Below is a complete implementation of Q-learning in a grid world environment with obstacles. The agent learns to navigate from start to goal while avoiding obstacles.
import numpy as np
import random
class GridWorld:
def __init__(self, size=4):
self.size = size
self.agent_pos = [0, 0]
self.goal_pos = [size - 1, size - 1]
self.obstacle_pos = [[1, 1], [2, 2]] # Example obstacles
# Actions: 0:up, 1:down, 2:left, 3:right
self.actions = [0, 1, 2, 3]
self.action_effects = {
0: (-1, 0), # Up
1: (1, 0), # Down
2: (0, -1), # Left
3: (0, 1) # Right
}
self.num_states = size * size
self.num_actions = len(self.actions)
def reset(self):
self.agent_pos = [0, 0]
return self.get_state()
def get_state(self):
# Convert 2D position to 1D state index
return self.agent_pos[0] * self.size + self.agent_pos[1]
def step(self, action_idx):
action = self.action_effects[self.actions[action_idx]]
new_pos = [self.agent_pos[0] + action[0], self.agent_pos[1] + action[1]]
reward = -0.1 # Small negative reward for each step (encourages shorter paths)
done = False
# Check boundaries
if not (0 <= new_pos[0] < self.size and 0 <= new_pos[1] < self.size):
# Agent hit a wall, stay in place
new_pos = self.agent_pos
reward = -1.0 # Penalty for hitting wall
elif new_pos in self.obstacle_pos:
# Agent hit an obstacle, stay in place
new_pos = self.agent_pos
reward = -1.0 # Penalty for hitting obstacle
elif new_pos == self.goal_pos:
reward = 10.0 # Reward for reaching goal
done = True
self.agent_pos = new_pos
return self.get_state(), reward, done
def q_learning(env, episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
# Initialize Q-table with zeros
Q = np.zeros((env.num_states, env.num_actions))
for episode in range(episodes):
state = env.reset()
done = False
while not done:
# Epsilon-greedy action selection
if random.random() < epsilon:
action = random.randint(0, env.num_actions - 1)
else:
action = np.argmax(Q[state])
# Take action and observe next state and reward
next_state, reward, done = env.step(action)
# Q-learning update
Q[state, action] = Q[state, action] + alpha * (
reward + gamma * np.max(Q[next_state]) - Q[state, action]
)
state = next_state
return Q
# Example usage:
env = GridWorld(size=4)
Q = q_learning(env)
print("Learned Q-table:")
print(Q)