Lesson 2: Q-Learning Agent
Now let's implement the Q-Learning algorithm from scratch! This is the core of your first RL agent.
๐ฏ Learning Objectivesโ
By the end of this lesson, you'll have:
- A complete Q-Learning implementation
- Understanding of the algorithm's components
- A working agent that can learn
๐ง Q-Learning Algorithmโ
Core Componentsโ
- Q-Table: Stores Q-values for state-action pairs
- Action Selection: ฮต-greedy policy for exploration/exploitation
- Q-Value Update: Bellman equation for learning
- Episode Management: Training loop and evaluation
๐ ๏ธ Step-by-Step Implementationโ
1. Create the Q-Learning Agentโ
Create agents/q_learning.py:
import numpy as np
import random
from typing import Tuple, List
class QLearningAgent:
def __init__(self, state_size: int, action_size: int,
learning_rate: float = 0.1,
discount_factor: float = 0.95,
epsilon: float = 1.0,
epsilon_decay: float = 0.995,
epsilon_min: float = 0.01):
"""
Initialize Q-Learning agent
Args:
state_size: Number of possible states
action_size: Number of possible actions
learning_rate: Learning rate (alpha)
discount_factor: Discount factor (gamma)
epsilon: Initial exploration rate
epsilon_decay: Rate of epsilon decay
epsilon_min: Minimum epsilon value
"""
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
# Initialize Q-table with zeros
self.q_table = np.zeros((state_size, action_size))
# Training statistics
self.episode_rewards = []
self.episode_lengths = []
def get_state_index(self, state) -> int:
"""
Convert state to index for Q-table
Args:
state: Current state (can be tuple, list, or array)
Returns:
int: Index for Q-table
"""
# For discrete states, convert to single index
if isinstance(state, (tuple, list)):
# Simple hash-based conversion
return hash(tuple(state)) % self.state_size
return int(state) % self.state_size
def choose_action(self, state) -> int:
"""
Choose action using ฮต-greedy policy
Args:
state: Current state
Returns:
int: Chosen action
"""
state_index = self.get_state_index(state)
# Exploration vs Exploitation
if random.random() < self.epsilon:
# Explore: choose random action
return random.randint(0, self.action_size - 1)
else:
# Exploit: choose best action
return np.argmax(self.q_table[state_index])
def update_q_table(self, state, action: int, reward: float,
next_state, done: bool):
"""
Update Q-table using Bellman equation
Args:
state: Current state
action: Action taken
reward: Reward received
next_state: Next state
done: Whether episode is finished
"""
state_index = self.get_state_index(state)
next_state_index = self.get_state_index(next_state)
# Current Q-value
current_q = self.q_table[state_index, action]
# Calculate target Q-value
if done:
target_q = reward
else:
target_q = reward + self.discount_factor * np.max(self.q_table[next_state_index])
# Update Q-value using Bellman equation
self.q_table[state_index, action] = current_q + self.learning_rate * (target_q - current_q)
def decay_epsilon(self):
"""Decay exploration rate"""
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def get_q_values(self, state) -> np.ndarray:
"""Get Q-values for a given state"""
state_index = self.get_state_index(state)
return self.q_table[state_index]
def save_model(self, filepath: str):
"""Save Q-table to file"""
np.save(filepath, self.q_table)
def load_model(self, filepath: str):
"""Load Q-table from file"""
self.q_table = np.load(filepath)
2. Create Training Loopโ
Create training/train_q_learning.py:
import numpy as np
import matplotlib.pyplot as plt
from agents.q_learning import QLearningAgent
from environments.ants_saga_env import AntsSagaEnvironment
def train_q_learning(episodes: int = 1000,
max_steps: int = 100,
render: bool = False):
"""
Train Q-Learning agent
Args:
episodes: Number of training episodes
max_steps: Maximum steps per episode
render: Whether to render environment
"""
# Initialize environment and agent
env = AntsSagaEnvironment()
agent = QLearningAgent(
state_size=env.state_size,
action_size=env.action_size,
learning_rate=0.1,
discount_factor=0.95,
epsilon=1.0,
epsilon_decay=0.995,
epsilon_min=0.01
)
# Training statistics
episode_rewards = []
episode_lengths = []
epsilon_history = []
print("Starting Q-Learning training...")
for episode in range(episodes):
state = env.reset()
total_reward = 0
steps = 0
for step in range(max_steps):
# Choose action
action = agent.choose_action(state)
# Take action
next_state, reward, done, info = env.step(action)
# Update Q-table
agent.update_q_table(state, action, reward, next_state, done)
# Update statistics
total_reward += reward
steps += 1
state = next_state
# Render if requested
if render and episode % 100 == 0:
env.render()
# Break if episode is done
if done:
break
# Decay epsilon
agent.decay_epsilon()
# Store statistics
episode_rewards.append(total_reward)
episode_lengths.append(steps)
epsilon_history.append(agent.epsilon)
# Print progress
if episode % 100 == 0:
avg_reward = np.mean(episode_rewards[-100:])
print(f"Episode {episode}, Average Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
# Save model
agent.save_model("models/q_learning_model.npy")
# Plot results
plot_training_results(episode_rewards, episode_lengths, epsilon_history)
return agent
def plot_training_results(rewards, lengths, epsilon):
"""Plot training results"""
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 12))
# Plot rewards
ax1.plot(rewards)
ax1.set_title("Episode Rewards")
ax1.set_xlabel("Episode")
ax1.set_ylabel("Total Reward")
ax1.grid(True)
# Plot episode lengths
ax2.plot(lengths)
ax2.set_title("Episode Lengths")
ax2.set_xlabel("Episode")
ax2.set_ylabel("Steps")
ax2.grid(True)
# Plot epsilon decay
ax3.plot(epsilon)
ax3.set_title("Epsilon Decay")
ax3.set_xlabel("Episode")
ax3.set_ylabel("Epsilon")
ax3.grid(True)
plt.tight_layout()
plt.savefig("logs/training_results.png")
plt.show()
if __name__ == "__main__":
agent = train_q_learning(episodes=1000, render=False)
print("Training completed!")
โ Testing Your Implementationโ
1. Test Q-Table Initializationโ
agent = QLearningAgent(state_size=10, action_size=4)
print(f"Q-table shape: {agent.q_table.shape}")
print(f"Initial Q-values: {agent.q_table[0]}")
2. Test Action Selectionโ
state = [1, 2, 3]
action = agent.choose_action(state)
print(f"Chosen action: {action}")
3. Test Q-Value Updateโ
agent.update_q_table(state, action, 1.0, [2, 3, 4], False)
print(f"Updated Q-values: {agent.get_q_values(state)}")
๐ฏ Next Stepsโ
Once your implementation is working:
- Proceed to GameDev Unity - Previous section
- Review Getting Started if needed
- Check Glossary for more concepts
๐ก Tipsโ
- Start simple: Test with a basic environment first
- Monitor learning: Watch how rewards and epsilon change
- Tune hyperparameters: Experiment with different values
- Save frequently: Save your model after training
Excellent! You've implemented Q-Learning from scratch. Next, let's train your agent!