Merge pull request #117 from adamantivm/dq/dqn-training-fixes

alejandromarcu · web-flow · commit 1337d1af288b · 2025-03-24T08:00:14.000-07:00
First working DQN, wins at trivial game
diff --git a/deep_quoridor/src/agents/flat_dqn.py b/deep_quoridor/src/agents/flat_dqn.py
@@ -6,6 +6,7 @@
 from collections import deque
 import random
 from agents import SelfRegisteringAgent
+from pathlib import Path
 
 
 class DQNNetwork(nn.Module):
@@ -182,8 +183,8 @@ class Pretrained01FlatDQNAgent(FlatDQNAgent):
     """
 
     def __init__(self, board_size, **kwargs):
-        super(Pretrained01FlatDQNAgent, self).__init__(board_size)
-        model_path = "/home/julian/aaae/deep-rabbit-hole/code/deep_rabbit_hole/models/dqn_flat_nostep_final.pt"
+        super().__init__(board_size, epsilon=0.0)
+        model_path = Path(__file__).resolve().parents[3] / "models" / "dqn_agent_final.pt"
         if os.path.exists(model_path):
             print(f"Loading pre-trained model from {model_path}")
             self.load_model(model_path)
diff --git a/deep_quoridor/src/train_dqn.py b/deep_quoridor/src/train_dqn.py
@@ -19,6 +19,12 @@ def train_dqn(
     """
     Train a DQN agent to play Quoridor.
 
+    Julian notes:
+    - This is for now working for a trivial 3x3 board with no walls
+    - It teaches the agent to use black (player 2) only, against a random agent
+      Note that in a 3x3 board with no walls, black always wins (if it wants)
+    - It's currently not assigning negative rewards for losing
+
     Args:
         episodes: Number of episodes to train for
         batch_size: Size of batches to sample from replay buffer
@@ -32,11 +38,8 @@ def train_dqn(
     """
     game = env(board_size=board_size, max_walls=max_walls, step_rewards=step_rewards)
 
-    # Calculate action space size
-    action_size = board_size**2 + ((board_size - 1) ** 2) * 2
-
     # Create the DQN agent
-    dqn_agent = FlatDQNAgent(board_size, action_size)
+    dqn_agent = FlatDQNAgent(board_size, epsilon_decay=0.9999)
 
     # Create a random opponent
     random_agent = RandomAgent()
@@ -50,7 +53,6 @@ def train_dqn(
     for episode in range(episodes):
         game.reset()
 
-        # Reset episode-specific variables
         episode_reward = 0
         episode_losses = []
 
@@ -59,11 +61,12 @@ def train_dqn(
             observation, reward, termination, truncation, _ = game.last()
 
             # If the game is over, break the loop
+            # TODO: Assign negative reward to the DQN agent when the opponent wins
             if termination or truncation:
                 break
 
             # If it's the DQN agent's turn
-            if agent_name == "player_0":
+            if agent_name == "player_1":
                 # Get current state
                 state = dqn_agent.preprocess_observation(observation)
 
@@ -73,8 +76,17 @@ def train_dqn(
                 # Execute action
                 game.step(action)
 
-                # Get new state, reward, etc.
-                next_observation, reward, termination, truncation, _ = game.last()
+                # Get the observation and rewards for THIS agent (not the opponent)
+                # NOTE: If we used game.last() it will return the observation and rewards for the currently active agent
+                # which, since we already did game.step(), is now the opponent
+                next_observation = game.observe(agent_name)
+
+                # Make the reward much larger than 1, to make it stand out
+                reward = game.rewards[agent_name] * 1000
+
+                # See if the game is over
+                # TODO: Understand what is truncation and if either of these values are player dependent
+                _, _, termination, truncation, _ = game.last()
 
                 # Add to episode reward
                 episode_reward += reward
@@ -115,10 +127,16 @@ def train_dqn(
         # Update target network periodically
         if episode % update_target_every == 0:
             dqn_agent.update_target_network()
-            avg_reward = sum(total_rewards[-100:]) / min(100, len(total_rewards)) if total_rewards else 0.0
-            avg_loss = sum(losses[-100:]) / min(100, len(losses)) if losses else 0.0
+            avg_reward = (
+                sum(total_rewards[-1 * update_target_every :]) / min(update_target_every, len(total_rewards))
+                if total_rewards
+                else 0.0
+            )
+            avg_loss = (
+                sum(losses[-1 * update_target_every :]) / min(update_target_every, len(losses)) if losses else 0.0
+            )
             print(
-                f"Episode {episode}/{episodes}, Avg Reward: {avg_reward:.2f}, "
+                f"Episode {episode + 1}/{episodes}, Avg Reward: {avg_reward:.2f}, "
                 f"Avg Loss: {avg_loss:.4f}, Epsilon: {dqn_agent.epsilon:.4f}"
             )