Merge pull request #329 from jonbinney/v2_b

alejandromarcu · web-flow · commit fea75cdccace · 2026-01-28T15:28:37.000-08:00
Improvements in training
diff --git a/deep_quoridor/experiments/B5W3/base.yaml b/deep_quoridor/experiments/B5W3/base.yaml
@@ -1,4 +1,4 @@
-run_id: cucu-26b
+run_id: cucu-28d
 quoridor:
   board_size: 5
   max_walls: 3
diff --git a/deep_quoridor/src/agents/alphazero/nn_evaluator.py b/deep_quoridor/src/agents/alphazero/nn_evaluator.py
@@ -284,14 +284,16 @@ def train_iteration_v2(self, samples):
         self.cache = LRUCache(max_size=self.max_cache_size)
 
         policy_loss, value_loss, total_loss = self.compute_losses(samples)
+        assert policy_loss, "Expected policy_loss"
+        assert value_loss, "Expected value_loss"
         assert total_loss, "Expected total_loss"
 
         # Backward pass
         self.optimizer.zero_grad()
         total_loss.backward()
         self.optimizer.step()
 
-        return total_loss.item()
+        return policy_loss.item(), value_loss.item(), total_loss.item()
 
     def train_iteration(
         self,
diff --git a/deep_quoridor/src/arena.py b/deep_quoridor/src/arena.py
@@ -71,6 +71,7 @@ def __init__(
         plugins: list[ArenaPlugin] = [],
         swap_players: bool = True,
         max_steps: int = 1000,
+        verbose: bool = True,
     ):
         self.board_size = board_size
         self.max_walls = max_walls
@@ -81,6 +82,7 @@ def __init__(
 
         self.renderers = renderers
         self.plugins = CompositeArenaPlugin([p for p in plugins + renderers + [saver] if p is not None])
+        self.verbose = verbose
 
     def _play_game(self, agent1: Agent, agent2: Agent, game_id: str) -> GameResult:
         self.game.reset()
@@ -119,7 +121,7 @@ def _play_game(self, agent1: Agent, agent2: Agent, game_id: str) -> GameResult:
                         done=True,
                     )
 
-                if truncation:
+                if truncation and self.verbose:
                     # Print the game state to help debug.
                     print(f"\nP1: {agent1.name()} P2: {agent2.name()}")
                     print(self.game.render())
diff --git a/deep_quoridor/src/metrics.py b/deep_quoridor/src/metrics.py
@@ -183,7 +183,7 @@ def tournament(self, agent: Agent):
         Runs a tournament for the given agent against predefined benchmarks.  This was created for v2
         architecture, and has some duplicated code from compute().
         """
-        arena = Arena(self.board_size, self.max_walls, max_steps=self.max_steps, renderers=[MatchResultsRenderer()])
+        arena = Arena(self.board_size, self.max_walls, max_steps=self.max_steps, verbose=False)
         # We store the elos of the opponents playing against each other so we don't have to play those matches
         # every time
         if not self.stored_elos:
diff --git a/deep_quoridor/src/utils/timer.py b/deep_quoridor/src/utils/timer.py
@@ -39,7 +39,7 @@ def start(cls, name: str):
         cls.starts[name] = time.perf_counter()
 
     @classmethod
-    def finish(cls, name: str, episode: Optional[int] = None) -> str:
+    def finish(cls, name: str, episode: Optional[int] = None) -> float:
         if name not in cls.starts:
             print(f"TIMER: WARNING - timer for {name} was not started but trying to finish")
             return ""
@@ -54,7 +54,7 @@ def finish(cls, name: str, episode: Optional[int] = None) -> str:
             if cls.wandb_run:
                 cls.wandb_run.log({f"time-{name}": elapsed, "Episode": episode})
 
-        return format_time(elapsed)
+        return elapsed
 
     @classmethod
     def log_cumulative(cls, x_name: str, x_value: int | float):
diff --git a/deep_quoridor/src/v2/TODO.md b/deep_quoridor/src/v2/TODO.md
@@ -0,0 +1,28 @@
+# V1 Parity
+
+- Replay buffer length: righ now we're not rolling out old games to respect the length
+- Agent Evolution benchmark
+- Allow to set a finish point (e.g. maximum number of models or games)
+- Include a CI test
+- Overrides from the command line
+- Continuation
+- Upload models to wandb
+
+# Other improvements and new features
+
+- Use a schedule for the learning rate
+- Use a logger class rather than just printing out.
+
+# Performance
+
+- In train, sample from all the training iterations together
+- For the replay buffer files, either:
+  - Move them in directories based on their game number (e.g. games_1000)
+  - Join multiple games in one file (a bit more tricky with sampling and race conditions)
+
+# Ideas
+
+- Self-healing processes
+- Allow to dynamically change the number of workers and parallel games, to experiment with performance
+- Mount the run directory and make other processes play from another computer
+- The processes could write status files and we could have a script to watch the status (e.g. elapsed time.)
diff --git a/deep_quoridor/src/v2/trainer.py b/deep_quoridor/src/v2/trainer.py
@@ -4,16 +4,15 @@
 
 import numpy as np
 import wandb
+from pydantic_yaml import parse_yaml_file_as
+from utils import Timer
 from v2.common import MockWandb, create_alphazero
 from v2.config import Config
-from v2.yaml_models import LatestModel
+from v2.yaml_models import GameInfo, LatestModel
 
 
 def train(config: Config):
-    global azparams
     batch_size = config.training.batch_size
-    training_iterations = 1
-    min_new_games = 25
 
     if config.wandb:
         run_id = f"{config.run_id}-training"
@@ -38,19 +37,18 @@ def train(config: Config):
     alphazero_agent.save_model(filename)
     LatestModel.write(config, str(filename), 0)
 
+    training_steps = 0
     last_game = 0
     model_version = 1
     moves_per_game = []
     game_filename = []
 
     while True:
-        while True:
-            ready = [f for f in sorted(config.paths.replay_buffers_ready.glob("*.pkl")) if f.is_file()]
-            if len(ready) >= min_new_games:
-                break
-            time.sleep(1)
+        Timer.start("waiting-to-train")
+
+        # Process new games: find new files, move them and extract the info used for training
+        ready = [f for f in sorted(config.paths.replay_buffers_ready.glob("*.pkl")) if f.is_file()]
 
-        # Process new games
         for f in ready:
             last_game += 1
 
@@ -59,44 +57,67 @@ def train(config: Config):
             yaml_file = f.with_suffix(".yaml")
             new_yaml_name = new_name.with_suffix(".yaml")
             yaml_file.rename(new_yaml_name)
+            game_info = parse_yaml_file_as(GameInfo, new_yaml_name)
 
             f.rename(new_name)
             with open(new_name, "rb") as f:
                 data = pickle.load(f)
-                game_length = len(list(data))
-                moves_per_game.append(game_length)
+                moves_per_game.append(game_info.game_length)
                 game_filename.append(f.name)
-                wandb_run.log({"game_length": game_length, "Game num": last_game, "Model version": model_version})
+                wandb_run.log(
+                    {
+                        "game_length": game_info.game_length,
+                        "model_lag": model_version - 1 - game_info.model_version,
+                        "Game num": last_game,
+                        "Model version": model_version,
+                    }
+                )
 
         total_moves = sum(moves_per_game)
-        if total_moves < batch_size:
-            continue
 
-        t0 = time.time()
-        for _ in range(training_iterations):
-            # Sample
-            # TO DO, we need to roll out games when it's longer that the replay buffer size
-            # TO DO probably we want to sample for all the training iterations together to make it faster
-            samples = []
+        games_needed_to_train = config.training.games_per_training_step * (training_steps + 1)
+
+        if total_moves < batch_size or games_needed_to_train > last_game:
+            time.sleep(1)
+            continue
 
-            games = np.random.choice(last_game, batch_size, p=[moves / total_moves for moves in moves_per_game])
-            samples_per_game = Counter(games)
-            for game_number in samples_per_game:
-                file = config.paths.replay_buffers / game_filename[game_number]
-                with open(file, "rb") as f:
-                    data = pickle.load(f)
+        time_waiting_to_train = Timer.finish("waiting-to-train")
 
-                samples.extend(np.random.choice(list(data), samples_per_game[game_number]))
+        # Sample moves from the replay buffer files
+        Timer.start("sample")
+        samples = []
 
-                # print(f"{game_number}: {samples_per_game[game_number]}, {len(entries)}")
+        games = np.random.choice(last_game, batch_size, p=[moves / total_moves for moves in moves_per_game])
+        samples_per_game = Counter(games)
+        for game_number in samples_per_game:
+            file = config.paths.replay_buffers / game_filename[game_number]
+            with open(file, "rb") as f:
+                data = pickle.load(f)
 
-            # Train
-            loss = alphazero_agent.evaluator.train_iteration_v2(samples)
-            wandb_run.log({"loss": loss, "games_played": last_game, "Model version": model_version}, commit=True)
+            samples.extend(np.random.choice(list(data), samples_per_game[game_number]))
+        time_sample = Timer.finish("sample")
+
+        # Train the network for one step using the samples
+        Timer.start("train")
+        policy_loss, value_loss, total_loss = alphazero_agent.evaluator.train_iteration_v2(samples)
+        training_steps += 1
+        time_train = Timer.finish("train")
+
+        wandb_run.log(
+            {
+                "policy_loss": policy_loss,
+                "value_loss": value_loss,
+                "total_loss": total_loss,
+                "games_played": last_game,
+                "time-sample": time_sample,
+                "time-train": time_train,
+                "time-waiting-to-train": time_waiting_to_train,
+                "Model version": model_version,
+            },
+            commit=True,
+        )
 
-        print(f"Loss: {loss}")
-        t1 = time.time()
-        print(f"Sampling and training took {t1 - t0}")
+        print(f"Sampling and training took {time_sample}, {time_train}")
 
         new_model_filename = config.paths.checkpoints / f"model_{model_version}.pt"
         alphazero_agent.save_model(new_model_filename)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-run_id: cucu-26b`
	`1`	`+run_id: cucu-28d`
`2`	`2`	`quoridor:`
`3`	`3`	`board_size: 5`
`4`	`4`	`max_walls: 3`