Merge pull request #317 from jonbinney/jdb/wandb-sweeps

jonbinney · web-flow · commit 1bf82917744e · 2025-11-27T16:03:35.000+01:00
Initial implementation of W&amp;B  Sweeps
diff --git a/deep_quoridor/src/plugins/wandb_train.py b/deep_quoridor/src/plugins/wandb_train.py
@@ -7,6 +7,7 @@
 from typing import Optional
 
 import wandb
+import wandb.wandb_run
 from agent_evolution_tournament import AgentEvolutionTournament
 from agents.core.trainable_agent import TrainableAgent
 from arena_utils import ArenaPlugin
@@ -54,6 +55,7 @@ def __init__(
         metrics: Metrics,
         agent_evolution_tournament: Optional[AgentEvolutionTournament] = None,
         include_raw_metrics: bool = False,
+        wandb_run: wandb.wandb_run.Run | None = None,
     ):
         self.params = params
         self.total_episodes = total_episodes
@@ -67,8 +69,29 @@ def __init__(
         self.agent_evolution_tournament = agent_evolution_tournament
         self.include_raw_metrics = include_raw_metrics
 
-    def _initialize(self, game):
-        assert self.agent
+        if wandb_run is None:
+            self.run = wandb.init(
+                project=self.params.project,
+                job_type="train",
+                id=self.params.run_id(),
+                group=f"{self.params.run_id()}",
+                notes=self.params.notes,
+            )
+        else:
+            self.run = wandb_run
+        Timer.set_wandb_run(self.run)
+
+        self._initialized = False
+
+    def _initialize(
+        self,
+        game,
+    ):
+        if self._initialized:
+            return
+
+        assert self.agent is not None
+        assert self.run is not None
 
         config = {
             "board_size": game.board_size,
@@ -78,23 +101,18 @@ def _initialize(self, game):
         }
         config.update(self.agent.model_hyperparameters())
 
-        self.run = wandb.init(
-            project=self.params.project,
-            job_type="train",
-            config=config,
-            tags=[self.agent.model_id(), f"-{self.params.run_id()}"],
-            id=self.params.run_id(),
-            group=f"{self.params.run_id()}",
-            notes=self.params.notes,
-        )
-        Timer.set_wandb_run(self.run)
+        self.run.config.update(config)
+        assert self.run.tags is not None
+        self.run.tags = self.run.tags + (self.agent.model_id(), f"-{self.params.run_id()}")
 
         wandb.define_metric("Loss step", hidden=True)
         wandb.define_metric("Epoch", hidden=True)
         wandb.define_metric("Episode", hidden=True)
         wandb.define_metric("loss_*", "Loss step")
         wandb.define_metric("*", "Episode")
 
+        self._initialized = True
+
     def start_game(self, game, agent1, agent2):
         if (self.agent is not None) and (self.agent != agent1) and (self.agent != agent2):
             raise ValueError("WandbTrainPlugin being used for an agent, but another agent is being trained")
diff --git a/deep_quoridor/src/train_alphazero.py b/deep_quoridor/src/train_alphazero.py
@@ -8,8 +8,11 @@
 import tempfile
 import time
 from dataclasses import asdict
+from pathlib import Path
 from typing import BinaryIO, Optional, cast
 
+import wandb
+import yaml
 from agent_evolution_tournament import AgentEvolutionTournament, AgentEvolutionTournamentParams
 from agents.alphazero.alphazero import AlphaZeroAgent, AlphaZeroBenchmarkOverrideParams, AlphaZeroParams
 from agents.alphazero.self_play_manager import GameParams, SelfPlayManager
@@ -22,12 +25,13 @@
 
 def train_alphazero(
     args: argparse.Namespace,
+    alphazero_params: str,
     wandb_train_plugin: Optional[WandbTrainPlugin],
 ):
     game_params = GameParams(args.board_size, args.max_walls, args.max_steps)
 
     # Create an agent that we'll use to do training.
-    training_params = parse_subargs(args.params, AlphaZeroParams)
+    training_params = parse_subargs(alphazero_params, AlphaZeroParams)
     assert isinstance(training_params, AlphaZeroParams)
     training_params.training_mode = True  # We only use this agent for training
     training_params.train_every = None  # We manually run training at the end of each epoch
@@ -74,7 +78,8 @@ def train_alphazero(
     if wandb_train_plugin is not None:
         # HACK: the start_game method only cares that "game" has board_size and max_walls
         # members, so we pass in a GameParams object. We have to call start_game
-        # because it calls the plugin's internal _intialize method which sets up metrics.
+        # because it calls the plugin's internal _intialize method which sets up metrics
+        # and creates the WandB run.
         wandb_train_plugin.start_game(game=args, agent1=training_agent, agent2=training_agent)
         training_agent.set_wandb_run(wandb_train_plugin.run)
         # Compute the tournament metrics with the initial model, possibly random initialized, to
@@ -168,25 +173,38 @@ def main(args):
 
     set_deterministic(args.seed)
 
-    t0 = time.time()
+    alphazero_params = args.params
 
     if args.wandb is None:
         wandb_train_plugin = None
     else:
+        # Create the benchmarks and evolution tournament, and then create the WandB training plugin
         wandb_params = parse_subargs(args.wandb, WandbParams)
         assert isinstance(wandb_params, WandbParams)
 
+        wandb_run = None
+        if args.sweep is not None:
+            # Instead of letting the wandb plugin start its own run, we create one from the sweep config
+            # and later pass it in to WandbTrainPlugin.start_game().
+            with open(args.sweep) as sweep_config_file:
+                sweep_config = yaml.load(sweep_config_file, Loader=yaml.FullLoader)
+
+            wandb_run = wandb.init(config=sweep_config)
+
+            # Apply the sweep params on top of the command line params
+            alphazero_params = override_subargs(alphazero_params, wandb_run.config["alphazero"])
+
         metrics = Metrics(
             args.board_size, args.max_walls, args.benchmarks, args.benchmarks_t, args.max_steps, args.num_workers
         )
-        agent_encoded_name = "alphazero:" + args.params
 
+        benchmark_params = alphazero_params
         if args.benchmarks_params:
-            benchmarks_params = parse_subargs(args.benchmarks_params, AlphaZeroBenchmarkOverrideParams)
-            assert isinstance(benchmarks_params, AlphaZeroBenchmarkOverrideParams)
-            override_args = {k: v for k, v in asdict(benchmarks_params).items() if v is not None}
+            benchmark_param_overrides = parse_subargs(args.benchmarks_params, AlphaZeroBenchmarkOverrideParams)
+            assert isinstance(benchmark_param_overrides, AlphaZeroBenchmarkOverrideParams)
+            override_args = {k: v for k, v in asdict(benchmark_param_overrides).items() if v is not None}
 
-            agent_encoded_name = "alphazero:" + override_subargs(args.params, override_args)
+            benchmark_params = override_subargs(benchmark_params, override_args)
 
         if args.agent_evolution is not None:
             agent_evolution_params = parse_subargs(args.agent_evolution, AgentEvolutionTournamentParams)
@@ -204,15 +222,16 @@ def main(args):
         wandb_train_plugin = WandbTrainPlugin(
             wandb_params,
             args.epochs * args.games_per_epoch,
-            agent_encoded_name,
+            "alphazero:" + benchmark_params,
             metrics,
             agent_evolution_tournament,
             include_raw_metrics=True,
+            wandb_run=wandb_run,
         )
 
     t0 = time.time()
 
-    train_alphazero(args, wandb_train_plugin=wandb_train_plugin)
+    train_alphazero(args, alphazero_params, wandb_train_plugin)
 
     t1 = time.time()
 
@@ -292,14 +311,16 @@ def main(args):
         type=str,
         help="Parameters for the Agent Evolution Tournament",
     )
+    parser.add_argument(
+        "--sweep",
+        default=None,
+        type=Path,
+        help="Path to WandB sweep config yaml file",
+    )
+
     args = parser.parse_args()
 
-    # Handle deprecated --max-game-length argument
-    if args.max_game_length is not None:
-        if args.max_steps != parser.get_default("max-steps"):  # Check if --max-steps was also provided (not default)
-            print("Warning: Both --max-game-length and --max-steps provided. Using --max-steps value.")
-        else:
-            print("Warning: --max-game-length is deprecated. Please use --max-steps instead.")
-            args.max_steps = args.max_game_length
+    if args.sweep is not None and args.wandb is None:
+        print("Enabling WandB since we're doing a sweep")
 
     main(args)