log all eval jobs to 1 wandb run, has to be separate from the train run

piperwolters · piperwolters · commit ba3a085afa87 · 2026-06-25T18:01:11.000Z
diff --git a/olmoearth_pretrain/internal/loop_eval_launch.py b/olmoearth_pretrain/internal/loop_eval_launch.py
@@ -24,6 +24,7 @@
 
 import logging
 import os
+import secrets
 import subprocess  # nosec
 import sys
 from pathlib import Path
@@ -134,6 +135,16 @@ def launch_checkpoint_eval_job(
         cmd.append(f"--trainer.callbacks.wandb.group={wandb_group}")
     if wandb_run_name is not None:
         cmd.append(f"--trainer.callbacks.wandb.name={wandb_run_name}")
+    # All in-loop eval jobs for this training run resume one shared wandb run id,
+    # generated once here (in the single-threaded training process) and stored in
+    # a fixed file under the checkpoint dir. Pre-creating it means each eval job
+    # only ever reads + resumes this id -- so per-step metrics consolidate into a
+    # single wandb run (keyed on checkpoint_step) with no race even when eval jobs
+    # overlap, instead of creating a separate run per eval step.
+    shared_runid_file = UPath(checkpoint_dir) / "loop_eval_wandb_runid.txt"
+    if not shared_runid_file.exists():
+        shared_runid_file.write_text(secrets.token_hex(4))
+    cmd.append(f"--trainer.callbacks.wandb.runid_path={shared_runid_file}")
     if tasks_to_run:
         cmd.append(
             "--trainer.callbacks.downstream_evaluator.tasks_to_run="
diff --git a/olmoearth_pretrain/train/callbacks/evaluator_callback.py b/olmoearth_pretrain/train/callbacks/evaluator_callback.py
@@ -979,6 +979,10 @@ def _launch_beaker_eval_job(self, step: int, task_names: list[str]) -> None:
             ),
             # Group all of this run's eval jobs together (and alongside training).
             wandb_group=train_run_name,
+            # All eval steps log to one consolidated wandb run (resumed via the
+            # shared runid file set in launch_checkpoint_eval_job), keyed on
+            # checkpoint_step -- instead of a separate run per eval step.
+            wandb_run_name=f"{train_run_name}_loop_evals",
             extra_overrides=self.beaker_eval_extra_overrides,
             log_dir=os.path.join(save_folder, "loop_eval_launch_logs"),
         )
diff --git a/olmoearth_pretrain/train/callbacks/wandb.py b/olmoearth_pretrain/train/callbacks/wandb.py
@@ -77,6 +77,11 @@ class OlmoEarthWandBCallback(WandBCallback):
     upload_dataset_distribution_pre_train: bool = True
     upload_modality_data_band_distribution_pre_train: bool = False
     restart_on_same_run: bool = True
+    # Optional explicit path to the file storing this run's wandb id. When set,
+    # the run always resumes from (and writes) this id. This lets multiple
+    # separate jobs (e.g. the in-loop beaker eval jobs) share one wandb run id so
+    # their metrics consolidate into a single run instead of one run per job.
+    runid_path: str | None = None
 
     def pre_train(self) -> None:
         """Pre-train callback for the wandb callback."""
@@ -88,10 +93,17 @@ def pre_train(self) -> None:
             wandb_dir = Path(self.trainer.save_folder) / "wandb"
             wandb_dir.mkdir(parents=True, exist_ok=True)
             resume_id = None
-            if self.restart_on_same_run:
-                runid_file = wandb_dir / "wandb_runid.txt"
-                if runid_file.exists():
-                    resume_id = runid_file.read_text().strip()
+            # A shared runid_path (set for in-loop beaker eval jobs) makes every
+            # job resume one wandb run; otherwise fall back to the per-run file
+            # when restart_on_same_run is set.
+            runid_file = (
+                Path(self.runid_path)
+                if self.runid_path
+                else wandb_dir / "wandb_runid.txt"
+            )
+            use_runid_file = self.runid_path is not None or self.restart_on_same_run
+            if use_runid_file and runid_file.exists():
+                resume_id = runid_file.read_text().strip()
 
             self.wandb.init(
                 dir=wandb_dir,
@@ -107,7 +119,8 @@ def pre_train(self) -> None:
                 settings=self.wandb.Settings(init_timeout=240),
             )
 
-            if not resume_id and self.restart_on_same_run:
+            if not resume_id and use_runid_file:
+                runid_file.parent.mkdir(parents=True, exist_ok=True)
                 runid_file.write_text(self.run.id)
 
             self._run_path = self.run.path  # type: ignore