Skip to content

Commit ba3a085

Browse files
author
piperwolters
committed
log all eval jobs to 1 wandb run, has to be separate from the train run
1 parent 92730f2 commit ba3a085

3 files changed

Lines changed: 33 additions & 5 deletions

File tree

olmoearth_pretrain/internal/loop_eval_launch.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import logging
2626
import os
27+
import secrets
2728
import subprocess # nosec
2829
import sys
2930
from pathlib import Path
@@ -134,6 +135,16 @@ def launch_checkpoint_eval_job(
134135
cmd.append(f"--trainer.callbacks.wandb.group={wandb_group}")
135136
if wandb_run_name is not None:
136137
cmd.append(f"--trainer.callbacks.wandb.name={wandb_run_name}")
138+
# All in-loop eval jobs for this training run resume one shared wandb run id,
139+
# generated once here (in the single-threaded training process) and stored in
140+
# a fixed file under the checkpoint dir. Pre-creating it means each eval job
141+
# only ever reads + resumes this id -- so per-step metrics consolidate into a
142+
# single wandb run (keyed on checkpoint_step) with no race even when eval jobs
143+
# overlap, instead of creating a separate run per eval step.
144+
shared_runid_file = UPath(checkpoint_dir) / "loop_eval_wandb_runid.txt"
145+
if not shared_runid_file.exists():
146+
shared_runid_file.write_text(secrets.token_hex(4))
147+
cmd.append(f"--trainer.callbacks.wandb.runid_path={shared_runid_file}")
137148
if tasks_to_run:
138149
cmd.append(
139150
"--trainer.callbacks.downstream_evaluator.tasks_to_run="

olmoearth_pretrain/train/callbacks/evaluator_callback.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,10 @@ def _launch_beaker_eval_job(self, step: int, task_names: list[str]) -> None:
979979
),
980980
# Group all of this run's eval jobs together (and alongside training).
981981
wandb_group=train_run_name,
982+
# All eval steps log to one consolidated wandb run (resumed via the
983+
# shared runid file set in launch_checkpoint_eval_job), keyed on
984+
# checkpoint_step -- instead of a separate run per eval step.
985+
wandb_run_name=f"{train_run_name}_loop_evals",
982986
extra_overrides=self.beaker_eval_extra_overrides,
983987
log_dir=os.path.join(save_folder, "loop_eval_launch_logs"),
984988
)

olmoearth_pretrain/train/callbacks/wandb.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ class OlmoEarthWandBCallback(WandBCallback):
7777
upload_dataset_distribution_pre_train: bool = True
7878
upload_modality_data_band_distribution_pre_train: bool = False
7979
restart_on_same_run: bool = True
80+
# Optional explicit path to the file storing this run's wandb id. When set,
81+
# the run always resumes from (and writes) this id. This lets multiple
82+
# separate jobs (e.g. the in-loop beaker eval jobs) share one wandb run id so
83+
# their metrics consolidate into a single run instead of one run per job.
84+
runid_path: str | None = None
8085

8186
def pre_train(self) -> None:
8287
"""Pre-train callback for the wandb callback."""
@@ -88,10 +93,17 @@ def pre_train(self) -> None:
8893
wandb_dir = Path(self.trainer.save_folder) / "wandb"
8994
wandb_dir.mkdir(parents=True, exist_ok=True)
9095
resume_id = None
91-
if self.restart_on_same_run:
92-
runid_file = wandb_dir / "wandb_runid.txt"
93-
if runid_file.exists():
94-
resume_id = runid_file.read_text().strip()
96+
# A shared runid_path (set for in-loop beaker eval jobs) makes every
97+
# job resume one wandb run; otherwise fall back to the per-run file
98+
# when restart_on_same_run is set.
99+
runid_file = (
100+
Path(self.runid_path)
101+
if self.runid_path
102+
else wandb_dir / "wandb_runid.txt"
103+
)
104+
use_runid_file = self.runid_path is not None or self.restart_on_same_run
105+
if use_runid_file and runid_file.exists():
106+
resume_id = runid_file.read_text().strip()
95107

96108
self.wandb.init(
97109
dir=wandb_dir,
@@ -107,7 +119,8 @@ def pre_train(self) -> None:
107119
settings=self.wandb.Settings(init_timeout=240),
108120
)
109121

110-
if not resume_id and self.restart_on_same_run:
122+
if not resume_id and use_runid_file:
123+
runid_file.parent.mkdir(parents=True, exist_ok=True)
111124
runid_file.write_text(self.run.id)
112125

113126
self._run_path = self.run.path # type: ignore

0 commit comments

Comments
 (0)