@@ -77,6 +77,11 @@ class OlmoEarthWandBCallback(WandBCallback):
7777 upload_dataset_distribution_pre_train : bool = True
7878 upload_modality_data_band_distribution_pre_train : bool = False
7979 restart_on_same_run : bool = True
80+ # Optional explicit path to the file storing this run's wandb id. When set,
81+ # the run always resumes from (and writes) this id. This lets multiple
82+ # separate jobs (e.g. the in-loop beaker eval jobs) share one wandb run id so
83+ # their metrics consolidate into a single run instead of one run per job.
84+ runid_path : str | None = None
8085
8186 def pre_train (self ) -> None :
8287 """Pre-train callback for the wandb callback."""
@@ -88,10 +93,17 @@ def pre_train(self) -> None:
8893 wandb_dir = Path (self .trainer .save_folder ) / "wandb"
8994 wandb_dir .mkdir (parents = True , exist_ok = True )
9095 resume_id = None
91- if self .restart_on_same_run :
92- runid_file = wandb_dir / "wandb_runid.txt"
93- if runid_file .exists ():
94- resume_id = runid_file .read_text ().strip ()
96+ # A shared runid_path (set for in-loop beaker eval jobs) makes every
97+ # job resume one wandb run; otherwise fall back to the per-run file
98+ # when restart_on_same_run is set.
99+ runid_file = (
100+ Path (self .runid_path )
101+ if self .runid_path
102+ else wandb_dir / "wandb_runid.txt"
103+ )
104+ use_runid_file = self .runid_path is not None or self .restart_on_same_run
105+ if use_runid_file and runid_file .exists ():
106+ resume_id = runid_file .read_text ().strip ()
95107
96108 self .wandb .init (
97109 dir = wandb_dir ,
@@ -107,7 +119,8 @@ def pre_train(self) -> None:
107119 settings = self .wandb .Settings (init_timeout = 240 ),
108120 )
109121
110- if not resume_id and self .restart_on_same_run :
122+ if not resume_id and use_runid_file :
123+ runid_file .parent .mkdir (parents = True , exist_ok = True )
111124 runid_file .write_text (self .run .id )
112125
113126 self ._run_path = self .run .path # type: ignore
0 commit comments