allenai
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎open_instruct/launch_utils.py‎
Lines changed: 15 additions & 0 deletions b/‎open_instruct/launch_utils.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎open_instruct/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎open_instruct/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/collect_eval_results.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/collect_eval_results.py‎
Lines changed: 1 addition & 1 deletion
@@ -50,6 +50,7 @@ All notable changes to this project will be documented in this file.
 - Add `--no_auto_dataset_cache` to GRPO and SFT integration test scripts to avoid HuggingFace 504 timeouts on CI runner (https://github.com/allenai/open-instruct/pull/1571).
 
 ### Added
+- Replace `scripts/submit_eval_jobs.py` with a new olmo-eval-internal launcher (Beaker v2, no gantry); the previous script is preserved as `scripts/submit_eval_jobs_old.py` and emits a `DeprecationWarning` (https://github.com/allenai/open-instruct/pull/1638).
 - Add OLMo-core SFT implementation (https://github.com/allenai/open-instruct/pull/1579).
 - Add DR-TULU replication script for Qwen 3.5 4B with evolving rubrics, per-tool pool size overrides, `vllm_qwen3_xml` parser, and `<answer>` tag extraction in rubric scoring (https://github.com/allenai/open-instruct/pull/1609).
 - Add MiniMax provider support: register `minimax-m2.7` and `minimax-m2.7-highspeed` models in `PRICE_PER_TOKEN` for cost tracking and add cl100k_base encoding support in `context_window_checker` (https://github.com/allenai/open-instruct/pull/1602).
 
@@ -3,6 +3,8 @@
 
 from transformers.utils import hub as transformers_hub
 
+AUTO_CREATED_BEAKER_CONFIG_DIR = "configs/beaker_configs/auto_created"
+
 WEKA_CLUSTERS = [
     "ai2/jupiter",
     "ai2/saturn",
@@ -96,3 +98,16 @@ def upload_to_gs_bucket(src_path: str, dest_path: str) -> None:
     cmd = ["gsutil", "-o", "GSUtil:parallel_composite_upload_threshold=150M", "cp", "-r", src_path, dest_path]
     print(f"Copying model to GS bucket with command: {cmd}")
     live_subprocess_output(cmd)
+
+
+def validate_beaker_workspace(workspace: str) -> None:
+    parts = workspace.split("/")
+    if len(parts) != 2 or not all(parts):
+        raise ValueError(
+            f"--workspace must be fully qualified as '<org>/<workspace>' (e.g., 'ai2/oe-adapt-general'). Received: '{workspace}'"
+        )
+
+
+def auto_created_spec_path(experiment_name: str) -> str:
+    os.makedirs(AUTO_CREATED_BEAKER_CONFIG_DIR, exist_ok=True)
+    return os.path.join(AUTO_CREATED_BEAKER_CONFIG_DIR, f"{experiment_name}.yaml")
@@ -1237,7 +1237,7 @@ def launch_ai2_evals_on_weka(
     oe_eval_gpu_multiplier: int | None = None,
 ) -> None:
     command = f"""\
-python scripts/submit_eval_jobs.py \
+python scripts/submit_eval_jobs_old.py \
 --model_name {leaderboard_name} \
 --location {path} \
 --is_tuned \
 
@@ -38,7 +38,7 @@ def make_parser():
     ]
 
     parser = argparse.ArgumentParser(
-        description="""Point this script at a Beaker job created by `submit_eval_jobs.py`.
+        description="""Point this script at a Beaker job created by `submit_eval_jobs_old.py`.
                     It will will collect all evaluation metrics and dump them in a json
                     file. It will also collect summary metrics for each task.""",
         epilog="""Usage example:
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def make_parser():`
`38`	`38`	`]`
`39`	`39`
`40`	`40`	`parser = argparse.ArgumentParser(`
`41`		- description="""Point this script at a Beaker job created by `submit_eval_jobs.py`.
	`41`	+ description="""Point this script at a Beaker job created by `submit_eval_jobs_old.py`.
`42`	`42`	`It will will collect all evaluation metrics and dump them in a json`
`43`	`43`	`file. It will also collect summary metrics for each task.""",`
`44`	`44`	`epilog="""Usage example:`