[nightshift] Deduplicate LM/DPO training setup in marin (#4255)

github-actions[bot] · Nightshift Agent · claude · web-flow · commit 34f2e2d2a127 · 2026-03-31T04:22:22.000Z
> *Forked paths converge—* > *shared roots beneath the diff* > *one function to bind* ## Summary - Extracted `_prepare_training_run()` and `_submit_training_job()` from `run_levanter_train_lm` and `run_levanter_train_dpo`, which were ~50-line near-identical copies of each other - Both public functions now delegate to the shared helpers, keeping only their unique logic (LM logs model config details; DPO does not) - Net reduction of ~13 lines and elimination of a maintenance hazard where fixes to one path could easily be missed in the other ## Test plan - [x] `uv run --package marin pytest tests/test_training.py -x` — 4/4 pass - [x] `./infra/pre-commit.py --all-files --fix` — clean - [ ] Manual: verify experiment pipelines that call `run_levanter_train_lm` / `run_levanter_train_dpo` still work end-to-end 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Nightshift Agent <nightshift-agent@marin-community.github.io> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lib/marin/src/marin/training/training.py b/lib/marin/src/marin/training/training.py
@@ -6,6 +6,7 @@
 import os
 from copy import deepcopy
 from dataclasses import dataclass, replace
+from collections.abc import Callable
 from typing import TypeVar
 
 import draccus
@@ -218,22 +219,13 @@ def _disable_xla_autotune_subcache(env: dict) -> None:
     logger.info("XLA sub-caches disabled (compilation cache is remote: %s)", cache_dir)
 
 
-def run_levanter_train_lm(config: TrainLmOnPodConfig):
-    """
-    Run the Levanter training main function on a Ray cluster.
-
-    This function is designed to be run on your machine or with sufficient variables in the env dict/os env.
-    It should also be run with a Ray cluster already running.
-
-    - WANDB_API_KEY: The API key for Weights and Biases.
-    - RUN_ID: (Optional) The run ID for this training run. Will default to a random UID if not set.
-    - GIT_COMMIT: (Optional) The git commit hash of the current codebase. Will attempt to fetch it if not set.
+def _prepare_training_run(
+    config: TrainOnPodConfigT,
+) -> tuple[TrainOnPodConfigT, TrainLmConfig | TrainDpoConfig, dict[str, str], list[str]]:
+    """Shared setup for LM and DPO training: env vars, run ID, config adjustments.
 
-    This function makes a number of changes to the config and ensures a few things are set:
-    - The run ID is set, or sets a default if not.
-    - WANDB_API_KEY is set.
-    - It disables the auto-ray-start and auto-worker-start options since we're already in a Ray cluster.
-    - It checks that configured GCS paths are in the same region as the VM (except train/validation source URLs).
+    Returns the updated pod config, the ready-to-use train config, the
+    environment dict, and the Fray extras list.
     """
     default_launch_config = levanter.infra.cli_helpers.load_config()
 
@@ -245,7 +237,6 @@ def run_levanter_train_lm(config: TrainLmOnPodConfig):
         config.env_vars or {},
         default_launch_config.env_for_accel(config.resources.device.variant),
     )
-    # if we're on tpu, ensure we have wandb
     if isinstance(config.resources.device, TpuConfig):
         _check_for_wandb_key(env)
 
@@ -261,16 +252,6 @@ def run_levanter_train_lm(config: TrainLmOnPodConfig):
     config = _enforce_run_id(config)
     logger.info(f"Using run ID: {config.train_config.trainer.id}")
 
-    model_config = config.train_config.model
-    logger.info(
-        "Model config: type=%s seq_len=%d hidden=%d batch=%s device=%s",
-        type(model_config).__name__,
-        model_config.max_seq_len,
-        model_config.Embed.size,
-        config.train_config.trainer.train_batch_size,
-        config.resources.device,
-    )
-
     train_config = config.train_config
     train_config = _suppress_ray_config(train_config)
     train_config = _maybe_override_auto_build_caches(train_config, config.auto_build_caches)
@@ -283,87 +264,93 @@ def run_levanter_train_lm(config: TrainLmOnPodConfig):
     if not isinstance(config.resources.device, CpuConfig):
         _doublecheck_paths(config)
 
-    client = current_client()
-
-    extras = []
+    extras: list[str] = []
     if isinstance(config.resources.device, TpuConfig):
         extras.append("tpu")
     elif isinstance(config.resources.device, GpuConfig):
         extras.append("gpu")
 
-    # Note: Using a constant job name allows restarts to adopt the existing job handle
+    return config, train_config, env, extras
+
+
+def _submit_training_job(
+    *,
+    job_name: str,
+    main_fn: Callable,
+    train_config: TrainConfigT,
+    resources: ResourceConfig,
+    env: dict[str, str],
+    extras: list[str],
+) -> None:
+    """Submit a Levanter training job to Fray and block until completion."""
+    client = current_client()
+    # Using a constant job name allows restarts to adopt the existing job handle
     # instead of raising a duplicate name error (adopt_existing=True is the default).
     job_request = JobRequest(
-        name="train_lm",
-        entrypoint=Entrypoint.from_callable(train_lm.main, args=[train_config]),
-        resources=config.resources,
+        name=job_name,
+        entrypoint=Entrypoint.from_callable(main_fn, args=[train_config]),
+        resources=resources,
         environment=create_environment(env_vars=env, extras=extras),
         max_retries_failure=10,
     )
     job = client.submit(job_request)
     job.wait(raise_on_failure=True)
 
 
-def run_levanter_train_dpo(config: TrainDpoOnPodConfig):
-    """
-    Run the Levanter DPO training main function on a Ray cluster.
+def run_levanter_train_lm(config: TrainLmOnPodConfig):
+    """Run the Levanter LM training main function on a Ray cluster.
 
     This function is designed to be run on your machine or with sufficient variables in the env dict/os env.
     It should also be run with a Ray cluster already running.
-    """
-    default_launch_config = levanter.infra.cli_helpers.load_config()
 
-    if config.output_path is not None:
-        logger.info(f"Using output path: {config.output_path}")
-        config = _update_config_to_use_out_path(config)
-
-    env = _add_default_env_variables(
-        config.env_vars or {},
-        default_launch_config.env_for_accel(config.resources.device.variant),
-    )
-    if isinstance(config.resources.device, TpuConfig):
-        _check_for_wandb_key(env)
-
-    env = _add_run_env_variables(env)
-
-    if "JAX_COMPILATION_CACHE_DIR" not in env:
-        env["JAX_COMPILATION_CACHE_DIR"] = _normalize_jax_compilation_cache_dir(
-            marin_temp_bucket(ttl_days=30, prefix="compilation-cache")
-        )
-        logger.info("JAX compilation cache: %s", env["JAX_COMPILATION_CACHE_DIR"])
-    _disable_xla_autotune_subcache(env)
+    - WANDB_API_KEY: The API key for Weights and Biases.
+    - RUN_ID: (Optional) The run ID for this training run. Will default to a random UID if not set.
+    - GIT_COMMIT: (Optional) The git commit hash of the current codebase. Will attempt to fetch it if not set.
 
-    config = _enforce_run_id(config)
-    logger.info(f"Using run ID: {config.train_config.trainer.id}")
+    This function makes a number of changes to the config and ensures a few things are set:
+    - The run ID is set, or sets a default if not.
+    - WANDB_API_KEY is set.
+    - It disables the auto-ray-start and auto-worker-start options since we're already in a Ray cluster.
+    - It checks that configured GCS paths are in the same region as the VM (except train/validation source URLs).
+    """
+    config, train_config, env, extras = _prepare_training_run(config)
 
-    train_config = config.train_config
-    train_config = _suppress_ray_config(train_config)
-    train_config = _maybe_override_auto_build_caches(train_config, config.auto_build_caches)
+    model_config = train_config.model
+    logger.info(
+        "Model config: type=%s seq_len=%d hidden=%d batch=%s device=%s",
+        type(model_config).__name__,
+        model_config.max_seq_len,
+        model_config.Embed.size,
+        train_config.trainer.train_batch_size,
+        config.resources.device,
+    )
 
-    if config.resources.device.kind == "cpu":
-        trainer = replace(train_config.trainer, require_accelerator=False)
-        train_config = replace(train_config, trainer=trainer)
+    _submit_training_job(
+        job_name="train_lm",
+        main_fn=train_lm.main,
+        train_config=train_config,
+        resources=config.resources,
+        env=env,
+        extras=extras,
+    )
 
-    if not isinstance(config.resources.device, CpuConfig):
-        _doublecheck_paths(config)
 
-    client = current_client()
+def run_levanter_train_dpo(config: TrainDpoOnPodConfig):
+    """Run the Levanter DPO training main function on a Ray cluster.
 
-    extras = []
-    if isinstance(config.resources.device, TpuConfig):
-        extras.append("tpu")
-    elif isinstance(config.resources.device, GpuConfig):
-        extras.append("gpu")
+    This function is designed to be run on your machine or with sufficient variables in the env dict/os env.
+    It should also be run with a Ray cluster already running.
+    """
+    config, train_config, env, extras = _prepare_training_run(config)
 
-    job_request = JobRequest(
-        name="train_dpo",
-        entrypoint=Entrypoint.from_callable(train_dpo.main, args=[train_config]),
+    _submit_training_job(
+        job_name="train_dpo",
+        main_fn=train_dpo.main,
+        train_config=train_config,
         resources=config.resources,
-        environment=create_environment(env_vars=env, extras=extras),
-        max_retries_failure=10,
+        env=env,
+        extras=extras,
     )
-    job = client.submit(job_request)
-    job.wait(raise_on_failure=True)
 
 
 def _doublecheck_paths(config: TrainOnPodConfigT):