NVIDIA · ChenhanYu · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
@@ -21,7 +21,7 @@ Mode is determined by which args you pass, not by which tool you call. One tool,
 |---|---|
 | `list_examples` | Enumerate bundled launcher YAMLs under `tools/launcher/examples/` with model + description metadata extracted from each YAML. Discovery primitive — call this first when you don't know which YAML to launch. |
 | `verify_setup(executor, ...)` | Fail-fast probe for the named executor. Docker: `docker info` (daemon up) + `docker info --format` runtime-registry check (looks for `"nvidia"` runtime registered by the NVIDIA Container Toolkit — no image pull, daemon-fast). Slurm: `ssh -o BatchMode=yes -o ConnectTimeout=5` to the cluster login node. Returns structured failure on auth / network / daemon issues — no exception. |
-| `submit_job(yaml_path, hf_local? \| cluster_host?, ...)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). |
+| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). |
 | `job_status(experiment_id)` | Filesystem-based status from nemo_run's experiment dir (`_DONE`, `status_*.out`). Returns `done` / `failed` / `running` plus per-task statuses. No in-memory registry; survives MCP server restarts. |
 | `job_logs(experiment_id, task?, tail?)` | Read `log_<task>.out` from the experiment dir. Per-task filtering + optional tail to truncate. |
 | `wait_for_experiment(experiment_id, timeout_sec?, poll_interval_sec?)` | Block until `job_status` returns `done` / `failed`, or until `timeout_sec` elapses. Single tool call replaces the agent's `while True: status; sleep` loop — saves tool-call turns and avoids overshooting the poll interval. Returns the final status plus `waited_seconds`. |

@@ -419,19 +419,41 @@ def submit_job_impl(
     job_name: str | None,
     extra_overrides: dict[str, str] | None,
     skip_verify: bool,
+    dry_run: bool = False,
 ) -> dict:
     """Submit a launcher YAML.
 
     Mode is determined by mutually-exclusive args:
       * ``hf_local`` set → Docker (local GPU)
       * ``cluster_host`` set → Slurm (remote SSH)
-      * Neither set → error
+      * Neither set → error (unless ``dry_run=True``)
       * Both set → error
 
+    When ``dry_run=True``, the launcher is invoked with ``--dryrun`` —
+    the YAML is parsed and validated but no cluster contact / no
+    container spawn / no sbatch happens. ``hf_local`` and
+    ``cluster_host`` are optional in dry-run mode (pass one to validate
+    that the YAML's executor-specific config compiles for the intended
+    target; omit both to validate just the YAML shape). ``verify_setup``
+    is skipped automatically — there's nothing to talk to.
+
     The actual orchestration is delegated to the launcher's
     ``core.run_jobs``. We don't re-implement nemo_run integration here —
     that lives upstream.
     """
+    # ---- Dry-run branch (no cluster contact) -----------------------
+    if dry_run:
+        return _submit_job_dry_run(
+            yaml_path=yaml_path,
+            hf_local=hf_local,
+            cluster_host=cluster_host,
+            cluster_user=cluster_user,
+            identity=identity,
+            job_dir=job_dir,
+            job_name=job_name,
+            extra_overrides=extra_overrides,
+        )
+
     # ---- Mode resolution -------------------------------------------
     if hf_local and cluster_host:
         return {
@@ -681,6 +703,164 @@ def submit_job_impl(
     }
 
 
+def _submit_job_dry_run(
+    *,
+    yaml_path: str,
+    hf_local: str | None,
+    cluster_host: str | None,
+    cluster_user: str | None,
+    identity: str | None,
+    job_dir: str | None,
+    job_name: str | None,
+    extra_overrides: dict[str, str] | None,
+) -> dict:
+    """Validate a launcher YAML by running ``launch.py --dryrun``.
+
+    No cluster contact, no container spawn, no sbatch. Used by
+    verify-task workflow stages (deployment_support,
+    hidden_state_dump_support, mlm_eval, ...) that just need to confirm
+    a YAML compiles before declaring support is ready.
+
+    Returns ``{ok, dry_run: True, validated: bool, diagnostic?: str,
+    exit_code: int|None, stdout_tail: str, stderr_tail: str,
+    argv: list[str]}``. Never returns ``experiment_id`` or ``pid`` —
+    there's nothing to track. ``diagnostic`` is present only on the
+    failure / timeout branches (the validated-success branch omits
+    it since there's nothing to diagnose).
+    """
+    # Same path resolution as the live submit, so dry-run and live use
+    # exactly the same YAML.
+    abs_yaml = _normalize_yaml_path(yaml_path)
+    if not abs_yaml.exists():
+        return {
+            "ok": False,
+            "dry_run": True,
+            "reason": "yaml_not_found",
+            "yaml_path": yaml_path,
+            "resolved_path": str(abs_yaml),
+            "diagnostic": (
+                f"YAML not found at {abs_yaml}. Pass a path under "
+                f"tools/launcher/examples/ (relative), an absolute path, "
+                f"or one of the examples returned by list_examples."
+            ),
+        }
+
+    # Build argv — launch.py supports --dryrun as a flag that prevents
+    # actual submission while still exercising the YAML loader, factory
+    # resolution, and arg parser. Same argv shape as live submit minus
+    # `--yes` pairs with `--dryrun` in every launcher CLI example (see
+    # `tools/launcher/CLAUDE.md:28` and `:93`, plus `tools/launcher/docs/
+    # contributing.md:24`). Without it, nemo_run's `run.cli.entrypoint`
+    # blocks on its confirmation prompt — and since we're capturing
+    # stdout (no TTY), the prompt would hang until the 60-second
+    # timeout fires.
+    argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun", "--yes"]
+    if hf_local:
+        argv.append(f"hf_local={hf_local}")
+    if cluster_user:
+        argv.append(f"user={cluster_user}")
+    if identity:
+        argv.append(f"identity={identity}")
+    if job_dir:
+        argv.append(f"job_dir={job_dir}")
+    if job_name:
+        argv.append(f"job_name={job_name}")
+    for k, v in (extra_overrides or {}).items():
+        argv.append(f"{k}={v}")
+
+    launcher_dir = _THIS_DIR.parent.parent / "launcher"
+    if not launcher_dir.exists():
+        return {
+            "ok": False,
+            "dry_run": True,
+            "reason": "launcher_dir_not_found",
+            "diagnostic": (
+                f"Expected tools/launcher/ at {launcher_dir} but it "
+                f"doesn't exist. modelopt-mcp must be installed from a "
+                f"Model-Optimizer clone or via uvx-from-git."
+            ),
+        }
+
+    # Propagate env so the launcher's factory resolution matches what
+    # the live submit would see (mainly: SLURM_HOST for slurm-factory
+    # default when cluster_host is set).
+    child_env = os.environ.copy()
+    child_env.setdefault("NEMORUN_HOME", os.getcwd())
+    if cluster_host:
+        child_env["SLURM_HOST"] = cluster_host
+
+    # Dry-run is fast (no network, no container) — 60s timeout is
+    # generous. Same subprocess invocation shape as the live-submit
+    # branch above (line 590): list-form argv, no shell, inherited
+    # env. ``argv`` members are string-literal constants
+    # ("uv", "run", "launch.py", "--yaml", "--dryrun"), validated
+    # filesystem paths (``str(abs_yaml)``, ``str(launcher_dir)``), or
+    # key=value override strings sourced from typed MCP-tool args.
+    # B603 false-positive matches the precedent in this module's
+    # `submit_job_impl` (Popen at line 563 + run at line 590), the
+    # verify probes (line 197 + 251), and the SSH probe (line 326).
+    try:
+        proc = subprocess.run(  # nosec B603
+            argv,
+            cwd=str(launcher_dir),
+            env=child_env,
+            capture_output=True,
+            text=True,
+            timeout=60,
+            check=False,
+        )
+    except subprocess.TimeoutExpired as e:
+        return {
+            "ok": False,
+            "dry_run": True,
+            "reason": "dry_run_timeout",
+            "exit_code": None,
+            "stdout_tail": (e.stdout or b"").decode(errors="replace")[-2000:] if e.stdout else "",
+            "stderr_tail": (e.stderr or b"").decode(errors="replace")[-2000:] if e.stderr else "",
+            "diagnostic": (
+                "launch.py --dryrun did not return within 60 seconds. "
+                "This usually means a YAML import / factory resolution "
+                "hung."
+            ),
+            "argv": argv,
+        }
+
+    stdout_tail = str(proc.stdout or "")[-2000:]
+    stderr_tail = str(proc.stderr or "")[-2000:]
+
+    if proc.returncode != 0:
+        return {
+            "ok": True,  # The tool itself ran cleanly
+            "dry_run": True,
+            "validated": False,  # ...but the YAML failed validation
+            "exit_code": proc.returncode,
+            "stdout_tail": stdout_tail,
+            "stderr_tail": stderr_tail,
+            "diagnostic": (
+                f"launch.py --dryrun rejected the YAML (exit code "
+                f"{proc.returncode}). Common reasons: invalid YAML "
+                f"syntax, missing required fields, factory function "
+                f"not registered, or a referenced file (HF model path, "
+                f"container tag) doesn't exist. See stderr_tail for the "
+                f"specific error."
+            ),
+            "argv": argv,
+        }
+
+    # Success branch returns the same field set as the failure branch
+    # (plus diagnostic-free since there's nothing to diagnose) so the
+    # caller can read stderr_tail / exit_code uniformly.
+    return {
+        "ok": True,
+        "dry_run": True,
+        "validated": True,
+        "exit_code": 0,
+        "stdout_tail": stdout_tail,
+        "stderr_tail": stderr_tail,
+        "argv": argv,
+    }
+
+
 # ---------------------------------------------------------------------------
 # job_status / job_logs — filesystem-based
 # ---------------------------------------------------------------------------

@@ -207,6 +207,28 @@ def submit_job(
                 )
             ),
         ] = False,
+        dry_run: Annotated[
+            bool,
+            Field(
+                description=(
+                    "If True, run `launch.py --dryrun --yes` to validate that "
+                    "the YAML parses, the factory resolves, and any "
+                    "referenced files exist — without contacting the "
+                    "cluster, spawning a container, or running sbatch. "
+                    "Used by verify-task workflow stages (deployment_support, "
+                    "hidden_state_dump_support, mlm_eval, ...) that only "
+                    "need to confirm a YAML compiles. Returns "
+                    "`{ok, dry_run: True, validated: bool, diagnostic?: str, "
+                    "exit_code: int|None, stdout_tail: str, stderr_tail: str, "
+                    "argv: list[str]}` with no `experiment_id`. Skips "
+                    "verify_setup automatically — "
+                    "no cluster contact happens in dry-run. `hf_local` / "
+                    "`cluster_host` are optional in this mode (pass one to "
+                    "validate executor-specific config, omit both to validate "
+                    "just the YAML shape)."
+                )
+            ),
+        ] = False,
     ) -> dict:
         return bridge.submit_job_impl(
             yaml_path=yaml_path,
@@ -218,6 +240,7 @@ def submit_job(
             job_name=job_name,
             extra_overrides=extra_overrides,
             skip_verify=skip_verify,
+            dry_run=dry_run,
         )
 
     @mcp.tool(