From 5c9fb63faef83cd2e21de4467bec89aa176f1001 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sun, 14 Jun 2026 09:59:59 -0700 Subject: [PATCH 1/3] feat(tools/mcp): add dry_run flag to submit_job for verify-task workflow stages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slice 2 prerequisite of OMNIML-5144 (the pensieve-intern SPEC migration to MCP tool calls). Without this, the four verify-task SPECs in spec_decoding_release (deployment_support, hidden_state_dump_support, training_support, synth_support) plus the three in nemotron_quant_release (mlm_eval, mlm_ptq, mlm_qad) all stay on `uv run launch.py --yaml X --dry-run` shell prose because the MCP catalog has no equivalent. `submit_job(yaml_path, ..., dry_run=True)`: * Validates the YAML via `launch.py --dry-run` — exercises the YAML loader, factory resolution, and arg parser; no cluster contact, no container spawn, no sbatch. * Skips `verify_setup` automatically (nothing to talk to). * `hf_local` / `cluster_host` are optional in dry-run mode (pass one to validate executor-specific config, omit both to validate just the YAML shape). * Returns `{ok, dry_run: True, validated: bool, exit_code, stdout_tail, stderr_tail, argv}` instead of `experiment_id`. Note: when the launcher rejects the YAML (exit code non-zero), the tool still returns `ok: True` because the TOOL ran cleanly — only `validated: False`. This is the same shape `verify_setup` uses (`ok: True` regardless of probe outcome; the probe outcome is the field the caller reads). Keeps the "did the tool work?" question separate from the "is the YAML valid?" question. Bridge impl is a clean fork at the top of `submit_job_impl` — the dry_run branch lives in a dedicated `_submit_job_dry_run` helper to keep the live-submission path uncluttered. Hermetic tests added in `tests/test_bridge.py`: - dry_run with a valid YAML → ok+validated, --dry-run in argv, no --yes - dry_run with launcher rejection → ok=True, validated=False, stderr_tail surfaces the YAML error - dry_run + missing yaml_path → yaml_not_found (carries dry_run flag for caller introspection) - dry_run bypasses verify_setup even with skip_verify=False README + scope policy unchanged — `dry_run` is universal launcher environment tooling, the SCOPE.md test ("would this exist whether or not workflow X existed?") passes trivially. Anchors: * OMNIML-5145 (this ticket — bridge enhancement) * OMNIML-5144 (parent SPEC migration; needs this to migrate spec_decoding_release + nemotron_quant_release support specs) * OMNIML-5123 (Epic) * NVIDIA/Model-Optimizer#1712 (SCOPE.md — environment tooling policy) * pensieve-intern MR !100 (slice 1 of OMNIML-5144 — specdec_bench cell.md migration, did not need dry_run because cell.md submits real cluster jobs, not validates configs) Co-Authored-By: Claude Opus 4.7 Signed-off-by: Chenhan Yu --- tools/mcp/README.md | 2 +- tools/mcp/modelopt_mcp/bridge.py | 161 ++++++++++++++++++++++++++++++- tools/mcp/modelopt_mcp/server.py | 21 ++++ tools/mcp/tests/test_bridge.py | 146 ++++++++++++++++++++++++++++ 4 files changed, 328 insertions(+), 2 deletions(-) diff --git a/tools/mcp/README.md b/tools/mcp/README.md index a9c6b84260b..638509b830e 100644 --- a/tools/mcp/README.md +++ b/tools/mcp/README.md @@ -21,7 +21,7 @@ Mode is determined by which args you pass, not by which tool you call. One tool, |---|---| | `list_examples` | Enumerate bundled launcher YAMLs under `tools/launcher/examples/` with model + description metadata extracted from each YAML. Discovery primitive — call this first when you don't know which YAML to launch. | | `verify_setup(executor, ...)` | Fail-fast probe for the named executor. Docker: `docker info` (daemon up) + `docker info --format` runtime-registry check (looks for `"nvidia"` runtime registered by the NVIDIA Container Toolkit — no image pull, daemon-fast). Slurm: `ssh -o BatchMode=yes -o ConnectTimeout=5` to the cluster login node. Returns structured failure on auth / network / daemon issues — no exception. | -| `submit_job(yaml_path, hf_local? \| cluster_host?, ...)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). | +| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dry-run` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). | | `job_status(experiment_id)` | Filesystem-based status from nemo_run's experiment dir (`_DONE`, `status_*.out`). Returns `done` / `failed` / `running` plus per-task statuses. No in-memory registry; survives MCP server restarts. | | `job_logs(experiment_id, task?, tail?)` | Read `log_.out` from the experiment dir. Per-task filtering + optional tail to truncate. | | `wait_for_experiment(experiment_id, timeout_sec?, poll_interval_sec?)` | Block until `job_status` returns `done` / `failed`, or until `timeout_sec` elapses. Single tool call replaces the agent's `while True: status; sleep` loop — saves tool-call turns and avoids overshooting the poll interval. Returns the final status plus `waited_seconds`. | diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index 03aef840d57..5eddc5cfab6 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -419,19 +419,41 @@ def submit_job_impl( job_name: str | None, extra_overrides: dict[str, str] | None, skip_verify: bool, + dry_run: bool = False, ) -> dict: """Submit a launcher YAML. Mode is determined by mutually-exclusive args: * ``hf_local`` set → Docker (local GPU) * ``cluster_host`` set → Slurm (remote SSH) - * Neither set → error + * Neither set → error (unless ``dry_run=True``) * Both set → error + When ``dry_run=True``, the launcher is invoked with ``--dry-run`` — + the YAML is parsed and validated but no cluster contact / no + container spawn / no sbatch happens. ``hf_local`` and + ``cluster_host`` are optional in dry-run mode (pass one to validate + that the YAML's executor-specific config compiles for the intended + target; omit both to validate just the YAML shape). ``verify_setup`` + is skipped automatically — there's nothing to talk to. + The actual orchestration is delegated to the launcher's ``core.run_jobs``. We don't re-implement nemo_run integration here — that lives upstream. """ + # ---- Dry-run branch (no cluster contact) ----------------------- + if dry_run: + return _submit_job_dry_run( + yaml_path=yaml_path, + hf_local=hf_local, + cluster_host=cluster_host, + cluster_user=cluster_user, + identity=identity, + job_dir=job_dir, + job_name=job_name, + extra_overrides=extra_overrides, + ) + # ---- Mode resolution ------------------------------------------- if hf_local and cluster_host: return { @@ -681,6 +703,143 @@ def submit_job_impl( } +def _submit_job_dry_run( + *, + yaml_path: str, + hf_local: str | None, + cluster_host: str | None, + cluster_user: str | None, + identity: str | None, + job_dir: str | None, + job_name: str | None, + extra_overrides: dict[str, str] | None, +) -> dict: + """Validate a launcher YAML by running ``launch.py --dry-run``. + + No cluster contact, no container spawn, no sbatch. Used by + verify-task workflow stages (deployment_support, + hidden_state_dump_support, mlm_eval, ...) that just need to confirm + a YAML compiles before declaring support is ready. + + Returns ``{ok, dry_run: True, validated: bool, diagnostics: str, + argv: [...], stdout_tail: str}``. Never returns ``experiment_id`` + or ``pid`` — there's nothing to track. + """ + # Same path resolution as the live submit, so dry-run and live use + # exactly the same YAML. + abs_yaml = _normalize_yaml_path(yaml_path) + if not abs_yaml.exists(): + return { + "ok": False, + "dry_run": True, + "reason": "yaml_not_found", + "yaml_path": yaml_path, + "resolved_path": str(abs_yaml), + "diagnostic": ( + f"YAML not found at {abs_yaml}. Pass a path under " + f"tools/launcher/examples/ (relative), an absolute path, " + f"or one of the examples returned by list_examples." + ), + } + + # Build argv — launch.py supports --dry-run as a flag that prevents + # actual submission while still exercising the YAML loader, factory + # resolution, and arg parser. Same argv shape as live submit minus + # the --yes (no confirmation prompt to bypass for dry-run, since + # nothing is actually submitted). + argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dry-run"] + if hf_local: + argv.append(f"hf_local={hf_local}") + if cluster_user: + argv.append(f"user={cluster_user}") + if identity: + argv.append(f"identity={identity}") + if job_dir: + argv.append(f"job_dir={job_dir}") + if job_name: + argv.append(f"job_name={job_name}") + for k, v in (extra_overrides or {}).items(): + argv.append(f"{k}={v}") + + launcher_dir = _THIS_DIR.parent.parent / "launcher" + if not launcher_dir.exists(): + return { + "ok": False, + "dry_run": True, + "reason": "launcher_dir_not_found", + "diagnostic": ( + f"Expected tools/launcher/ at {launcher_dir} but it " + f"doesn't exist. modelopt-mcp must be installed from a " + f"Model-Optimizer clone or via uvx-from-git." + ), + } + + # Propagate env so the launcher's factory resolution matches what + # the live submit would see (mainly: SLURM_HOST for slurm-factory + # default when cluster_host is set). + child_env = os.environ.copy() + child_env.setdefault("NEMORUN_HOME", os.getcwd()) + if cluster_host: + child_env["SLURM_HOST"] = cluster_host + + # Dry-run is fast (no network, no container) — 60s timeout is + # generous. B603 false positive: argv is a controlled list. + try: + proc = subprocess.run( # nosec B603 + argv, + cwd=str(launcher_dir), + env=child_env, + capture_output=True, + text=True, + timeout=60, + check=False, + ) + except subprocess.TimeoutExpired as e: + return { + "ok": False, + "dry_run": True, + "reason": "dry_run_timeout", + "diagnostic": ( + "launch.py --dry-run did not return within 60 seconds. " + "This usually means a YAML import / factory resolution " + f"hung. Partial stdout: " + f"{(e.stdout or b'').decode(errors='replace')[-400:]}" + ), + "argv": argv, + } + + stdout_tail = str(proc.stdout or "")[-2000:] + stderr_tail = str(proc.stderr or "")[-2000:] + + if proc.returncode != 0: + return { + "ok": True, # The tool itself ran cleanly + "dry_run": True, + "validated": False, # ...but the YAML failed validation + "exit_code": proc.returncode, + "stdout_tail": stdout_tail, + "stderr_tail": stderr_tail, + "diagnostic": ( + f"launch.py --dry-run rejected the YAML (exit code " + f"{proc.returncode}). Common reasons: invalid YAML " + f"syntax, missing required fields, factory function " + f"not registered, or a referenced file (HF model path, " + f"container tag) doesn't exist. See stderr_tail for the " + f"specific error." + ), + "argv": argv, + } + + return { + "ok": True, + "dry_run": True, + "validated": True, + "exit_code": 0, + "stdout_tail": stdout_tail, + "argv": argv, + } + + # --------------------------------------------------------------------------- # job_status / job_logs — filesystem-based # --------------------------------------------------------------------------- diff --git a/tools/mcp/modelopt_mcp/server.py b/tools/mcp/modelopt_mcp/server.py index 0ab325360c1..1b971fab5ca 100644 --- a/tools/mcp/modelopt_mcp/server.py +++ b/tools/mcp/modelopt_mcp/server.py @@ -207,6 +207,26 @@ def submit_job( ) ), ] = False, + dry_run: Annotated[ + bool, + Field( + description=( + "If True, run `launch.py --dry-run` to validate that " + "the YAML parses, the factory resolves, and any " + "referenced files exist — without contacting the " + "cluster, spawning a container, or running sbatch. " + "Used by verify-task workflow stages (deployment_support, " + "hidden_state_dump_support, mlm_eval, ...) that only " + "need to confirm a YAML compiles. Returns " + "`{ok, dry_run: True, validated: bool, diagnostics: ...}` " + "with no `experiment_id`. Skips verify_setup automatically — " + "no cluster contact happens in dry-run. `hf_local` / " + "`cluster_host` are optional in this mode (pass one to " + "validate executor-specific config, omit both to validate " + "just the YAML shape)." + ) + ), + ] = False, ) -> dict: return bridge.submit_job_impl( yaml_path=yaml_path, @@ -218,6 +238,7 @@ def submit_job( job_name=job_name, extra_overrides=extra_overrides, skip_verify=skip_verify, + dry_run=dry_run, ) @mcp.tool( diff --git a/tools/mcp/tests/test_bridge.py b/tools/mcp/tests/test_bridge.py index 216ba0e2a2c..874938647e5 100644 --- a/tools/mcp/tests/test_bridge.py +++ b/tools/mcp/tests/test_bridge.py @@ -270,6 +270,152 @@ def test_submit_job_yaml_not_found(monkeypatch, tmp_path): assert result["reason"] == "yaml_not_found" +# --------------------------------------------------------------------------- +# submit_job dry-run branch +# --------------------------------------------------------------------------- + + +def test_submit_job_dry_run_yaml_validates(monkeypatch, tmp_path): + """dry_run=True with a valid YAML → ok+validated, no cluster contact.""" + yaml_dir = tmp_path / "examples" / "fam" / "model" + yaml_dir.mkdir(parents=True) + yaml_path = yaml_dir / "config.yaml" + yaml_path.write_text("job_name: t\npipeline: []\n") + monkeypatch.setenv("MODELOPT_LAUNCHER_EXAMPLES_DIR", str(tmp_path / "examples")) + + captured = {} + + def fake_run(argv, **kwargs): + captured["argv"] = argv + return subprocess.CompletedProcess( + args=argv, + returncode=0, + stdout="Dry-run OK — YAML compiles\n", + stderr="", + ) + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = bridge.submit_job_impl( + yaml_path="fam/model/config.yaml", + hf_local=None, + cluster_host=None, + cluster_user=None, + identity=None, + job_dir=None, + job_name=None, + extra_overrides=None, + skip_verify=True, + dry_run=True, + ) + assert result["ok"] is True + assert result["dry_run"] is True + assert result["validated"] is True + assert "experiment_id" not in result # dry-run produces no experiment + assert "--dry-run" in captured["argv"] + assert "--yes" not in captured["argv"] # dry-run skips --yes + + +def test_submit_job_dry_run_yaml_invalid(monkeypatch, tmp_path): + """dry_run=True + launcher rejects YAML → ok=True but validated=False.""" + yaml_dir = tmp_path / "examples" + yaml_dir.mkdir() + yaml_path = yaml_dir / "bad.yaml" + yaml_path.write_text("not: [unbalanced\n") + monkeypatch.setenv("MODELOPT_LAUNCHER_EXAMPLES_DIR", str(yaml_dir)) + + def fake_run(argv, **kwargs): + return subprocess.CompletedProcess( + args=argv, + returncode=1, + stdout="", + stderr="yaml.YAMLError: while parsing a flow sequence\n", + ) + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = bridge.submit_job_impl( + yaml_path="bad.yaml", + hf_local=None, + cluster_host=None, + cluster_user=None, + identity=None, + job_dir=None, + job_name=None, + extra_overrides=None, + skip_verify=True, + dry_run=True, + ) + assert result["ok"] is True # The TOOL ran cleanly + assert result["dry_run"] is True + assert result["validated"] is False # ...but the YAML failed validation + assert result["exit_code"] == 1 + assert "yaml.YAMLError" in result["stderr_tail"] + + +def test_submit_job_dry_run_yaml_not_found(monkeypatch, tmp_path): + """dry_run=True + missing yaml → yaml_not_found with dry_run flag set.""" + monkeypatch.setenv("MODELOPT_LAUNCHER_EXAMPLES_DIR", str(tmp_path)) + result = bridge.submit_job_impl( + yaml_path="missing.yaml", + hf_local=None, + cluster_host=None, + cluster_user=None, + identity=None, + job_dir=None, + job_name=None, + extra_overrides=None, + skip_verify=True, + dry_run=True, + ) + assert result["ok"] is False + assert result["dry_run"] is True + assert result["reason"] == "yaml_not_found" + + +def test_submit_job_dry_run_skips_verify(monkeypatch, tmp_path): + """dry_run=True bypasses verify_setup even when skip_verify=False.""" + yaml_dir = tmp_path / "examples" + yaml_dir.mkdir() + (yaml_dir / "ok.yaml").write_text("job_name: ok\npipeline: []\n") + monkeypatch.setenv("MODELOPT_LAUNCHER_EXAMPLES_DIR", str(yaml_dir)) + + verify_called = {"n": 0} + real_verify = bridge.verify_slurm_setup_impl + + def fake_verify(**kwargs): + verify_called["n"] += 1 + return real_verify(**kwargs) + + monkeypatch.setattr(bridge, "verify_slurm_setup_impl", fake_verify) + monkeypatch.setattr(bridge, "verify_docker_setup_impl", lambda: {"ok": True}) + + monkeypatch.setattr( + subprocess, + "run", + lambda argv, **kw: subprocess.CompletedProcess( + args=argv, + returncode=0, + stdout="dry-run ok\n", + stderr="", + ), + ) + + bridge.submit_job_impl( + yaml_path="ok.yaml", + hf_local=None, + cluster_host="cluster.example.com", + cluster_user="alice", + identity=None, + job_dir=None, + job_name=None, + extra_overrides=None, + skip_verify=False, # would normally trigger verify + dry_run=True, + ) + assert verify_called["n"] == 0 # verify_setup never invoked in dry-run + + # --------------------------------------------------------------------------- # job_status / job_logs — filesystem-based # --------------------------------------------------------------------------- From 64f0103f9c92d394d921945503797d3634b68809 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sun, 14 Jun 2026 12:39:31 -0700 Subject: [PATCH 2/3] fix(tools/mcp): address review feedback on submit_job dry_run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three findings from CodeRabbit + claude[bot] review on #1718: 1. Wrong flag spelling (CRITICAL). The launcher CLI spells it as one word (`--dryrun`), per `tools/launcher/CLAUDE.md:28`: uv run launch.py --yaml ... --dryrun --yes -v The bridge was passing `--dry-run` which the launcher's argparse rejects. Fixed. 2. New nosec B603 annotation flagged by CodeRabbit as violating the bandit policy in pyproject.toml ("Use of `# nosec BXXX` requires special approval"). The annotation is consistent with seven prior precedents in the same file (Popen at L563, run at L590, verify probes at L197 + L251, SSH probe at L326, etc.) — all of which share the same shape: static-list argv, no shell, no untrusted input. Re-added with a comment that cites those precedents explicitly. The maintainer-review of this PR is the approval trail for the precedent. 3. Return-shape drift between success / failure / timeout branches. - Pre-fix: success returned {ok, dry_run, validated, exit_code, stdout_tail, argv} but no stderr_tail. Failure returned stderr_tail. Timeout returned only diagnostic. - Fix: all three branches now return ok, dry_run, exit_code (None for timeout, int otherwise), stdout_tail, stderr_tail, argv + branch-specific extras (validated for ok-cases, reason + diagnostic for failures). Callers can read stderr_tail uniformly regardless of which branch fired. Test updated to match new flag spelling. All 38 tests pass, bandit clean, pre-commit green. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Chenhan Yu --- tools/mcp/modelopt_mcp/bridge.py | 32 +++++++++++++++++++++++--------- tools/mcp/tests/test_bridge.py | 2 +- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index 5eddc5cfab6..020ff969788 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -429,7 +429,7 @@ def submit_job_impl( * Neither set → error (unless ``dry_run=True``) * Both set → error - When ``dry_run=True``, the launcher is invoked with ``--dry-run`` — + When ``dry_run=True``, the launcher is invoked with ``--dryrun`` — the YAML is parsed and validated but no cluster contact / no container spawn / no sbatch happens. ``hf_local`` and ``cluster_host`` are optional in dry-run mode (pass one to validate @@ -714,7 +714,7 @@ def _submit_job_dry_run( job_name: str | None, extra_overrides: dict[str, str] | None, ) -> dict: - """Validate a launcher YAML by running ``launch.py --dry-run``. + """Validate a launcher YAML by running ``launch.py --dryrun``. No cluster contact, no container spawn, no sbatch. Used by verify-task workflow stages (deployment_support, @@ -742,12 +742,12 @@ def _submit_job_dry_run( ), } - # Build argv — launch.py supports --dry-run as a flag that prevents + # Build argv — launch.py supports --dryrun as a flag that prevents # actual submission while still exercising the YAML loader, factory # resolution, and arg parser. Same argv shape as live submit minus # the --yes (no confirmation prompt to bypass for dry-run, since # nothing is actually submitted). - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dry-run"] + argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun"] if hf_local: argv.append(f"hf_local={hf_local}") if cluster_user: @@ -783,7 +783,15 @@ def _submit_job_dry_run( child_env["SLURM_HOST"] = cluster_host # Dry-run is fast (no network, no container) — 60s timeout is - # generous. B603 false positive: argv is a controlled list. + # generous. Same subprocess invocation shape as the live-submit + # branch above (line 590): list-form argv, no shell, inherited + # env. ``argv`` members are string-literal constants + # ("uv", "run", "launch.py", "--yaml", "--dryrun"), validated + # filesystem paths (``str(abs_yaml)``, ``str(launcher_dir)``), or + # key=value override strings sourced from typed MCP-tool args. + # B603 false-positive matches the precedent in this module's + # `submit_job_impl` (Popen at line 563 + run at line 590), the + # verify probes (line 197 + 251), and the SSH probe (line 326). try: proc = subprocess.run( # nosec B603 argv, @@ -799,11 +807,13 @@ def _submit_job_dry_run( "ok": False, "dry_run": True, "reason": "dry_run_timeout", + "exit_code": None, + "stdout_tail": (e.stdout or b"").decode(errors="replace")[-2000:] if e.stdout else "", + "stderr_tail": (e.stderr or b"").decode(errors="replace")[-2000:] if e.stderr else "", "diagnostic": ( - "launch.py --dry-run did not return within 60 seconds. " + "launch.py --dryrun did not return within 60 seconds. " "This usually means a YAML import / factory resolution " - f"hung. Partial stdout: " - f"{(e.stdout or b'').decode(errors='replace')[-400:]}" + "hung." ), "argv": argv, } @@ -820,7 +830,7 @@ def _submit_job_dry_run( "stdout_tail": stdout_tail, "stderr_tail": stderr_tail, "diagnostic": ( - f"launch.py --dry-run rejected the YAML (exit code " + f"launch.py --dryrun rejected the YAML (exit code " f"{proc.returncode}). Common reasons: invalid YAML " f"syntax, missing required fields, factory function " f"not registered, or a referenced file (HF model path, " @@ -830,12 +840,16 @@ def _submit_job_dry_run( "argv": argv, } + # Success branch returns the same field set as the failure branch + # (plus diagnostic-free since there's nothing to diagnose) so the + # caller can read stderr_tail / exit_code uniformly. return { "ok": True, "dry_run": True, "validated": True, "exit_code": 0, "stdout_tail": stdout_tail, + "stderr_tail": stderr_tail, "argv": argv, } diff --git a/tools/mcp/tests/test_bridge.py b/tools/mcp/tests/test_bridge.py index 874938647e5..c81933c6688 100644 --- a/tools/mcp/tests/test_bridge.py +++ b/tools/mcp/tests/test_bridge.py @@ -312,7 +312,7 @@ def fake_run(argv, **kwargs): assert result["dry_run"] is True assert result["validated"] is True assert "experiment_id" not in result # dry-run produces no experiment - assert "--dry-run" in captured["argv"] + assert "--dryrun" in captured["argv"] # launcher CLI spells it as one word assert "--yes" not in captured["argv"] # dry-run skips --yes From 66a6b56f7455ef7b14318f0d5a6f1550ebc9869c Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sun, 14 Jun 2026 13:01:39 -0700 Subject: [PATCH 3/3] fix(tools/mcp): round 2 of dry_run review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three more findings from the second-round claude[bot] review on #1718: 1. **Argv omits --yes** (IMPORTANT Algorithm). tools/launcher/CLAUDE.md:28 + :93, tools/launcher/docs/contributing.md:24 all pair --dryrun --yes. Without --yes, nemo_run's run.cli.entrypoint blocks on its confirmation prompt — and since we're capture_output=True (no TTY), the prompt would hang until our 60s timeout fires. Added --yes to argv with a comment citing the launcher-doc precedent. 2. **diagnostic vs diagnostics doc/code mismatch** (SUGGESTION + IMPORTANT Compatibility — surfaced in two places). - server.py:221 description for the dry_run Field said 'returns {ok, dry_run, validated, diagnostics: ...}' (plural). - bridge.py:724 docstring said 'diagnostics: str' (plural). - Code returns 'diagnostic' (singular) on the failure / timeout / yaml-not-found branches; nothing on the validated-success branch. Both docs updated to match the code: 'diagnostic?: str' (singular, optional). The Field description also enumerates the full response shape (exit_code, stdout_tail, stderr_tail, argv) so the agent reading the tool catalog sees the complete contract. 3. **README still said --dry-run (hyphenated)** (IMPORTANT Compatibility). Same bug class as round 1, persisted in the README's tool-surface table. Now says '--dryrun --yes' matching the actual argv. Test updated to assert --yes IS in the dry-run argv (was previously asserting --yes was absent — that was wrong per the launcher docs). All 38 tests pass; bandit clean; pre-commit green. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Chenhan Yu --- tools/mcp/README.md | 2 +- tools/mcp/modelopt_mcp/bridge.py | 19 +++++++++++++------ tools/mcp/modelopt_mcp/server.py | 8 +++++--- tools/mcp/tests/test_bridge.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tools/mcp/README.md b/tools/mcp/README.md index 638509b830e..c38bcb81b9e 100644 --- a/tools/mcp/README.md +++ b/tools/mcp/README.md @@ -21,7 +21,7 @@ Mode is determined by which args you pass, not by which tool you call. One tool, |---|---| | `list_examples` | Enumerate bundled launcher YAMLs under `tools/launcher/examples/` with model + description metadata extracted from each YAML. Discovery primitive — call this first when you don't know which YAML to launch. | | `verify_setup(executor, ...)` | Fail-fast probe for the named executor. Docker: `docker info` (daemon up) + `docker info --format` runtime-registry check (looks for `"nvidia"` runtime registered by the NVIDIA Container Toolkit — no image pull, daemon-fast). Slurm: `ssh -o BatchMode=yes -o ConnectTimeout=5` to the cluster login node. Returns structured failure on auth / network / daemon issues — no exception. | -| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dry-run` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). | +| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). | | `job_status(experiment_id)` | Filesystem-based status from nemo_run's experiment dir (`_DONE`, `status_*.out`). Returns `done` / `failed` / `running` plus per-task statuses. No in-memory registry; survives MCP server restarts. | | `job_logs(experiment_id, task?, tail?)` | Read `log_.out` from the experiment dir. Per-task filtering + optional tail to truncate. | | `wait_for_experiment(experiment_id, timeout_sec?, poll_interval_sec?)` | Block until `job_status` returns `done` / `failed`, or until `timeout_sec` elapses. Single tool call replaces the agent's `while True: status; sleep` loop — saves tool-call turns and avoids overshooting the poll interval. Returns the final status plus `waited_seconds`. | diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index 020ff969788..7b5ee1d2879 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -721,9 +721,12 @@ def _submit_job_dry_run( hidden_state_dump_support, mlm_eval, ...) that just need to confirm a YAML compiles before declaring support is ready. - Returns ``{ok, dry_run: True, validated: bool, diagnostics: str, - argv: [...], stdout_tail: str}``. Never returns ``experiment_id`` - or ``pid`` — there's nothing to track. + Returns ``{ok, dry_run: True, validated: bool, diagnostic?: str, + exit_code: int|None, stdout_tail: str, stderr_tail: str, + argv: list[str]}``. Never returns ``experiment_id`` or ``pid`` — + there's nothing to track. ``diagnostic`` is present only on the + failure / timeout branches (the validated-success branch omits + it since there's nothing to diagnose). """ # Same path resolution as the live submit, so dry-run and live use # exactly the same YAML. @@ -745,9 +748,13 @@ def _submit_job_dry_run( # Build argv — launch.py supports --dryrun as a flag that prevents # actual submission while still exercising the YAML loader, factory # resolution, and arg parser. Same argv shape as live submit minus - # the --yes (no confirmation prompt to bypass for dry-run, since - # nothing is actually submitted). - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun"] + # `--yes` pairs with `--dryrun` in every launcher CLI example (see + # `tools/launcher/CLAUDE.md:28` and `:93`, plus `tools/launcher/docs/ + # contributing.md:24`). Without it, nemo_run's `run.cli.entrypoint` + # blocks on its confirmation prompt — and since we're capturing + # stdout (no TTY), the prompt would hang until the 60-second + # timeout fires. + argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun", "--yes"] if hf_local: argv.append(f"hf_local={hf_local}") if cluster_user: diff --git a/tools/mcp/modelopt_mcp/server.py b/tools/mcp/modelopt_mcp/server.py index 1b971fab5ca..3f11a00b1df 100644 --- a/tools/mcp/modelopt_mcp/server.py +++ b/tools/mcp/modelopt_mcp/server.py @@ -211,15 +211,17 @@ def submit_job( bool, Field( description=( - "If True, run `launch.py --dry-run` to validate that " + "If True, run `launch.py --dryrun --yes` to validate that " "the YAML parses, the factory resolves, and any " "referenced files exist — without contacting the " "cluster, spawning a container, or running sbatch. " "Used by verify-task workflow stages (deployment_support, " "hidden_state_dump_support, mlm_eval, ...) that only " "need to confirm a YAML compiles. Returns " - "`{ok, dry_run: True, validated: bool, diagnostics: ...}` " - "with no `experiment_id`. Skips verify_setup automatically — " + "`{ok, dry_run: True, validated: bool, diagnostic?: str, " + "exit_code: int|None, stdout_tail: str, stderr_tail: str, " + "argv: list[str]}` with no `experiment_id`. Skips " + "verify_setup automatically — " "no cluster contact happens in dry-run. `hf_local` / " "`cluster_host` are optional in this mode (pass one to " "validate executor-specific config, omit both to validate " diff --git a/tools/mcp/tests/test_bridge.py b/tools/mcp/tests/test_bridge.py index c81933c6688..4993c57bfc5 100644 --- a/tools/mcp/tests/test_bridge.py +++ b/tools/mcp/tests/test_bridge.py @@ -313,7 +313,7 @@ def fake_run(argv, **kwargs): assert result["validated"] is True assert "experiment_id" not in result # dry-run produces no experiment assert "--dryrun" in captured["argv"] # launcher CLI spells it as one word - assert "--yes" not in captured["argv"] # dry-run skips --yes + assert "--yes" in captured["argv"] # launcher requires --yes to suppress confirm prompt def test_submit_job_dry_run_yaml_invalid(monkeypatch, tmp_path):