fix(inference): preflight NEMOCLAW_VLLM_MODEL on sandbox connect

laitingsheng · laitingsheng · commit 3f3e2214492d · 2026-05-30T14:56:35.000Z
Signed-off-by: Tinson Lai &lt;tinsonl@nvidia.com&gt;
diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx
@@ -1358,7 +1358,7 @@ Set them before running `nemoclaw onboard`.
 | `NEMOCLAW_SANDBOX` | sandbox name | Alternate spelling of `NEMOCLAW_SANDBOX_NAME`; used by `services` and `debug` lookups when neither a flag nor `NEMOCLAW_SANDBOX_NAME` is set. |
 | `NEMOCLAW_INSTALL_REF` | git ref | For internal installer commands: the git ref to install from. Overridden by the `--install-ref` flag. |
 | `NEMOCLAW_INSTALL_TAG` | release tag | For internal installer commands: the release tag to install. Overridden by the `--install-tag` flag. |
-| `NEMOCLAW_VLLM_MODEL` | registry slug or Hugging Face model id | Selects the model the managed-vLLM install path serves. Recognised slugs: `qwen3.6-27b`, `nemotron-3-nano-4b`, `deepseek-r1-distill-70b`. Unset uses the per-platform profile default. Gated models (e.g. `deepseek-r1-distill-70b`) require `HF_TOKEN` or `HUGGING_FACE_HUB_TOKEN`. |
+| `NEMOCLAW_VLLM_MODEL` | registry slug or Hugging Face model id | Selects the model the managed-vLLM install path serves. Recognised slugs: `qwen3.6-27b`, `nemotron-3-nano-4b`, `deepseek-r1-distill-70b`. Unset uses the per-platform profile default. Gated models (e.g. `deepseek-r1-distill-70b`) require `HF_TOKEN` or `HUGGING_FACE_HUB_TOKEN`. `nemoclaw <name> connect` only inspects the variable for fail-fast validation (unknown slug or gated model without an HF token) before attaching; it does not pull or serve a model itself. |
 | `NEMOCLAW_MODEL_ROUTER_PYTHON` | absolute path | Pins the host Python interpreter used to create the Model Router virtual environment. Strict. NemoClaw probes only that interpreter and aborts with the failure reason if it does not qualify, rather than silently falling back to another python. Relative command names such as `python3.12` are rejected. When unset, NemoClaw probes `python3.13`, `python3.12`, `python3.11`, `python3.10`, and bare `python3`, retains every interpreter whose version is in `[3.10, 3.14)` and whose `ensurepip`, `pyexpat`, `ssl`, and `venv` stdlib modules import cleanly, and tries `python -m venv` on each in priority order until one succeeds. Set the pin when the auto-discovered interpreter is broken (for example, Homebrew `python@3.14` with a `pyexpat` dlopen mismatch on macOS). |
 
 #### Linux Ollama install mode details
diff --git a/src/lib/actions/sandbox/connect.ts b/src/lib/actions/sandbox/connect.ts
@@ -19,6 +19,7 @@ import { D, G, R, YW } from "../../cli/terminal-style";
 import * as agentRuntime from "../../agent/runtime";
 import { parseGatewayInference } from "../../inference/config";
 import { findReachableOllamaHost, probeLocalProviderHealth } from "../../inference/local";
+import { preflightVllmModelEnv } from "../../inference/vllm-models";
 import {
   ensureOllamaAuthProxy,
   probeOllamaAuthProxyHealth,
@@ -755,10 +756,31 @@ function exitWithSpawnResult(result: SpawnLikeResult): void {
   process.exit(1);
 }
 
+// `NEMOCLAW_VLLM_MODEL` only steers the express-vLLM install path, but users
+// often re-export it in the same shell they later run `connect` in. Run the
+// installer's validators up-front so a typo or a gated model with no
+// `HF_TOKEN` fails fast on the host — before any sandbox readiness probe,
+// inference-route reset, or SSH attach — instead of being silently ignored.
+// (#4543)
+function preflightVllmModelEnvOrExit(): void {
+  const result = preflightVllmModelEnv();
+  if (result.ok) return;
+  console.error("");
+  console.error(`  Error: ${result.message}`);
+  console.error(
+    `  Hint: NEMOCLAW_VLLM_MODEL is consumed by the managed-vLLM install path, not \`${CLI_NAME} <name> connect\`.`,
+  );
+  console.error(
+    "  Unset NEMOCLAW_VLLM_MODEL before reconnecting, or fix the value (and token) and re-run the install path that serves the model.",
+  );
+  process.exit(1);
+}
+
 export async function connectSandbox(
   sandboxName: string,
   { probeOnly = false }: SandboxConnectOptions = {},
 ): Promise<void> {
+  preflightVllmModelEnvOrExit();
   const { isSandboxReady, parseSandboxStatus } = require("../../onboard");
   await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true });
 
diff --git a/src/lib/inference/vllm-models.test.ts b/src/lib/inference/vllm-models.test.ts
@@ -8,6 +8,7 @@ import {
   VLLM_MODELS,
   assertGatedModelAccess,
   buildVllmServeCommand,
+  preflightVllmModelEnv,
   selectVllmModelFromEnv,
 } from "../../../dist/lib/inference/vllm-models";
 
@@ -99,3 +100,55 @@ describe("vllm model registry", () => {
     expect(cmd).not.toContain("--reasoning-parser qwen3");
   });
 });
+
+describe("preflightVllmModelEnv", () => {
+  it("succeeds when NEMOCLAW_VLLM_MODEL is unset", () => {
+    expect(preflightVllmModelEnv({} as NodeJS.ProcessEnv)).toEqual({ ok: true });
+  });
+
+  it("succeeds for a recognised non-gated slug", () => {
+    expect(
+      preflightVllmModelEnv({ NEMOCLAW_VLLM_MODEL: "qwen3.6-27b" } as NodeJS.ProcessEnv),
+    ).toEqual({ ok: true });
+  });
+
+  it("succeeds for a gated slug when HF_TOKEN is set", () => {
+    expect(
+      preflightVllmModelEnv({
+        NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
+        HF_TOKEN: "hf_abc",
+      } as NodeJS.ProcessEnv),
+    ).toEqual({ ok: true });
+  });
+
+  it("succeeds for a gated slug when HUGGING_FACE_HUB_TOKEN is set", () => {
+    expect(
+      preflightVllmModelEnv({
+        NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
+        HUGGING_FACE_HUB_TOKEN: "hf_abc",
+      } as NodeJS.ProcessEnv),
+    ).toEqual({ ok: true });
+  });
+
+  it("fails fast for a gated slug with no Hugging Face token (#4543)", () => {
+    const result = preflightVllmModelEnv({
+      NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
+    } as NodeJS.ProcessEnv);
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.message).toMatch(/gated on Hugging Face/);
+      expect(result.message).toMatch(/HF_TOKEN/);
+      expect(result.message).toMatch(/HUGGING_FACE_HUB_TOKEN/);
+    }
+  });
+
+  it("fails fast for an unknown slug", () => {
+    const result = preflightVllmModelEnv({
+      NEMOCLAW_VLLM_MODEL: "made-up-model",
+    } as NodeJS.ProcessEnv);
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.message).toMatch(/Unknown NEMOCLAW_VLLM_MODEL='made-up-model'/);
+    }
+  });
+});
diff --git a/src/lib/inference/vllm-models.ts b/src/lib/inference/vllm-models.ts
@@ -133,6 +133,42 @@ export function assertGatedModelAccess(
   );
 }
 
+export type PreflightVllmModelResult = { ok: true } | { ok: false; message: string };
+
+/**
+ * Combined preflight for callers that hold a `NEMOCLAW_VLLM_MODEL` reference
+ * but do not themselves invoke the vLLM installer — for example
+ * `nemoclaw <name> connect`, which simply attaches to a running sandbox.
+ *
+ * The variable steers the express-vLLM install path, so on every other code
+ * path the natural behaviour is to ignore it. Silent-ignore hides two real
+ * user mistakes:
+ *
+ *   1. typos in the slug (`deepseek-r1-distill-70b` vs an old marketing
+ *      name), surfaced later as the wrong model being served and a confused
+ *      user; and
+ *   2. requesting a gated model (DeepSeek-R1 Distill Llama 70B) without
+ *      exporting `HF_TOKEN` / `HUGGING_FACE_HUB_TOKEN`, which downstream
+ *      explodes as a 401 from Hugging Face partway through the pull.
+ *
+ * Running the same `selectVllmModelFromEnv` + `assertGatedModelAccess` checks
+ * the installer uses gives the caller a single fail-fast surface and one
+ * canonical message to print before any side effects. Returns
+ * `{ ok: true }` when the variable is unset or resolves cleanly. (#4543)
+ */
+export function preflightVllmModelEnv(
+  env: NodeJS.ProcessEnv = process.env,
+): PreflightVllmModelResult {
+  try {
+    const model = selectVllmModelFromEnv(env);
+    if (!model) return { ok: true };
+    assertGatedModelAccess(model, env);
+    return { ok: true };
+  } catch (err) {
+    return { ok: false, message: (err as Error).message };
+  }
+}
+
 const SHARED_VLLM_ARGS: readonly string[] = [
   "--gpu-memory-utilization",
   "0.7",