Skip to content

Commit 3f3e221

Browse files
committed
fix(inference): preflight NEMOCLAW_VLLM_MODEL on sandbox connect
Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
1 parent e79461c commit 3f3e221

4 files changed

Lines changed: 112 additions & 1 deletion

File tree

docs/reference/commands.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1358,7 +1358,7 @@ Set them before running `nemoclaw onboard`.
13581358
| `NEMOCLAW_SANDBOX` | sandbox name | Alternate spelling of `NEMOCLAW_SANDBOX_NAME`; used by `services` and `debug` lookups when neither a flag nor `NEMOCLAW_SANDBOX_NAME` is set. |
13591359
| `NEMOCLAW_INSTALL_REF` | git ref | For internal installer commands: the git ref to install from. Overridden by the `--install-ref` flag. |
13601360
| `NEMOCLAW_INSTALL_TAG` | release tag | For internal installer commands: the release tag to install. Overridden by the `--install-tag` flag. |
1361-
| `NEMOCLAW_VLLM_MODEL` | registry slug or Hugging Face model id | Selects the model the managed-vLLM install path serves. Recognised slugs: `qwen3.6-27b`, `nemotron-3-nano-4b`, `deepseek-r1-distill-70b`. Unset uses the per-platform profile default. Gated models (e.g. `deepseek-r1-distill-70b`) require `HF_TOKEN` or `HUGGING_FACE_HUB_TOKEN`. |
1361+
| `NEMOCLAW_VLLM_MODEL` | registry slug or Hugging Face model id | Selects the model the managed-vLLM install path serves. Recognised slugs: `qwen3.6-27b`, `nemotron-3-nano-4b`, `deepseek-r1-distill-70b`. Unset uses the per-platform profile default. Gated models (e.g. `deepseek-r1-distill-70b`) require `HF_TOKEN` or `HUGGING_FACE_HUB_TOKEN`. `nemoclaw <name> connect` only inspects the variable for fail-fast validation (unknown slug or gated model without an HF token) before attaching; it does not pull or serve a model itself. |
13621362
| `NEMOCLAW_MODEL_ROUTER_PYTHON` | absolute path | Pins the host Python interpreter used to create the Model Router virtual environment. Strict. NemoClaw probes only that interpreter and aborts with the failure reason if it does not qualify, rather than silently falling back to another python. Relative command names such as `python3.12` are rejected. When unset, NemoClaw probes `python3.13`, `python3.12`, `python3.11`, `python3.10`, and bare `python3`, retains every interpreter whose version is in `[3.10, 3.14)` and whose `ensurepip`, `pyexpat`, `ssl`, and `venv` stdlib modules import cleanly, and tries `python -m venv` on each in priority order until one succeeds. Set the pin when the auto-discovered interpreter is broken (for example, Homebrew `python@3.14` with a `pyexpat` dlopen mismatch on macOS). |
13631363

13641364
#### Linux Ollama install mode details

src/lib/actions/sandbox/connect.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import { D, G, R, YW } from "../../cli/terminal-style";
1919
import * as agentRuntime from "../../agent/runtime";
2020
import { parseGatewayInference } from "../../inference/config";
2121
import { findReachableOllamaHost, probeLocalProviderHealth } from "../../inference/local";
22+
import { preflightVllmModelEnv } from "../../inference/vllm-models";
2223
import {
2324
ensureOllamaAuthProxy,
2425
probeOllamaAuthProxyHealth,
@@ -755,10 +756,31 @@ function exitWithSpawnResult(result: SpawnLikeResult): void {
755756
process.exit(1);
756757
}
757758

759+
// `NEMOCLAW_VLLM_MODEL` only steers the express-vLLM install path, but users
760+
// often re-export it in the same shell they later run `connect` in. Run the
761+
// installer's validators up-front so a typo or a gated model with no
762+
// `HF_TOKEN` fails fast on the host — before any sandbox readiness probe,
763+
// inference-route reset, or SSH attach — instead of being silently ignored.
764+
// (#4543)
765+
function preflightVllmModelEnvOrExit(): void {
766+
const result = preflightVllmModelEnv();
767+
if (result.ok) return;
768+
console.error("");
769+
console.error(` Error: ${result.message}`);
770+
console.error(
771+
` Hint: NEMOCLAW_VLLM_MODEL is consumed by the managed-vLLM install path, not \`${CLI_NAME} <name> connect\`.`,
772+
);
773+
console.error(
774+
" Unset NEMOCLAW_VLLM_MODEL before reconnecting, or fix the value (and token) and re-run the install path that serves the model.",
775+
);
776+
process.exit(1);
777+
}
778+
758779
export async function connectSandbox(
759780
sandboxName: string,
760781
{ probeOnly = false }: SandboxConnectOptions = {},
761782
): Promise<void> {
783+
preflightVllmModelEnvOrExit();
762784
const { isSandboxReady, parseSandboxStatus } = require("../../onboard");
763785
await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true });
764786

src/lib/inference/vllm-models.test.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
VLLM_MODELS,
99
assertGatedModelAccess,
1010
buildVllmServeCommand,
11+
preflightVllmModelEnv,
1112
selectVllmModelFromEnv,
1213
} from "../../../dist/lib/inference/vllm-models";
1314

@@ -99,3 +100,55 @@ describe("vllm model registry", () => {
99100
expect(cmd).not.toContain("--reasoning-parser qwen3");
100101
});
101102
});
103+
104+
describe("preflightVllmModelEnv", () => {
105+
it("succeeds when NEMOCLAW_VLLM_MODEL is unset", () => {
106+
expect(preflightVllmModelEnv({} as NodeJS.ProcessEnv)).toEqual({ ok: true });
107+
});
108+
109+
it("succeeds for a recognised non-gated slug", () => {
110+
expect(
111+
preflightVllmModelEnv({ NEMOCLAW_VLLM_MODEL: "qwen3.6-27b" } as NodeJS.ProcessEnv),
112+
).toEqual({ ok: true });
113+
});
114+
115+
it("succeeds for a gated slug when HF_TOKEN is set", () => {
116+
expect(
117+
preflightVllmModelEnv({
118+
NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
119+
HF_TOKEN: "hf_abc",
120+
} as NodeJS.ProcessEnv),
121+
).toEqual({ ok: true });
122+
});
123+
124+
it("succeeds for a gated slug when HUGGING_FACE_HUB_TOKEN is set", () => {
125+
expect(
126+
preflightVllmModelEnv({
127+
NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
128+
HUGGING_FACE_HUB_TOKEN: "hf_abc",
129+
} as NodeJS.ProcessEnv),
130+
).toEqual({ ok: true });
131+
});
132+
133+
it("fails fast for a gated slug with no Hugging Face token (#4543)", () => {
134+
const result = preflightVllmModelEnv({
135+
NEMOCLAW_VLLM_MODEL: "deepseek-r1-distill-70b",
136+
} as NodeJS.ProcessEnv);
137+
expect(result.ok).toBe(false);
138+
if (!result.ok) {
139+
expect(result.message).toMatch(/gated on Hugging Face/);
140+
expect(result.message).toMatch(/HF_TOKEN/);
141+
expect(result.message).toMatch(/HUGGING_FACE_HUB_TOKEN/);
142+
}
143+
});
144+
145+
it("fails fast for an unknown slug", () => {
146+
const result = preflightVllmModelEnv({
147+
NEMOCLAW_VLLM_MODEL: "made-up-model",
148+
} as NodeJS.ProcessEnv);
149+
expect(result.ok).toBe(false);
150+
if (!result.ok) {
151+
expect(result.message).toMatch(/Unknown NEMOCLAW_VLLM_MODEL='made-up-model'/);
152+
}
153+
});
154+
});

src/lib/inference/vllm-models.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,42 @@ export function assertGatedModelAccess(
133133
);
134134
}
135135

136+
export type PreflightVllmModelResult = { ok: true } | { ok: false; message: string };
137+
138+
/**
139+
* Combined preflight for callers that hold a `NEMOCLAW_VLLM_MODEL` reference
140+
* but do not themselves invoke the vLLM installer — for example
141+
* `nemoclaw <name> connect`, which simply attaches to a running sandbox.
142+
*
143+
* The variable steers the express-vLLM install path, so on every other code
144+
* path the natural behaviour is to ignore it. Silent-ignore hides two real
145+
* user mistakes:
146+
*
147+
* 1. typos in the slug (`deepseek-r1-distill-70b` vs an old marketing
148+
* name), surfaced later as the wrong model being served and a confused
149+
* user; and
150+
* 2. requesting a gated model (DeepSeek-R1 Distill Llama 70B) without
151+
* exporting `HF_TOKEN` / `HUGGING_FACE_HUB_TOKEN`, which downstream
152+
* explodes as a 401 from Hugging Face partway through the pull.
153+
*
154+
* Running the same `selectVllmModelFromEnv` + `assertGatedModelAccess` checks
155+
* the installer uses gives the caller a single fail-fast surface and one
156+
* canonical message to print before any side effects. Returns
157+
* `{ ok: true }` when the variable is unset or resolves cleanly. (#4543)
158+
*/
159+
export function preflightVllmModelEnv(
160+
env: NodeJS.ProcessEnv = process.env,
161+
): PreflightVllmModelResult {
162+
try {
163+
const model = selectVllmModelFromEnv(env);
164+
if (!model) return { ok: true };
165+
assertGatedModelAccess(model, env);
166+
return { ok: true };
167+
} catch (err) {
168+
return { ok: false, message: (err as Error).message };
169+
}
170+
}
171+
136172
const SHARED_VLLM_ARGS: readonly string[] = [
137173
"--gpu-memory-utilization",
138174
"0.7",

0 commit comments

Comments
 (0)