Skip to content

Commit 39ee9e4

Browse files
committed
fix(cli): suppress inference probe upfront on docker/sandbox failure layers and expose failureLayer to status --json
Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
1 parent b0b0fd4 commit 39ee9e4

8 files changed

Lines changed: 626 additions & 116 deletions

File tree

src/commands/sandbox/status.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@ export default class SandboxStatusCommand extends NemoClawCommand {
2828
const { args } = await this.parse(SandboxStatusCommand);
2929
if (this.jsonEnabled()) {
3030
const report = await getSandboxStatusReport(args.sandboxName);
31-
if (!report.found || report.gatewayState !== "present" || report.rpcIssue) {
31+
if (
32+
!report.found ||
33+
report.gatewayState !== "present" ||
34+
report.rpcIssue ||
35+
report.failureLayer
36+
) {
3237
process.exitCode = 1;
3338
}
3439
// #4310: route the machine-readable report through the centralized

src/lib/actions/sandbox/docker-health.ts

Lines changed: 6 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import { dockerContainerInspectFormat } from "../../adapters/docker/inspect";
55
import { dockerCapture } from "../../adapters/docker/run";
66
import * as registry from "../../state/registry";
7+
import { resolveSandboxContainerOwner } from "./sandbox-container-owner";
78

89
export type DockerHealthState =
910
| "healthy"
@@ -48,48 +49,11 @@ function resolveDockerDriverSandboxContainer(
4849
} catch {
4950
return null;
5051
}
51-
// OpenShell names sandbox containers either as `openshell-<sandbox>`
52-
// (no suffix) or `openshell-<sandbox>-<id>` where <id> is a runtime
53-
// identifier appended by openshell. Two ambiguities to avoid:
54-
//
55-
// 1. A sandbox name can be a prefix of another sandbox name
56-
// (`my` vs `my-assistant`).
57-
// 2. Even with a hyphen-free suffix, a sandbox name can be a prefix
58-
// of another sandbox name whose own suffix is hyphen-free
59-
// (`my-assistant` vs `my-assistant-prod`).
60-
//
61-
// To disambiguate, resolve each candidate to the LONGEST registered
62-
// sandbox name it could belong to. We only accept a candidate when
63-
// that resolved owner is the sandbox we are looking up. This also
64-
// gives the right answer for the `openshell-<sandbox>` exact form.
65-
const ourPrefix = `openshell-${sandboxName}-`;
66-
const ourExact = `openshell-${sandboxName}`;
67-
const knownSandboxes = new Set(deps.listSandboxNames());
68-
knownSandboxes.add(sandboxName);
69-
const candidates = deps
70-
.dockerPsNames()
71-
.split("\n")
72-
.map((line) => line.trim())
73-
.filter((line) => line === ourExact || line.startsWith(ourPrefix));
74-
// Prefer the exact-name container before considering suffixed ones.
75-
// Without this short-circuit, a suffixed `openshell-<name>-<id>` whose
76-
// <id> is a docker runtime suffix (not a registered sandbox name) would
77-
// resolve to our sandbox via the longest-match heuristic and beat the
78-
// co-existing exact `openshell-<name>` if it appeared earlier in
79-
// `docker ps`.
80-
if (candidates.includes(ourExact)) return ourExact;
81-
for (const candidate of candidates) {
82-
const stripped = candidate.replace(/^openshell-/, "");
83-
// Find the longest known sandbox whose container name pattern
84-
// matches this candidate. Longest-first defeats prefix collisions.
85-
const owner = [...knownSandboxes]
86-
.filter(
87-
(name) => stripped === name || stripped.startsWith(`${name}-`),
88-
)
89-
.sort((a, b) => b.length - a.length)[0];
90-
if (owner === sandboxName) return candidate;
91-
}
92-
return null;
52+
return resolveSandboxContainerOwner(
53+
deps.dockerPsNames(),
54+
sandboxName,
55+
deps.listSandboxNames(),
56+
);
9357
}
9458

9559
function normalizeHealthState(raw: string): DockerHealthState {

src/lib/actions/sandbox/gateway-failure-classifier.ts

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { dockerInfo } from "../../adapters/docker/info";
77
import { dockerCapture } from "../../adapters/docker/run";
88
import { GATEWAY_PORT } from "../../core/ports";
99
import * as registry from "../../state/registry";
10+
import { resolveSandboxContainerOwner } from "./sandbox-container-owner";
1011

1112
const DEFAULT_CONTAINER = "openshell-cluster-nemoclaw";
1213
const DOCKER_TIMEOUT_MS = 3000;
@@ -189,46 +190,6 @@ const defaultSandboxContainerRunners: SandboxContainerFailureRunners = {
189190
portProbe: defaultPortProbe,
190191
};
191192

192-
// OpenShell names sandbox containers either as `openshell-<sandbox>` (no
193-
// suffix) or `openshell-<sandbox>-<id>`, where `<id>` is appended by
194-
// openshell at runtime. Two prefix collisions are possible:
195-
//
196-
// 1. A sandbox name can be a prefix of another sandbox name
197-
// (`my` vs `my-assistant`).
198-
// 2. Even with a hyphen-free `<id>`, a sandbox name can be a prefix
199-
// of another sandbox name whose own suffix is hyphen-free
200-
// (`my-assistant` vs `my-assistant-prod`).
201-
//
202-
// Resolve each candidate to the LONGEST registered sandbox name it could
203-
// belong to and only accept candidates that resolve to the sandbox we are
204-
// looking up. The exact-name form is preferred before suffixed forms so
205-
// that an `openshell-<sandbox>` container always wins over an unrelated
206-
// `openshell-<sandbox>-<runtime-id>` co-tenant. Mirrors the resolver in
207-
// `docker-health.ts`.
208-
function findSandboxContainerName(
209-
names: string,
210-
sandboxName: string,
211-
registeredSandboxNames: readonly string[],
212-
): string | null {
213-
const ourPrefix = `openshell-${sandboxName}-`;
214-
const ourExact = `openshell-${sandboxName}`;
215-
const known = new Set<string>(registeredSandboxNames);
216-
known.add(sandboxName);
217-
const candidates = names
218-
.split("\n")
219-
.map((line) => line.trim())
220-
.filter((line) => line === ourExact || line.startsWith(ourPrefix));
221-
if (candidates.includes(ourExact)) return ourExact;
222-
const knownArr = [...known];
223-
for (const candidate of candidates) {
224-
const stripped = candidate.replace(/^openshell-/, "");
225-
const owner = knownArr
226-
.filter((name) => stripped === name || stripped.startsWith(`${name}-`))
227-
.sort((a, b) => b.length - a.length)[0];
228-
if (owner === sandboxName) return candidate;
229-
}
230-
return null;
231-
}
232193

233194
function isValidDashboardPort(port: number | null | undefined): port is number {
234195
return (
@@ -248,13 +209,13 @@ export async function classifySandboxContainerFailure(
248209
): Promise<SandboxContainerFailureResult | null> {
249210
const runners = opts.runners ?? defaultSandboxContainerRunners;
250211
const registeredSandboxNames = runners.listSandboxNames();
251-
const running = findSandboxContainerName(
212+
const running = resolveSandboxContainerOwner(
252213
runners.listRunningContainerNames(),
253214
sandboxName,
254215
registeredSandboxNames,
255216
);
256217
if (running) return null;
257-
const present = findSandboxContainerName(
218+
const present = resolveSandboxContainerOwner(
258219
runners.listAllContainerNames(),
259220
sandboxName,
260221
registeredSandboxNames,
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
/**
5+
* Resolve which OpenShell container owns a given sandbox name.
6+
*
7+
* OpenShell names sandbox containers either as `openshell-<sandbox>` (no
8+
* suffix) or `openshell-<sandbox>-<id>`, where `<id>` is appended by openshell
9+
* at runtime. Two prefix collisions are possible:
10+
*
11+
* 1. A sandbox name can be a prefix of another sandbox name
12+
* (`my` vs `my-assistant`).
13+
* 2. Even with a hyphen-free `<id>`, a sandbox name can be a prefix
14+
* of another sandbox name whose own suffix is hyphen-free
15+
* (`my-assistant` vs `my-assistant-prod`).
16+
*
17+
* The longest-owner rule resolves each candidate to the longest registered
18+
* sandbox name that could claim it, then only accepts candidates that resolve
19+
* back to the queried sandbox. The exact-name form is preferred before
20+
* suffixed forms so `openshell-<sandbox>` always wins over an unrelated
21+
* `openshell-<sandbox>-<runtime-id>` co-tenant.
22+
*/
23+
export function resolveSandboxContainerOwner(
24+
containerNamesRaw: string,
25+
sandboxName: string,
26+
registeredSandboxNames: Iterable<string>,
27+
): string | null {
28+
const ourPrefix = `openshell-${sandboxName}-`;
29+
const ourExact = `openshell-${sandboxName}`;
30+
const known = new Set<string>(registeredSandboxNames);
31+
known.add(sandboxName);
32+
const candidates = containerNamesRaw
33+
.split("\n")
34+
.map((line) => line.trim())
35+
.filter((line) => line === ourExact || line.startsWith(ourPrefix));
36+
if (candidates.includes(ourExact)) return ourExact;
37+
const knownArr = [...known];
38+
for (const candidate of candidates) {
39+
const stripped = candidate.replace(/^openshell-/, "");
40+
const owner = knownArr
41+
.filter((name) => stripped === name || stripped.startsWith(`${name}-`))
42+
.sort((a, b) => b.length - a.length)[0];
43+
if (owner === sandboxName) return candidate;
44+
}
45+
return null;
46+
}

src/lib/actions/sandbox/status.test.ts

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ import { describe, expect, it } from "vitest";
66
import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health";
77
import {
88
classifySandboxContainerFailureForStatus,
9+
classifySandboxStatusPreflightFailure,
910
getSandboxStatusInferenceHealth,
1011
isDockerDaemonUnreachableForStatus,
12+
maybeGetSandboxStatusInferenceHealth,
1113
} from "../../../../dist/lib/actions/sandbox/status";
1214

1315
describe("sandbox status inference health", () => {
@@ -152,3 +154,125 @@ describe("classifySandboxContainerFailureForStatus", () => {
152154
expect(observed).toEqual([{ sandboxName: "alpha", port: null }]);
153155
});
154156
});
157+
158+
describe("maybeGetSandboxStatusInferenceHealth", () => {
159+
it("does not invoke the provider probe when suppressInferenceProbe is true even with a present gateway and string provider", () => {
160+
let probeCalls = 0;
161+
const result = maybeGetSandboxStatusInferenceHealth(
162+
true,
163+
true,
164+
"nvidia-prod",
165+
"nvidia/nemotron",
166+
(...args) => {
167+
probeCalls += 1;
168+
throw new Error(`probeProviderHealth should not be invoked (args=${JSON.stringify(args)})`);
169+
},
170+
);
171+
expect(result).toBeNull();
172+
expect(probeCalls).toBe(0);
173+
});
174+
175+
it("delegates to the probe when suppressInferenceProbe is false", () => {
176+
const calls: { provider: string; options?: ProviderHealthProbeOptions }[] = [];
177+
const result = maybeGetSandboxStatusInferenceHealth(
178+
false,
179+
true,
180+
"nvidia-prod",
181+
"nvidia/nemotron",
182+
(provider, options) => {
183+
calls.push({ provider, options });
184+
return {
185+
ok: true,
186+
probed: true,
187+
providerLabel: "NVIDIA Endpoints",
188+
endpoint: "https://integrate.api.nvidia.com/v1/chat/completions",
189+
detail: "healthy",
190+
};
191+
},
192+
);
193+
expect(result?.ok).toBe(true);
194+
expect(calls).toEqual([
195+
{ provider: "nvidia-prod", options: { model: "nvidia/nemotron" } },
196+
]);
197+
});
198+
});
199+
200+
describe("classifySandboxStatusPreflightFailure", () => {
201+
it("returns docker_unreachable when the daemon probe reports unreachable", async () => {
202+
let sandboxProbeCalled = false;
203+
const result = await classifySandboxStatusPreflightFailure(
204+
{ name: "alpha", openshellDriver: "docker" } as never,
205+
{
206+
dockerProbe: () => false,
207+
sandboxContainerProbe: async () => {
208+
sandboxProbeCalled = true;
209+
return null;
210+
},
211+
},
212+
);
213+
expect(result).toEqual({ layer: "docker_unreachable", dockerUnreachable: true });
214+
// Short-circuits: a daemon that is already known to be down must not
215+
// trigger a follow-up `docker ps` round trip.
216+
expect(sandboxProbeCalled).toBe(false);
217+
});
218+
219+
it("returns the sandbox container failure when the daemon is reachable", async () => {
220+
const result = await classifySandboxStatusPreflightFailure(
221+
{ name: "alpha", openshellDriver: "docker", dashboardPort: 18789 } as never,
222+
{
223+
dockerProbe: () => true,
224+
sandboxContainerProbe: async (sandboxName, dashboardPort) => {
225+
expect(sandboxName).toBe("alpha");
226+
expect(dashboardPort).toBe(18789);
227+
return {
228+
layer: "sandbox_dashboard_port_conflict",
229+
detail: "stub failure",
230+
};
231+
},
232+
},
233+
);
234+
expect(result).toEqual({
235+
layer: "sandbox_dashboard_port_conflict",
236+
dockerUnreachable: false,
237+
});
238+
});
239+
240+
it("returns null when the sandbox container probe finds no failure", async () => {
241+
const result = await classifySandboxStatusPreflightFailure(
242+
{ name: "alpha", openshellDriver: "docker" } as never,
243+
{
244+
dockerProbe: () => true,
245+
sandboxContainerProbe: async () => null,
246+
},
247+
);
248+
expect(result).toBeNull();
249+
});
250+
251+
it("returns null when the sandbox is not on the docker driver", async () => {
252+
let dockerCalled = false;
253+
let sandboxCalled = false;
254+
const result = await classifySandboxStatusPreflightFailure(
255+
{ name: "alpha", openshellDriver: "vm" } as never,
256+
{
257+
dockerProbe: () => {
258+
dockerCalled = true;
259+
return false;
260+
},
261+
sandboxContainerProbe: async () => {
262+
sandboxCalled = true;
263+
return null;
264+
},
265+
},
266+
);
267+
expect(result).toBeNull();
268+
// Both gates are docker-driver-only; a vm sandbox must not provoke
269+
// either probe.
270+
expect(dockerCalled).toBe(false);
271+
expect(sandboxCalled).toBe(false);
272+
});
273+
274+
it("returns null when the sandbox entry is null", async () => {
275+
const result = await classifySandboxStatusPreflightFailure(null);
276+
expect(result).toBeNull();
277+
});
278+
});

0 commit comments

Comments
 (0)