Skip to content

Commit b0b0fd4

Browse files
committed
fix(cli): resolve sandbox container by longest-owner and validate dashboardPort
Signed-off-by: Tinson Lai <tinsonl@nvidia.com>
1 parent 5a98bdf commit b0b0fd4

3 files changed

Lines changed: 151 additions & 21 deletions

File tree

src/lib/actions/sandbox/gateway-failure-classifier.ts

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import net from "node:net";
66
import { dockerInfo } from "../../adapters/docker/info";
77
import { dockerCapture } from "../../adapters/docker/run";
88
import { GATEWAY_PORT } from "../../core/ports";
9+
import * as registry from "../../state/registry";
910

1011
const DEFAULT_CONTAINER = "openshell-cluster-nemoclaw";
1112
const DOCKER_TIMEOUT_MS = 3000;
@@ -44,6 +45,7 @@ export type SandboxContainerFailureResult = {
4445
export type SandboxContainerFailureRunners = {
4546
listAllContainerNames: () => string;
4647
listRunningContainerNames: () => string;
48+
listSandboxNames: () => string[];
4749
portProbe: (port: number) => Promise<boolean>;
4850
};
4951

@@ -172,29 +174,71 @@ function defaultListRunningContainerNames(): string {
172174
});
173175
}
174176

177+
function defaultListSandboxNames(): string[] {
178+
try {
179+
return registry.listSandboxes().sandboxes.map((entry) => entry.name);
180+
} catch {
181+
return [];
182+
}
183+
}
184+
175185
const defaultSandboxContainerRunners: SandboxContainerFailureRunners = {
176186
listAllContainerNames: defaultListAllContainerNames,
177187
listRunningContainerNames: defaultListRunningContainerNames,
188+
listSandboxNames: defaultListSandboxNames,
178189
portProbe: defaultPortProbe,
179190
};
180191

181-
function sandboxContainerNamePrefix(sandboxName: string): string {
182-
return `openshell-${sandboxName}`;
183-
}
184-
192+
// OpenShell names sandbox containers either as `openshell-<sandbox>` (no
193+
// suffix) or `openshell-<sandbox>-<id>`, where `<id>` is appended by
194+
// openshell at runtime. Two prefix collisions are possible:
195+
//
196+
// 1. A sandbox name can be a prefix of another sandbox name
197+
// (`my` vs `my-assistant`).
198+
// 2. Even with a hyphen-free `<id>`, a sandbox name can be a prefix
199+
// of another sandbox name whose own suffix is hyphen-free
200+
// (`my-assistant` vs `my-assistant-prod`).
201+
//
202+
// Resolve each candidate to the LONGEST registered sandbox name it could
203+
// belong to and only accept candidates that resolve to the sandbox we are
204+
// looking up. The exact-name form is preferred before suffixed forms so
205+
// that an `openshell-<sandbox>` container always wins over an unrelated
206+
// `openshell-<sandbox>-<runtime-id>` co-tenant. Mirrors the resolver in
207+
// `docker-health.ts`.
185208
function findSandboxContainerName(
186209
names: string,
187210
sandboxName: string,
211+
registeredSandboxNames: readonly string[],
188212
): string | null {
189-
const prefix = sandboxContainerNamePrefix(sandboxName);
190-
for (const line of names.split("\n")) {
191-
const trimmed = line.trim();
192-
if (!trimmed) continue;
193-
if (trimmed === prefix || trimmed.startsWith(`${prefix}-`)) return trimmed;
213+
const ourPrefix = `openshell-${sandboxName}-`;
214+
const ourExact = `openshell-${sandboxName}`;
215+
const known = new Set<string>(registeredSandboxNames);
216+
known.add(sandboxName);
217+
const candidates = names
218+
.split("\n")
219+
.map((line) => line.trim())
220+
.filter((line) => line === ourExact || line.startsWith(ourPrefix));
221+
if (candidates.includes(ourExact)) return ourExact;
222+
const knownArr = [...known];
223+
for (const candidate of candidates) {
224+
const stripped = candidate.replace(/^openshell-/, "");
225+
const owner = knownArr
226+
.filter((name) => stripped === name || stripped.startsWith(`${name}-`))
227+
.sort((a, b) => b.length - a.length)[0];
228+
if (owner === sandboxName) return candidate;
194229
}
195230
return null;
196231
}
197232

233+
function isValidDashboardPort(port: number | null | undefined): port is number {
234+
return (
235+
typeof port === "number" &&
236+
Number.isInteger(port) &&
237+
port >= 1 &&
238+
port <= 65535
239+
);
240+
}
241+
198242
export async function classifySandboxContainerFailure(
199243
sandboxName: string,
200244
opts: {
@@ -203,18 +247,24 @@ export async function classifySandboxContainerFailure(
203247
} = {},
204248
): Promise<SandboxContainerFailureResult | null> {
205249
const runners = opts.runners ?? defaultSandboxContainerRunners;
250+
const registeredSandboxNames = runners.listSandboxNames();
206251
const running = findSandboxContainerName(
207252
runners.listRunningContainerNames(),
208253
sandboxName,
254+
registeredSandboxNames,
209255
);
210256
if (running) return null;
211257
const present = findSandboxContainerName(
212258
runners.listAllContainerNames(),
213259
sandboxName,
260+
registeredSandboxNames,
214261
);
215262
if (!present) return null;
216-
const dashboardPort = opts.dashboardPort ?? null;
217-
if (dashboardPort && (await runners.portProbe(dashboardPort))) {
263+
const dashboardPort = opts.dashboardPort;
264+
if (
265+
isValidDashboardPort(dashboardPort) &&
266+
(await runners.portProbe(dashboardPort))
267+
) {
218268
return {
219269
layer: "sandbox_dashboard_port_conflict",
220270
detail: `Sandbox container '${present}' is stopped and dashboard port ${dashboardPort} is held by another process.`,

src/lib/actions/sandbox/status.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,11 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
293293
console.log(getLayerHeader("docker_unreachable"));
294294
process.exitCode = 1;
295295
}
296-
// #4515: when the per-sandbox container is stopped (and optionally its
297-
// dashboard port is held by a foreign listener) the host-side Inference
298-
// probe still hits the remote provider directly and falsely shows healthy.
299-
// Probe the sandbox container upfront so the failure layer is the first
300-
// user-visible signal and the misleading Inference line is suppressed.
296+
// When the per-sandbox container is stopped (and optionally its dashboard
297+
// port is held by a foreign listener) the host-side Inference probe still
298+
// hits the remote provider directly and falsely shows healthy. Probe the
299+
// sandbox container upfront so the failure layer is the first user-visible
300+
// signal and the misleading Inference line is suppressed.
301301
const sandboxFailure = dockerUnreachable
302302
? null
303303
: await classifySandboxContainerFailureForStatus(sandboxEntryEarly);

test/gateway-failure-classifier.test.ts

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ function makeSandboxRunners(
145145
return {
146146
listAllContainerNames: () => "",
147147
listRunningContainerNames: () => "",
148+
listSandboxNames: () => [],
148149
portProbe: async () => false,
149150
...overrides,
150151
};
@@ -243,7 +244,7 @@ describe("classifySandboxContainerFailure", () => {
243244
expect(portProbeCalled).toBe(false);
244245
});
245246

246-
it("matches the exact prefix and the uuid-suffixed shape but not unrelated containers", async () => {
247+
it("matches the exact prefix and accepts uuid-suffixed shapes that resolve back to the queried sandbox", async () => {
247248
const exactResult = await classifySandboxContainerFailure("my-assistant", {
248249
runners: makeSandboxRunners({
249250
listAllContainerNames: () => "openshell-my-assistant\n",
@@ -252,14 +253,17 @@ describe("classifySandboxContainerFailure", () => {
252253
expect(exactResult?.layer).toBe("sandbox_container_stopped");
253254
expect(exactResult?.detail).toContain("openshell-my-assistant");
254255

255-
const otherSandbox = await classifySandboxContainerFailure("my-assistant", {
256+
// `openshell-my-assistant-7616dcb1` belongs to `my-assistant` because no
257+
// other registered sandbox name claims it via the longest-owner rule.
258+
const uuidResult = await classifySandboxContainerFailure("my-assistant", {
256259
runners: makeSandboxRunners({
257260
listAllContainerNames: () =>
258-
"openshell-my-assistant-evil\nopenshell-different-sandbox-abc",
261+
"openshell-my-assistant-7616dcb1\nopenshell-different-sandbox-abc",
262+
listSandboxNames: () => ["my-assistant", "different-sandbox"],
259263
}),
260264
});
261-
expect(otherSandbox?.layer).toBe("sandbox_container_stopped");
262-
expect(otherSandbox?.detail).toContain("openshell-my-assistant-evil");
265+
expect(uuidResult?.layer).toBe("sandbox_container_stopped");
266+
expect(uuidResult?.detail).toContain("openshell-my-assistant-7616dcb1");
263267

264268
const unrelated = await classifySandboxContainerFailure("my-assistant", {
265269
runners: makeSandboxRunners({
@@ -269,4 +273,80 @@ describe("classifySandboxContainerFailure", () => {
269273
});
270274
expect(unrelated).toBeNull();
271275
});
276+
277+
it("rejects a longer registered sandbox's container even when the literal prefix matches the queried name", async () => {
278+
// `openshell-my-assistant-prod-7616dcb1` resolves to the longer
279+
// `my-assistant-prod` sandbox via the longest-owner rule; the query for
280+
// `my-assistant` must not consume it. Mirrors the docker-health.ts
281+
// resolver and prevents the prefix-collision bug.
282+
const collision = await classifySandboxContainerFailure("my-assistant", {
283+
runners: makeSandboxRunners({
284+
listAllContainerNames: () =>
285+
"openshell-my-assistant-prod-7616dcb1\nopenshell-cluster-nemoclaw",
286+
listSandboxNames: () => ["my-assistant", "my-assistant-prod"],
287+
}),
288+
});
289+
expect(collision).toBeNull();
290+
});
291+
292+
it("matches an `openshell-<name>` exact container even when a co-tenant `openshell-<name>-<id>` exists in the same listing", async () => {
293+
const result = await classifySandboxContainerFailure("my-assistant", {
294+
runners: makeSandboxRunners({
295+
listAllContainerNames: () =>
296+
"openshell-my-assistant-7616dcb1\nopenshell-my-assistant",
297+
listSandboxNames: () => ["my-assistant"],
298+
}),
299+
});
300+
expect(result?.layer).toBe("sandbox_container_stopped");
301+
expect(result?.detail).toContain("openshell-my-assistant");
302+
expect(result?.detail).not.toContain("openshell-my-assistant-7616dcb1");
303+
});
304+
305+
it("ignores an out-of-range dashboardPort and falls back to sandbox_container_stopped", async () => {
306+
let portProbeCalled = false;
307+
const result = await classifySandboxContainerFailure("my-assistant", {
308+
dashboardPort: 70000,
309+
runners: makeSandboxRunners({
310+
listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n",
311+
portProbe: async () => {
312+
portProbeCalled = true;
313+
return true;
314+
},
315+
}),
316+
});
317+
expect(result?.layer).toBe("sandbox_container_stopped");
318+
expect(portProbeCalled).toBe(false);
319+
});
320+
321+
it("ignores a non-integer dashboardPort and falls back to sandbox_container_stopped", async () => {
322+
let portProbeCalled = false;
323+
const result = await classifySandboxContainerFailure("my-assistant", {
324+
dashboardPort: 18789.5 as unknown as number,
325+
runners: makeSandboxRunners({
326+
listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n",
327+
portProbe: async () => {
328+
portProbeCalled = true;
329+
return true;
330+
},
331+
}),
332+
});
333+
expect(result?.layer).toBe("sandbox_container_stopped");
334+
expect(portProbeCalled).toBe(false);
335+
});
336+
337+
it("ignores a zero dashboardPort and falls back to sandbox_container_stopped", async () => {
338+
let portProbeCalled = false;
339+
const result = await classifySandboxContainerFailure("my-assistant", {
340+
dashboardPort: 0,
341+
runners: makeSandboxRunners({
342+
listAllContainerNames: () => "openshell-my-assistant-7616dcb1\n",
343+
portProbe: async () => {
344+
portProbeCalled = true;
345+
return true;
346+
},
347+
}),
348+
});
349+
expect(result?.layer).toBe("sandbox_container_stopped");
350+
expect(portProbeCalled).toBe(false);
351+
});
272352
});

0 commit comments

Comments
 (0)