Skip to content

Commit b61c66e

Browse files
yuranichcursoragent
andcommitted
fix: surface worker startup stderr
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 863070d commit b61c66e

3 files changed

Lines changed: 141 additions & 5 deletions

File tree

packages/sdk/client/rpc/node-rpc-client.ts

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
import type { RuntimeContext } from "@/schemas";
2323

2424
const RPC_INIT_TIMEOUT_MS = 30_000;
25+
const WORKER_STDERR_TAIL_CHARS = 16_384;
2526

2627
const logger = getClientLogger();
2728

@@ -175,6 +176,19 @@ function bestEffortUnlinkSocket(socketPath: string | null) {
175176
}
176177
}
177178

179+
function appendWorkerStderrTail(current: string, chunk: string) {
180+
const next = current + chunk;
181+
if (next.length <= WORKER_STDERR_TAIL_CHARS) return next;
182+
return next.slice(next.length - WORKER_STDERR_TAIL_CHARS);
183+
}
184+
185+
function createWorkerStartupError(details: string, stderrTail: string) {
186+
const stderr = stderrTail.trimEnd();
187+
if (!stderr) return new Error(details);
188+
189+
return new Error(`${details}\n\nWorker stderr:\n${stderr}`);
190+
}
191+
178192
function resetModuleState() {
179193
rpcInstance = null;
180194
rpcPromise = null;
@@ -230,6 +244,14 @@ interface SpawnResources {
230244
socketPath: string;
231245
}
232246

247+
interface WorkerStderrStream {
248+
on(event: "data", listener: (chunk: Buffer | string) => void): void;
249+
}
250+
251+
function getWorkerStderr(proc: BareChildProcess): WorkerStderrStream | null {
252+
return (proc as { stderr?: WorkerStderrStream | null }).stderr ?? null;
253+
}
254+
233255
// `bare-runtime` resolves its platform binary with
234256
// `require('bare-runtime-<platform>-<arch>')` and throws a terse
235257
// `No binaries found for target '<platform>-<arch>'` whenever that package —
@@ -322,12 +344,19 @@ async function ensureRPC(): Promise<RPC> {
322344

323345
rpcPromise = new Promise((resolve, reject) => {
324346
let settled = false;
347+
let workerStderrTail = "";
325348

326349
const timer = setTimeout(() => {
327350
if (settled) return;
328351
settled = true;
352+
const cause = workerStderrTail
353+
? createWorkerStartupError(
354+
"Worker did not establish IPC before the RPC initialization timeout",
355+
workerStderrTail,
356+
)
357+
: undefined;
329358
teardownFailedInit();
330-
reject(new RPCInitTimeoutError(RPC_INIT_TIMEOUT_MS));
359+
reject(new RPCInitTimeoutError(RPC_INIT_TIMEOUT_MS, cause));
331360
}, RPC_INIT_TIMEOUT_MS);
332361

333362
ipcServer = createServer((socket) => {
@@ -370,7 +399,7 @@ async function ensureRPC(): Promise<RPC> {
370399
],
371400
platform: process.platform,
372401
arch: process.arch,
373-
stdio: ["inherit", "inherit", "inherit"],
402+
stdio: ["inherit", "inherit", "pipe"],
374403
});
375404
} catch (error) {
376405
// `spawn` resolves the bare binary synchronously and can throw before
@@ -386,6 +415,12 @@ async function ensureRPC(): Promise<RPC> {
386415
}
387416

388417
if (bareWorkerProc) {
418+
getWorkerStderr(bareWorkerProc)?.on("data", (chunk) => {
419+
const text = chunk.toString();
420+
workerStderrTail = appendWorkerStderrTail(workerStderrTail, text);
421+
process.stderr.write(chunk);
422+
});
423+
389424
bareWorkerProc.on(
390425
"exit",
391426
(code: number | null, exitSignal: string | null) => {
@@ -397,15 +432,29 @@ async function ensureRPC(): Promise<RPC> {
397432
);
398433
return;
399434
}
400-
// Worker died before handshake — reject the init promise.
435+
// Pre-handshake failures are rejected from "close" so stderr has
436+
// drained before we assemble the startup error cause.
437+
},
438+
);
439+
440+
bareWorkerProc.on(
441+
"close",
442+
(...args: unknown[]) => {
443+
if (settled) return;
444+
const code = typeof args[0] === "number" ? args[0] : null;
445+
const exitSignal = typeof args[1] === "string" ? args[1] : null;
446+
447+
// Worker died before handshake. Use close, not exit, so piped
448+
// stderr has drained before we build the error cause.
401449
settled = true;
402450
clearTimeout(timer);
403451
teardownFailedInit();
404452
reject(
405453
new RPCInitTimeoutError(
406454
RPC_INIT_TIMEOUT_MS,
407-
new Error(
408-
`Worker process exited with code ${code} before IPC connection was established`,
455+
createWorkerStartupError(
456+
`Worker process exited with code ${code}, signal ${exitSignal} before IPC connection was established`,
457+
workerStderrTail,
409458
),
410459
),
411460
);
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import path from "node:path";
2+
import { fileURLToPath } from "node:url";
3+
4+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
5+
const nativeLoadErrorMarker = "QVAC_REPRO_NATIVE_LOAD_ERROR";
6+
7+
function formatCause(error: Error | undefined) {
8+
const cause = (error as { cause?: unknown } | undefined)?.cause;
9+
if (!cause) return "<none>";
10+
if (cause instanceof Error) return `${cause.name}: ${cause.message}`;
11+
if (typeof cause === "string") return cause;
12+
return Object.prototype.toString.call(cause);
13+
}
14+
15+
function collectMessages(error: Error | undefined) {
16+
if (!error) return "";
17+
18+
const cause = (error as { cause?: unknown }).cause;
19+
const causeMessage =
20+
cause instanceof Error ? cause.message : Object.prototype.toString.call(cause);
21+
return `${error.message}\n${causeMessage}`;
22+
}
23+
24+
async function main() {
25+
process.env["QVAC_WORKER_PATH"] = path.resolve(
26+
__dirname,
27+
"../test/unit/fixtures/native-load-failure-worker.mjs",
28+
);
29+
30+
const { loadModel } = await import("../client/api/load-model");
31+
const { close } = await import("../client/rpc/rpc-client");
32+
33+
let startupError: Error | undefined;
34+
try {
35+
await loadModel({
36+
modelSrc: "/tmp/qvac-repro-model.gguf",
37+
modelType: "llamacpp-completion",
38+
});
39+
} catch (error) {
40+
startupError = error as Error;
41+
}
42+
43+
try {
44+
await close();
45+
} catch {}
46+
47+
if (!startupError) {
48+
console.error("Expected loadModel() to fail, but it resolved.");
49+
process.exit(1);
50+
}
51+
52+
const name = (startupError as { name?: string }).name;
53+
const messages = collectMessages(startupError);
54+
55+
if (name !== "RPC_INIT_TIMEOUT") {
56+
console.error(`Expected RPC_INIT_TIMEOUT, got ${name ?? "<missing>"}.`);
57+
console.error(startupError);
58+
process.exit(1);
59+
}
60+
61+
console.log("\n=== SDK-facing error caught by the caller ===");
62+
console.log(`name: ${name}`);
63+
console.log(`message: ${startupError.message}`);
64+
console.log(`cause: ${formatCause(startupError)}`);
65+
66+
if (!messages.includes(nativeLoadErrorMarker)) {
67+
console.error(
68+
`Expected SDK error details to include ${nativeLoadErrorMarker}.`,
69+
);
70+
process.exit(1);
71+
}
72+
73+
console.log("\n=== Reproduction result ===");
74+
console.log("Fixed: loadModel() failed with RPC_INIT_TIMEOUT.");
75+
console.log(
76+
`The SDK error includes the underlying ${nativeLoadErrorMarker} marker.`,
77+
);
78+
process.exit(0);
79+
}
80+
81+
main().catch((error: unknown) => {
82+
console.error(error);
83+
process.exit(1);
84+
});
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Bare worker that fails before the IPC handshake, like a native addon dlopen error.
2+
3+
throw new Error('QVAC_REPRO_NATIVE_LOAD_ERROR: simulated dlopen failure before worker handshake')

0 commit comments

Comments
 (0)