Skip to content

Commit 98d66d6

Browse files
Fenrurclaude
andcommitted
fix: add per-context subprocess timeout to prevent daemon freeze (PR moazbuilds#38)
- Configurable timeouts per context (telegram, heartbeat, job) - SIGTERM → SIGKILL grace period for stuck subprocesses - Timeout detection in Telegram error messages - Skip fallback retry on timeout (only on rate limit) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent fef42a4 commit 98d66d6

File tree

3 files changed

+109
-20
lines changed

3 files changed

+109
-20
lines changed

src/commands/telegram.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,10 @@ async function handleMessage(message: TelegramMessage): Promise<void> {
980980
}
981981

982982
if (result.exitCode !== 0) {
983-
const errText = `Error (exit ${result.exitCode}): ${result.stderr || "Unknown error"}`;
983+
const isKilled = result.exitCode === 143 || result.exitCode === 137;
984+
const errText = isKilled
985+
? `⏱ Request timed out (exit ${result.exitCode}: ${result.exitCode === 143 ? "SIGTERM" : "SIGKILL"}) — the subprocess took too long and was killed. Try again or split into smaller steps.`
986+
: `Error (exit ${result.exitCode}): ${result.stderr || "Unknown error"}`;
984987
if (streamMsgId) {
985988
await callApi(config.token, "editMessageText", {
986989
chat_id: chatId, message_id: streamMsgId, text: errText,

src/config.ts

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const DEFAULT_SETTINGS: Settings = {
3535
web: { enabled: false, host: "127.0.0.1", port: 4632 },
3636
stt: { baseUrl: "", model: "" },
3737
additionalDirs: [],
38+
timeouts: { telegram: 5, heartbeat: 15, job: 30, default: 5 },
3839
};
3940

4041
export interface HeartbeatExcludeWindow {
@@ -64,7 +65,6 @@ export interface TelegramConfig {
6465
export interface DiscordConfig {
6566
token: string;
6667
allowedUserIds: string[]; // Discord snowflake IDs exceed Number.MAX_SAFE_INTEGER
67-
listenChannels: string[]; // Channel IDs where bot responds to all messages (no mention needed)
6868
}
6969

7070
export type SecurityLevel =
@@ -79,6 +79,17 @@ export interface SecurityConfig {
7979
disallowedTools: string[];
8080
}
8181

82+
export interface TimeoutsConfig {
83+
/** Max seconds for a telegram message subprocess. Default: 5 min. */
84+
telegram: number;
85+
/** Max minutes for a heartbeat subprocess. Default: 5 min. */
86+
heartbeat: number;
87+
/** Max minutes for a scheduled job subprocess. Default: 30 min. */
88+
job: number;
89+
/** Max minutes for all other subprocesses (bootstrap, trigger, etc). Default: 5 min. */
90+
default: number;
91+
}
92+
8293
export interface Settings {
8394
model: string;
8495
api: string;
@@ -94,6 +105,7 @@ export interface Settings {
94105
stt: SttConfig;
95106
additionalDirs: string[];
96107
apiToken?: string;
108+
timeouts: TimeoutsConfig;
97109
}
98110

99111
export interface AgenticConfig {
@@ -184,9 +196,6 @@ function parseSettings(raw: Record<string, any>, discordUserIds?: string[]): Set
184196
: Array.isArray(raw.discord?.allowedUserIds)
185197
? raw.discord.allowedUserIds.map(String)
186198
: [],
187-
listenChannels: Array.isArray(raw.discord?.listenChannels)
188-
? raw.discord.listenChannels.map(String)
189-
: [],
190199
},
191200
security: {
192201
level,
@@ -212,6 +221,12 @@ function parseSettings(raw: Record<string, any>, discordUserIds?: string[]): Set
212221
.map((d: string) => d.trim())
213222
: [],
214223
apiToken: typeof raw.apiToken === "string" && raw.apiToken.trim() ? raw.apiToken.trim() : undefined,
224+
timeouts: {
225+
telegram: Number.isFinite(raw.timeouts?.telegram) ? Number(raw.timeouts.telegram) : 5,
226+
heartbeat: Number.isFinite(raw.timeouts?.heartbeat) ? Number(raw.timeouts.heartbeat) : 15,
227+
job: Number.isFinite(raw.timeouts?.job) ? Number(raw.timeouts.job) : 30,
228+
default: Number.isFinite(raw.timeouts?.default) ? Number(raw.timeouts.default) : 5,
229+
},
215230
};
216231
}
217232

src/runner.ts

Lines changed: 86 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ const LEGACY_PROJECT_CLAUDE_MD = join(process.cwd(), ".claude", "CLAUDE.md");
1717
const CLAUDECLAW_BLOCK_START = "<!-- claudeclaw:managed:start -->";
1818
const CLAUDECLAW_BLOCK_END = "<!-- claudeclaw:managed:end -->";
1919

20+
// Grace period between SIGTERM and SIGKILL when a subprocess times out.
21+
const SIGKILL_GRACE_MS = 5_000;
22+
2023
export interface RunResult {
2124
stdout: string;
2225
stderr: string;
@@ -45,6 +48,7 @@ function parseRateLimitResetTime(text: string): number | null {
4548
const now = new Date();
4649
const reset = new Date(now);
4750
reset.setUTCHours(hours, minutes, 0, 0);
51+
// If the reset time is in the past, it means tomorrow
4852
if (reset.getTime() <= now.getTime()) {
4953
reset.setUTCDate(reset.getUTCDate() + 1);
5054
}
@@ -140,12 +144,36 @@ function buildChildEnv(baseEnv: Record<string, string>, model: string, api: stri
140144
return childEnv;
141145
}
142146

147+
/**
148+
* Resolve the subprocess timeout (in ms) for a given invocation name.
149+
* Values are read fresh from settings on every call, so hot-reload works
150+
* automatically: edit settings.json and the next subprocess picks it up.
151+
*
152+
* Name mapping:
153+
* "telegram" → settings.timeouts.telegram (default 5 min)
154+
* "heartbeat" → settings.timeouts.heartbeat (default 5 min)
155+
* anything else (jobs, bootstrap, trigger…) → settings.timeouts.job (default 30 min)
156+
*/
157+
function resolveTimeoutMs(name: string): number {
158+
const t = getSettings().timeouts;
159+
let minutes: number;
160+
if (name === "telegram") {
161+
minutes = t.telegram;
162+
} else if (name === "heartbeat") {
163+
minutes = t.heartbeat;
164+
} else {
165+
minutes = t.job;
166+
}
167+
return minutes * 60_000;
168+
}
169+
143170
async function runClaudeOnce(
144171
baseArgs: string[],
145172
model: string,
146173
api: string,
147-
baseEnv: Record<string, string>
148-
): Promise<{ rawStdout: string; stderr: string; exitCode: number }> {
174+
baseEnv: Record<string, string>,
175+
timeoutMs: number
176+
): Promise<{ rawStdout: string; stderr: string; exitCode: number; timedOut: boolean }> {
149177
const args = [...baseArgs];
150178
const normalizedModel = model.trim().toLowerCase();
151179
if (model.trim() && normalizedModel !== "glm") args.push("--model", model.trim());
@@ -157,17 +185,33 @@ async function runClaudeOnce(
157185
});
158186

159187
activeProc = proc;
188+
189+
let timedOut = false;
190+
let sigkillTimer: ReturnType<typeof setTimeout> | null = null;
191+
192+
const killTimer = setTimeout(() => {
193+
timedOut = true;
194+
try { proc.kill("SIGTERM"); } catch { /* already dead */ }
195+
sigkillTimer = setTimeout(() => {
196+
try { proc.kill("SIGKILL"); } catch { /* already dead */ }
197+
}, SIGKILL_GRACE_MS);
198+
}, timeoutMs);
199+
160200
const [rawStdout, stderr] = await Promise.all([
161201
new Response(proc.stdout).text(),
162202
new Response(proc.stderr).text(),
163203
]);
164204
await proc.exited;
165205
if (activeProc === proc) activeProc = null;
166206

207+
clearTimeout(killTimer);
208+
if (sigkillTimer !== null) clearTimeout(sigkillTimer);
209+
167210
return {
168211
rawStdout,
169212
stderr,
170213
exitCode: proc.exitCode ?? 1,
214+
timedOut,
171215
};
172216
}
173217

@@ -208,8 +252,9 @@ async function runClaudeStreaming(
208252
api: string,
209253
baseEnv: Record<string, string>,
210254
onChunk?: (text: string) => void,
211-
onToolEvent?: (line: string) => void
212-
): Promise<{ result: string; stderr: string; exitCode: number; sessionId?: string; isRateLimit: boolean }> {
255+
onToolEvent?: (line: string) => void,
256+
timeoutMs?: number
257+
): Promise<{ result: string; stderr: string; exitCode: number; sessionId?: string; isRateLimit: boolean; timedOut: boolean }> {
213258
const args = [...baseArgs];
214259
const normalizedModel = model.trim().toLowerCase();
215260
if (model.trim() && normalizedModel !== "glm") args.push("--model", model.trim());
@@ -221,6 +266,21 @@ async function runClaudeStreaming(
221266
});
222267

223268
activeProc = proc;
269+
270+
let timedOut = false;
271+
let sigkillTimer: ReturnType<typeof setTimeout> | null = null;
272+
let killTimer: ReturnType<typeof setTimeout> | null = null;
273+
274+
if (timeoutMs) {
275+
killTimer = setTimeout(() => {
276+
timedOut = true;
277+
try { proc.kill("SIGTERM"); } catch { /* already dead */ }
278+
sigkillTimer = setTimeout(() => {
279+
try { proc.kill("SIGKILL"); } catch { /* already dead */ }
280+
}, SIGKILL_GRACE_MS);
281+
}, timeoutMs);
282+
}
283+
224284
const stderrPromise = new Response(proc.stderr).text();
225285

226286
let finalResult = "";
@@ -297,11 +357,14 @@ async function runClaudeStreaming(
297357
await proc.exited;
298358
if (activeProc === proc) activeProc = null;
299359

360+
if (killTimer !== null) clearTimeout(killTimer);
361+
if (sigkillTimer !== null) clearTimeout(sigkillTimer);
362+
300363
const stderr = await stderrPromise;
301364
// Also check stderr for rate limit signals
302365
if (!isRateLimit) isRateLimit = RATE_LIMIT_PATTERN.test(stderr);
303366

304-
return { result: finalResult, stderr, exitCode: proc.exitCode ?? 1, sessionId, isRateLimit };
367+
return { result: finalResult, stderr, exitCode: proc.exitCode ?? 1, sessionId, isRateLimit, timedOut };
305368
}
306369

307370
const PROJECT_DIR = process.cwd();
@@ -465,14 +528,17 @@ async function execClaude(name: string, prompt: string, onChunk?: (text: string)
465528
primaryConfig = { model, api };
466529
}
467530

531+
const timeoutMs = resolveTimeoutMs(name);
532+
const timeoutMin = timeoutMs / 60_000;
533+
468534
const fallbackConfig: ModelConfig = {
469535
model: fallback?.model ?? "",
470536
api: fallback?.api ?? "",
471537
};
472538
const securityArgs = buildSecurityArgs(security);
473539

474540
console.log(
475-
`[${new Date().toLocaleTimeString()}] Running: ${name} (${isNew ? "new session" : `resume ${existing.sessionId.slice(0, 8)}`}, security: ${security.level})`
541+
`[${new Date().toLocaleTimeString()}] Running: ${name} (${isNew ? "new session" : `resume ${existing.sessionId.slice(0, 8)}`}, security: ${security.level}, timeout: ${timeoutMin}m)`
476542
);
477543

478544
// Always use stream-json — session_id comes from the result event for both new and resumed
@@ -505,33 +571,37 @@ async function execClaude(name: string, prompt: string, onChunk?: (text: string)
505571
const { CLAUDECODE: _, ...cleanEnv } = process.env;
506572
const baseEnv = { ...cleanEnv } as Record<string, string>;
507573

508-
let exec = await runClaudeStreaming(args, primaryConfig.model, primaryConfig.api, baseEnv, onChunk, onToolEvent);
574+
let exec = await runClaudeStreaming(args, primaryConfig.model, primaryConfig.api, baseEnv, onChunk, onToolEvent, timeoutMs);
509575
let usedFallback = false;
510576

511-
if (exec.isRateLimit && hasModelConfig(fallbackConfig) && !sameModelConfig(primaryConfig, fallbackConfig)) {
577+
if (exec.timedOut) {
578+
console.warn(
579+
`[${new Date().toLocaleTimeString()}] TIMEOUT: ${name} subprocess killed after ${timeoutMin}m (SIGTERM+SIGKILL)`
580+
);
581+
}
582+
583+
// Only retry with fallback on rate limit — not on timeout
584+
if (!exec.timedOut && exec.isRateLimit && hasModelConfig(fallbackConfig) && !sameModelConfig(primaryConfig, fallbackConfig)) {
512585
console.warn(
513586
`[${new Date().toLocaleTimeString()}] Claude limit reached; retrying with fallback${fallbackConfig.model ? ` (${fallbackConfig.model})` : ""}...`
514587
);
515-
// Strip --resume to avoid mixing thinking block signatures from
516-
// different providers in the same session history (see issue #18).
517588
const fallbackArgs = args.filter(
518589
(a) => a !== "--resume" && a !== existing?.sessionId
519590
);
520-
exec = await runClaudeStreaming(fallbackArgs, fallbackConfig.model, fallbackConfig.api, baseEnv, onChunk, onToolEvent);
591+
exec = await runClaudeStreaming(fallbackArgs, fallbackConfig.model, fallbackConfig.api, baseEnv, onChunk, onToolEvent, timeoutMs);
521592
usedFallback = true;
522593
}
523594

524595
// Auto-detect corrupted session from thinking block signature mismatch.
525-
// Back up the broken session and retry with a fresh one (issue #18).
526-
if (exec.exitCode !== 0 && !isNew && SIGNATURE_ERROR.test((exec.result ?? "") + exec.stderr)) {
596+
if (!exec.timedOut && exec.exitCode !== 0 && !isNew && SIGNATURE_ERROR.test((exec.result ?? "") + exec.stderr)) {
527597
const backupName = await backupSession();
528598
console.warn(
529599
`[${new Date().toLocaleTimeString()}] Detected corrupted session (thinking block signature mismatch). Backed up to ${backupName}, retrying with fresh session...`
530600
);
531601
const freshArgs = args.filter(
532602
(a) => a !== "--resume" && a !== existing?.sessionId
533603
);
534-
exec = await runClaudeStreaming(freshArgs, primaryConfig.model, primaryConfig.api, baseEnv, onChunk, onToolEvent);
604+
exec = await runClaudeStreaming(freshArgs, primaryConfig.model, primaryConfig.api, baseEnv, onChunk, onToolEvent, timeoutMs);
535605
}
536606

537607
const { result: stdout, stderr, exitCode, sessionId: streamedSessionId } = exec;
@@ -541,7 +611,7 @@ async function execClaude(name: string, prompt: string, onChunk?: (text: string)
541611
if (exec.isRateLimit) {
542612
const combined = stdout + stderr;
543613
const resetTime = parseRateLimitResetTime(combined);
544-
rateLimitResetAt = resetTime ?? (Date.now() + 60 * 60_000); // fallback: 1 hour
614+
rateLimitResetAt = resetTime ?? (Date.now() + 60 * 60_000);
545615
rateLimitNotified = false;
546616
console.warn(
547617
`[${new Date().toLocaleTimeString()}] Rate limit detected. Reset at: ${new Date(rateLimitResetAt).toISOString()}`
@@ -561,6 +631,7 @@ async function execClaude(name: string, prompt: string, onChunk?: (text: string)
561631
`Date: ${new Date().toISOString()}`,
562632
`Session: ${sessionId} (${isNew ? "new" : "resumed"})`,
563633
`Model config: ${usedFallback ? "fallback" : "primary"}`,
634+
`Timeout: ${timeoutMin}m${exec.timedOut ? " [TIMED OUT]" : ""}`,
564635
...(agentic.enabled ? [`Task type: ${taskType}`, `Routing: ${routingReasoning}`] : []),
565636
`Prompt: ${prompt}`,
566637
`Exit code: ${exitCode}`,

0 commit comments

Comments
 (0)