Skip to content

Commit c76ecb4

Browse files
authored
Merge pull request #2597 from kartast/fix/db-malformed-self-restart
fix(agent-runner): exit on persistent inbound.db corruption errors
2 parents 8f332e0 + 9dc9efa commit c76ecb4

2 files changed

Lines changed: 68 additions & 0 deletions

File tree

container/agent-runner/src/poll-loop.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from '
44
import { getPendingMessages, markCompleted } from './db/messages-in.js';
55
import { getUndeliveredMessages } from './db/messages-out.js';
66
import { formatMessages, extractRouting } from './formatter.js';
7+
import { isCorruptionError } from './poll-loop.js';
78
import { MockProvider } from './providers/mock.js';
89

910
beforeEach(() => {
@@ -377,3 +378,20 @@ describe('end-to-end with mock provider', () => {
377378
expect(outMessages[0].in_reply_to).toBe('m1');
378379
});
379380
});
381+
382+
describe('isCorruptionError', () => {
383+
it('matches the Docker Desktop macOS torn-read symptom', () => {
384+
expect(isCorruptionError('database disk image is malformed')).toBe(true);
385+
});
386+
387+
it('matches wrapped SQLite corruption codes', () => {
388+
expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true);
389+
expect(isCorruptionError('file is not a database')).toBe(true);
390+
});
391+
392+
it('returns false for unrelated errors', () => {
393+
expect(isCorruptionError('database is locked')).toBe(false);
394+
expect(isCorruptionError('no such table: messages_in')).toBe(false);
395+
expect(isCorruptionError('')).toBe(false);
396+
});
397+
});

container/agent-runner/src/poll-loop.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
1818
const POLL_INTERVAL_MS = 1000;
1919
const ACTIVE_POLL_INTERVAL_MS = 500;
2020

21+
/**
22+
* Number of consecutive `database disk image is malformed` errors after which
23+
* the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS
24+
* = 500ms this is roughly 5 seconds — long enough to dodge a transient torn
25+
* read during a host write, short enough to recover quickly from a poisoned
26+
* page cache (host-sweep then respawns with a fresh mount).
27+
*/
28+
const CORRUPTION_STREAK_EXIT = 10;
29+
30+
/**
31+
* True for SQLite errors that indicate a corrupt READ view — almost always a
32+
* cross-mount page-cache coherency issue on Docker Desktop macOS rather than
33+
* actual file damage (host-side integrity_check passes). Reopening the DB
34+
* handle inside this process does NOT recover; only a fresh container mount
35+
* does. Caller's job is to exit so host-sweep respawns the container.
36+
*/
37+
export function isCorruptionError(msg: string): boolean {
38+
return (
39+
msg.includes('database disk image is malformed') ||
40+
msg.includes('SQLITE_CORRUPT') ||
41+
msg.includes('file is not a database')
42+
);
43+
}
44+
2145
function log(msg: string): void {
2246
console.error(`[poll-loop] ${msg}`);
2347
}
@@ -291,6 +315,7 @@ async function processQuery(
291315
// will kill the container and messages get reset to pending.
292316
let pollInFlight = false;
293317
let endedForCommand = false;
318+
let corruptionStreak = 0;
294319
const pollHandle = setInterval(() => {
295320
if (done || pollInFlight || endedForCommand) return;
296321
pollInFlight = true;
@@ -362,6 +387,31 @@ async function processQuery(
362387
// path is not, so it needs its own.
363388
const errMsg = err instanceof Error ? err.message : String(err);
364389
log(`Follow-up poll error: ${errMsg}`);
390+
391+
// Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs /
392+
// gRPC-FUSE coherency bug — the kernel page cache for the inbound.db
393+
// bind mount can latch a torn snapshot mid-host-write, after which
394+
// every fresh openInboundDb() in this process sees the same broken
395+
// view. Reopening inside the container does NOT recover; only a fresh
396+
// container mount does. Exit so the host sweep respawns us.
397+
if (isCorruptionError(errMsg)) {
398+
corruptionStreak += 1;
399+
if (corruptionStreak >= CORRUPTION_STREAK_EXIT) {
400+
log(
401+
`Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` +
402+
`inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`,
403+
);
404+
// Stop touching the heartbeat so host-sweep stale detection fires
405+
// promptly even if exit() races with in-flight async work.
406+
done = true;
407+
clearInterval(pollHandle);
408+
// Defer exit one tick so this log line flushes through Docker's
409+
// log driver before the process dies.
410+
setTimeout(() => process.exit(75), 100);
411+
}
412+
} else {
413+
corruptionStreak = 0;
414+
}
365415
} finally {
366416
pollInFlight = false;
367417
}

0 commit comments

Comments
 (0)