|
| 1 | +/** |
| 2 | + * ProcessFootprintMonitor — the per-machine process-footprint measurement that |
| 3 | + * was MISSING when steady-state process accumulation (multiple full agent stacks |
| 4 | + * + their heavy MCP servers — a whole Chromium, an Electron) climbed unwatched |
| 5 | + * until the host hit a kernel limit and panicked (2026-06-26, os_refcnt overflow). |
| 6 | + * |
| 7 | + * The host spawn-cap bounds INSTANTANEOUS spawn bursts; the idle-session reapers |
| 8 | + * bound idle SESSIONS. Neither MEASURES the slow climb of the total process count. |
| 9 | + * This monitor does exactly that and nothing more: on an interval it counts the |
| 10 | + * instar-relevant processes on this machine, classifies them (agent CLIs vs MCP |
| 11 | + * servers vs other node), keeps a bounded rolling window so a TREND is visible, |
| 12 | + * and — only when explicitly enabled — raises ONE de-duplicated heads-up when the |
| 13 | + * count crosses a threshold. It is OBSERVE-ONLY: it never kills, throttles, or |
| 14 | + * gates anything (the reapers own reclamation). Ships DARK by default. |
| 15 | + * |
| 16 | + * Pure core: all process input is injected (`listProcesses`) so the classifier and |
| 17 | + * trend logic are unit-testable without scanning the real host. |
| 18 | + */ |
| 19 | + |
| 20 | +import { execFileSync } from 'node:child_process'; |
| 21 | +import { MCP_PROCESS_SIGNATURES } from './mcpProcessSignatures.js'; |
| 22 | +import { withSyncOp } from '../core/InFlightSyncOpMarker.js'; |
| 23 | + |
| 24 | +/** A live process as seen by the scanner (only the fields we classify on). */ |
| 25 | +export interface FootprintProcess { |
| 26 | + pid: number; |
| 27 | + /** Full command line (argv joined) — matched against signatures/patterns. */ |
| 28 | + command: string; |
| 29 | + /** Resident set size in bytes (0 if unknown). */ |
| 30 | + rssBytes: number; |
| 31 | +} |
| 32 | + |
| 33 | +export type FootprintKind = 'agent-cli' | 'mcp' | 'other-node'; |
| 34 | + |
| 35 | +/** One point-in-time footprint reading. */ |
| 36 | +export interface FootprintSample { |
| 37 | + ts: number; |
| 38 | + /** Total instar-relevant processes counted. */ |
| 39 | + total: number; |
| 40 | + byKind: Record<FootprintKind, number>; |
| 41 | + /** Summed RSS of the counted processes. */ |
| 42 | + rssBytes: number; |
| 43 | +} |
| 44 | + |
| 45 | +export interface ProcessFootprintMonitorConfig { |
| 46 | + /** Master switch — DARK by default. When false, start() is a no-op. */ |
| 47 | + enabled: boolean; |
| 48 | + sampleIntervalMs: number; |
| 49 | + /** Ring-buffer size (how many samples of history to keep for the trend). */ |
| 50 | + windowSamples: number; |
| 51 | + /** |
| 52 | + * Total-process count at/above which a heads-up is raised. 0 disables the alert |
| 53 | + * regardless of `alertEnabled`. The alert is observe-only (one attention item). |
| 54 | + */ |
| 55 | + alertThreshold: number; |
| 56 | + /** Alert is opt-in even when the monitor is enabled (measure first). */ |
| 57 | + alertEnabled: boolean; |
| 58 | +} |
| 59 | + |
| 60 | +export const DEFAULT_PROCESS_FOOTPRINT_MONITOR_CONFIG: ProcessFootprintMonitorConfig = { |
| 61 | + enabled: false, // DARK by default (observe-only, but no reason to sample on the fleet yet) |
| 62 | + sampleIntervalMs: 5 * 60 * 1000, |
| 63 | + windowSamples: 288, // 24h at 5-min cadence |
| 64 | + alertThreshold: 220, // the panic snapshot showed ~280 node refs; warn well before |
| 65 | + alertEnabled: false, // opt-in: measure before paging |
| 66 | +}; |
| 67 | + |
| 68 | +export interface ProcessFootprintMonitorDeps { |
| 69 | + /** Returns the instar-relevant processes on this machine. Injected for tests. */ |
| 70 | + listProcesses: () => FootprintProcess[]; |
| 71 | + now?: () => number; |
| 72 | + /** Observe-only heads-up sink (the attention queue). Absent ⇒ alert is inert. */ |
| 73 | + emitAttention?: (item: { id: string; title: string; body: string }) => void; |
| 74 | +} |
| 75 | + |
| 76 | +/** |
| 77 | + * Production scanner: enumerate the host's processes via `ps`. Returns [] on any |
| 78 | + * failure (fail-safe — a missing reading must never crash the monitor). The scan |
| 79 | + * is off-hot-path (the monitor samples on a multi-minute interval, ships dark) and |
| 80 | + * funnels through withSyncOp so the in-flight marker sees the blocking spawn. |
| 81 | + */ |
| 82 | +export function defaultListProcesses(): FootprintProcess[] { |
| 83 | + let out: string; |
| 84 | + try { |
| 85 | + // lint-allow-blocking-scan: off-hot-path (multi-minute sampling interval, dark |
| 86 | + // by default), bounded 15s timeout — same posture as the AgentWorktreeReaper's |
| 87 | + // lsof scan. The monitor only READS process metadata; it never kills or gates. |
| 88 | + out = withSyncOp(() => execFileSync('ps', ['-A', '-o', 'pid=,rss=,command='], { |
| 89 | + encoding: 'utf-8', timeout: 15_000, maxBuffer: 32 * 1024 * 1024, |
| 90 | + })); |
| 91 | + } catch { |
| 92 | + return []; // @silent-fallback-ok — no ps reading ⇒ no sample (keeps last) |
| 93 | + } |
| 94 | + const procs: FootprintProcess[] = []; |
| 95 | + for (const line of out.split('\n')) { |
| 96 | + const m = line.match(/^\s*(\d+)\s+(\d+)\s+(.*)$/); |
| 97 | + if (!m) continue; |
| 98 | + procs.push({ pid: Number(m[1]), rssBytes: Number(m[2]) * 1024 /* ps rss is KB */, command: m[3] }); |
| 99 | + } |
| 100 | + return procs; |
| 101 | +} |
| 102 | + |
| 103 | +/** Classify a single process. Returns null for processes we don't count. */ |
| 104 | +export function classifyFootprintProcess(p: FootprintProcess): FootprintKind | null { |
| 105 | + const cmd = (p.command || '').toLowerCase(); |
| 106 | + if (!cmd) return null; |
| 107 | + // MCP servers — the heavy, mostly-idle ones (Chromium for Playwright, Electron, |
| 108 | + // mcp-remote bridges). Matched via the SAME allow-listed signatures the reaper uses. |
| 109 | + for (const sig of MCP_PROCESS_SIGNATURES) { |
| 110 | + if (sig.commandIncludesAll.every((needle) => cmd.includes(needle.toLowerCase()))) { |
| 111 | + return 'mcp'; |
| 112 | + } |
| 113 | + } |
| 114 | + // Agent CLIs — the per-session reasoning processes. |
| 115 | + if (/\b(claude|codex|gemini)\b/.test(cmd) && !cmd.includes('grep')) return 'agent-cli'; |
| 116 | + // Other instar node processes (servers, lifelines, MCP wrappers not matched above). |
| 117 | + if (/\bnode\b/.test(cmd) || cmd.includes('/.instar/') || cmd.includes('instar/dist')) return 'other-node'; |
| 118 | + return null; |
| 119 | +} |
| 120 | + |
| 121 | +/** Build a footprint sample from a process list (pure). */ |
| 122 | +export function buildFootprintSample(procs: FootprintProcess[], ts: number): FootprintSample { |
| 123 | + const byKind: Record<FootprintKind, number> = { 'agent-cli': 0, mcp: 0, 'other-node': 0 }; |
| 124 | + let total = 0; |
| 125 | + let rssBytes = 0; |
| 126 | + for (const p of procs) { |
| 127 | + const kind = classifyFootprintProcess(p); |
| 128 | + if (!kind) continue; |
| 129 | + byKind[kind]++; |
| 130 | + total++; |
| 131 | + rssBytes += Math.max(0, p.rssBytes || 0); |
| 132 | + } |
| 133 | + return { ts, total, byKind, rssBytes }; |
| 134 | +} |
| 135 | + |
| 136 | +export interface FootprintStatus { |
| 137 | + enabled: boolean; |
| 138 | + latest: FootprintSample | null; |
| 139 | + /** Direction over the window: rising if the latest exceeds the window median by |
| 140 | + * a margin, falling if below, else stable. Coarse on purpose. */ |
| 141 | + trend: 'rising' | 'stable' | 'falling' | 'insufficient-data'; |
| 142 | + windowSize: number; |
| 143 | + alertThreshold: number; |
| 144 | + alertEnabled: boolean; |
| 145 | + /** True while the most recent sample is at/over the threshold. */ |
| 146 | + overThreshold: boolean; |
| 147 | + samples: FootprintSample[]; |
| 148 | +} |
| 149 | + |
| 150 | +export class ProcessFootprintMonitor { |
| 151 | + private readonly cfg: ProcessFootprintMonitorConfig; |
| 152 | + private readonly deps: ProcessFootprintMonitorDeps; |
| 153 | + private readonly now: () => number; |
| 154 | + private ring: FootprintSample[] = []; |
| 155 | + private timer?: NodeJS.Timeout; |
| 156 | + /** Per-episode alert latch: one heads-up per threshold-crossing episode. */ |
| 157 | + private alerted = false; |
| 158 | + |
| 159 | + constructor(deps: ProcessFootprintMonitorDeps, cfg?: Partial<ProcessFootprintMonitorConfig>) { |
| 160 | + this.deps = deps; |
| 161 | + this.cfg = { ...DEFAULT_PROCESS_FOOTPRINT_MONITOR_CONFIG, ...(cfg ?? {}) }; |
| 162 | + this.now = deps.now ?? (() => Date.now()); |
| 163 | + } |
| 164 | + |
| 165 | + start(): void { |
| 166 | + if (this.timer || !this.cfg.enabled) return; |
| 167 | + this.sample(); // one immediate reading |
| 168 | + this.timer = setInterval(() => this.sample(), this.cfg.sampleIntervalMs); |
| 169 | + if (typeof this.timer.unref === 'function') this.timer.unref(); |
| 170 | + } |
| 171 | + |
| 172 | + stop(): void { |
| 173 | + if (this.timer) { clearInterval(this.timer); this.timer = undefined; } |
| 174 | + } |
| 175 | + |
| 176 | + /** Take one reading (also callable directly in tests). Returns the sample. */ |
| 177 | + sample(): FootprintSample { |
| 178 | + let procs: FootprintProcess[]; |
| 179 | + try { procs = this.deps.listProcesses(); } |
| 180 | + catch { return this.ring[this.ring.length - 1] ?? buildFootprintSample([], this.now()); } // fail-safe: keep last |
| 181 | + const s = buildFootprintSample(procs, this.now()); |
| 182 | + this.ring.push(s); |
| 183 | + while (this.ring.length > this.cfg.windowSamples) this.ring.shift(); |
| 184 | + this.maybeAlert(s); |
| 185 | + return s; |
| 186 | + } |
| 187 | + |
| 188 | + private maybeAlert(s: FootprintSample): void { |
| 189 | + if (!this.cfg.alertEnabled || this.cfg.alertThreshold <= 0) return; |
| 190 | + if (s.total >= this.cfg.alertThreshold) { |
| 191 | + if (!this.alerted && this.deps.emitAttention) { |
| 192 | + this.alerted = true; // one per episode |
| 193 | + this.deps.emitAttention({ |
| 194 | + id: 'process-footprint:over-threshold', |
| 195 | + title: `Process footprint high (${s.total} processes)`, |
| 196 | + body: `This machine is running ${s.total} instar-relevant processes ` + |
| 197 | + `(${s.byKind['agent-cli']} agent CLIs, ${s.byKind.mcp} MCP servers, ` + |
| 198 | + `${s.byKind['other-node']} other node) — at/over the ${this.cfg.alertThreshold} ` + |
| 199 | + `heads-up threshold. Steady-state process accumulation is the footprint that ` + |
| 200 | + `preceded the resource-exhaustion panic; consider offloading idle MCP servers ` + |
| 201 | + `or consolidating agent stacks.`, |
| 202 | + }); |
| 203 | + } |
| 204 | + } else if (s.total < this.cfg.alertThreshold * 0.9) { |
| 205 | + this.alerted = false; // re-arm with hysteresis once it recovers |
| 206 | + } |
| 207 | + } |
| 208 | + |
| 209 | + private computeTrend(): FootprintStatus['trend'] { |
| 210 | + if (this.ring.length < 4) return 'insufficient-data'; |
| 211 | + const totals = this.ring.map((s) => s.total).slice().sort((a, b) => a - b); |
| 212 | + const median = totals[Math.floor(totals.length / 2)]; |
| 213 | + const latest = this.ring[this.ring.length - 1].total; |
| 214 | + const margin = Math.max(2, Math.ceil(median * 0.15)); |
| 215 | + if (latest >= median + margin) return 'rising'; |
| 216 | + if (latest <= median - margin) return 'falling'; |
| 217 | + return 'stable'; |
| 218 | + } |
| 219 | + |
| 220 | + status(): FootprintStatus { |
| 221 | + return { |
| 222 | + enabled: this.cfg.enabled, |
| 223 | + latest: this.ring[this.ring.length - 1] ?? null, |
| 224 | + trend: this.computeTrend(), |
| 225 | + windowSize: this.ring.length, |
| 226 | + alertThreshold: this.cfg.alertThreshold, |
| 227 | + alertEnabled: this.cfg.alertEnabled, |
| 228 | + overThreshold: (this.ring[this.ring.length - 1]?.total ?? 0) >= this.cfg.alertThreshold && this.cfg.alertThreshold > 0, |
| 229 | + samples: this.ring.slice(), |
| 230 | + }; |
| 231 | + } |
| 232 | +} |
0 commit comments