From f9f382ec9a3a89ec596f7ded4d3cfce71613a3a1 Mon Sep 17 00:00:00 2001 From: Jon Mischo Date: Sat, 25 Apr 2026 00:56:28 -0700 Subject: [PATCH 1/5] fix(heartbeat): escalate stranded issue when recovery retry succeeds without execution path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reconciler queues an issue_continuation_needed (or assignment_recovery) retry when an assigned issue has no live execution path. The escalation gate previously only tripped on failed/cancelled/timed_out terminal statuses, so a recovery run that exited successfully (e.g., posted a comment and returned) without re-establishing a real execution path would leave the issue in the same state, causing the reconciler to re-queue another recovery run on every tick (default 30s). This produced an indefinite loop until manual intervention. The hasActiveExecutionPath check earlier in the same branch already guarantees we only reach this guard when the issue is still stranded, so any terminal status of the recovery retry — including succeeded — should trigger escalation to blocked. Rename didAutomaticRecoveryFail to didAutomaticRecoveryExhaust to reflect that succeeded retries are now also considered exhausted. --- .../heartbeat-process-recovery.test.ts | 35 +++++++++---------- server/src/services/heartbeat.ts | 11 +++--- server/src/services/recovery/service.ts | 14 +++++--- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/server/src/__tests__/heartbeat-process-recovery.test.ts b/server/src/__tests__/heartbeat-process-recovery.test.ts index 8f870ac602c..f1798fcfb86 100644 --- a/server/src/__tests__/heartbeat-process-recovery.test.ts +++ b/server/src/__tests__/heartbeat-process-recovery.test.ts @@ -1486,8 +1486,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => { expect(wakeups).toHaveLength(1); }); - it("re-enqueues continuation when the latest automatic continuation succeeded without closing the issue", async () => { - const { agentId, issueId, runId } = await seedStrandedIssueFixture({ + it("escalates to blocked when the latest automatic continuation succeeded without closing the issue", async () => { + const { companyId, agentId, issueId, runId } = await seedStrandedIssueFixture({ status: "in_progress", runStatus: "succeeded", retryReason: "issue_continuation_needed", @@ -1495,28 +1495,25 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => { const heartbeat = heartbeatService(db); const result = await heartbeat.reconcileStrandedAssignedIssues(); - expect(result.continuationRequeued).toBe(1); - expect(result.escalated).toBe(0); + expect(result.continuationRequeued).toBe(0); + expect(result.escalated).toBe(1); expect(result.issueIds).toEqual([issueId]); const issue = await db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => rows[0] ?? null); - expect(issue?.status).toBe("in_progress"); - - const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId)); - expect(comments).toHaveLength(0); + expect(issue?.status).toBe("blocked"); - const runs = await db - .select() - .from(heartbeatRuns) - .where(eq(heartbeatRuns.agentId, agentId)); - expect(runs).toHaveLength(2); + await expectStrandedRecoveryArtifacts({ + companyId, + agentId, + issueId, + runId, + previousStatus: "in_progress", + retryReason: "issue_continuation_needed", + }); - const retryRun = runs.find((row) => row.id !== runId); - expect(retryRun?.id).toBeTruthy(); - expect((retryRun?.contextSnapshot as Record)?.retryReason).toBe("issue_continuation_needed"); - if (retryRun) { - await waitForRunToSettle(heartbeat, retryRun.id); - } + const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId)); + expect(comments).toHaveLength(1); + expect(comments[0]?.body).toContain("retried continuation"); }); it("does not reconcile user-assigned work through the agent stranded-work recovery path", async () => { diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index 0d2c715f4da..356dc3a7bba 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -994,7 +994,7 @@ function summarizeRunFailureForIssueComment( return null; } -function didAutomaticRecoveryFail( +function didAutomaticRecoveryExhaust( latestRun: Pick | null, expectedRetryReason: "assignment_recovery" | "issue_continuation_needed", ) { @@ -1002,10 +1002,13 @@ function didAutomaticRecoveryFail( const latestContext = parseObject(latestRun.contextSnapshot); const latestRetryReason = readNonEmptyString(latestContext.retryReason); + // A succeeded recovery run is also considered exhausted: call sites verify there is no + // active execution path before reaching this check, so a run that exited successfully + // without re-establishing one left the issue stranded and should trigger escalation. return ( latestRetryReason === expectedRetryReason && - UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes( - latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number], + HEARTBEAT_RUN_TERMINAL_STATUSES.includes( + latestRun.status as (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number], ) ); } @@ -6031,7 +6034,7 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {}) const shouldBlockImmediately = !recoveryAgentInvokable || !recoveryAgent || - didAutomaticRecoveryFail(run, issue.status === "todo" ? "assignment_recovery" : "issue_continuation_needed"); + didAutomaticRecoveryExhaust(run, issue.status === "todo" ? "assignment_recovery" : "issue_continuation_needed"); if (shouldBlockImmediately) { const comment = buildImmediateExecutionPathRecoveryComment({ status: issue.status as "todo" | "in_progress", diff --git a/server/src/services/recovery/service.ts b/server/src/services/recovery/service.ts index f20dd05915f..a8914b71113 100644 --- a/server/src/services/recovery/service.ts +++ b/server/src/services/recovery/service.ts @@ -35,6 +35,7 @@ import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js" const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; +const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const; const ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS = 24 * 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS = 4 * 60 * 60 * 1000; @@ -111,7 +112,7 @@ function summarizeRunFailureForIssueComment(run: LatestIssueRun) { return null; } -function didAutomaticRecoveryFail( +function didAutomaticRecoveryExhaust( latestRun: LatestIssueRun, expectedRetryReason: "assignment_recovery" | "issue_continuation_needed", ) { @@ -119,9 +120,12 @@ function didAutomaticRecoveryFail( const latestContext = parseObject(latestRun.contextSnapshot); const latestRetryReason = readNonEmptyString(latestContext.retryReason); + // A succeeded recovery run is also considered exhausted: call sites verify there is no + // active execution path before reaching this check, so a run that exited successfully + // without re-establishing one left the issue stranded and should trigger escalation. return latestRetryReason === expectedRetryReason && - UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes( - latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number], + HEARTBEAT_RUN_TERMINAL_STATUSES.includes( + latestRun.status as (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number], ); } @@ -1454,7 +1458,7 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) continue; } - if (didAutomaticRecoveryFail(latestRun, "assignment_recovery")) { + if (didAutomaticRecoveryExhaust(latestRun, "assignment_recovery")) { const failureSummary = summarizeRunFailureForIssueComment(latestRun); const updated = await escalateStrandedAssignedIssue({ issue, @@ -1495,7 +1499,7 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) result.skipped += 1; continue; } - if (didAutomaticRecoveryFail(latestRun, "issue_continuation_needed")) { + if (didAutomaticRecoveryExhaust(latestRun, "issue_continuation_needed")) { const failureSummary = summarizeRunFailureForIssueComment(latestRun); const updated = await escalateStrandedAssignedIssue({ issue, From 14604ce8a156d9f433ed55f6e4a1e64dfe2f291f Mon Sep 17 00:00:00 2001 From: Jon Mischo Date: Sat, 25 Apr 2026 03:19:34 -0700 Subject: [PATCH 2/5] refactor(heartbeat): consolidate didAutomaticRecoveryExhaust into recovery service Remove the parallel copy of didAutomaticRecoveryExhaust from heartbeat.ts and export the canonical implementation from recovery/service.ts. The execution-path recovery caller in heartbeat.ts now imports it directly, completing the consolidation started when reconcileStrandedAssignedIssues was moved into recovery/service.ts. --- server/src/services/heartbeat.ts | 21 +-------------------- server/src/services/recovery/service.ts | 2 +- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index 356dc3a7bba..cf7035511fb 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -107,7 +107,7 @@ import { readContinuationAttempt, } from "./recovery/index.js"; import { isAutomaticRecoverySuppressedByPauseHold } from "./recovery/pause-hold-guard.js"; -import { recoveryService } from "./recovery/service.js"; +import { recoveryService, didAutomaticRecoveryExhaust } from "./recovery/service.js"; import { withAgentStartLock } from "./agent-start-lock.js"; import { redactCurrentUserText, redactCurrentUserValue } from "../log-redaction.js"; import { @@ -994,25 +994,6 @@ function summarizeRunFailureForIssueComment( return null; } -function didAutomaticRecoveryExhaust( - latestRun: Pick | null, - expectedRetryReason: "assignment_recovery" | "issue_continuation_needed", -) { - if (!latestRun) return false; - - const latestContext = parseObject(latestRun.contextSnapshot); - const latestRetryReason = readNonEmptyString(latestContext.retryReason); - // A succeeded recovery run is also considered exhausted: call sites verify there is no - // active execution path before reaching this check, so a run that exited successfully - // without re-establishing one left the issue stranded and should trigger escalation. - return ( - latestRetryReason === expectedRetryReason && - HEARTBEAT_RUN_TERMINAL_STATUSES.includes( - latestRun.status as (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number], - ) - ); -} - function normalizeLedgerBillingType(value: unknown): BillingType { const raw = readNonEmptyString(value); switch (raw) { diff --git a/server/src/services/recovery/service.ts b/server/src/services/recovery/service.ts index a8914b71113..54145b5211f 100644 --- a/server/src/services/recovery/service.ts +++ b/server/src/services/recovery/service.ts @@ -112,7 +112,7 @@ function summarizeRunFailureForIssueComment(run: LatestIssueRun) { return null; } -function didAutomaticRecoveryExhaust( +export function didAutomaticRecoveryExhaust( latestRun: LatestIssueRun, expectedRetryReason: "assignment_recovery" | "issue_continuation_needed", ) { From 98c7d2d72e7438636173e771037dea7343db6896 Mon Sep 17 00:00:00 2001 From: Jon Mischo Date: Sat, 25 Apr 2026 03:35:17 -0700 Subject: [PATCH 3/5] refactor(heartbeat): remove unused UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constant was made redundant when callers were updated to use the broader HEARTBEAT_RUN_TERMINAL_STATUSES set (which includes 'succeeded'). Remove both copies — the one in recovery/service.ts and the parallel copy in heartbeat.ts. Self-reviewed by gpt-5.4-mini high; 0 findings. --- server/src/services/heartbeat.ts | 1 - server/src/services/recovery/service.ts | 1 - 2 files changed, 2 deletions(-) diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index cf7035511fb..418f4f5c6d9 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -151,7 +151,6 @@ const execFile = promisify(execFileCallback); const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const CANCELLABLE_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const; -const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; export { ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS, ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS, diff --git a/server/src/services/recovery/service.ts b/server/src/services/recovery/service.ts index 54145b5211f..04bab224e7e 100644 --- a/server/src/services/recovery/service.ts +++ b/server/src/services/recovery/service.ts @@ -34,7 +34,6 @@ import { import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js"; const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; -const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const; const ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS = 24 * 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000; From e1331243b291b6b2400810f35c4eec535a6ec6c6 Mon Sep 17 00:00:00 2001 From: Jon Mischo Date: Mon, 27 Apr 2026 18:45:39 -0700 Subject: [PATCH 4/5] chore(recovery): drop heartbeat cleanup from stranded escalation PR --- server/src/services/heartbeat.ts | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index 3a0dc242102..653b03c2478 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -109,7 +109,7 @@ import { readContinuationAttempt, } from "./recovery/index.js"; import { isAutomaticRecoverySuppressedByPauseHold } from "./recovery/pause-hold-guard.js"; -import { recoveryService, didAutomaticRecoveryExhaust } from "./recovery/service.js"; +import { recoveryService } from "./recovery/service.js"; import { withAgentStartLock } from "./agent-start-lock.js"; import { redactCurrentUserText, redactCurrentUserValue } from "../log-redaction.js"; import { @@ -154,6 +154,7 @@ const execFile = promisify(execFileCallback); const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const CANCELLABLE_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const; +const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; export { ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS, ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS, @@ -996,6 +997,22 @@ function summarizeRunFailureForIssueComment( return null; } +function didAutomaticRecoveryFail( + latestRun: Pick | null, + expectedRetryReason: "assignment_recovery" | "issue_continuation_needed", +) { + if (!latestRun) return false; + + const latestContext = parseObject(latestRun.contextSnapshot); + const latestRetryReason = readNonEmptyString(latestContext.retryReason); + return ( + latestRetryReason === expectedRetryReason && + UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes( + latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number], + ) + ); +} + function normalizeLedgerBillingType(value: unknown): BillingType { const raw = readNonEmptyString(value); switch (raw) { @@ -6181,7 +6198,7 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {}) issue.originKind === RECOVERY_ORIGIN_KINDS.strandedIssueRecovery || !recoveryAgentInvokable || !recoveryAgent || - didAutomaticRecoveryExhaust(run, issue.status === "todo" ? "assignment_recovery" : "issue_continuation_needed"); + didAutomaticRecoveryFail(run, issue.status === "todo" ? "assignment_recovery" : "issue_continuation_needed"); if (shouldBlockImmediately) { const comment = buildImmediateExecutionPathRecoveryComment({ status: issue.status as "todo" | "in_progress", From f685b8502afefe97688401653fef48c7c976e967 Mon Sep 17 00:00:00 2001 From: Jon Mischo Date: Mon, 27 Apr 2026 19:06:23 -0700 Subject: [PATCH 5/5] fix(recovery): let continuation caps handle successful retries --- server/src/services/recovery/service.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/services/recovery/service.ts b/server/src/services/recovery/service.ts index c2df3475c88..4e453684f43 100644 --- a/server/src/services/recovery/service.ts +++ b/server/src/services/recovery/service.ts @@ -118,6 +118,9 @@ export function didAutomaticRecoveryExhaust( const latestContext = parseObject(latestRun.contextSnapshot); const latestRetryReason = readNonEmptyString(latestContext.retryReason); + if (expectedRetryReason === "issue_continuation_needed" && latestRun.status === "succeeded") { + return false; + } // A succeeded recovery run is also considered exhausted: call sites verify there is no // active execution path before reaching this check, so a run that exited successfully // without re-establishing one left the issue stranded and should trigger escalation.