Skip to content

Commit 176759d

Browse files
authored
fix(init): recover run-level stale resume errors (#1025)
## Summary Handles Mastra's run-level \"was not suspended\" response as stale-resume recovery instead of surfacing the raw 500 to users. The CLI now polls run state briefly, which covers the small window where the resume was processed but D1/runById has not settled yet. ## Changes - Treat both step-level and run-level not-suspended responses as stale resume cases. - Retry run-state recovery with a short backoff before giving up. - Add coverage for run-level recovery and transient runById failures. ## Test Plan - `pnpm exec vitest run test/lib/init/wizard-runner.test.ts` - `pnpm exec tsc --noEmit --pretty false` - `git diff --check`
1 parent 1bec084 commit 176759d

4 files changed

Lines changed: 74 additions & 29 deletions

File tree

src/lib/init/tools/run-commands.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ export async function runCommands(
5454
message: `Command failed: ${command.original}`,
5555
data: {
5656
exitCode: result.exitCode,
57+
stdout: result.stdout.slice(0, 500),
5758
stderr: result.stderr.slice(0, 500),
5859
cwd: payload.cwd,
5960
},

src/lib/init/wizard-runner.ts

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ async function preamble(
437437

438438
const MAX_RESUME_RETRIES = 3;
439439
const RETRY_BACKOFF_MS = [2000, 4000, 8000];
440+
const RUN_STATE_RECOVERY_BACKOFF_MS = [0, 250, 750, 1500];
441+
const RUN_STATE_RECOVERY_TIMEOUT_MS = 10_000;
440442

441443
type ResumeRetryArgs = {
442444
run: {
@@ -454,11 +456,13 @@ type ResumeRetryArgs = {
454456
};
455457

456458
/**
457-
* Detect Mastra's "step not suspended" 500 — means the server already
459+
* Detect Mastra's "not suspended" 500 — means the server already
458460
* processed this step (our previous request succeeded but the response was
459461
* dropped before we received it). The MastraClientError message embeds the
460462
* server body, e.g.:
461463
* "HTTP error! status: 500 - {"error":"This workflow step 'X' was not suspended..."}"
464+
* or:
465+
* "HTTP error! status: 500 - {"error":"This workflow run was not suspended"}"
462466
*/
463467
function isStepAlreadyAdvancedError(err: unknown): boolean {
464468
return err instanceof Error && err.message.includes("was not suspended");
@@ -473,31 +477,39 @@ async function tryRecoverCurrentRunState(
473477
workflow: ResumeRetryArgs["workflow"],
474478
runId: string
475479
): Promise<WorkflowRunResult | null> {
476-
try {
477-
const raw = await withTimeout(
478-
workflow.runById(runId, {
479-
fields: ["steps", "activeStepsPath", "result"],
480-
}),
481-
API_TIMEOUT_MS,
482-
"Run state recovery"
483-
);
484-
// runById returns activeStepsPath (Record<stepId, executionPath>) but
485-
// not suspended (string[][]). The main loop reads result.suspended to
486-
// find the active step; without it, stepId falls back to "unknown" and
487-
// extractSuspendPayload iterates all steps — picking the first with any
488-
// suspendPayload, which could be a completed step with stale D1 data.
489-
// Derive suspended from the activeStepsPath keys so the lookup is
490-
// deterministic: those keys are exactly the currently-active step IDs.
491-
const state = raw as Record<string, unknown>;
492-
if (!state.suspended && state.activeStepsPath) {
493-
state.suspended = Object.keys(
494-
state.activeStepsPath as Record<string, unknown>
495-
).map((id) => [id]);
480+
for (const delayMs of RUN_STATE_RECOVERY_BACKOFF_MS) {
481+
if (delayMs > 0) {
482+
await new Promise((resolve) => setTimeout(resolve, delayMs));
483+
}
484+
try {
485+
const raw = await withTimeout(
486+
workflow.runById(runId, {
487+
fields: ["steps", "activeStepsPath", "result"],
488+
}),
489+
RUN_STATE_RECOVERY_TIMEOUT_MS,
490+
"Run state recovery"
491+
);
492+
// runById returns activeStepsPath (Record<stepId, executionPath>) but
493+
// not suspended (string[][]). The main loop reads result.suspended to
494+
// find the active step; without it, stepId falls back to "unknown" and
495+
// extractSuspendPayload iterates all steps — picking the first with any
496+
// suspendPayload, which could be a completed step with stale D1 data.
497+
// Derive suspended from the activeStepsPath keys so the lookup is
498+
// deterministic: those keys are exactly the currently-active step IDs.
499+
const state = raw as Record<string, unknown>;
500+
if (!state.suspended && state.activeStepsPath) {
501+
state.suspended = Object.keys(
502+
state.activeStepsPath as Record<string, unknown>
503+
).map((id) => [id]);
504+
}
505+
return assertWorkflowResult(state);
506+
} catch {
507+
// Mastra/D1 can briefly return a not-yet-readable or intermediate run
508+
// state immediately after rejecting a stale resume. Poll a few times
509+
// before surfacing the original 500 to the user.
496510
}
497-
return assertWorkflowResult(state);
498-
} catch {
499-
return null;
500511
}
512+
return null;
501513
}
502514

503515
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: retry loop branches across transient errors, stale-step recovery, and backoff

test/lib/init/wizard-runner.test.ts

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,13 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
976976
);
977977
}
978978

979+
function staleRunError(): Error {
980+
return new Error(
981+
"HTTP error! status: 500 - " +
982+
JSON.stringify({ error: "This workflow run was not suspended" })
983+
);
984+
}
985+
979986
test("recovers when server has already advanced to the next step", async () => {
980987
mockStartResult = {
981988
status: "suspended",
@@ -1005,13 +1012,37 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
10051012
expect(resumeCount).toBe(1);
10061013
});
10071014

1008-
test("throws immediately when stale-step error occurs and runById fails", async () => {
1015+
test("recovers from run-level not-suspended errors after transient runById failure", async () => {
1016+
mockStartResult = {
1017+
status: "suspended",
1018+
suspended: [["tool-step"]],
1019+
steps: { "tool-step": { suspendPayload: toolPayload } },
1020+
};
1021+
runByIdMock
1022+
.mockRejectedValueOnce(new Error("D1 snapshot not ready"))
1023+
.mockResolvedValueOnce({ status: "success" });
1024+
1025+
let resumeCount = 0;
1026+
makeStaleStepRun(() => {
1027+
resumeCount += 1;
1028+
return Promise.reject(staleRunError());
1029+
});
1030+
1031+
await runWizard(makeOptions());
1032+
1033+
expect(formatResultSpy).toHaveBeenCalled();
1034+
expect(runByIdMock).toHaveBeenCalledTimes(2);
1035+
expect(resumeCount).toBe(1);
1036+
});
1037+
1038+
test("throws when stale-step error occurs and runById keeps failing", async () => {
10091039
mockStartResult = {
10101040
status: "suspended",
10111041
suspended: [["tool-step"]],
10121042
steps: { "tool-step": { suspendPayload: toolPayload } },
10131043
};
1014-
// runById is unreachable — recovery fails, wizard throws without retrying.
1044+
// runById is unreachable — recovery fails, wizard throws without retrying
1045+
// the stale resume request.
10151046
mockRunByIdResult = new Error("runById network error");
10161047

10171048
let resumeCount = 0;
@@ -1022,9 +1053,9 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
10221053

10231054
await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
10241055

1025-
// Threw immediately after recovery failed — no futile retries of the stale step.
1056+
// Threw after recovery polling failed — no futile retries of the stale step.
10261057
expect(resumeCount).toBe(1);
1027-
expect(runByIdMock).toHaveBeenCalledTimes(1);
1058+
expect(runByIdMock).toHaveBeenCalledTimes(4);
10281059
});
10291060
});
10301061

test/lib/run-commands.mocked.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import type { RunCommandsPayload } from "../../src/lib/init/types.js";
1515
type Breadcrumb = {
1616
level: string;
1717
message: string;
18-
data: { exitCode: number; stderr: string; cwd: string };
18+
data: { exitCode: number; stdout: string; stderr: string; cwd: string };
1919
};
2020

2121
const { breadcrumbs } = vi.hoisted(() => ({
@@ -60,6 +60,7 @@ describe("runCommands breadcrumb on failure", () => {
6060
expect(crumb.level).toBe("error");
6161
expect(crumb.message).toContain("ls");
6262
expect(crumb.data.exitCode).not.toBe(0);
63+
expect(typeof crumb.data.stdout).toBe("string");
6364
expect(typeof crumb.data.stderr).toBe("string");
6465
expect(crumb.data.cwd).toBe("/tmp");
6566
});

0 commit comments

Comments
 (0)