Skip to content

Commit 3d8574e

Browse files
AddonoCopilot
andcommitted
fix: harden ralph evaluation json parsing
Improve fitness evaluation reliability by extracting the first valid score payload from mixed model output, including fenced JSON and malformed-leading-object cases. Add focused unit tests for parsing edge cases and wire the helper into evaluateFitness to reduce fallback aggregate=0 outcomes. Update IMPLEMENTATION_PLAN.md with the completed resilience task and validation notes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6e004e0 commit 3d8574e

4 files changed

Lines changed: 149 additions & 10 deletions

File tree

IMPLEMENTATION_PLAN.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,19 @@ This plan lists prioritized tasks required to bring the implementation into full
236236
- **Dependencies:** Task 16
237237
- **Notes:**
238238
- Targets the regression observed at iteration 25 where evaluation timed out and fallback scoring forced `aggregate=0`.
239-
- Expanded timeout detection to inspect string errors, `Error` instances, and nested `cause` chains used by SDK-wrapped errors.
240-
- Keeps retry behavior behavior-safe while reducing false negatives in timeout detection.
239+
- Expanded timeout detection to inspect string errors, `Error` instances, and nested `cause` chains used by SDK-wrapped errors.
240+
- Keeps retry behavior behavior-safe while reducing false negatives in timeout detection.
241+
- Validation run after this change: `npm run typecheck`, `npm run lint`, `npm test`, and `npm audit --production` all pass; audit reports 0 vulnerabilities.
242+
243+
## 19. Ralph Loop Evaluation JSON Extraction Resilience
244+
- **Task:** Harden fitness-evaluation response parsing so valid scoring JSON is recovered from mixed prose/code-fence outputs instead of triggering fallback aggregate scoring. **[COMPLETE]**
245+
- **Spec:** Ralph-loop/spec.md (Evaluation JSON schema, Fitness evaluation process), Logging/spec.md (score trajectory continuity)
246+
- **Files:** src/ralph/evaluation.ts, ralph-loop.ts
247+
- **Tests:** test/unit/ralph/evaluation.test.ts
248+
- **Dependencies:** Task 18
249+
- **Notes:**
250+
- Targets the score-regression pattern where evaluation responses may include extra wrapper text and cause JSON parse misses that force fallback scores (`aggregate=0`).
251+
- Added `extractFitnessJsonPayload()` with balanced-brace scanning to find the first valid JSON object containing required fitness score fields, including content embedded in markdown code fences.
252+
- Updated `evaluateFitness()` in `ralph-loop.ts` to use the new helper, preserving existing score clamping and checklist normalization.
253+
- Added unit coverage for plain JSON, fenced JSON with surrounding text, malformed-leading-object recovery, and null return when no valid payload exists.
241254
- Validation run after this change: `npm run typecheck`, `npm run lint`, `npm test`, and `npm audit --production` all pass; audit reports 0 vulnerabilities.

ralph-loop.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
type SessionEvent,
1010
} from "@github/copilot-sdk";
1111
import {
12+
extractFitnessJsonPayload,
1213
isSessionIdleTimeoutError,
1314
resolveEvaluationTimeoutMs,
1415
} from "./src/ralph/evaluation.ts";
@@ -371,15 +372,10 @@ Respond with ONLY a valid JSON object — no markdown, no code fences, no extra
371372
evaluationTimeoutMs,
372373
);
373374

374-
// Strip optional markdown code fences then extract outermost JSON object
375375
const raw = response?.data?.content ?? "";
376-
const stripped = raw
377-
.replace(/^```(?:json)?\s*/im, "")
378-
.replace(/```\s*$/im, "")
379-
.trim();
380-
const jsonMatch = stripped.match(/\{[\s\S]*\}/);
381-
if (jsonMatch) {
382-
const parsed = JSON.parse(jsonMatch[0]) as Partial<FitnessScores>;
376+
const parsedPayload = extractFitnessJsonPayload(raw);
377+
if (parsedPayload) {
378+
const parsed = parsedPayload as Partial<FitnessScores>;
383379
const clamp = (n: unknown): number =>
384380
Math.min(100, Math.max(0, Math.round(Number(n) || 0)));
385381
return {

src/ralph/evaluation.ts

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,96 @@ export function isSessionIdleTimeoutError(error: unknown): boolean {
2727
);
2828
}
2929

30+
/**
31+
* Extract and parse the first valid fitness-score JSON object from model output.
32+
* This is resilient to surrounding prose/code fences and skips malformed objects.
33+
*/
34+
export function extractFitnessJsonPayload(
35+
content: string,
36+
): Record<string, unknown> | null {
37+
const candidates = [content, ...extractFencedBlocks(content)];
38+
for (const candidate of candidates) {
39+
const parsed = extractFirstValidFitnessObject(candidate);
40+
if (parsed) return parsed;
41+
}
42+
return null;
43+
}
44+
45+
function extractFencedBlocks(content: string): string[] {
46+
const blocks: string[] = [];
47+
const fenceRegex = /```(?:json)?\s*([\s\S]*?)```/gi;
48+
let match = fenceRegex.exec(content);
49+
while (match) {
50+
const body = match[1]?.trim();
51+
if (body) blocks.push(body);
52+
match = fenceRegex.exec(content);
53+
}
54+
return blocks;
55+
}
56+
57+
function extractFirstValidFitnessObject(
58+
text: string,
59+
): Record<string, unknown> | null {
60+
for (const jsonSlice of getJsonObjectSlices(text)) {
61+
try {
62+
const parsed = JSON.parse(jsonSlice);
63+
if (isFitnessPayload(parsed)) return parsed;
64+
} catch {
65+
// Keep scanning for later valid objects.
66+
}
67+
}
68+
return null;
69+
}
70+
71+
function *getJsonObjectSlices(text: string): Generator<string> {
72+
for (let start = 0; start < text.length; start++) {
73+
if (text[start] !== "{") continue;
74+
let depth = 0;
75+
let inString = false;
76+
let escaped = false;
77+
for (let i = start; i < text.length; i++) {
78+
const char = text[i];
79+
if (!char) continue;
80+
if (inString) {
81+
if (escaped) {
82+
escaped = false;
83+
continue;
84+
}
85+
if (char === "\\") {
86+
escaped = true;
87+
continue;
88+
}
89+
if (char === "\"") inString = false;
90+
continue;
91+
}
92+
if (char === "\"") {
93+
inString = true;
94+
continue;
95+
}
96+
if (char === "{") depth++;
97+
if (char === "}") {
98+
depth--;
99+
if (depth === 0) {
100+
yield text.slice(start, i + 1);
101+
break;
102+
}
103+
}
104+
}
105+
}
106+
}
107+
108+
function isFitnessPayload(value: unknown): value is Record<string, unknown> {
109+
if (!value || typeof value !== "object") return false;
110+
const raw = value as Record<string, unknown>;
111+
return [
112+
"specCompliance",
113+
"testCoverage",
114+
"codeQuality",
115+
"buildHealth",
116+
"aggregate",
117+
].every((key) => key in raw);
118+
}
119+
30120
function collectErrorMessages(error: unknown, depth = 0): string[] {
31121
if (depth > 4 || error === null || error === undefined) return [];
32122

test/unit/ralph/evaluation.test.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { describe, expect, it } from "vitest";
22
import {
3+
extractFitnessJsonPayload,
34
isSessionIdleTimeoutError,
45
resolveEvaluationTimeoutMs,
56
} from "../../../src/ralph/evaluation";
@@ -46,3 +47,42 @@ describe("isSessionIdleTimeoutError", () => {
4647
expect(isSessionIdleTimeoutError(new Error("Network failure"))).toBe(false);
4748
});
4849
});
50+
51+
describe("extractFitnessJsonPayload", () => {
52+
it("parses plain JSON payloads", () => {
53+
const raw = JSON.stringify({
54+
specCompliance: 80,
55+
testCoverage: 85,
56+
codeQuality: 90,
57+
buildHealth: 95,
58+
aggregate: 87,
59+
notes: "ok",
60+
checklist: [],
61+
});
62+
expect(extractFitnessJsonPayload(raw)?.aggregate).toBe(87);
63+
});
64+
65+
it("extracts JSON from fenced blocks with surrounding text", () => {
66+
const raw = [
67+
"Here are your scores:",
68+
"```json",
69+
'{"specCompliance":70,"testCoverage":60,"codeQuality":65,"buildHealth":75,"aggregate":68,"notes":"x","checklist":[]}',
70+
"```",
71+
"Done.",
72+
].join("\n");
73+
expect(extractFitnessJsonPayload(raw)?.specCompliance).toBe(70);
74+
});
75+
76+
it("skips malformed JSON objects and finds the next valid payload", () => {
77+
const raw = [
78+
'noise {"not":"fitness"}',
79+
'{"specCompliance": bad-json }',
80+
'{"specCompliance":88,"testCoverage":89,"codeQuality":90,"buildHealth":91,"aggregate":90,"notes":"good","checklist":[]}',
81+
].join("\n");
82+
expect(extractFitnessJsonPayload(raw)?.buildHealth).toBe(91);
83+
});
84+
85+
it("returns null when no valid fitness payload is present", () => {
86+
expect(extractFitnessJsonPayload('{"hello":"world"}')).toBeNull();
87+
});
88+
});

0 commit comments

Comments
 (0)