fix: harden ralph evaluation json parsing

Addono · Copilot · Addono · commit 3d8574efd340 · 2026-02-28T13:18:43.000Z
Improve fitness evaluation reliability by extracting the first valid score payload from mixed model output, including fenced JSON and malformed-leading-object cases.

Add focused unit tests for parsing edge cases and wire the helper into evaluateFitness to reduce fallback aggregate=0 outcomes.

Update IMPLEMENTATION_PLAN.md with the completed resilience task and validation notes.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
@@ -236,6 +236,19 @@ This plan lists prioritized tasks required to bring the implementation into full
   - **Dependencies:** Task 16
   - **Notes:**
     - Targets the regression observed at iteration 25 where evaluation timed out and fallback scoring forced `aggregate=0`.
-    - Expanded timeout detection to inspect string errors, `Error` instances, and nested `cause` chains used by SDK-wrapped errors.
-    - Keeps retry behavior behavior-safe while reducing false negatives in timeout detection.
+  - Expanded timeout detection to inspect string errors, `Error` instances, and nested `cause` chains used by SDK-wrapped errors.
+  - Keeps retry behavior behavior-safe while reducing false negatives in timeout detection.
+  - Validation run after this change: `npm run typecheck`, `npm run lint`, `npm test`, and `npm audit --production` all pass; audit reports 0 vulnerabilities.
+
+## 19. Ralph Loop Evaluation JSON Extraction Resilience
+- **Task:** Harden fitness-evaluation response parsing so valid scoring JSON is recovered from mixed prose/code-fence outputs instead of triggering fallback aggregate scoring. **[COMPLETE]**
+  - **Spec:** Ralph-loop/spec.md (Evaluation JSON schema, Fitness evaluation process), Logging/spec.md (score trajectory continuity)
+  - **Files:** src/ralph/evaluation.ts, ralph-loop.ts
+  - **Tests:** test/unit/ralph/evaluation.test.ts
+  - **Dependencies:** Task 18
+  - **Notes:**
+    - Targets the score-regression pattern where evaluation responses may include extra wrapper text and cause JSON parse misses that force fallback scores (`aggregate=0`).
+    - Added `extractFitnessJsonPayload()` with balanced-brace scanning to find the first valid JSON object containing required fitness score fields, including content embedded in markdown code fences.
+    - Updated `evaluateFitness()` in `ralph-loop.ts` to use the new helper, preserving existing score clamping and checklist normalization.
+    - Added unit coverage for plain JSON, fenced JSON with surrounding text, malformed-leading-object recovery, and null return when no valid payload exists.
     - Validation run after this change: `npm run typecheck`, `npm run lint`, `npm test`, and `npm audit --production` all pass; audit reports 0 vulnerabilities.
diff --git a/ralph-loop.ts b/ralph-loop.ts
@@ -9,6 +9,7 @@ import {
   type SessionEvent,
 } from "@github/copilot-sdk";
 import {
+  extractFitnessJsonPayload,
   isSessionIdleTimeoutError,
   resolveEvaluationTimeoutMs,
 } from "./src/ralph/evaluation.ts";
@@ -371,15 +372,10 @@ Respond with ONLY a valid JSON object — no markdown, no code fences, no extra
         evaluationTimeoutMs,
       );
 
-      // Strip optional markdown code fences then extract outermost JSON object
       const raw = response?.data?.content ?? "";
-      const stripped = raw
-        .replace(/^```(?:json)?\s*/im, "")
-        .replace(/```\s*$/im, "")
-        .trim();
-      const jsonMatch = stripped.match(/\{[\s\S]*\}/);
-      if (jsonMatch) {
-        const parsed = JSON.parse(jsonMatch[0]) as Partial<FitnessScores>;
+      const parsedPayload = extractFitnessJsonPayload(raw);
+      if (parsedPayload) {
+        const parsed = parsedPayload as Partial<FitnessScores>;
         const clamp = (n: unknown): number =>
           Math.min(100, Math.max(0, Math.round(Number(n) || 0)));
         return {
diff --git a/src/ralph/evaluation.ts b/src/ralph/evaluation.ts
@@ -27,6 +27,96 @@ export function isSessionIdleTimeoutError(error: unknown): boolean {
   );
 }
 
+/**
+ * Extract and parse the first valid fitness-score JSON object from model output.
+ * This is resilient to surrounding prose/code fences and skips malformed objects.
+ */
+export function extractFitnessJsonPayload(
+  content: string,
+): Record<string, unknown> | null {
+  const candidates = [content, ...extractFencedBlocks(content)];
+  for (const candidate of candidates) {
+    const parsed = extractFirstValidFitnessObject(candidate);
+    if (parsed) return parsed;
+  }
+  return null;
+}
+
+function extractFencedBlocks(content: string): string[] {
+  const blocks: string[] = [];
+  const fenceRegex = /```(?:json)?\s*([\s\S]*?)```/gi;
+  let match = fenceRegex.exec(content);
+  while (match) {
+    const body = match[1]?.trim();
+    if (body) blocks.push(body);
+    match = fenceRegex.exec(content);
+  }
+  return blocks;
+}
+
+function extractFirstValidFitnessObject(
+  text: string,
+): Record<string, unknown> | null {
+  for (const jsonSlice of getJsonObjectSlices(text)) {
+    try {
+      const parsed = JSON.parse(jsonSlice);
+      if (isFitnessPayload(parsed)) return parsed;
+    } catch {
+      // Keep scanning for later valid objects.
+    }
+  }
+  return null;
+}
+
+function *getJsonObjectSlices(text: string): Generator<string> {
+  for (let start = 0; start < text.length; start++) {
+    if (text[start] !== "{") continue;
+    let depth = 0;
+    let inString = false;
+    let escaped = false;
+    for (let i = start; i < text.length; i++) {
+      const char = text[i];
+      if (!char) continue;
+      if (inString) {
+        if (escaped) {
+          escaped = false;
+          continue;
+        }
+        if (char === "\\") {
+          escaped = true;
+          continue;
+        }
+        if (char === "\"") inString = false;
+        continue;
+      }
+      if (char === "\"") {
+        inString = true;
+        continue;
+      }
+      if (char === "{") depth++;
+      if (char === "}") {
+        depth--;
+        if (depth === 0) {
+          yield text.slice(start, i + 1);
+          break;
+        }
+      }
+    }
+  }
+}
+
+function isFitnessPayload(value: unknown): value is Record<string, unknown> {
+  if (!value || typeof value !== "object") return false;
+  const raw = value as Record<string, unknown>;
+  return [
+    "specCompliance",
+    "testCoverage",
+    "codeQuality",
+    "buildHealth",
+    "aggregate",
+  ].every((key) => key in raw);
+}
+
 function collectErrorMessages(error: unknown, depth = 0): string[] {
   if (depth > 4 || error === null || error === undefined) return [];
 
diff --git a/test/unit/ralph/evaluation.test.ts b/test/unit/ralph/evaluation.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 import {
+  extractFitnessJsonPayload,
   isSessionIdleTimeoutError,
   resolveEvaluationTimeoutMs,
 } from "../../../src/ralph/evaluation";
@@ -46,3 +47,42 @@ describe("isSessionIdleTimeoutError", () => {
     expect(isSessionIdleTimeoutError(new Error("Network failure"))).toBe(false);
   });
 });
+
+describe("extractFitnessJsonPayload", () => {
+  it("parses plain JSON payloads", () => {
+    const raw = JSON.stringify({
+      specCompliance: 80,
+      testCoverage: 85,
+      codeQuality: 90,
+      buildHealth: 95,
+      aggregate: 87,
+      notes: "ok",
+      checklist: [],
+    });
+    expect(extractFitnessJsonPayload(raw)?.aggregate).toBe(87);
+  });
+
+  it("extracts JSON from fenced blocks with surrounding text", () => {
+    const raw = [
+      "Here are your scores:",
+      "```json",
+      '{"specCompliance":70,"testCoverage":60,"codeQuality":65,"buildHealth":75,"aggregate":68,"notes":"x","checklist":[]}',
+      "```",
+      "Done.",
+    ].join("\n");
+    expect(extractFitnessJsonPayload(raw)?.specCompliance).toBe(70);
+  });
+
+  it("skips malformed JSON objects and finds the next valid payload", () => {
+    const raw = [
+      'noise {"not":"fitness"}',
+      '{"specCompliance": bad-json }',
+      '{"specCompliance":88,"testCoverage":89,"codeQuality":90,"buildHealth":91,"aggregate":90,"notes":"good","checklist":[]}',
+    ].join("\n");
+    expect(extractFitnessJsonPayload(raw)?.buildHealth).toBe(91);
+  });
+
+  it("returns null when no valid fitness payload is present", () => {
+    expect(extractFitnessJsonPayload('{"hello":"world"}')).toBeNull();
+  });
+});