fix(training): repair TOON bullet/indexed outputs + tighten synth prompts

lalalune · claude · lalalune · commit 2714a61f5710 · 2026-05-05T02:53:07.000-07:00
Three quality issues surfaced when running the synthesizers against
gpt-oss-120b on Groq (the canonical first real run, 2375 evaluator
records). Pre-repair conformance was 84% across 5 evaluators.

scripts/transform_repair_toon_bullets.py
  Two repair passes:
  1. Markdown-bullet repair: `key:\\n  - x\\n  - y` → `key[2]:\\n  - x\\n  - y`
     gpt-oss emits this for `strengths`/`improvements`/`learnings` in
     reflection records (2/3 of failures).
  2. Indexed-assign repair: `topics[0]: x\\ntopics[1]: y\\ntopics[2]: z`
     → `topics[3]:\\n  - x\\n  - y\\n  - z`. gpt-oss emits this for
     `topics`/`keyPoints` in summarization (all 497 failures pre-repair).
  Idempotent. Drops records that don't parse even after repair.

scripts/synthesize_evaluator_prompts.py
  - FACT_EXTRACTION_TEMPLATE: explicit STRICT op vocabulary section that
    forbids `op: insert`/`op: add` (gpt-oss emitted these in 24% of
    fact_extractor records). Canonical ops are add_durable, add_current,
    strengthen, decay, contradict.
  - INITIAL_SUMMARIZATION_TEMPLATE: replaced the runtime example
    `topics[0]: ... topics[1]: ...` with the canonical TOON array form
    `topics[N]:\\n  - x\\n  - y` and explicitly forbids nested sub-keys
    in keyPoints (the source of 47% of summarization failures).

scripts/audit_pipeline_shapes.py
  - validate_reflection now checks the correct fields per the runtime's
    reflectionTemplate (thought/quality_score/strengths/improvements/
    learnings) — was previously checking task_completed which belongs
    to reflection_evaluator. Quality_score validator accepts int, float,
    "78", and "78/100" forms.

scripts/publish_dataset_to_hf.py
  Publish allowlist extended to include data/synthesized/evaluators/ and
  data/synthesized/phase3/ so the new Phase-4 + Phase-3 records ship
  with the next dataset push.

Conformance after the repair pass on the same 2375 records:
  reflection_evaluator: 100.0%
  long_term_extraction: 100.0%
  reflection (post-repair): 97.7%
  fact_extractor (with stricter prompt, re-running): TBD
  summarization (with stricter prompt, re-running): TBD

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/training/scripts/audit_pipeline_shapes.py b/packages/training/scripts/audit_pipeline_shapes.py
@@ -24,6 +24,7 @@
 import collections
 import json
 import logging
+import re
 import sys
 from pathlib import Path
 from typing import Any, Iterator
@@ -308,14 +309,36 @@ def validate_should_respond(decoded: Any) -> list[str]:
 
 
 def validate_reflection(decoded: Any) -> list[str]:
+    """Validate `reflectionTemplate` output (eliza/packages/core/src/prompts.ts:867).
+
+    Emits: thought, quality_score, strengths, improvements, learnings.
+    NOT to be confused with `reflection_evaluator` (separate template,
+    has task_completed / task_completion_reason / relationships)."""
     if not isinstance(decoded, dict):
         return [f"top_level_not_object({type(decoded).__name__})"]
     reasons: list[str] = []
-    for required in ("thought", "task_completed", "task_completion_reason"):
+    for required in ("thought", "quality_score", "strengths",
+                     "improvements", "learnings"):
         if required not in decoded:
             reasons.append(f"missing_{required}")
-    if "task_completed" in decoded and not isinstance(decoded["task_completed"], bool):
-        reasons.append("task_completed_wrong_type")
+    qs = decoded.get("quality_score")
+    if qs is not None:
+        # Accept int, float, or stringified forms — including the common
+        # "78/100" denominator form that gpt-oss emits.
+        n: float | None = None
+        if isinstance(qs, (int, float)):
+            n = float(qs)
+        elif isinstance(qs, str):
+            m = re.match(r"^\s*(\d+(?:\.\d+)?)\s*(?:/\s*100)?\s*$", qs)
+            if m:
+                try:
+                    n = float(m.group(1))
+                except ValueError:
+                    n = None
+        if n is None:
+            reasons.append(f"quality_score_wrong_type({type(qs).__name__})")
+        elif not (0 <= n <= 100):
+            reasons.append(f"quality_score_out_of_range({qs})")
     return reasons
 
 
diff --git a/packages/training/scripts/publish_dataset_to_hf.py b/packages/training/scripts/publish_dataset_to_hf.py
@@ -192,9 +192,12 @@ def _spec_combined() -> DatasetSpec:
             files.append(sb_manifest)
             path_in_repo[sb_manifest] = "scambench/manifest.json"
 
-    # Synthesized small sets.
+    # Synthesized small sets. evaluators/ + phase3/ are the Phase-4 and
+    # Phase-3 fillers added in 2026-05 to close the runtime-phase coverage
+    # gap (see docs/dataset/COVERAGE_AUDIT.md, EVALUATOR_SYNTHESIS.md).
     synth_base = DATA / "synthesized"
-    for sub in ("action_examples", "action_pairs", "core_prompts"):
+    for sub in ("action_examples", "action_pairs", "core_prompts",
+                "evaluators", "phase3"):
         d = synth_base / sub
         if not d.exists():
             continue
diff --git a/packages/training/scripts/synthesize_evaluator_prompts.py b/packages/training/scripts/synthesize_evaluator_prompts.py
@@ -209,6 +209,60 @@ def strip_fences(s: str) -> str:
     return s.strip()
 
 
+def repair_toon_bullets(s: str) -> str:
+    """Convert markdown-bullet style values into TOON-array form.
+
+    gpt-oss-120b (and other instruction-tuned models) often emit:
+
+        strengths:
+        - Clear tone.
+        - Prompt response.
+
+    Which TOON cannot parse — `strengths:` has no value and the bullets
+    look like new keys. Convert into TOON array form:
+
+        strengths[2]:
+          - Clear tone.
+          - Prompt response.
+
+    Idempotent on already-TOON output. Conservative: only transforms a
+    `key:` line followed by ≥1 line starting with `- ` (after the
+    bullets, the value resumes once we hit a non-bullet line)."""
+    lines = s.splitlines()
+    out: list[str] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*$", line)
+        if m:
+            # Look ahead for one or more `- item` lines (allow a blank line
+            # between the key and the bullets).
+            j = i + 1
+            while j < len(lines) and lines[j].strip() == "":
+                j += 1
+            bullets: list[str] = []
+            k = j
+            while k < len(lines) and lines[k].lstrip().startswith("- "):
+                bullets.append(lines[k].lstrip()[2:].strip())
+                k += 1
+            if bullets:
+                key = m.group(1)
+                out.append(f"{key}[{len(bullets)}]:")
+                for b in bullets:
+                    out.append(f"  - {b}")
+                i = k
+                continue
+        out.append(line)
+        i += 1
+    return "\n".join(out)
+
+
+def normalize_teacher_output(s: str) -> str:
+    """Strip fences + apply known repair passes. Used by every evaluator
+    branch in synthesize() so a single fix lands everywhere."""
+    return repair_toon_bullets(strip_fences(s))
+
+
 # ───────────────────────── shared diversity pools ─────────────────────────
 
 PERSONAS = [
@@ -401,6 +455,18 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
 
 You maintain a two-store fact memory for an AI assistant. For each message you decide what to insert, strengthen, decay, or contradict in that memory. You return a single JSON object with an `ops` array — nothing else.
 
+## STRICT op vocabulary (these are the ONLY accepted op values)
+
+- `add_durable`     — for stable identity-level claims (where someone lives, allergies, founded a company, life events)
+- `add_current`     — for time-bound state (anxious today, debugging X, working on Y, traveling next week)
+- `strengthen`      — when a known fact is restated; include `factId`
+- `decay`           — when a current fact looks resolved; include `factId`
+- `contradict`      — when a fact is directly contradicted; include `factId` and `proposedText`
+
+DO NOT emit `op: insert`, `op: add`, `op: update`, or any value not in the
+list above. Use `add_durable` for durable claims and `add_current` for
+time-bound state — never the bare `add` or `insert`.
+
 (see eliza/packages/core/src/prompts.ts:752 for the full description; the
 inputs below replicate the runtime substitution.)
 
@@ -449,12 +515,28 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
 - **Topics**: List of main topics discussed (comma-separated)
 - **Key Points**: Important facts or decisions (bullet points)
 
-Respond in TOON:
-text: Your comprehensive summary here
-topics[0]: topic1
-topics[1]: topic2
-keyPoints[0]: First key point
-keyPoints[1]: Second key point"""
+## STRICT TOON output
+
+Each `topics[N]` and `keyPoints[N]` entry MUST be a single flat string —
+never an indented sub-object with sub-keys. Do not use markdown bullets.
+
+Use the EXACT layout below (replace placeholders, keep the array form):
+
+text: Your comprehensive summary here.
+topics[3]:
+  - topic1
+  - topic2
+  - topic3
+keyPoints[5]:
+  - First key point as a single sentence.
+  - Second key point as a single sentence.
+  - Third key point as a single sentence.
+  - Fourth key point as a single sentence.
+  - Fifth key point as a single sentence.
+
+If you have a different number of topics or key points, change the index
+length to match (e.g. `topics[2]:`). Each item must be one line, no
+nested keys, no markdown bullets, no leading numbering."""
 
 
 LONG_TERM_EXTRACTION_TEMPLATE = """# Task: Extract Long-Term Memory (Strict Criteria)
@@ -866,7 +948,7 @@ def _generate_one(
         if dry_run:
             target_text = stub_reflection(encoder, rng, ctx, False)
         else:
-            target_text = strip_fences(call_teacher(
+            target_text = normalize_teacher_output(call_teacher(
                 teacher,
                 "You are generating supervised TOON output for the elizaOS "
                 "reflection evaluator. Emit ONE TOON document and nothing else.",
@@ -881,7 +963,7 @@ def _generate_one(
         if dry_run:
             target_text = stub_reflection_evaluator(encoder, rng, ctx, entity_ids, False)
         else:
-            target_text = strip_fences(call_teacher(
+            target_text = normalize_teacher_output(call_teacher(
                 teacher,
                 "You are generating supervised TOON output for the elizaOS "
                 "reflectionEvaluator. Emit ONE TOON document and nothing else. "
@@ -901,7 +983,7 @@ def _generate_one(
         if dry_run:
             target_text = stub_fact_extractor(encoder, rng, ctx, force_empty)
         else:
-            target_text = strip_fences(call_teacher(
+            target_text = normalize_teacher_output(call_teacher(
                 teacher,
                 "You are the elizaOS fact_extractor. Return exactly one JSON "
                 "object `{\"ops\":[...]}`. Empty `{\"ops\":[]}` is a "
@@ -922,7 +1004,7 @@ def _generate_one(
         if dry_run:
             target_text = stub_summarization(encoder, rng, ctx, bucket)
         else:
-            target_text = strip_fences(call_teacher(
+            target_text = normalize_teacher_output(call_teacher(
                 teacher,
                 "You are the elizaOS summarization evaluator. Emit ONE TOON "
                 "document with `text`, `topics[N]`, `keyPoints[M]`. Nothing "
@@ -942,7 +1024,7 @@ def _generate_one(
             band = (0.85, 0.94) if rng.random() < 0.625 else (0.95, 1.0)
             target_text = stub_long_term(encoder, rng, ctx, force_empty, band)
         else:
-            target_text = strip_fences(call_teacher(
+            target_text = normalize_teacher_output(call_teacher(
                 teacher,
                 "You are the elizaOS long_term_extraction evaluator. ULTRA-"
                 "STRICT: when in doubt, emit no memories entries — empty "
diff --git a/packages/training/scripts/transform_repair_toon_bullets.py b/packages/training/scripts/transform_repair_toon_bullets.py