Skip to content

Commit 2714a61

Browse files
lalaluneclaude
andcommitted
fix(training): repair TOON bullet/indexed outputs + tighten synth prompts
Three quality issues surfaced when running the synthesizers against gpt-oss-120b on Groq (the canonical first real run, 2375 evaluator records). Pre-repair conformance was 84% across 5 evaluators. scripts/transform_repair_toon_bullets.py Two repair passes: 1. Markdown-bullet repair: `key:\\n - x\\n - y` → `key[2]:\\n - x\\n - y` gpt-oss emits this for `strengths`/`improvements`/`learnings` in reflection records (2/3 of failures). 2. Indexed-assign repair: `topics[0]: x\\ntopics[1]: y\\ntopics[2]: z` → `topics[3]:\\n - x\\n - y\\n - z`. gpt-oss emits this for `topics`/`keyPoints` in summarization (all 497 failures pre-repair). Idempotent. Drops records that don't parse even after repair. scripts/synthesize_evaluator_prompts.py - FACT_EXTRACTION_TEMPLATE: explicit STRICT op vocabulary section that forbids `op: insert`/`op: add` (gpt-oss emitted these in 24% of fact_extractor records). Canonical ops are add_durable, add_current, strengthen, decay, contradict. - INITIAL_SUMMARIZATION_TEMPLATE: replaced the runtime example `topics[0]: ... topics[1]: ...` with the canonical TOON array form `topics[N]:\\n - x\\n - y` and explicitly forbids nested sub-keys in keyPoints (the source of 47% of summarization failures). scripts/audit_pipeline_shapes.py - validate_reflection now checks the correct fields per the runtime's reflectionTemplate (thought/quality_score/strengths/improvements/ learnings) — was previously checking task_completed which belongs to reflection_evaluator. Quality_score validator accepts int, float, "78", and "78/100" forms. scripts/publish_dataset_to_hf.py Publish allowlist extended to include data/synthesized/evaluators/ and data/synthesized/phase3/ so the new Phase-4 + Phase-3 records ship with the next dataset push. Conformance after the repair pass on the same 2375 records: reflection_evaluator: 100.0% long_term_extraction: 100.0% reflection (post-repair): 97.7% fact_extractor (with stricter prompt, re-running): TBD summarization (with stricter prompt, re-running): TBD Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 50e940e commit 2714a61

4 files changed

Lines changed: 326 additions & 16 deletions

File tree

packages/training/scripts/audit_pipeline_shapes.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import collections
2525
import json
2626
import logging
27+
import re
2728
import sys
2829
from pathlib import Path
2930
from typing import Any, Iterator
@@ -308,14 +309,36 @@ def validate_should_respond(decoded: Any) -> list[str]:
308309

309310

310311
def validate_reflection(decoded: Any) -> list[str]:
312+
"""Validate `reflectionTemplate` output (eliza/packages/core/src/prompts.ts:867).
313+
314+
Emits: thought, quality_score, strengths, improvements, learnings.
315+
NOT to be confused with `reflection_evaluator` (separate template,
316+
has task_completed / task_completion_reason / relationships)."""
311317
if not isinstance(decoded, dict):
312318
return [f"top_level_not_object({type(decoded).__name__})"]
313319
reasons: list[str] = []
314-
for required in ("thought", "task_completed", "task_completion_reason"):
320+
for required in ("thought", "quality_score", "strengths",
321+
"improvements", "learnings"):
315322
if required not in decoded:
316323
reasons.append(f"missing_{required}")
317-
if "task_completed" in decoded and not isinstance(decoded["task_completed"], bool):
318-
reasons.append("task_completed_wrong_type")
324+
qs = decoded.get("quality_score")
325+
if qs is not None:
326+
# Accept int, float, or stringified forms — including the common
327+
# "78/100" denominator form that gpt-oss emits.
328+
n: float | None = None
329+
if isinstance(qs, (int, float)):
330+
n = float(qs)
331+
elif isinstance(qs, str):
332+
m = re.match(r"^\s*(\d+(?:\.\d+)?)\s*(?:/\s*100)?\s*$", qs)
333+
if m:
334+
try:
335+
n = float(m.group(1))
336+
except ValueError:
337+
n = None
338+
if n is None:
339+
reasons.append(f"quality_score_wrong_type({type(qs).__name__})")
340+
elif not (0 <= n <= 100):
341+
reasons.append(f"quality_score_out_of_range({qs})")
319342
return reasons
320343

321344

packages/training/scripts/publish_dataset_to_hf.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,12 @@ def _spec_combined() -> DatasetSpec:
192192
files.append(sb_manifest)
193193
path_in_repo[sb_manifest] = "scambench/manifest.json"
194194

195-
# Synthesized small sets.
195+
# Synthesized small sets. evaluators/ + phase3/ are the Phase-4 and
196+
# Phase-3 fillers added in 2026-05 to close the runtime-phase coverage
197+
# gap (see docs/dataset/COVERAGE_AUDIT.md, EVALUATOR_SYNTHESIS.md).
196198
synth_base = DATA / "synthesized"
197-
for sub in ("action_examples", "action_pairs", "core_prompts"):
199+
for sub in ("action_examples", "action_pairs", "core_prompts",
200+
"evaluators", "phase3"):
198201
d = synth_base / sub
199202
if not d.exists():
200203
continue

packages/training/scripts/synthesize_evaluator_prompts.py

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,60 @@ def strip_fences(s: str) -> str:
209209
return s.strip()
210210

211211

212+
def repair_toon_bullets(s: str) -> str:
213+
"""Convert markdown-bullet style values into TOON-array form.
214+
215+
gpt-oss-120b (and other instruction-tuned models) often emit:
216+
217+
strengths:
218+
- Clear tone.
219+
- Prompt response.
220+
221+
Which TOON cannot parse — `strengths:` has no value and the bullets
222+
look like new keys. Convert into TOON array form:
223+
224+
strengths[2]:
225+
- Clear tone.
226+
- Prompt response.
227+
228+
Idempotent on already-TOON output. Conservative: only transforms a
229+
`key:` line followed by ≥1 line starting with `- ` (after the
230+
bullets, the value resumes once we hit a non-bullet line)."""
231+
lines = s.splitlines()
232+
out: list[str] = []
233+
i = 0
234+
while i < len(lines):
235+
line = lines[i]
236+
m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*$", line)
237+
if m:
238+
# Look ahead for one or more `- item` lines (allow a blank line
239+
# between the key and the bullets).
240+
j = i + 1
241+
while j < len(lines) and lines[j].strip() == "":
242+
j += 1
243+
bullets: list[str] = []
244+
k = j
245+
while k < len(lines) and lines[k].lstrip().startswith("- "):
246+
bullets.append(lines[k].lstrip()[2:].strip())
247+
k += 1
248+
if bullets:
249+
key = m.group(1)
250+
out.append(f"{key}[{len(bullets)}]:")
251+
for b in bullets:
252+
out.append(f" - {b}")
253+
i = k
254+
continue
255+
out.append(line)
256+
i += 1
257+
return "\n".join(out)
258+
259+
260+
def normalize_teacher_output(s: str) -> str:
261+
"""Strip fences + apply known repair passes. Used by every evaluator
262+
branch in synthesize() so a single fix lands everywhere."""
263+
return repair_toon_bullets(strip_fences(s))
264+
265+
212266
# ───────────────────────── shared diversity pools ─────────────────────────
213267

214268
PERSONAS = [
@@ -401,6 +455,18 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
401455
402456
You maintain a two-store fact memory for an AI assistant. For each message you decide what to insert, strengthen, decay, or contradict in that memory. You return a single JSON object with an `ops` array — nothing else.
403457
458+
## STRICT op vocabulary (these are the ONLY accepted op values)
459+
460+
- `add_durable` — for stable identity-level claims (where someone lives, allergies, founded a company, life events)
461+
- `add_current` — for time-bound state (anxious today, debugging X, working on Y, traveling next week)
462+
- `strengthen` — when a known fact is restated; include `factId`
463+
- `decay` — when a current fact looks resolved; include `factId`
464+
- `contradict` — when a fact is directly contradicted; include `factId` and `proposedText`
465+
466+
DO NOT emit `op: insert`, `op: add`, `op: update`, or any value not in the
467+
list above. Use `add_durable` for durable claims and `add_current` for
468+
time-bound state — never the bare `add` or `insert`.
469+
404470
(see eliza/packages/core/src/prompts.ts:752 for the full description; the
405471
inputs below replicate the runtime substitution.)
406472
@@ -449,12 +515,28 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
449515
- **Topics**: List of main topics discussed (comma-separated)
450516
- **Key Points**: Important facts or decisions (bullet points)
451517
452-
Respond in TOON:
453-
text: Your comprehensive summary here
454-
topics[0]: topic1
455-
topics[1]: topic2
456-
keyPoints[0]: First key point
457-
keyPoints[1]: Second key point"""
518+
## STRICT TOON output
519+
520+
Each `topics[N]` and `keyPoints[N]` entry MUST be a single flat string —
521+
never an indented sub-object with sub-keys. Do not use markdown bullets.
522+
523+
Use the EXACT layout below (replace placeholders, keep the array form):
524+
525+
text: Your comprehensive summary here.
526+
topics[3]:
527+
- topic1
528+
- topic2
529+
- topic3
530+
keyPoints[5]:
531+
- First key point as a single sentence.
532+
- Second key point as a single sentence.
533+
- Third key point as a single sentence.
534+
- Fourth key point as a single sentence.
535+
- Fifth key point as a single sentence.
536+
537+
If you have a different number of topics or key points, change the index
538+
length to match (e.g. `topics[2]:`). Each item must be one line, no
539+
nested keys, no markdown bullets, no leading numbering."""
458540

459541

460542
LONG_TERM_EXTRACTION_TEMPLATE = """# Task: Extract Long-Term Memory (Strict Criteria)
@@ -866,7 +948,7 @@ def _generate_one(
866948
if dry_run:
867949
target_text = stub_reflection(encoder, rng, ctx, False)
868950
else:
869-
target_text = strip_fences(call_teacher(
951+
target_text = normalize_teacher_output(call_teacher(
870952
teacher,
871953
"You are generating supervised TOON output for the elizaOS "
872954
"reflection evaluator. Emit ONE TOON document and nothing else.",
@@ -881,7 +963,7 @@ def _generate_one(
881963
if dry_run:
882964
target_text = stub_reflection_evaluator(encoder, rng, ctx, entity_ids, False)
883965
else:
884-
target_text = strip_fences(call_teacher(
966+
target_text = normalize_teacher_output(call_teacher(
885967
teacher,
886968
"You are generating supervised TOON output for the elizaOS "
887969
"reflectionEvaluator. Emit ONE TOON document and nothing else. "
@@ -901,7 +983,7 @@ def _generate_one(
901983
if dry_run:
902984
target_text = stub_fact_extractor(encoder, rng, ctx, force_empty)
903985
else:
904-
target_text = strip_fences(call_teacher(
986+
target_text = normalize_teacher_output(call_teacher(
905987
teacher,
906988
"You are the elizaOS fact_extractor. Return exactly one JSON "
907989
"object `{\"ops\":[...]}`. Empty `{\"ops\":[]}` is a "
@@ -922,7 +1004,7 @@ def _generate_one(
9221004
if dry_run:
9231005
target_text = stub_summarization(encoder, rng, ctx, bucket)
9241006
else:
925-
target_text = strip_fences(call_teacher(
1007+
target_text = normalize_teacher_output(call_teacher(
9261008
teacher,
9271009
"You are the elizaOS summarization evaluator. Emit ONE TOON "
9281010
"document with `text`, `topics[N]`, `keyPoints[M]`. Nothing "
@@ -942,7 +1024,7 @@ def _generate_one(
9421024
band = (0.85, 0.94) if rng.random() < 0.625 else (0.95, 1.0)
9431025
target_text = stub_long_term(encoder, rng, ctx, force_empty, band)
9441026
else:
945-
target_text = strip_fences(call_teacher(
1027+
target_text = normalize_teacher_output(call_teacher(
9461028
teacher,
9471029
"You are the elizaOS long_term_extraction evaluator. ULTRA-"
9481030
"STRICT: when in doubt, emit no memories entries — empty "

0 commit comments

Comments
 (0)