Skip to content

Commit 5c2fd0e

Browse files
author
Shaw
committed
Merge branch 'develop' of https://github.com/elizaOS/eliza into develop
2 parents c900c8b + 2714a61 commit 5c2fd0e

4 files changed

Lines changed: 326 additions & 16 deletions

File tree

packages/training/scripts/audit_pipeline_shapes.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import collections
2525
import json
2626
import logging
27+
import re
2728
import sys
2829
from pathlib import Path
2930
from typing import Any, Iterator
@@ -308,14 +309,36 @@ def validate_should_respond(decoded: Any) -> list[str]:
308309

309310

310311
def validate_reflection(decoded: Any) -> list[str]:
312+
"""Validate `reflectionTemplate` output (eliza/packages/core/src/prompts.ts:867).
313+
314+
Emits: thought, quality_score, strengths, improvements, learnings.
315+
NOT to be confused with `reflection_evaluator` (separate template,
316+
has task_completed / task_completion_reason / relationships)."""
311317
if not isinstance(decoded, dict):
312318
return [f"top_level_not_object({type(decoded).__name__})"]
313319
reasons: list[str] = []
314-
for required in ("thought", "task_completed", "task_completion_reason"):
320+
for required in ("thought", "quality_score", "strengths",
321+
"improvements", "learnings"):
315322
if required not in decoded:
316323
reasons.append(f"missing_{required}")
317-
if "task_completed" in decoded and not isinstance(decoded["task_completed"], bool):
318-
reasons.append("task_completed_wrong_type")
324+
qs = decoded.get("quality_score")
325+
if qs is not None:
326+
# Accept int, float, or stringified forms — including the common
327+
# "78/100" denominator form that gpt-oss emits.
328+
n: float | None = None
329+
if isinstance(qs, (int, float)):
330+
n = float(qs)
331+
elif isinstance(qs, str):
332+
m = re.match(r"^\s*(\d+(?:\.\d+)?)\s*(?:/\s*100)?\s*$", qs)
333+
if m:
334+
try:
335+
n = float(m.group(1))
336+
except ValueError:
337+
n = None
338+
if n is None:
339+
reasons.append(f"quality_score_wrong_type({type(qs).__name__})")
340+
elif not (0 <= n <= 100):
341+
reasons.append(f"quality_score_out_of_range({qs})")
319342
return reasons
320343

321344

packages/training/scripts/publish_dataset_to_hf.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,12 @@ def _spec_combined() -> DatasetSpec:
191191
files.append(sb_manifest)
192192
path_in_repo[sb_manifest] = "scambench/manifest.json"
193193

194-
# Synthesized small sets.
194+
# Synthesized small sets. evaluators/ + phase3/ are the Phase-4 and
195+
# Phase-3 fillers added in 2026-05 to close the runtime-phase coverage
196+
# gap (see docs/dataset/COVERAGE_AUDIT.md, EVALUATOR_SYNTHESIS.md).
195197
synth_base = DATA / "synthesized"
196-
for sub in ("action_examples", "action_pairs", "core_prompts"):
198+
for sub in ("action_examples", "action_pairs", "core_prompts",
199+
"evaluators", "phase3"):
197200
d = synth_base / sub
198201
if not d.exists():
199202
continue

packages/training/scripts/synthesize_evaluator_prompts.py

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,60 @@ def strip_fences(s: str) -> str:
209209
return s.strip()
210210

211211

212+
def repair_toon_bullets(s: str) -> str:
213+
"""Convert markdown-bullet style values into TOON-array form.
214+
215+
gpt-oss-120b (and other instruction-tuned models) often emit:
216+
217+
strengths:
218+
- Clear tone.
219+
- Prompt response.
220+
221+
Which TOON cannot parse — `strengths:` has no value and the bullets
222+
look like new keys. Convert into TOON array form:
223+
224+
strengths[2]:
225+
- Clear tone.
226+
- Prompt response.
227+
228+
Idempotent on already-TOON output. Conservative: only transforms a
229+
`key:` line followed by ≥1 line starting with `- ` (after the
230+
bullets, the value resumes once we hit a non-bullet line)."""
231+
lines = s.splitlines()
232+
out: list[str] = []
233+
i = 0
234+
while i < len(lines):
235+
line = lines[i]
236+
m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*$", line)
237+
if m:
238+
# Look ahead for one or more `- item` lines (allow a blank line
239+
# between the key and the bullets).
240+
j = i + 1
241+
while j < len(lines) and lines[j].strip() == "":
242+
j += 1
243+
bullets: list[str] = []
244+
k = j
245+
while k < len(lines) and lines[k].lstrip().startswith("- "):
246+
bullets.append(lines[k].lstrip()[2:].strip())
247+
k += 1
248+
if bullets:
249+
key = m.group(1)
250+
out.append(f"{key}[{len(bullets)}]:")
251+
for b in bullets:
252+
out.append(f" - {b}")
253+
i = k
254+
continue
255+
out.append(line)
256+
i += 1
257+
return "\n".join(out)
258+
259+
260+
def normalize_teacher_output(s: str) -> str:
261+
"""Strip fences + apply known repair passes. Used by every evaluator
262+
branch in synthesize() so a single fix lands everywhere."""
263+
return repair_toon_bullets(strip_fences(s))
264+
265+
212266
# ───────────────────────── shared diversity pools ─────────────────────────
213267

214268
PERSONAS = [
@@ -401,6 +455,18 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
401455
402456
You maintain a two-store fact memory for an AI assistant. For each message you decide what to insert, strengthen, decay, or contradict in that memory. You return a single JSON object with an `ops` array — nothing else.
403457
458+
## STRICT op vocabulary (these are the ONLY accepted op values)
459+
460+
- `add_durable` — for stable identity-level claims (where someone lives, allergies, founded a company, life events)
461+
- `add_current` — for time-bound state (anxious today, debugging X, working on Y, traveling next week)
462+
- `strengthen` — when a known fact is restated; include `factId`
463+
- `decay` — when a current fact looks resolved; include `factId`
464+
- `contradict` — when a fact is directly contradicted; include `factId` and `proposedText`
465+
466+
DO NOT emit `op: insert`, `op: add`, `op: update`, or any value not in the
467+
list above. Use `add_durable` for durable claims and `add_current` for
468+
time-bound state — never the bare `add` or `insert`.
469+
404470
(see eliza/packages/core/src/prompts.ts:752 for the full description; the
405471
inputs below replicate the runtime substitution.)
406472
@@ -449,12 +515,28 @@ def render_recent(snippet: list[tuple[str, str]], speaker: str, agent: str) -> s
449515
- **Topics**: List of main topics discussed (comma-separated)
450516
- **Key Points**: Important facts or decisions (bullet points)
451517
452-
Respond in TOON:
453-
text: Your comprehensive summary here
454-
topics[0]: topic1
455-
topics[1]: topic2
456-
keyPoints[0]: First key point
457-
keyPoints[1]: Second key point"""
518+
## STRICT TOON output
519+
520+
Each `topics[N]` and `keyPoints[N]` entry MUST be a single flat string —
521+
never an indented sub-object with sub-keys. Do not use markdown bullets.
522+
523+
Use the EXACT layout below (replace placeholders, keep the array form):
524+
525+
text: Your comprehensive summary here.
526+
topics[3]:
527+
- topic1
528+
- topic2
529+
- topic3
530+
keyPoints[5]:
531+
- First key point as a single sentence.
532+
- Second key point as a single sentence.
533+
- Third key point as a single sentence.
534+
- Fourth key point as a single sentence.
535+
- Fifth key point as a single sentence.
536+
537+
If you have a different number of topics or key points, change the index
538+
length to match (e.g. `topics[2]:`). Each item must be one line, no
539+
nested keys, no markdown bullets, no leading numbering."""
458540

459541

460542
LONG_TERM_EXTRACTION_TEMPLATE = """# Task: Extract Long-Term Memory (Strict Criteria)
@@ -865,7 +947,7 @@ def _generate_one(
865947
if dry_run:
866948
target_text = stub_reflection(encoder, rng, ctx, False)
867949
else:
868-
target_text = strip_fences(call_teacher(
950+
target_text = normalize_teacher_output(call_teacher(
869951
teacher,
870952
"You are generating supervised TOON output for the elizaOS "
871953
"reflection evaluator. Emit ONE TOON document and nothing else.",
@@ -880,7 +962,7 @@ def _generate_one(
880962
if dry_run:
881963
target_text = stub_reflection_evaluator(encoder, rng, ctx, entity_ids, False)
882964
else:
883-
target_text = strip_fences(call_teacher(
965+
target_text = normalize_teacher_output(call_teacher(
884966
teacher,
885967
"You are generating supervised TOON output for the elizaOS "
886968
"reflectionEvaluator. Emit ONE TOON document and nothing else. "
@@ -900,7 +982,7 @@ def _generate_one(
900982
if dry_run:
901983
target_text = stub_fact_extractor(encoder, rng, ctx, force_empty)
902984
else:
903-
target_text = strip_fences(call_teacher(
985+
target_text = normalize_teacher_output(call_teacher(
904986
teacher,
905987
"You are the elizaOS fact_extractor. Return exactly one JSON "
906988
"object `{\"ops\":[...]}`. Empty `{\"ops\":[]}` is a "
@@ -921,7 +1003,7 @@ def _generate_one(
9211003
if dry_run:
9221004
target_text = stub_summarization(encoder, rng, ctx, bucket)
9231005
else:
924-
target_text = strip_fences(call_teacher(
1006+
target_text = normalize_teacher_output(call_teacher(
9251007
teacher,
9261008
"You are the elizaOS summarization evaluator. Emit ONE TOON "
9271009
"document with `text`, `topics[N]`, `keyPoints[M]`. Nothing "
@@ -941,7 +1023,7 @@ def _generate_one(
9411023
band = (0.85, 0.94) if rng.random() < 0.625 else (0.95, 1.0)
9421024
target_text = stub_long_term(encoder, rng, ctx, force_empty, band)
9431025
else:
944-
target_text = strip_fences(call_teacher(
1026+
target_text = normalize_teacher_output(call_teacher(
9451027
teacher,
9461028
"You are the elizaOS long_term_extraction evaluator. ULTRA-"
9471029
"STRICT: when in doubt, emit no memories entries — empty "

0 commit comments

Comments
 (0)