Skip to content

Commit f404833

Browse files
feat: LLM-as-judge scorer, dataset auto-sampling, eval --ci baseline (#78)
- Add score_llm_judge() scorer: calls any OpenAI-compatible endpoint, parses JSON {score, reason}, clamps to [0,1], strips markdown fences - Dispatch llm_judge through run_scorer() alongside existing scorers - Add auto_populate() to dataset: 6 signal filters (has-errors, high-retry, cost-above, wide-blast, long-duration, low-eval-score), since_days window, dedup, optional label - Extend cmd_eval_ci() with --baseline, --save-baseline, --tolerance, --github-summary flags; _load_baseline/_save_baseline/_write_github_summary - GitHub summary writes PR-comment-ready Markdown with delta vs baseline - 25 new tests covering all three features (700 total, all passing) Co-authored-by: Ona <no-reply@ona.com>
1 parent a5feb00 commit f404833

5 files changed

Lines changed: 764 additions & 7 deletions

File tree

src/agent_trace/cli.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,14 @@ def build_parser() -> argparse.ArgumentParser:
575575
p_eval_ci = eval_sub.add_parser("ci", help="run evals and exit 1 if any scorer fails")
576576
p_eval_ci.add_argument("session_id", nargs="?", help="session ID or prefix (default: latest)")
577577
p_eval_ci.add_argument("--config", default=".agent-evals.yaml")
578+
p_eval_ci.add_argument("--baseline", metavar="FILE",
579+
help="compare scores against a saved baseline JSON")
580+
p_eval_ci.add_argument("--save-baseline", dest="save_baseline", metavar="FILE",
581+
help="save current scores as a baseline and exit")
582+
p_eval_ci.add_argument("--tolerance", type=float, default=0.0, metavar="N",
583+
help="allow up to N regression vs baseline before failing (default: 0)")
584+
p_eval_ci.add_argument("--github-summary", dest="github_summary", action="store_true",
585+
help="write PR-comment Markdown to .agent-traces/eval-summary.md")
578586

579587
p_eval_dataset = eval_sub.add_parser("dataset", help="manage eval datasets")
580588
dataset_sub = p_eval_dataset.add_subparsers(dest="dataset_command")
@@ -587,6 +595,15 @@ def build_parser() -> argparse.ArgumentParser:
587595
)
588596
p_ds_export = dataset_sub.add_parser("export", help="export dataset to JSONL")
589597
p_ds_export.add_argument("--dataset", default=".agent-traces/datasets/default.jsonl")
598+
p_ds_auto = dataset_sub.add_parser("auto", help="auto-populate dataset from sessions by signal filter")
599+
p_ds_auto.add_argument("--name", default="default", help="dataset name (default: default)")
600+
p_ds_auto.add_argument("--dataset", default="", help="explicit dataset path (overrides --name)")
601+
p_ds_auto.add_argument("--filter", default="has-errors",
602+
help="filter: has-errors, high-retry, cost-above:N, wide-blast, "
603+
"long-duration:Ns, low-eval-score:N (default: has-errors)")
604+
p_ds_auto.add_argument("--since", default="7d", metavar="Nd",
605+
help="look back N days (default: 7d)")
606+
p_ds_auto.add_argument("--label", default="", help="label for added entries")
590607

591608
# watch
592609
p_watch = sub.add_parser("watch", help="monitor a live session with circuit breakers")

src/agent_trace/eval/dataset.py

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
import uuid
1414
from dataclasses import asdict, dataclass, field
1515
from pathlib import Path
16+
from typing import TYPE_CHECKING
17+
18+
if TYPE_CHECKING:
19+
from ..store import TraceStore
1620

1721

1822

@@ -68,6 +72,138 @@ def export_entries(dataset_path: str | Path, out=sys.stdout) -> None:
6872
out.write(entry.to_json() + "\n")
6973

7074

75+
# ---------------------------------------------------------------------------
76+
# Auto-sampling: populate a dataset from stored sessions by signal filter
77+
# ---------------------------------------------------------------------------
78+
79+
def _session_passes_filter(
80+
store: "TraceStore",
81+
session_id: str,
82+
filter_spec: str,
83+
eval_threshold: float = 0.8,
84+
) -> bool:
85+
"""Return True if the session matches the given filter spec.
86+
87+
Supported filters:
88+
has-errors — session has at least one ERROR event
89+
high-retry — retry rate > 30%
90+
cost-above:N — estimated cost > $N
91+
wide-blast — distinct files written > 10
92+
long-duration:Ns — session duration > N seconds
93+
low-eval-score:N — eval.json overall score < N
94+
"""
95+
from ..models import EventType
96+
97+
try:
98+
events = store.load_events(session_id)
99+
meta = store.load_meta(session_id)
100+
except Exception:
101+
return False
102+
103+
spec = filter_spec.strip().lower()
104+
105+
if spec == "has-errors":
106+
return any(e.event_type == EventType.ERROR for e in events)
107+
108+
if spec == "high-retry":
109+
tool_calls = [e for e in events if e.event_type == EventType.TOOL_CALL]
110+
if not tool_calls:
111+
return False
112+
retries = 0
113+
prev = None
114+
run = 0
115+
for ev in tool_calls:
116+
name = ev.data.get("tool_name", "")
117+
if name == prev:
118+
run += 1
119+
if run >= 2:
120+
retries += 1
121+
else:
122+
prev = name
123+
run = 0
124+
return retries / len(tool_calls) > 0.30
125+
126+
if spec.startswith("cost-above:"):
127+
try:
128+
threshold_dollars = float(spec.split(":", 1)[1])
129+
except ValueError:
130+
return False
131+
cost = meta.total_tokens / 1_000_000 * 3.0
132+
return cost > threshold_dollars
133+
134+
if spec == "wide-blast":
135+
files: set[str] = set()
136+
for ev in events:
137+
if ev.event_type == EventType.FILE_WRITE:
138+
p = ev.data.get("path") or ev.data.get("file_path") or ""
139+
if p:
140+
files.add(p)
141+
return len(files) > 10
142+
143+
if spec.startswith("long-duration:"):
144+
try:
145+
max_s = float(spec.split(":", 1)[1].rstrip("s"))
146+
except ValueError:
147+
return False
148+
duration = meta.total_duration_ms / 1000 if meta.total_duration_ms else 0.0
149+
return duration > max_s
150+
151+
if spec.startswith("low-eval-score:"):
152+
try:
153+
score_threshold = float(spec.split(":", 1)[1])
154+
except ValueError:
155+
score_threshold = eval_threshold
156+
eval_path = store.base_dir / session_id / "eval.json"
157+
if not eval_path.exists():
158+
return False
159+
try:
160+
data = json.loads(eval_path.read_text())
161+
results = data.get("results") or data.get("judges") or []
162+
if not results:
163+
return False
164+
avg = sum(float(r.get("score", 0)) for r in results) / len(results)
165+
return avg < score_threshold
166+
except Exception:
167+
return False
168+
169+
return False
170+
171+
172+
def auto_populate(
173+
store: "TraceStore",
174+
dataset_path: str | Path,
175+
filter_spec: str,
176+
since_days: float = 7.0,
177+
label: str = "",
178+
limit: int = 500,
179+
) -> int:
180+
"""Auto-populate a dataset from sessions matching a filter.
181+
182+
Returns the number of entries added.
183+
"""
184+
cutoff = time.time() - since_days * 86400
185+
added = 0
186+
187+
existing = {e.session_id for e in list_entries(dataset_path)}
188+
189+
for meta in store.list_sessions():
190+
if meta.started_at < cutoff:
191+
continue
192+
if meta.session_id in existing:
193+
continue
194+
if added >= limit:
195+
break
196+
if _session_passes_filter(store, meta.session_id, filter_spec):
197+
entry = DatasetEntry(
198+
session_id=meta.session_id,
199+
label=label or filter_spec,
200+
)
201+
add_entry(dataset_path, entry)
202+
added += 1
203+
204+
return added
205+
206+
71207
# ---------------------------------------------------------------------------
72208
# CLI handler
73209
# ---------------------------------------------------------------------------
@@ -104,5 +240,17 @@ def cmd_dataset(args: argparse.Namespace) -> int:
104240
export_entries(dataset_path)
105241
return 0
106242

107-
sys.stderr.write("Usage: agent-strace eval dataset <add|list|export>\n")
243+
if dataset_command == "auto":
244+
from ..store import TraceStore
245+
filter_spec = getattr(args, "filter", "has-errors") or "has-errors"
246+
since_raw = getattr(args, "since", "7d") or "7d"
247+
since_days = float(since_raw.rstrip("d"))
248+
label = getattr(args, "label", "") or filter_spec
249+
trace_dir = getattr(args, "trace_dir", ".agent-traces")
250+
store = TraceStore(trace_dir)
251+
added = auto_populate(store, dataset_path, filter_spec, since_days=since_days, label=label)
252+
sys.stdout.write(f"Added {added} session(s) to {dataset_path} (filter: {filter_spec})\n")
253+
return 0
254+
255+
sys.stderr.write("Usage: agent-strace eval dataset <add|list|export|auto>\n")
108256
return 1

src/agent_trace/eval/runner.py

Lines changed: 105 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import argparse
99
import json
1010
import sys
11+
from pathlib import Path
1112
from dataclasses import dataclass, field
1213

1314
from ..store import TraceStore
@@ -211,8 +212,74 @@ def cmd_eval_compare(args: argparse.Namespace) -> int:
211212
return 0
212213

213214

215+
def _load_baseline(path: str) -> dict[str, float]:
216+
"""Load a saved baseline: {scorer_name: score}."""
217+
p = Path(path)
218+
if not p.exists():
219+
return {}
220+
try:
221+
return json.loads(p.read_text())
222+
except Exception:
223+
return {}
224+
225+
226+
def _save_baseline(path: str, report: "EvalReport") -> None:
227+
"""Save current scores as a baseline file."""
228+
p = Path(path)
229+
p.parent.mkdir(parents=True, exist_ok=True)
230+
data = {r.scorer: r.score for r in report.results}
231+
p.write_text(json.dumps(data, indent=2))
232+
233+
234+
def _write_github_summary(report: "EvalReport", baseline: dict[str, float], tolerance: float) -> None:
235+
"""Write a PR-comment-ready Markdown summary to .agent-traces/eval-summary.md."""
236+
lines = ["## agent-strace eval\n"]
237+
lines.append("| Judge | Pass rate | Baseline | Delta | Status |")
238+
lines.append("|---|---|---|---|---|")
239+
for r in report.results:
240+
base_score = baseline.get(r.scorer)
241+
if base_score is not None:
242+
delta = r.score - base_score
243+
delta_str = f"{delta:+.0%}"
244+
regressed = delta < -tolerance
245+
status = "❌" if regressed else "✅"
246+
base_str = f"{base_score:.0%}"
247+
else:
248+
delta_str = "—"
249+
status = "✅" if r.passed else "❌"
250+
base_str = "—"
251+
lines.append(f"| `{r.scorer}` | {r.score:.0%} | {base_str} | {delta_str} | {status} |")
252+
253+
lines.append("")
254+
if report.overall_passed:
255+
lines.append("**Result: PASS**")
256+
else:
257+
lines.append(f"**Result: FAIL** — {report.failed} scorer(s) below threshold.")
258+
259+
failing = [r for r in report.results if not r.passed]
260+
if failing:
261+
lines.append("")
262+
lines.append("<details>")
263+
lines.append("<summary>Failing scorers</summary>")
264+
lines.append("")
265+
for r in failing:
266+
lines.append(f"- `{r.scorer}` — score {r.score:.2f} (threshold {r.threshold:.2f}): {r.reason}")
267+
lines.append("")
268+
lines.append("</details>")
269+
270+
summary_path = Path(".agent-traces/eval-summary.md")
271+
summary_path.parent.mkdir(parents=True, exist_ok=True)
272+
summary_path.write_text("\n".join(lines) + "\n")
273+
sys.stderr.write(f"GitHub summary written to {summary_path}\n")
274+
275+
214276
def cmd_eval_ci(args: argparse.Namespace) -> int:
215-
"""Run evals and exit 1 if any scorer fails (for CI integration)."""
277+
"""Run evals and exit 1 if any scorer fails (for CI integration).
278+
279+
Supports baseline comparison (--baseline), saving baselines
280+
(--save-baseline), regression tolerance (--tolerance), and
281+
GitHub Actions PR comment output (--github-summary).
282+
"""
216283
store = TraceStore(args.trace_dir)
217284
config = load_config(getattr(args, "config", ".agent-evals.yaml"))
218285

@@ -222,12 +289,44 @@ def cmd_eval_ci(args: argparse.Namespace) -> int:
222289
return 1
223290

224291
report = run_eval(store, session_id, config)
225-
# Route table to stderr so CI output is pipeable without noise
226292
format_report_table(report, out=sys.stderr)
227293

228-
if report.overall_passed:
229-
sys.stderr.write("CI: all scorers passed\n")
294+
# Save baseline if requested
295+
save_baseline_path = getattr(args, "save_baseline", None)
296+
if save_baseline_path:
297+
_save_baseline(save_baseline_path, report)
298+
sys.stderr.write(f"Baseline saved to {save_baseline_path}\n")
230299
return 0
231-
else:
232-
sys.stderr.write(f"CI: {report.failed} scorer(s) failed\n")
300+
301+
# Load baseline for comparison
302+
baseline_path = getattr(args, "baseline", None)
303+
baseline: dict[str, float] = {}
304+
if baseline_path:
305+
baseline = _load_baseline(baseline_path)
306+
307+
tolerance = float(getattr(args, "tolerance", 0.0) or 0.0)
308+
309+
# GitHub summary
310+
if getattr(args, "github_summary", False):
311+
_write_github_summary(report, baseline, tolerance)
312+
313+
# Determine pass/fail with optional baseline regression check
314+
failed = False
315+
if not report.overall_passed:
316+
failed = True
317+
elif baseline:
318+
for r in report.results:
319+
base_score = baseline.get(r.scorer)
320+
if base_score is not None and (r.score - base_score) < -tolerance:
321+
sys.stderr.write(
322+
f"CI: {r.scorer} regressed {r.score:.2f} vs baseline {base_score:.2f} "
323+
f"(tolerance {tolerance:.2f})\n"
324+
)
325+
failed = True
326+
327+
if failed:
328+
sys.stderr.write(f"CI: FAIL — {report.failed} scorer(s) failed\n")
233329
return 1
330+
331+
sys.stderr.write("CI: PASS — all scorers passed\n")
332+
return 0

0 commit comments

Comments
 (0)