Skip to content

Commit 5cd6592

Browse files
feat: evaluation framework — score, compare, and regression-test sessions (#10)
Add `agent-strace eval` subcommand with run, compare, ci, and dataset management. Scorers (zero new dependencies): - no_errors: 1.0 if no ERROR events - regex: pattern match against any event type - cost_under: proportional score against a dollar budget - files_scoped: all file ops within allowed path prefixes - duration_under: session duration within a time budget - custom: any callable returning float in [0, 1] Commands: agent-strace eval run <session-id> [--format table|json] agent-strace eval compare <session-a> <session-b> agent-strace eval ci <session-id> # exits 1 on any failure agent-strace eval dataset add|list|export Config via .agent-evals.yaml (stdlib-only YAML parser, no PyYAML). Dataset storage is local JSONL in .agent-traces/datasets/. 41 new tests covering all scorers, dataset CRUD, runner formatting, compare output, and CI exit codes. Co-authored-by: Ona <no-reply@ona.com>
1 parent e1dbe18 commit 5cd6592

6 files changed

Lines changed: 1266 additions & 0 deletions

File tree

src/agent_trace/eval/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Evaluation framework for agent sessions.
2+
3+
Score, compare, and regression-test agent sessions against configurable
4+
scorers. All storage is local JSONL — no external service, no database.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import sys
11+
12+
from .runner import cmd_eval_run, cmd_eval_compare, cmd_eval_ci
13+
from .dataset import cmd_dataset
14+
15+
16+
def cmd_eval(args: argparse.Namespace) -> int:
17+
eval_command = getattr(args, "eval_command", None)
18+
if not eval_command:
19+
sys.stderr.write(
20+
"Usage: agent-strace eval <run|compare|ci|dataset> ...\n"
21+
"Run `agent-strace eval --help` for details.\n"
22+
)
23+
return 1
24+
25+
if eval_command == "run":
26+
return cmd_eval_run(args)
27+
if eval_command == "compare":
28+
return cmd_eval_compare(args)
29+
if eval_command == "ci":
30+
return cmd_eval_ci(args)
31+
if eval_command == "dataset":
32+
return cmd_dataset(args)
33+
34+
sys.stderr.write(f"Unknown eval subcommand: {eval_command}\n")
35+
return 1

src/agent_trace/eval/config.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
"""Eval configuration loader (.agent-evals.yaml).
2+
3+
Parses a YAML-like config file using stdlib only (no PyYAML dependency).
4+
Supports a minimal subset of YAML: string keys, scalar values, and lists.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import re
10+
from dataclasses import dataclass, field
11+
from pathlib import Path
12+
13+
14+
@dataclass
15+
class ScorerConfig:
16+
type: str
17+
threshold: float = 1.0
18+
weight: float = 1.0
19+
# Extra scorer-specific params stored as a dict
20+
params: dict = field(default_factory=dict)
21+
22+
23+
@dataclass
24+
class EvalConfig:
25+
scorers: list[ScorerConfig] = field(default_factory=list)
26+
pass_threshold: float = 0.85
27+
warn_threshold: float = 0.70
28+
29+
@classmethod
30+
def default(cls) -> "EvalConfig":
31+
"""Return a sensible default config when no file is present."""
32+
return cls(
33+
scorers=[
34+
ScorerConfig(type="no_errors", threshold=1.0),
35+
],
36+
pass_threshold=0.85,
37+
warn_threshold=0.70,
38+
)
39+
40+
41+
# ---------------------------------------------------------------------------
42+
# Minimal YAML parser (stdlib only)
43+
# ---------------------------------------------------------------------------
44+
45+
def _parse_yaml_value(raw: str):
46+
"""Parse a scalar YAML value to Python type."""
47+
raw = raw.strip()
48+
if raw in ("true", "True", "yes"):
49+
return True
50+
if raw in ("false", "False", "no"):
51+
return False
52+
if raw in ("null", "~", ""):
53+
return None
54+
try:
55+
return int(raw)
56+
except ValueError:
57+
pass
58+
try:
59+
return float(raw)
60+
except ValueError:
61+
pass
62+
# Strip surrounding quotes
63+
if (raw.startswith('"') and raw.endswith('"')) or \
64+
(raw.startswith("'") and raw.endswith("'")):
65+
return raw[1:-1]
66+
return raw
67+
68+
69+
def _parse_minimal_yaml(text: str) -> dict:
70+
"""Parse a minimal YAML document into a nested dict/list structure.
71+
72+
Supports:
73+
- key: value (scalar mapping)
74+
- key: (mapping block — indented children follow)
75+
- - key: value (list of mappings)
76+
- - scalar (list of scalars)
77+
"""
78+
lines = text.splitlines()
79+
80+
def _skip_blanks(idx: int) -> int:
81+
while idx < len(lines) and (not lines[idx].strip() or lines[idx].strip().startswith("#")):
82+
idx += 1
83+
return idx
84+
85+
def _indent(line: str) -> int:
86+
return len(line) - len(line.lstrip())
87+
88+
def _parse_block(start: int, base_indent: int) -> tuple[int, object]:
89+
"""Parse a block starting at *start* with *base_indent* indentation.
90+
91+
Returns (next_line_index, parsed_value).
92+
"""
93+
i = _skip_blanks(start)
94+
if i >= len(lines):
95+
return i, {}
96+
97+
first_line = lines[i]
98+
first_indent = _indent(first_line)
99+
first_content = first_line.strip()
100+
101+
# Determine block type from first non-blank line
102+
if first_content.startswith("- "):
103+
# List block
104+
result_list: list = []
105+
while i < len(lines):
106+
i = _skip_blanks(i)
107+
if i >= len(lines):
108+
break
109+
line = lines[i]
110+
ind = _indent(line)
111+
content = line.strip()
112+
if ind < base_indent:
113+
break
114+
if not content.startswith("- "):
115+
break
116+
117+
item_content = content[2:].strip()
118+
if ":" in item_content:
119+
# Mapping item: parse key: value on this line, then sub-keys
120+
item_dict: dict = {}
121+
key, _, rest = item_content.partition(":")
122+
key = key.strip()
123+
rest = rest.strip()
124+
if rest:
125+
item_dict[key] = _parse_yaml_value(rest)
126+
else:
127+
# value is a sub-block
128+
i += 1
129+
i, sub = _parse_block(i, ind + 2)
130+
item_dict[key] = sub
131+
result_list.append(item_dict)
132+
continue
133+
134+
# Collect additional key: value lines at deeper indent
135+
i += 1
136+
while i < len(lines):
137+
i = _skip_blanks(i)
138+
if i >= len(lines):
139+
break
140+
sub_line = lines[i]
141+
sub_ind = _indent(sub_line)
142+
sub_content = sub_line.strip()
143+
if sub_ind <= ind or sub_content.startswith("- "):
144+
break
145+
if ":" in sub_content:
146+
k, _, v = sub_content.partition(":")
147+
item_dict[k.strip()] = _parse_yaml_value(v.strip())
148+
i += 1
149+
result_list.append(item_dict)
150+
else:
151+
result_list.append(_parse_yaml_value(item_content))
152+
i += 1
153+
154+
return i, result_list
155+
else:
156+
# Mapping block
157+
result_dict: dict = {}
158+
while i < len(lines):
159+
i = _skip_blanks(i)
160+
if i >= len(lines):
161+
break
162+
line = lines[i]
163+
ind = _indent(line)
164+
content = line.strip()
165+
if ind < base_indent:
166+
break
167+
if content.startswith("- "):
168+
break
169+
if ":" not in content:
170+
i += 1
171+
continue
172+
173+
key, _, rest = content.partition(":")
174+
key = key.strip()
175+
rest = rest.strip()
176+
i += 1
177+
178+
if rest:
179+
result_dict[key] = _parse_yaml_value(rest)
180+
else:
181+
# Look ahead for child block
182+
j = _skip_blanks(i)
183+
if j < len(lines) and _indent(lines[j]) > ind:
184+
i, child = _parse_block(i, _indent(lines[j]))
185+
result_dict[key] = child
186+
else:
187+
result_dict[key] = None
188+
189+
return i, result_dict
190+
191+
_, result = _parse_block(0, 0)
192+
if not isinstance(result, dict):
193+
return {}
194+
return result
195+
196+
197+
def load_config(path: str | Path = ".agent-evals.yaml") -> EvalConfig:
198+
"""Load eval config from *path*. Returns default config if file not found."""
199+
p = Path(path)
200+
if not p.exists():
201+
return EvalConfig.default()
202+
203+
try:
204+
text = p.read_text(encoding="utf-8")
205+
data = _parse_minimal_yaml(text)
206+
except Exception:
207+
return EvalConfig.default()
208+
209+
scorers: list[ScorerConfig] = []
210+
for s in data.get("scorers", []):
211+
if not isinstance(s, dict):
212+
continue
213+
scorer_type = str(s.get("type", ""))
214+
if not scorer_type:
215+
continue
216+
threshold = float(s.get("threshold", s.get("weight", 1.0)))
217+
params = {k: v for k, v in s.items() if k not in ("type", "threshold", "weight")}
218+
scorers.append(ScorerConfig(type=scorer_type, threshold=threshold, params=params))
219+
220+
thresholds = data.get("thresholds", {}) or {}
221+
pass_t = float(thresholds.get("pass", 0.85)) if isinstance(thresholds, dict) else 0.85
222+
warn_t = float(thresholds.get("warn", 0.70)) if isinstance(thresholds, dict) else 0.70
223+
224+
if not scorers:
225+
scorers = EvalConfig.default().scorers
226+
227+
return EvalConfig(scorers=scorers, pass_threshold=pass_t, warn_threshold=warn_t)

src/agent_trace/eval/dataset.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""Dataset management for eval sessions.
2+
3+
Datasets are JSONL files stored in .agent-traces/datasets/.
4+
Each entry records a session ID, label, and scorer configuration.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import json
11+
import sys
12+
import time
13+
import uuid
14+
from dataclasses import asdict, dataclass, field
15+
from pathlib import Path
16+
17+
from ..store import TraceStore
18+
19+
20+
@dataclass
21+
class DatasetEntry:
22+
entry_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
23+
session_id: str = ""
24+
label: str = ""
25+
added_at: float = field(default_factory=time.time)
26+
scorers: list[dict] = field(default_factory=list)
27+
28+
def to_json(self) -> str:
29+
return json.dumps(asdict(self), separators=(",", ":"))
30+
31+
@classmethod
32+
def from_json(cls, line: str) -> "DatasetEntry":
33+
return cls(**json.loads(line))
34+
35+
36+
# ---------------------------------------------------------------------------
37+
# CRUD
38+
# ---------------------------------------------------------------------------
39+
40+
def _ensure_dir(path: Path) -> None:
41+
path.parent.mkdir(parents=True, exist_ok=True)
42+
43+
44+
def add_entry(dataset_path: str | Path, entry: DatasetEntry) -> None:
45+
p = Path(dataset_path)
46+
_ensure_dir(p)
47+
with open(p, "a", encoding="utf-8") as f:
48+
f.write(entry.to_json() + "\n")
49+
50+
51+
def list_entries(dataset_path: str | Path) -> list[DatasetEntry]:
52+
p = Path(dataset_path)
53+
if not p.exists():
54+
return []
55+
entries = []
56+
for line in p.read_text(encoding="utf-8").splitlines():
57+
line = line.strip()
58+
if line:
59+
try:
60+
entries.append(DatasetEntry.from_json(line))
61+
except (json.JSONDecodeError, TypeError):
62+
continue
63+
return entries
64+
65+
66+
def export_entries(dataset_path: str | Path, out=sys.stdout) -> None:
67+
for entry in list_entries(dataset_path):
68+
out.write(entry.to_json() + "\n")
69+
70+
71+
# ---------------------------------------------------------------------------
72+
# CLI handler
73+
# ---------------------------------------------------------------------------
74+
75+
def cmd_dataset(args: argparse.Namespace) -> int:
76+
dataset_command = getattr(args, "dataset_command", None)
77+
dataset_path = getattr(args, "dataset", ".agent-traces/datasets/default.jsonl")
78+
79+
if dataset_command == "add":
80+
session_id = getattr(args, "session", "")
81+
label = getattr(args, "label", "")
82+
if not session_id:
83+
sys.stderr.write("--session is required\n")
84+
return 1
85+
entry = DatasetEntry(session_id=session_id, label=label)
86+
add_entry(dataset_path, entry)
87+
sys.stderr.write(f"Added session {session_id} to dataset {dataset_path}\n")
88+
return 0
89+
90+
if dataset_command == "list":
91+
entries = list_entries(dataset_path)
92+
if not entries:
93+
sys.stdout.write(f"No entries in {dataset_path}\n")
94+
return 0
95+
sys.stdout.write(f"\nDataset: {dataset_path} ({len(entries)} entries)\n")
96+
sys.stdout.write(f"{'─' * 60}\n")
97+
for e in entries:
98+
label = f" {e.label}" if e.label else ""
99+
sys.stdout.write(f" {e.entry_id} {e.session_id}{label}\n")
100+
sys.stdout.write(f"{'─' * 60}\n\n")
101+
return 0
102+
103+
if dataset_command == "export":
104+
export_entries(dataset_path)
105+
return 0
106+
107+
sys.stderr.write("Usage: agent-strace eval dataset <add|list|export>\n")
108+
return 1

0 commit comments

Comments
 (0)