Skip to content

Commit 5035079

Browse files
Merge feat/issue-10-eval-framework: evaluation framework (#16)
Co-authored-by: Ona <no-reply@ona.com>
2 parents b4d1215 + a6822c3 commit 5035079

6 files changed

Lines changed: 1263 additions & 0 deletions

File tree

src/agent_trace/eval/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Evaluation framework for agent sessions.
2+
3+
Score, compare, and regression-test agent sessions against configurable
4+
scorers. All storage is local JSONL — no external service, no database.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import sys
11+
12+
from .runner import cmd_eval_run, cmd_eval_compare, cmd_eval_ci
13+
from .dataset import cmd_dataset
14+
15+
16+
def cmd_eval(args: argparse.Namespace) -> int:
17+
eval_command = getattr(args, "eval_command", None)
18+
if not eval_command:
19+
sys.stderr.write(
20+
"Usage: agent-strace eval <run|compare|ci|dataset> ...\n"
21+
"Run `agent-strace eval --help` for details.\n"
22+
)
23+
return 1
24+
25+
if eval_command == "run":
26+
return cmd_eval_run(args)
27+
if eval_command == "compare":
28+
return cmd_eval_compare(args)
29+
if eval_command == "ci":
30+
return cmd_eval_ci(args)
31+
if eval_command == "dataset":
32+
return cmd_dataset(args)
33+
34+
sys.stderr.write(f"Unknown eval subcommand: {eval_command}\n")
35+
return 1

src/agent_trace/eval/config.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Eval configuration loader (.agent-evals.yaml).
2+
3+
Parses a YAML-like config file using stdlib only (no PyYAML dependency).
4+
Supports a minimal subset of YAML: string keys, scalar values, and lists.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from dataclasses import dataclass, field
10+
from pathlib import Path
11+
12+
13+
@dataclass
14+
class ScorerConfig:
15+
type: str
16+
threshold: float = 1.0
17+
weight: float = 1.0
18+
# Extra scorer-specific params stored as a dict
19+
params: dict = field(default_factory=dict)
20+
21+
22+
@dataclass
23+
class EvalConfig:
24+
scorers: list[ScorerConfig] = field(default_factory=list)
25+
pass_threshold: float = 0.85
26+
warn_threshold: float = 0.70
27+
28+
@classmethod
29+
def default(cls) -> "EvalConfig":
30+
"""Return a sensible default config when no file is present."""
31+
return cls(
32+
scorers=[
33+
ScorerConfig(type="no_errors", threshold=1.0),
34+
],
35+
pass_threshold=0.85,
36+
warn_threshold=0.70,
37+
)
38+
39+
40+
# ---------------------------------------------------------------------------
41+
# Minimal YAML parser (stdlib only)
42+
# ---------------------------------------------------------------------------
43+
44+
def _parse_yaml_value(raw: str):
45+
"""Parse a scalar YAML value to Python type."""
46+
raw = raw.strip()
47+
if raw in ("true", "True", "yes"):
48+
return True
49+
if raw in ("false", "False", "no"):
50+
return False
51+
if raw in ("null", "~", ""):
52+
return None
53+
try:
54+
return int(raw)
55+
except ValueError:
56+
pass
57+
try:
58+
return float(raw)
59+
except ValueError:
60+
pass
61+
# Strip surrounding quotes
62+
if (raw.startswith('"') and raw.endswith('"')) or \
63+
(raw.startswith("'") and raw.endswith("'")):
64+
return raw[1:-1]
65+
return raw
66+
67+
68+
def _parse_minimal_yaml(text: str) -> dict:
69+
"""Parse a minimal YAML document into a nested dict/list structure.
70+
71+
Supports:
72+
- key: value (scalar mapping)
73+
- key: (mapping block — indented children follow)
74+
- - key: value (list of mappings)
75+
- - scalar (list of scalars)
76+
"""
77+
lines = text.splitlines()
78+
79+
def _skip_blanks(idx: int) -> int:
80+
while idx < len(lines) and (not lines[idx].strip() or lines[idx].strip().startswith("#")):
81+
idx += 1
82+
return idx
83+
84+
def _indent(line: str) -> int:
85+
return len(line) - len(line.lstrip())
86+
87+
def _parse_block(start: int, base_indent: int) -> tuple[int, object]:
88+
"""Parse a block starting at *start* with *base_indent* indentation.
89+
90+
Returns (next_line_index, parsed_value).
91+
"""
92+
i = _skip_blanks(start)
93+
if i >= len(lines):
94+
return i, {}
95+
96+
first_content = lines[i].strip()
97+
98+
# Determine block type from first non-blank line
99+
if first_content.startswith("- "):
100+
# List block
101+
result_list: list = []
102+
while i < len(lines):
103+
i = _skip_blanks(i)
104+
if i >= len(lines):
105+
break
106+
line = lines[i]
107+
ind = _indent(line)
108+
content = line.strip()
109+
if ind < base_indent:
110+
break
111+
if not content.startswith("- "):
112+
break
113+
114+
item_content = content[2:].strip()
115+
if ":" in item_content:
116+
# Mapping item: parse key: value on this line, then sub-keys
117+
item_dict: dict = {}
118+
key, _, rest = item_content.partition(":")
119+
key = key.strip()
120+
rest = rest.strip()
121+
if rest:
122+
item_dict[key] = _parse_yaml_value(rest)
123+
else:
124+
# value is a sub-block
125+
i += 1
126+
i, sub = _parse_block(i, ind + 2)
127+
item_dict[key] = sub
128+
result_list.append(item_dict)
129+
continue
130+
131+
# Collect additional key: value lines at deeper indent
132+
i += 1
133+
while i < len(lines):
134+
i = _skip_blanks(i)
135+
if i >= len(lines):
136+
break
137+
sub_line = lines[i]
138+
sub_ind = _indent(sub_line)
139+
sub_content = sub_line.strip()
140+
if sub_ind <= ind or sub_content.startswith("- "):
141+
break
142+
if ":" in sub_content:
143+
k, _, v = sub_content.partition(":")
144+
item_dict[k.strip()] = _parse_yaml_value(v.strip())
145+
i += 1
146+
result_list.append(item_dict)
147+
else:
148+
result_list.append(_parse_yaml_value(item_content))
149+
i += 1
150+
151+
return i, result_list
152+
else:
153+
# Mapping block
154+
result_dict: dict = {}
155+
while i < len(lines):
156+
i = _skip_blanks(i)
157+
if i >= len(lines):
158+
break
159+
line = lines[i]
160+
ind = _indent(line)
161+
content = line.strip()
162+
if ind < base_indent:
163+
break
164+
if content.startswith("- "):
165+
break
166+
if ":" not in content:
167+
i += 1
168+
continue
169+
170+
key, _, rest = content.partition(":")
171+
key = key.strip()
172+
rest = rest.strip()
173+
i += 1
174+
175+
if rest:
176+
result_dict[key] = _parse_yaml_value(rest)
177+
else:
178+
# Look ahead for child block
179+
j = _skip_blanks(i)
180+
if j < len(lines) and _indent(lines[j]) > ind:
181+
i, child = _parse_block(i, _indent(lines[j]))
182+
result_dict[key] = child
183+
else:
184+
result_dict[key] = None
185+
186+
return i, result_dict
187+
188+
_, result = _parse_block(0, 0)
189+
if not isinstance(result, dict):
190+
return {}
191+
return result
192+
193+
194+
def load_config(path: str | Path = ".agent-evals.yaml") -> EvalConfig:
195+
"""Load eval config from *path*. Returns default config if file not found."""
196+
p = Path(path)
197+
if not p.exists():
198+
return EvalConfig.default()
199+
200+
try:
201+
text = p.read_text(encoding="utf-8")
202+
data = _parse_minimal_yaml(text)
203+
except Exception:
204+
return EvalConfig.default()
205+
206+
scorers: list[ScorerConfig] = []
207+
for s in data.get("scorers", []):
208+
if not isinstance(s, dict):
209+
continue
210+
scorer_type = str(s.get("type", ""))
211+
if not scorer_type:
212+
continue
213+
threshold = float(s.get("threshold", s.get("weight", 1.0)))
214+
params = {k: v for k, v in s.items() if k not in ("type", "threshold", "weight")}
215+
scorers.append(ScorerConfig(type=scorer_type, threshold=threshold, params=params))
216+
217+
thresholds = data.get("thresholds", {}) or {}
218+
pass_t = float(thresholds.get("pass", 0.85)) if isinstance(thresholds, dict) else 0.85
219+
warn_t = float(thresholds.get("warn", 0.70)) if isinstance(thresholds, dict) else 0.70
220+
221+
if not scorers:
222+
scorers = EvalConfig.default().scorers
223+
224+
return EvalConfig(scorers=scorers, pass_threshold=pass_t, warn_threshold=warn_t)

src/agent_trace/eval/dataset.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""Dataset management for eval sessions.
2+
3+
Datasets are JSONL files stored in .agent-traces/datasets/.
4+
Each entry records a session ID, label, and scorer configuration.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import argparse
10+
import json
11+
import sys
12+
import time
13+
import uuid
14+
from dataclasses import asdict, dataclass, field
15+
from pathlib import Path
16+
17+
18+
19+
20+
@dataclass
21+
class DatasetEntry:
22+
entry_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
23+
session_id: str = ""
24+
label: str = ""
25+
added_at: float = field(default_factory=time.time)
26+
scorers: list[dict] = field(default_factory=list)
27+
28+
def to_json(self) -> str:
29+
return json.dumps(asdict(self), separators=(",", ":"))
30+
31+
@classmethod
32+
def from_json(cls, line: str) -> "DatasetEntry":
33+
return cls(**json.loads(line))
34+
35+
36+
# ---------------------------------------------------------------------------
37+
# CRUD
38+
# ---------------------------------------------------------------------------
39+
40+
def _ensure_dir(path: Path) -> None:
41+
path.parent.mkdir(parents=True, exist_ok=True)
42+
43+
44+
def add_entry(dataset_path: str | Path, entry: DatasetEntry) -> None:
45+
p = Path(dataset_path)
46+
_ensure_dir(p)
47+
with open(p, "a", encoding="utf-8") as f:
48+
f.write(entry.to_json() + "\n")
49+
50+
51+
def list_entries(dataset_path: str | Path) -> list[DatasetEntry]:
52+
p = Path(dataset_path)
53+
if not p.exists():
54+
return []
55+
entries = []
56+
for line in p.read_text(encoding="utf-8").splitlines():
57+
line = line.strip()
58+
if line:
59+
try:
60+
entries.append(DatasetEntry.from_json(line))
61+
except (json.JSONDecodeError, TypeError):
62+
continue
63+
return entries
64+
65+
66+
def export_entries(dataset_path: str | Path, out=sys.stdout) -> None:
67+
for entry in list_entries(dataset_path):
68+
out.write(entry.to_json() + "\n")
69+
70+
71+
# ---------------------------------------------------------------------------
72+
# CLI handler
73+
# ---------------------------------------------------------------------------
74+
75+
def cmd_dataset(args: argparse.Namespace) -> int:
76+
dataset_command = getattr(args, "dataset_command", None)
77+
dataset_path = getattr(args, "dataset", ".agent-traces/datasets/default.jsonl")
78+
79+
if dataset_command == "add":
80+
session_id = getattr(args, "session", "")
81+
label = getattr(args, "label", "")
82+
if not session_id:
83+
sys.stderr.write("--session is required\n")
84+
return 1
85+
entry = DatasetEntry(session_id=session_id, label=label)
86+
add_entry(dataset_path, entry)
87+
sys.stderr.write(f"Added session {session_id} to dataset {dataset_path}\n")
88+
return 0
89+
90+
if dataset_command == "list":
91+
entries = list_entries(dataset_path)
92+
if not entries:
93+
sys.stdout.write(f"No entries in {dataset_path}\n")
94+
return 0
95+
sys.stdout.write(f"\nDataset: {dataset_path} ({len(entries)} entries)\n")
96+
sys.stdout.write(f"{'─' * 60}\n")
97+
for e in entries:
98+
label = f" {e.label}" if e.label else ""
99+
sys.stdout.write(f" {e.entry_id} {e.session_id}{label}\n")
100+
sys.stdout.write(f"{'─' * 60}\n\n")
101+
return 0
102+
103+
if dataset_command == "export":
104+
export_entries(dataset_path)
105+
return 0
106+
107+
sys.stderr.write("Usage: agent-strace eval dataset <add|list|export>\n")
108+
return 1

0 commit comments

Comments
 (0)