Skip to content

Commit c341be0

Browse files
feat(compare): add session regression testing command (closes #114)
Wraps diff.compare_sessions with a first-class CLI for regression testing. - compare <id-a> <id-b>: structured report with cost/duration/tool-call deltas - --tag TAG: compare last N sessions matching the tag - --rerun: surfaces the stored user_prompt with instructions (automation deferred) - --format json: machine-readable output including decision_divergence score - decision_divergence: Levenshtein distance on decision event text sequences - 34 tests covering edit distance, decision extraction, tag lookup, CLI paths Co-authored-by: Ona <no-reply@ona.com>
1 parent 14ae284 commit c341be0

5 files changed

Lines changed: 647 additions & 1 deletion

File tree

README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,40 @@ Supported models: `sonnet` (default), `opus`, `haiku`, `gpt4`, `gpt4o`. Token co
303303

304304
See [examples/session_analysis.md](examples/session_analysis.md) for a full walkthrough combining `import`, `explain`, and `cost`.
305305

306+
### Session regression testing (compare)
307+
308+
Compare two sessions structurally and get a verdict on whether agent behaviour improved or regressed. Useful when changing models, prompts, or tool implementations.
309+
310+
```bash
311+
# Compare two existing sessions
312+
agent-strace compare <session-id-a> <session-id-b>
313+
314+
# Compare the last 2 sessions with a given task tag
315+
agent-strace compare --tag refactor-auth
316+
317+
# Machine-readable output
318+
agent-strace compare <session-id-a> <session-id-b> --format json
319+
```
320+
321+
Example output:
322+
323+
```
324+
Session Comparison
325+
─────────────────────────────────────────────────────────────────
326+
a84664242afa bf1207728ee6 change
327+
─────────────────────────────────────────────────────────────────
328+
Duration 18m 00s 12m 00s -33%
329+
Total cost $4.2300 $2.8700 -32%
330+
Tool calls 14 11 -21%
331+
Files modified 2 2 (same)
332+
Errors 0 0
333+
─────────────────────────────────────────────────────────────────
334+
Verdict: bf1207728ee6 was 32% cheaper, 33% faster
335+
Decision divergence: 2 point(s)
336+
```
337+
338+
`decision divergence` is the edit distance between the two sessions' decision event sequences — no LLM call required. `--tag` compares the last N sessions whose `agent_name` or `command` contains the tag string.
339+
306340
### Weekly spend digest (budget-report)
307341

308342
Aggregate cost across sessions for a configurable time window. Shows total spend, top sessions, cost by tool, and savings from watchdog budget ceilings.

src/agent_trace/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""agent-trace: strace for AI agents."""
22

3-
__version__ = "0.48.0"
3+
__version__ = "0.49.0"

src/agent_trace/cli.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from .anonymize import cmd_anonymize_export
4949
from .integrations import detect_and_instrument, _INTEGRATIONS
5050
from .budget_report import cmd_budget_report
51+
from .compare import cmd_compare
5152
from .lint import cmd_lint
5253
from .retention import cmd_retention
5354
from .sample import cmd_sample
@@ -882,6 +883,23 @@ def build_parser() -> argparse.ArgumentParser:
882883
p_sample.add_argument("--seed", type=int, default=None,
883884
help="random seed for reproducible random sampling")
884885

886+
# compare
887+
p_compare = sub.add_parser("compare", help="session-to-session regression report")
888+
p_compare.add_argument("session_id_a", nargs="?",
889+
help="first session ID (baseline)")
890+
p_compare.add_argument("session_id_b", nargs="?",
891+
help="second session ID (candidate)")
892+
p_compare.add_argument("--rerun", action="store_true",
893+
help="re-run the original prompt and compare live")
894+
p_compare.add_argument("--model", metavar="MODEL",
895+
help="model to use for --rerun")
896+
p_compare.add_argument("--tag", metavar="TAG",
897+
help="compare the last N sessions matching this tag")
898+
p_compare.add_argument("--last", type=int, default=2, metavar="N",
899+
help="number of tagged sessions to compare (default: 2)")
900+
p_compare.add_argument("--format", choices=["text", "json"], default="text",
901+
dest="format", help="output format (default: text)")
902+
885903
# budget-report
886904
p_budget = sub.add_parser("budget-report", help="weekly spend digest across sessions")
887905
p_budget.add_argument("--since", metavar="DATE",
@@ -1034,6 +1052,7 @@ def main() -> None:
10341052
"mcp": cmd_mcp,
10351053
"auto": cmd_auto,
10361054
"budget-report": cmd_budget_report,
1055+
"compare": cmd_compare,
10371056
"lint": cmd_lint,
10381057
"retention": cmd_retention,
10391058
"sample": cmd_sample,

src/agent_trace/compare.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
"""Session-to-session regression testing: agent-strace compare.
2+
3+
Wraps diff.compare_sessions with a first-class CLI workflow for regression
4+
testing: compare two sessions directly, compare the last N sessions with a
5+
given tag, or re-run a session's original prompt and compare live.
6+
7+
Decision divergence is computed as edit distance on decision event text —
8+
no LLM call required.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import argparse
14+
import json
15+
import sys
16+
from dataclasses import asdict
17+
from typing import TextIO
18+
19+
from .diff import compare_sessions, format_compare, CompareReport
20+
from .models import EventType, SessionMeta
21+
from .store import TraceStore
22+
23+
24+
# ---------------------------------------------------------------------------
25+
# Decision divergence (edit distance on decision text)
26+
# ---------------------------------------------------------------------------
27+
28+
def _decision_texts(store: TraceStore, session_id: str) -> list[str]:
29+
"""Extract decision event text from a session."""
30+
try:
31+
events = store.load_events(session_id)
32+
except Exception:
33+
return []
34+
texts: list[str] = []
35+
for ev in events:
36+
if ev.event_type == EventType.DECISION:
37+
text = (
38+
ev.data.get("text")
39+
or ev.data.get("content")
40+
or ev.data.get("reasoning")
41+
or ""
42+
)
43+
if text:
44+
texts.append(str(text))
45+
return texts
46+
47+
48+
def _edit_distance(a: list[str], b: list[str]) -> int:
49+
"""Levenshtein distance between two lists of strings (token-level)."""
50+
m, n = len(a), len(b)
51+
if m == 0:
52+
return n
53+
if n == 0:
54+
return m
55+
dp = list(range(n + 1))
56+
for i in range(1, m + 1):
57+
prev = dp[0]
58+
dp[0] = i
59+
for j in range(1, n + 1):
60+
temp = dp[j]
61+
if a[i - 1] == b[j - 1]:
62+
dp[j] = prev
63+
else:
64+
dp[j] = 1 + min(prev, dp[j], dp[j - 1])
65+
prev = temp
66+
return dp[n]
67+
68+
69+
def decision_divergence(store: TraceStore, session_a: str, session_b: str) -> int:
70+
"""Number of decision events where reasoning text differs significantly."""
71+
texts_a = _decision_texts(store, session_a)
72+
texts_b = _decision_texts(store, session_b)
73+
return _edit_distance(texts_a, texts_b)
74+
75+
76+
# ---------------------------------------------------------------------------
77+
# JSON serialisation of CompareReport
78+
# ---------------------------------------------------------------------------
79+
80+
def _report_to_dict(report: CompareReport, divergence: int) -> dict:
81+
return {
82+
"session_a": report.session_a,
83+
"session_b": report.session_b,
84+
"label_a": report.label_a,
85+
"label_b": report.label_b,
86+
"duration_a": report.duration_a,
87+
"duration_b": report.duration_b,
88+
"cost_a": report.cost_a,
89+
"cost_b": report.cost_b,
90+
"tool_calls_a": report.tool_calls_a,
91+
"tool_calls_b": report.tool_calls_b,
92+
"files_modified_a": report.files_modified_a,
93+
"files_modified_b": report.files_modified_b,
94+
"errors_a": report.errors_a,
95+
"errors_b": report.errors_b,
96+
"redundant_reads_a": report.redundant_reads_a,
97+
"redundant_reads_b": report.redundant_reads_b,
98+
"decision_divergence": divergence,
99+
"divergence_points": [
100+
{"step": step, "description_a": da, "description_b": db}
101+
for step, da, db in report.divergence_points
102+
],
103+
"verdict": report.verdict,
104+
}
105+
106+
107+
# ---------------------------------------------------------------------------
108+
# Tag-based session lookup
109+
# ---------------------------------------------------------------------------
110+
111+
def _sessions_by_tag(store: TraceStore, tag: str, last: int = 2) -> list[str]:
112+
"""Return the last N session IDs whose agent_name or command contains tag."""
113+
all_sessions = store.list_sessions()
114+
matched = [
115+
s for s in all_sessions
116+
if tag.lower() in (s.agent_name or "").lower()
117+
or tag.lower() in (s.command or "").lower()
118+
]
119+
# list_sessions returns newest-first
120+
return [s.session_id for s in matched[:last]]
121+
122+
123+
# ---------------------------------------------------------------------------
124+
# --rerun support
125+
# ---------------------------------------------------------------------------
126+
127+
def _get_user_prompt(store: TraceStore, session_id: str) -> str | None:
128+
"""Extract the original user prompt from a session's events."""
129+
try:
130+
events = store.load_events(session_id)
131+
except Exception:
132+
return None
133+
for ev in events:
134+
if ev.event_type == EventType.USER_PROMPT:
135+
return (
136+
ev.data.get("content")
137+
or ev.data.get("text")
138+
or ev.data.get("prompt")
139+
or None
140+
)
141+
return None
142+
143+
144+
# ---------------------------------------------------------------------------
145+
# CLI handler
146+
# ---------------------------------------------------------------------------
147+
148+
def cmd_compare(args: argparse.Namespace) -> int:
149+
store = TraceStore(args.trace_dir)
150+
fmt = getattr(args, "format", "text")
151+
tag = getattr(args, "tag", None)
152+
last = getattr(args, "last", 2)
153+
rerun = getattr(args, "rerun", False)
154+
model = getattr(args, "model", None)
155+
156+
# Resolve the two session IDs
157+
session_a: str | None = None
158+
session_b: str | None = None
159+
160+
if tag:
161+
ids = _sessions_by_tag(store, tag, last=last)
162+
if len(ids) < 2:
163+
sys.stderr.write(
164+
f"Need at least 2 sessions tagged {tag!r}, found {len(ids)}.\n"
165+
)
166+
return 1
167+
session_a, session_b = ids[1], ids[0] # older first, newer second
168+
169+
else:
170+
raw_a = getattr(args, "session_id_a", None)
171+
raw_b = getattr(args, "session_id_b", None)
172+
173+
if not raw_a:
174+
sys.stderr.write(
175+
"Usage: agent-strace compare <session-id-a> <session-id-b>\n"
176+
" agent-strace compare <session-id> --rerun [--model MODEL]\n"
177+
" agent-strace compare --tag TAG [--last N]\n"
178+
)
179+
return 1
180+
181+
full_a = store.find_session(raw_a)
182+
if not full_a:
183+
sys.stderr.write(f"Session not found: {raw_a}\n")
184+
return 1
185+
session_a = full_a
186+
187+
if rerun:
188+
# --rerun: re-execute the original prompt and compare live
189+
prompt = _get_user_prompt(store, session_a)
190+
if not prompt:
191+
sys.stderr.write(
192+
f"Session {session_a[:12]} has no stored user_prompt. "
193+
"Cannot --rerun without a recorded prompt.\n"
194+
)
195+
return 1
196+
sys.stderr.write(
197+
f"[compare] --rerun is not yet automated. "
198+
f"Original prompt for {session_a[:12]}:\n\n{prompt}\n\n"
199+
"Run the agent with this prompt, then compare the two sessions manually.\n"
200+
)
201+
return 1
202+
203+
if not raw_b:
204+
sys.stderr.write("Provide two session IDs or use --tag / --rerun.\n")
205+
return 1
206+
207+
full_b = store.find_session(raw_b)
208+
if not full_b:
209+
sys.stderr.write(f"Session not found: {raw_b}\n")
210+
return 1
211+
session_b = full_b
212+
213+
# Run comparison
214+
try:
215+
report = compare_sessions(store, session_a, session_b)
216+
except Exception as exc:
217+
sys.stderr.write(f"[compare] Failed: {exc}\n")
218+
return 1
219+
220+
divergence = decision_divergence(store, session_a, session_b)
221+
222+
if fmt == "json":
223+
sys.stdout.write(json.dumps(_report_to_dict(report, divergence), indent=2) + "\n")
224+
else:
225+
# Text: use existing format_compare, then append decision divergence
226+
format_compare(report, sys.stdout)
227+
sys.stdout.write(f"Decision divergence: {divergence} point(s)\n\n")
228+
229+
return 0

0 commit comments

Comments
 (0)