Skip to content

Commit 13a916d

Browse files
Siddhant-K-codeOnaona-agent
authored
feat: v0.22.0 — semantic session diff (compare outcomes between two runs) (#38)
diff.py gains a --semantic mode that compares two sessions at the outcome level: cost, duration, errors, retries, files read/written, commands run, and optional eval scores. Reports which files/commands were unique to each session and gives a verdict (A is better / B is better / inconclusive) based on errors, cost, duration, and retries. CLI: agent-strace diff <session-a> <session-b> --semantic [--eval-config] Closes #28 Co-authored-by: Ona <ona@gitpod.io> Co-authored-by: Ona <no-reply@ona.com>
1 parent 5ac2eda commit 13a916d

2 files changed

Lines changed: 338 additions & 5 deletions

File tree

src/agent_trace/diff.py

Lines changed: 209 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
"""Session diff: structural behavioral comparison of two sessions.
1+
"""Session diff: structural and semantic comparison of two sessions.
22
3-
Compares two sessions by their phase structure (from explain), finds the
4-
divergence point, and reports differences in files touched, commands run,
5-
outcomes, duration, and cost.
3+
Two modes:
4+
- Structural (default): compares phase structure, divergence point, files/commands per phase.
5+
- Semantic (--semantic): compares outcome-level metrics — files touched, commands run,
6+
cost, duration, errors, and eval scores.
67
"""
78

89
from __future__ import annotations
910

1011
import argparse
12+
import json
1113
import sys
12-
from dataclasses import dataclass
14+
from dataclasses import dataclass, field
1315
from typing import TextIO
1416

1517
from .explain import Phase, build_phases, explain_session
@@ -228,6 +230,202 @@ def format_diff(result: SessionDiff, out: TextIO = sys.stdout) -> None:
228230
f"{result.retries_b} retries\n\n")
229231

230232

233+
# ---------------------------------------------------------------------------
234+
# Semantic diff
235+
# ---------------------------------------------------------------------------
236+
237+
@dataclass
238+
class SemanticDiffReport:
239+
session_a: str
240+
session_b: str
241+
# Metrics
242+
duration_a: float
243+
duration_b: float
244+
cost_a: float
245+
cost_b: float
246+
errors_a: int
247+
errors_b: int
248+
tool_calls_a: int
249+
tool_calls_b: int
250+
llm_requests_a: int
251+
llm_requests_b: int
252+
retries_a: int
253+
retries_b: int
254+
# File sets
255+
files_read_both: list[str] = field(default_factory=list)
256+
files_read_a_only: list[str] = field(default_factory=list)
257+
files_read_b_only: list[str] = field(default_factory=list)
258+
files_written_both: list[str] = field(default_factory=list)
259+
files_written_a_only: list[str] = field(default_factory=list)
260+
files_written_b_only: list[str] = field(default_factory=list)
261+
# Command sets
262+
cmds_both: list[str] = field(default_factory=list)
263+
cmds_a_only: list[str] = field(default_factory=list)
264+
cmds_b_only: list[str] = field(default_factory=list)
265+
# Eval scores (optional)
266+
eval_scores_a: dict = field(default_factory=dict)
267+
eval_scores_b: dict = field(default_factory=dict)
268+
# Verdict
269+
verdict: str = "" # "A is better" | "B is better" | "inconclusive"
270+
271+
272+
def semantic_diff(
273+
store: TraceStore,
274+
session_a: str,
275+
session_b: str,
276+
eval_config: str = ".agent-evals.yaml",
277+
) -> SemanticDiffReport:
278+
"""Compare two sessions at the outcome level."""
279+
from .cost import estimate_cost
280+
281+
result_a = explain_session(store, session_a)
282+
result_b = explain_session(store, session_b)
283+
meta_a = store.load_meta(session_a)
284+
meta_b = store.load_meta(session_b)
285+
286+
# Cost
287+
try:
288+
cost_a = estimate_cost(store, session_a).total_cost
289+
except Exception:
290+
cost_a = 0.0
291+
try:
292+
cost_b = estimate_cost(store, session_b).total_cost
293+
except Exception:
294+
cost_b = 0.0
295+
296+
# Aggregate files and commands across all phases
297+
def _collect(result):
298+
reads: set[str] = set()
299+
writes: set[str] = set()
300+
cmds: set[str] = set()
301+
for p in result.phases:
302+
reads.update(p.files_read)
303+
writes.update(p.files_written)
304+
cmds.update(p.commands)
305+
return reads, writes, cmds
306+
307+
reads_a, writes_a, cmds_a = _collect(result_a)
308+
reads_b, writes_b, cmds_b = _collect(result_b)
309+
310+
# Eval scores
311+
eval_a: dict = {}
312+
eval_b: dict = {}
313+
try:
314+
from .eval import run_evals
315+
import os
316+
if os.path.exists(eval_config):
317+
eval_a = {r.scorer_name: r.score for r in run_evals(store, session_a, eval_config)}
318+
eval_b = {r.scorer_name: r.score for r in run_evals(store, session_b, eval_config)}
319+
except Exception:
320+
pass
321+
322+
# Verdict: B is better if it has fewer errors, lower cost, shorter duration
323+
# and is not worse on any metric
324+
def _verdict() -> str:
325+
a_wins = 0
326+
b_wins = 0
327+
metrics = [
328+
(meta_a.errors, meta_b.errors, True), # lower is better
329+
(cost_a, cost_b, True),
330+
(result_a.total_duration, result_b.total_duration, True),
331+
(result_a.total_retries, result_b.total_retries, True),
332+
]
333+
for va, vb, lower_better in metrics:
334+
if lower_better:
335+
if va > vb:
336+
b_wins += 1
337+
elif vb > va:
338+
a_wins += 1
339+
if b_wins > 0 and a_wins == 0:
340+
return "B is better"
341+
if a_wins > 0 and b_wins == 0:
342+
return "A is better"
343+
return "inconclusive"
344+
345+
return SemanticDiffReport(
346+
session_a=session_a,
347+
session_b=session_b,
348+
duration_a=result_a.total_duration,
349+
duration_b=result_b.total_duration,
350+
cost_a=cost_a,
351+
cost_b=cost_b,
352+
errors_a=meta_a.errors,
353+
errors_b=meta_b.errors,
354+
tool_calls_a=meta_a.tool_calls,
355+
tool_calls_b=meta_b.tool_calls,
356+
llm_requests_a=meta_a.llm_requests,
357+
llm_requests_b=meta_b.llm_requests,
358+
retries_a=result_a.total_retries,
359+
retries_b=result_b.total_retries,
360+
files_read_both=sorted(reads_a & reads_b),
361+
files_read_a_only=sorted(reads_a - reads_b),
362+
files_read_b_only=sorted(reads_b - reads_a),
363+
files_written_both=sorted(writes_a & writes_b),
364+
files_written_a_only=sorted(writes_a - writes_b),
365+
files_written_b_only=sorted(writes_b - writes_a),
366+
cmds_both=sorted(cmds_a & cmds_b),
367+
cmds_a_only=sorted(cmds_a - cmds_b),
368+
cmds_b_only=sorted(cmds_b - cmds_a),
369+
eval_scores_a=eval_a,
370+
eval_scores_b=eval_b,
371+
verdict=_verdict(),
372+
)
373+
374+
375+
def _pct_change(a: float, b: float) -> str:
376+
if a == 0:
377+
return "n/a"
378+
pct = (b - a) / a * 100
379+
sign = "+" if pct > 0 else ""
380+
return f"{sign}{pct:.0f}%"
381+
382+
383+
def format_semantic_diff(report: SemanticDiffReport, out: TextIO = sys.stdout) -> None:
384+
w = out.write
385+
a = report.session_a[:12]
386+
b = report.session_b[:12]
387+
388+
w(f"\nSemantic diff: {a} vs {b}\n")
389+
w("─" * 69 + "\n")
390+
w(f" {'':30} {'Session A':>12} {'Session B':>12} {'Change':>8}\n")
391+
w("─" * 69 + "\n")
392+
393+
def _row(label: str, va, vb, fmt=str, lower_better: bool = True) -> None:
394+
change = _pct_change(float(va), float(vb)) if isinstance(va, (int, float)) else ""
395+
w(f" {label:<30} {fmt(va):>12} {fmt(vb):>12} {change:>8}\n")
396+
397+
_row("Duration", _fmt_duration(report.duration_a), _fmt_duration(report.duration_b), fmt=str)
398+
_row("Cost", f"${report.cost_a:.4f}", f"${report.cost_b:.4f}", fmt=str)
399+
_row("Errors", report.errors_a, report.errors_b)
400+
_row("Tool calls", report.tool_calls_a, report.tool_calls_b)
401+
_row("LLM requests", report.llm_requests_a, report.llm_requests_b)
402+
_row("Retries", report.retries_a, report.retries_b)
403+
w("─" * 69 + "\n")
404+
405+
def _file_rows(label: str, both: list, a_only: list, b_only: list) -> None:
406+
if both:
407+
w(f" {label} (both) {', '.join(both[:3])}{'...' if len(both)>3 else ''}\n")
408+
for f in a_only[:3]:
409+
w(f" {label} (A only) {f}\n")
410+
for f in b_only[:3]:
411+
w(f" {label} (B only) {f}\n")
412+
413+
_file_rows("Files read", report.files_read_both, report.files_read_a_only, report.files_read_b_only)
414+
_file_rows("Files written", report.files_written_both, report.files_written_a_only, report.files_written_b_only)
415+
_file_rows("Commands", report.cmds_both, report.cmds_a_only, report.cmds_b_only)
416+
417+
if report.eval_scores_a or report.eval_scores_b:
418+
w("─" * 69 + "\n")
419+
all_scorers = sorted(set(report.eval_scores_a) | set(report.eval_scores_b))
420+
for scorer in all_scorers:
421+
sa = report.eval_scores_a.get(scorer, "n/a")
422+
sb = report.eval_scores_b.get(scorer, "n/a")
423+
w(f" Eval {scorer:<25} {str(sa):>12} {str(sb):>12}\n")
424+
425+
w("─" * 69 + "\n")
426+
w(f" Verdict: {report.verdict}\n\n")
427+
428+
231429
# ---------------------------------------------------------------------------
232430
# CLI handler
233431
# ---------------------------------------------------------------------------
@@ -245,6 +443,12 @@ def cmd_diff(args: argparse.Namespace) -> int:
245443
sys.stderr.write(f"Session not found: {args.session_b}\n")
246444
return 1
247445

446+
if getattr(args, "semantic", False):
447+
eval_config = getattr(args, "eval_config", ".agent-evals.yaml") or ".agent-evals.yaml"
448+
report = semantic_diff(store, id_a, id_b, eval_config=eval_config)
449+
format_semantic_diff(report)
450+
return 0
451+
248452
result = diff_sessions(store, id_a, id_b)
249453
format_diff(result)
250454
return 0

tests/test_semantic_diff.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Tests for semantic session diff (issue #28)."""
2+
3+
import os
4+
import sys
5+
import tempfile
6+
import unittest
7+
import io
8+
9+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
10+
11+
from agent_trace.diff import (
12+
SemanticDiffReport,
13+
format_semantic_diff,
14+
semantic_diff,
15+
)
16+
from agent_trace.models import EventType, SessionMeta, TraceEvent
17+
from agent_trace.store import TraceStore
18+
19+
20+
def _make_session(store, tool_calls=2, errors=0, tokens=1000, files_read=None, files_written=None, cmds=None):
21+
meta = SessionMeta(
22+
agent_name="test",
23+
tool_calls=tool_calls,
24+
errors=errors,
25+
total_tokens=tokens,
26+
total_duration_ms=5000,
27+
)
28+
store.create_session(meta)
29+
30+
for path in (files_read or []):
31+
e = TraceEvent(
32+
event_type=EventType.TOOL_CALL,
33+
session_id=meta.session_id,
34+
data={"tool_name": "read", "arguments": {"file_path": path}},
35+
)
36+
store.append_event(meta.session_id, e)
37+
38+
for path in (files_written or []):
39+
e = TraceEvent(
40+
event_type=EventType.TOOL_CALL,
41+
session_id=meta.session_id,
42+
data={"tool_name": "write", "arguments": {"file_path": path}},
43+
)
44+
store.append_event(meta.session_id, e)
45+
46+
for cmd in (cmds or []):
47+
e = TraceEvent(
48+
event_type=EventType.TOOL_CALL,
49+
session_id=meta.session_id,
50+
data={"tool_name": "bash", "arguments": {"command": cmd}},
51+
)
52+
store.append_event(meta.session_id, e)
53+
54+
return meta.session_id
55+
56+
57+
class TestSemanticDiff(unittest.TestCase):
58+
def setUp(self):
59+
self.tmpdir = tempfile.mkdtemp()
60+
self.store = TraceStore(self.tmpdir)
61+
62+
def test_basic_diff(self):
63+
sid_a = _make_session(self.store, errors=0, tokens=1000)
64+
sid_b = _make_session(self.store, errors=2, tokens=2000)
65+
report = semantic_diff(self.store, sid_a, sid_b)
66+
self.assertEqual(report.errors_a, 0)
67+
self.assertEqual(report.errors_b, 2)
68+
69+
def test_verdict_b_better(self):
70+
sid_a = _make_session(self.store, errors=3, tokens=5000)
71+
sid_b = _make_session(self.store, errors=0, tokens=1000)
72+
report = semantic_diff(self.store, sid_a, sid_b)
73+
self.assertEqual(report.verdict, "B is better")
74+
75+
def test_verdict_a_better(self):
76+
sid_a = _make_session(self.store, errors=0, tokens=1000)
77+
sid_b = _make_session(self.store, errors=3, tokens=5000)
78+
report = semantic_diff(self.store, sid_a, sid_b)
79+
self.assertEqual(report.verdict, "A is better")
80+
81+
def test_file_sets(self):
82+
sid_a = _make_session(self.store, files_read=["src/a.py", "src/b.py"])
83+
sid_b = _make_session(self.store, files_read=["src/b.py", "src/c.py"])
84+
report = semantic_diff(self.store, sid_a, sid_b)
85+
self.assertIn("src/b.py", report.files_read_both)
86+
self.assertIn("src/a.py", report.files_read_a_only)
87+
self.assertIn("src/c.py", report.files_read_b_only)
88+
89+
def test_command_sets(self):
90+
sid_a = _make_session(self.store, cmds=["pytest", "make build"])
91+
sid_b = _make_session(self.store, cmds=["pytest", "make test"])
92+
report = semantic_diff(self.store, sid_a, sid_b)
93+
self.assertIn("pytest", report.cmds_both)
94+
self.assertIn("make build", report.cmds_a_only)
95+
self.assertIn("make test", report.cmds_b_only)
96+
97+
def test_identical_sessions_inconclusive(self):
98+
sid_a = _make_session(self.store, errors=0, tokens=1000)
99+
sid_b = _make_session(self.store, errors=0, tokens=1000)
100+
report = semantic_diff(self.store, sid_a, sid_b)
101+
self.assertEqual(report.verdict, "inconclusive")
102+
103+
104+
class TestFormatSemanticDiff(unittest.TestCase):
105+
def test_format_no_crash(self):
106+
tmpdir = tempfile.mkdtemp()
107+
store = TraceStore(tmpdir)
108+
sid_a = _make_session(store, errors=1, tokens=2000)
109+
sid_b = _make_session(store, errors=0, tokens=1000)
110+
report = semantic_diff(store, sid_a, sid_b)
111+
buf = io.StringIO()
112+
format_semantic_diff(report, out=buf)
113+
output = buf.getvalue()
114+
self.assertIn("Semantic diff", output)
115+
self.assertIn("Verdict", output)
116+
117+
def test_format_shows_verdict(self):
118+
tmpdir = tempfile.mkdtemp()
119+
store = TraceStore(tmpdir)
120+
sid_a = _make_session(store, errors=5)
121+
sid_b = _make_session(store, errors=0)
122+
report = semantic_diff(store, sid_a, sid_b)
123+
buf = io.StringIO()
124+
format_semantic_diff(report, out=buf)
125+
self.assertIn("B is better", buf.getvalue())
126+
127+
128+
if __name__ == "__main__":
129+
unittest.main()

0 commit comments

Comments
 (0)