feat(compare): add session regression testing command (closes #114)

Siddhant-K-code · ona-agent · Siddhant-K-code · commit c341be0190c0 · 2026-05-23T07:46:09.000Z
Wraps diff.compare_sessions with a first-class CLI for regression testing.

- compare &lt;id-a&gt; &lt;id-b&gt;: structured report with cost/duration/tool-call deltas
- --tag TAG: compare last N sessions matching the tag
- --rerun: surfaces the stored user_prompt with instructions (automation deferred)
- --format json: machine-readable output including decision_divergence score
- decision_divergence: Levenshtein distance on decision event text sequences
- 34 tests covering edit distance, decision extraction, tag lookup, CLI paths

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/README.md b/README.md
@@ -303,6 +303,40 @@ Supported models: `sonnet` (default), `opus`, `haiku`, `gpt4`, `gpt4o`. Token co
 
 See [examples/session_analysis.md](examples/session_analysis.md) for a full walkthrough combining `import`, `explain`, and `cost`.
 
+### Session regression testing (compare)
+
+Compare two sessions structurally and get a verdict on whether agent behaviour improved or regressed. Useful when changing models, prompts, or tool implementations.
+
+```bash
+# Compare two existing sessions
+agent-strace compare <session-id-a> <session-id-b>
+
+# Compare the last 2 sessions with a given task tag
+agent-strace compare --tag refactor-auth
+
+# Machine-readable output
+agent-strace compare <session-id-a> <session-id-b> --format json
+```
+
+Example output:
+
+```
+Session Comparison
+─────────────────────────────────────────────────────────────────
+                                 a84664242afa    bf1207728ee6    change
+─────────────────────────────────────────────────────────────────
+  Duration                              18m 00s         12m 00s     -33%
+  Total cost                            $4.2300         $2.8700     -32%
+  Tool calls                                 14              11     -21%
+  Files modified                              2               2    (same)
+  Errors                                      0               0
+─────────────────────────────────────────────────────────────────
+Verdict: bf1207728ee6 was 32% cheaper, 33% faster
+Decision divergence:  2 point(s)
+```
+
+`decision divergence` is the edit distance between the two sessions' decision event sequences — no LLM call required. `--tag` compares the last N sessions whose `agent_name` or `command` contains the tag string.
+
 ### Weekly spend digest (budget-report)
 
 Aggregate cost across sessions for a configurable time window. Shows total spend, top sessions, cost by tool, and savings from watchdog budget ceilings.
diff --git a/src/agent_trace/__init__.py b/src/agent_trace/__init__.py
@@ -1,3 +1,3 @@
 """agent-trace: strace for AI agents."""
 
-__version__ = "0.48.0"
+__version__ = "0.49.0"
diff --git a/src/agent_trace/cli.py b/src/agent_trace/cli.py
@@ -48,6 +48,7 @@
 from .anonymize import cmd_anonymize_export
 from .integrations import detect_and_instrument, _INTEGRATIONS
 from .budget_report import cmd_budget_report
+from .compare import cmd_compare
 from .lint import cmd_lint
 from .retention import cmd_retention
 from .sample import cmd_sample
@@ -882,6 +883,23 @@ def build_parser() -> argparse.ArgumentParser:
     p_sample.add_argument("--seed", type=int, default=None,
                           help="random seed for reproducible random sampling")
 
+    # compare
+    p_compare = sub.add_parser("compare", help="session-to-session regression report")
+    p_compare.add_argument("session_id_a", nargs="?",
+                           help="first session ID (baseline)")
+    p_compare.add_argument("session_id_b", nargs="?",
+                           help="second session ID (candidate)")
+    p_compare.add_argument("--rerun", action="store_true",
+                           help="re-run the original prompt and compare live")
+    p_compare.add_argument("--model", metavar="MODEL",
+                           help="model to use for --rerun")
+    p_compare.add_argument("--tag", metavar="TAG",
+                           help="compare the last N sessions matching this tag")
+    p_compare.add_argument("--last", type=int, default=2, metavar="N",
+                           help="number of tagged sessions to compare (default: 2)")
+    p_compare.add_argument("--format", choices=["text", "json"], default="text",
+                           dest="format", help="output format (default: text)")
+
     # budget-report
     p_budget = sub.add_parser("budget-report", help="weekly spend digest across sessions")
     p_budget.add_argument("--since", metavar="DATE",
@@ -1034,6 +1052,7 @@ def main() -> None:
         "mcp": cmd_mcp,
         "auto": cmd_auto,
         "budget-report": cmd_budget_report,
+        "compare": cmd_compare,
         "lint": cmd_lint,
         "retention": cmd_retention,
         "sample": cmd_sample,
diff --git a/src/agent_trace/compare.py b/src/agent_trace/compare.py
@@ -0,0 +1,229 @@
+"""Session-to-session regression testing: agent-strace compare.
+
+Wraps diff.compare_sessions with a first-class CLI workflow for regression
+testing: compare two sessions directly, compare the last N sessions with a
+given tag, or re-run a session's original prompt and compare live.
+
+Decision divergence is computed as edit distance on decision event text —
+no LLM call required.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from dataclasses import asdict
+from typing import TextIO
+
+from .diff import compare_sessions, format_compare, CompareReport
+from .models import EventType, SessionMeta
+from .store import TraceStore
+
+
+# ---------------------------------------------------------------------------
+# Decision divergence (edit distance on decision text)
+# ---------------------------------------------------------------------------
+
+def _decision_texts(store: TraceStore, session_id: str) -> list[str]:
+    """Extract decision event text from a session."""
+    try:
+        events = store.load_events(session_id)
+    except Exception:
+        return []
+    texts: list[str] = []
+    for ev in events:
+        if ev.event_type == EventType.DECISION:
+            text = (
+                ev.data.get("text")
+                or ev.data.get("content")
+                or ev.data.get("reasoning")
+                or ""
+            )
+            if text:
+                texts.append(str(text))
+    return texts
+
+
+def _edit_distance(a: list[str], b: list[str]) -> int:
+    """Levenshtein distance between two lists of strings (token-level)."""
+    m, n = len(a), len(b)
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+    dp = list(range(n + 1))
+    for i in range(1, m + 1):
+        prev = dp[0]
+        dp[0] = i
+        for j in range(1, n + 1):
+            temp = dp[j]
+            if a[i - 1] == b[j - 1]:
+                dp[j] = prev
+            else:
+                dp[j] = 1 + min(prev, dp[j], dp[j - 1])
+            prev = temp
+    return dp[n]
+
+
+def decision_divergence(store: TraceStore, session_a: str, session_b: str) -> int:
+    """Number of decision events where reasoning text differs significantly."""
+    texts_a = _decision_texts(store, session_a)
+    texts_b = _decision_texts(store, session_b)
+    return _edit_distance(texts_a, texts_b)
+
+
+# ---------------------------------------------------------------------------
+# JSON serialisation of CompareReport
+# ---------------------------------------------------------------------------
+
+def _report_to_dict(report: CompareReport, divergence: int) -> dict:
+    return {
+        "session_a": report.session_a,
+        "session_b": report.session_b,
+        "label_a": report.label_a,
+        "label_b": report.label_b,
+        "duration_a": report.duration_a,
+        "duration_b": report.duration_b,
+        "cost_a": report.cost_a,
+        "cost_b": report.cost_b,
+        "tool_calls_a": report.tool_calls_a,
+        "tool_calls_b": report.tool_calls_b,
+        "files_modified_a": report.files_modified_a,
+        "files_modified_b": report.files_modified_b,
+        "errors_a": report.errors_a,
+        "errors_b": report.errors_b,
+        "redundant_reads_a": report.redundant_reads_a,
+        "redundant_reads_b": report.redundant_reads_b,
+        "decision_divergence": divergence,
+        "divergence_points": [
+            {"step": step, "description_a": da, "description_b": db}
+            for step, da, db in report.divergence_points
+        ],
+        "verdict": report.verdict,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Tag-based session lookup
+# ---------------------------------------------------------------------------
+
+def _sessions_by_tag(store: TraceStore, tag: str, last: int = 2) -> list[str]:
+    """Return the last N session IDs whose agent_name or command contains tag."""
+    all_sessions = store.list_sessions()
+    matched = [
+        s for s in all_sessions
+        if tag.lower() in (s.agent_name or "").lower()
+        or tag.lower() in (s.command or "").lower()
+    ]
+    # list_sessions returns newest-first
+    return [s.session_id for s in matched[:last]]
+
+
+# ---------------------------------------------------------------------------
+# --rerun support
+# ---------------------------------------------------------------------------
+
+def _get_user_prompt(store: TraceStore, session_id: str) -> str | None:
+    """Extract the original user prompt from a session's events."""
+    try:
+        events = store.load_events(session_id)
+    except Exception:
+        return None
+    for ev in events:
+        if ev.event_type == EventType.USER_PROMPT:
+            return (
+                ev.data.get("content")
+                or ev.data.get("text")
+                or ev.data.get("prompt")
+                or None
+            )
+    return None
+
+
+# ---------------------------------------------------------------------------
+# CLI handler
+# ---------------------------------------------------------------------------
+
+def cmd_compare(args: argparse.Namespace) -> int:
+    store = TraceStore(args.trace_dir)
+    fmt = getattr(args, "format", "text")
+    tag = getattr(args, "tag", None)
+    last = getattr(args, "last", 2)
+    rerun = getattr(args, "rerun", False)
+    model = getattr(args, "model", None)
+
+    # Resolve the two session IDs
+    session_a: str | None = None
+    session_b: str | None = None
+
+    if tag:
+        ids = _sessions_by_tag(store, tag, last=last)
+        if len(ids) < 2:
+            sys.stderr.write(
+                f"Need at least 2 sessions tagged {tag!r}, found {len(ids)}.\n"
+            )
+            return 1
+        session_a, session_b = ids[1], ids[0]  # older first, newer second
+
+    else:
+        raw_a = getattr(args, "session_id_a", None)
+        raw_b = getattr(args, "session_id_b", None)
+
+        if not raw_a:
+            sys.stderr.write(
+                "Usage: agent-strace compare <session-id-a> <session-id-b>\n"
+                "       agent-strace compare <session-id> --rerun [--model MODEL]\n"
+                "       agent-strace compare --tag TAG [--last N]\n"
+            )
+            return 1
+
+        full_a = store.find_session(raw_a)
+        if not full_a:
+            sys.stderr.write(f"Session not found: {raw_a}\n")
+            return 1
+        session_a = full_a
+
+        if rerun:
+            # --rerun: re-execute the original prompt and compare live
+            prompt = _get_user_prompt(store, session_a)
+            if not prompt:
+                sys.stderr.write(
+                    f"Session {session_a[:12]} has no stored user_prompt. "
+                    "Cannot --rerun without a recorded prompt.\n"
+                )
+                return 1
+            sys.stderr.write(
+                f"[compare] --rerun is not yet automated. "
+                f"Original prompt for {session_a[:12]}:\n\n{prompt}\n\n"
+                "Run the agent with this prompt, then compare the two sessions manually.\n"
+            )
+            return 1
+
+        if not raw_b:
+            sys.stderr.write("Provide two session IDs or use --tag / --rerun.\n")
+            return 1
+
+        full_b = store.find_session(raw_b)
+        if not full_b:
+            sys.stderr.write(f"Session not found: {raw_b}\n")
+            return 1
+        session_b = full_b
+
+    # Run comparison
+    try:
+        report = compare_sessions(store, session_a, session_b)
+    except Exception as exc:
+        sys.stderr.write(f"[compare] Failed: {exc}\n")
+        return 1
+
+    divergence = decision_divergence(store, session_a, session_b)
+
+    if fmt == "json":
+        sys.stdout.write(json.dumps(_report_to_dict(report, divergence), indent=2) + "\n")
+    else:
+        # Text: use existing format_compare, then append decision divergence
+        format_compare(report, sys.stdout)
+        sys.stdout.write(f"Decision divergence:  {divergence} point(s)\n\n")
+
+    return 0
diff --git a/tests/test_compare.py b/tests/test_compare.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""agent-trace: strace for AI agents."""`
`2`	`2`
`3`		`-__version__ = "0.48.0"`
	`3`	`+__version__ = "0.49.0"`