|
| 1 | +"""Session-to-session regression testing: agent-strace compare. |
| 2 | +
|
| 3 | +Wraps diff.compare_sessions with a first-class CLI workflow for regression |
| 4 | +testing: compare two sessions directly, compare the last N sessions with a |
| 5 | +given tag, or re-run a session's original prompt and compare live. |
| 6 | +
|
| 7 | +Decision divergence is computed as edit distance on decision event text — |
| 8 | +no LLM call required. |
| 9 | +""" |
| 10 | + |
| 11 | +from __future__ import annotations |
| 12 | + |
| 13 | +import argparse |
| 14 | +import json |
| 15 | +import sys |
| 16 | +from dataclasses import asdict |
| 17 | +from typing import TextIO |
| 18 | + |
| 19 | +from .diff import compare_sessions, format_compare, CompareReport |
| 20 | +from .models import EventType, SessionMeta |
| 21 | +from .store import TraceStore |
| 22 | + |
| 23 | + |
| 24 | +# --------------------------------------------------------------------------- |
| 25 | +# Decision divergence (edit distance on decision text) |
| 26 | +# --------------------------------------------------------------------------- |
| 27 | + |
| 28 | +def _decision_texts(store: TraceStore, session_id: str) -> list[str]: |
| 29 | + """Extract decision event text from a session.""" |
| 30 | + try: |
| 31 | + events = store.load_events(session_id) |
| 32 | + except Exception: |
| 33 | + return [] |
| 34 | + texts: list[str] = [] |
| 35 | + for ev in events: |
| 36 | + if ev.event_type == EventType.DECISION: |
| 37 | + text = ( |
| 38 | + ev.data.get("text") |
| 39 | + or ev.data.get("content") |
| 40 | + or ev.data.get("reasoning") |
| 41 | + or "" |
| 42 | + ) |
| 43 | + if text: |
| 44 | + texts.append(str(text)) |
| 45 | + return texts |
| 46 | + |
| 47 | + |
| 48 | +def _edit_distance(a: list[str], b: list[str]) -> int: |
| 49 | + """Levenshtein distance between two lists of strings (token-level).""" |
| 50 | + m, n = len(a), len(b) |
| 51 | + if m == 0: |
| 52 | + return n |
| 53 | + if n == 0: |
| 54 | + return m |
| 55 | + dp = list(range(n + 1)) |
| 56 | + for i in range(1, m + 1): |
| 57 | + prev = dp[0] |
| 58 | + dp[0] = i |
| 59 | + for j in range(1, n + 1): |
| 60 | + temp = dp[j] |
| 61 | + if a[i - 1] == b[j - 1]: |
| 62 | + dp[j] = prev |
| 63 | + else: |
| 64 | + dp[j] = 1 + min(prev, dp[j], dp[j - 1]) |
| 65 | + prev = temp |
| 66 | + return dp[n] |
| 67 | + |
| 68 | + |
| 69 | +def decision_divergence(store: TraceStore, session_a: str, session_b: str) -> int: |
| 70 | + """Number of decision events where reasoning text differs significantly.""" |
| 71 | + texts_a = _decision_texts(store, session_a) |
| 72 | + texts_b = _decision_texts(store, session_b) |
| 73 | + return _edit_distance(texts_a, texts_b) |
| 74 | + |
| 75 | + |
| 76 | +# --------------------------------------------------------------------------- |
| 77 | +# JSON serialisation of CompareReport |
| 78 | +# --------------------------------------------------------------------------- |
| 79 | + |
| 80 | +def _report_to_dict(report: CompareReport, divergence: int) -> dict: |
| 81 | + return { |
| 82 | + "session_a": report.session_a, |
| 83 | + "session_b": report.session_b, |
| 84 | + "label_a": report.label_a, |
| 85 | + "label_b": report.label_b, |
| 86 | + "duration_a": report.duration_a, |
| 87 | + "duration_b": report.duration_b, |
| 88 | + "cost_a": report.cost_a, |
| 89 | + "cost_b": report.cost_b, |
| 90 | + "tool_calls_a": report.tool_calls_a, |
| 91 | + "tool_calls_b": report.tool_calls_b, |
| 92 | + "files_modified_a": report.files_modified_a, |
| 93 | + "files_modified_b": report.files_modified_b, |
| 94 | + "errors_a": report.errors_a, |
| 95 | + "errors_b": report.errors_b, |
| 96 | + "redundant_reads_a": report.redundant_reads_a, |
| 97 | + "redundant_reads_b": report.redundant_reads_b, |
| 98 | + "decision_divergence": divergence, |
| 99 | + "divergence_points": [ |
| 100 | + {"step": step, "description_a": da, "description_b": db} |
| 101 | + for step, da, db in report.divergence_points |
| 102 | + ], |
| 103 | + "verdict": report.verdict, |
| 104 | + } |
| 105 | + |
| 106 | + |
| 107 | +# --------------------------------------------------------------------------- |
| 108 | +# Tag-based session lookup |
| 109 | +# --------------------------------------------------------------------------- |
| 110 | + |
| 111 | +def _sessions_by_tag(store: TraceStore, tag: str, last: int = 2) -> list[str]: |
| 112 | + """Return the last N session IDs whose agent_name or command contains tag.""" |
| 113 | + all_sessions = store.list_sessions() |
| 114 | + matched = [ |
| 115 | + s for s in all_sessions |
| 116 | + if tag.lower() in (s.agent_name or "").lower() |
| 117 | + or tag.lower() in (s.command or "").lower() |
| 118 | + ] |
| 119 | + # list_sessions returns newest-first |
| 120 | + return [s.session_id for s in matched[:last]] |
| 121 | + |
| 122 | + |
| 123 | +# --------------------------------------------------------------------------- |
| 124 | +# --rerun support |
| 125 | +# --------------------------------------------------------------------------- |
| 126 | + |
| 127 | +def _get_user_prompt(store: TraceStore, session_id: str) -> str | None: |
| 128 | + """Extract the original user prompt from a session's events.""" |
| 129 | + try: |
| 130 | + events = store.load_events(session_id) |
| 131 | + except Exception: |
| 132 | + return None |
| 133 | + for ev in events: |
| 134 | + if ev.event_type == EventType.USER_PROMPT: |
| 135 | + return ( |
| 136 | + ev.data.get("content") |
| 137 | + or ev.data.get("text") |
| 138 | + or ev.data.get("prompt") |
| 139 | + or None |
| 140 | + ) |
| 141 | + return None |
| 142 | + |
| 143 | + |
| 144 | +# --------------------------------------------------------------------------- |
| 145 | +# CLI handler |
| 146 | +# --------------------------------------------------------------------------- |
| 147 | + |
| 148 | +def cmd_compare(args: argparse.Namespace) -> int: |
| 149 | + store = TraceStore(args.trace_dir) |
| 150 | + fmt = getattr(args, "format", "text") |
| 151 | + tag = getattr(args, "tag", None) |
| 152 | + last = getattr(args, "last", 2) |
| 153 | + rerun = getattr(args, "rerun", False) |
| 154 | + model = getattr(args, "model", None) |
| 155 | + |
| 156 | + # Resolve the two session IDs |
| 157 | + session_a: str | None = None |
| 158 | + session_b: str | None = None |
| 159 | + |
| 160 | + if tag: |
| 161 | + ids = _sessions_by_tag(store, tag, last=last) |
| 162 | + if len(ids) < 2: |
| 163 | + sys.stderr.write( |
| 164 | + f"Need at least 2 sessions tagged {tag!r}, found {len(ids)}.\n" |
| 165 | + ) |
| 166 | + return 1 |
| 167 | + session_a, session_b = ids[1], ids[0] # older first, newer second |
| 168 | + |
| 169 | + else: |
| 170 | + raw_a = getattr(args, "session_id_a", None) |
| 171 | + raw_b = getattr(args, "session_id_b", None) |
| 172 | + |
| 173 | + if not raw_a: |
| 174 | + sys.stderr.write( |
| 175 | + "Usage: agent-strace compare <session-id-a> <session-id-b>\n" |
| 176 | + " agent-strace compare <session-id> --rerun [--model MODEL]\n" |
| 177 | + " agent-strace compare --tag TAG [--last N]\n" |
| 178 | + ) |
| 179 | + return 1 |
| 180 | + |
| 181 | + full_a = store.find_session(raw_a) |
| 182 | + if not full_a: |
| 183 | + sys.stderr.write(f"Session not found: {raw_a}\n") |
| 184 | + return 1 |
| 185 | + session_a = full_a |
| 186 | + |
| 187 | + if rerun: |
| 188 | + # --rerun: re-execute the original prompt and compare live |
| 189 | + prompt = _get_user_prompt(store, session_a) |
| 190 | + if not prompt: |
| 191 | + sys.stderr.write( |
| 192 | + f"Session {session_a[:12]} has no stored user_prompt. " |
| 193 | + "Cannot --rerun without a recorded prompt.\n" |
| 194 | + ) |
| 195 | + return 1 |
| 196 | + sys.stderr.write( |
| 197 | + f"[compare] --rerun is not yet automated. " |
| 198 | + f"Original prompt for {session_a[:12]}:\n\n{prompt}\n\n" |
| 199 | + "Run the agent with this prompt, then compare the two sessions manually.\n" |
| 200 | + ) |
| 201 | + return 1 |
| 202 | + |
| 203 | + if not raw_b: |
| 204 | + sys.stderr.write("Provide two session IDs or use --tag / --rerun.\n") |
| 205 | + return 1 |
| 206 | + |
| 207 | + full_b = store.find_session(raw_b) |
| 208 | + if not full_b: |
| 209 | + sys.stderr.write(f"Session not found: {raw_b}\n") |
| 210 | + return 1 |
| 211 | + session_b = full_b |
| 212 | + |
| 213 | + # Run comparison |
| 214 | + try: |
| 215 | + report = compare_sessions(store, session_a, session_b) |
| 216 | + except Exception as exc: |
| 217 | + sys.stderr.write(f"[compare] Failed: {exc}\n") |
| 218 | + return 1 |
| 219 | + |
| 220 | + divergence = decision_divergence(store, session_a, session_b) |
| 221 | + |
| 222 | + if fmt == "json": |
| 223 | + sys.stdout.write(json.dumps(_report_to_dict(report, divergence), indent=2) + "\n") |
| 224 | + else: |
| 225 | + # Text: use existing format_compare, then append decision divergence |
| 226 | + format_compare(report, sys.stdout) |
| 227 | + sys.stdout.write(f"Decision divergence: {divergence} point(s)\n\n") |
| 228 | + |
| 229 | + return 0 |
0 commit comments