1- """Session diff: structural behavioral comparison of two sessions.
1+ """Session diff: structural and semantic comparison of two sessions.
22
3- Compares two sessions by their phase structure (from explain), finds the
4- divergence point, and reports differences in files touched, commands run,
5- outcomes, duration, and cost.
3+ Two modes:
4+ - Structural (default): compares phase structure, divergence point, files/commands per phase.
5+ - Semantic (--semantic): compares outcome-level metrics — files touched, commands run,
6+ cost, duration, errors, and eval scores.
67"""
78
89from __future__ import annotations
910
1011import argparse
12+ import json
1113import sys
12- from dataclasses import dataclass
14+ from dataclasses import dataclass , field
1315from typing import TextIO
1416
1517from .explain import Phase , build_phases , explain_session
@@ -228,6 +230,202 @@ def format_diff(result: SessionDiff, out: TextIO = sys.stdout) -> None:
228230 f"{ result .retries_b } retries\n \n " )
229231
230232
233+ # ---------------------------------------------------------------------------
234+ # Semantic diff
235+ # ---------------------------------------------------------------------------
236+
237+ @dataclass
238+ class SemanticDiffReport :
239+ session_a : str
240+ session_b : str
241+ # Metrics
242+ duration_a : float
243+ duration_b : float
244+ cost_a : float
245+ cost_b : float
246+ errors_a : int
247+ errors_b : int
248+ tool_calls_a : int
249+ tool_calls_b : int
250+ llm_requests_a : int
251+ llm_requests_b : int
252+ retries_a : int
253+ retries_b : int
254+ # File sets
255+ files_read_both : list [str ] = field (default_factory = list )
256+ files_read_a_only : list [str ] = field (default_factory = list )
257+ files_read_b_only : list [str ] = field (default_factory = list )
258+ files_written_both : list [str ] = field (default_factory = list )
259+ files_written_a_only : list [str ] = field (default_factory = list )
260+ files_written_b_only : list [str ] = field (default_factory = list )
261+ # Command sets
262+ cmds_both : list [str ] = field (default_factory = list )
263+ cmds_a_only : list [str ] = field (default_factory = list )
264+ cmds_b_only : list [str ] = field (default_factory = list )
265+ # Eval scores (optional)
266+ eval_scores_a : dict = field (default_factory = dict )
267+ eval_scores_b : dict = field (default_factory = dict )
268+ # Verdict
269+ verdict : str = "" # "A is better" | "B is better" | "inconclusive"
270+
271+
272+ def semantic_diff (
273+ store : TraceStore ,
274+ session_a : str ,
275+ session_b : str ,
276+ eval_config : str = ".agent-evals.yaml" ,
277+ ) -> SemanticDiffReport :
278+ """Compare two sessions at the outcome level."""
279+ from .cost import estimate_cost
280+
281+ result_a = explain_session (store , session_a )
282+ result_b = explain_session (store , session_b )
283+ meta_a = store .load_meta (session_a )
284+ meta_b = store .load_meta (session_b )
285+
286+ # Cost
287+ try :
288+ cost_a = estimate_cost (store , session_a ).total_cost
289+ except Exception :
290+ cost_a = 0.0
291+ try :
292+ cost_b = estimate_cost (store , session_b ).total_cost
293+ except Exception :
294+ cost_b = 0.0
295+
296+ # Aggregate files and commands across all phases
297+ def _collect (result ):
298+ reads : set [str ] = set ()
299+ writes : set [str ] = set ()
300+ cmds : set [str ] = set ()
301+ for p in result .phases :
302+ reads .update (p .files_read )
303+ writes .update (p .files_written )
304+ cmds .update (p .commands )
305+ return reads , writes , cmds
306+
307+ reads_a , writes_a , cmds_a = _collect (result_a )
308+ reads_b , writes_b , cmds_b = _collect (result_b )
309+
310+ # Eval scores
311+ eval_a : dict = {}
312+ eval_b : dict = {}
313+ try :
314+ from .eval import run_evals
315+ import os
316+ if os .path .exists (eval_config ):
317+ eval_a = {r .scorer_name : r .score for r in run_evals (store , session_a , eval_config )}
318+ eval_b = {r .scorer_name : r .score for r in run_evals (store , session_b , eval_config )}
319+ except Exception :
320+ pass
321+
322+ # Verdict: B is better if it has fewer errors, lower cost, shorter duration
323+ # and is not worse on any metric
324+ def _verdict () -> str :
325+ a_wins = 0
326+ b_wins = 0
327+ metrics = [
328+ (meta_a .errors , meta_b .errors , True ), # lower is better
329+ (cost_a , cost_b , True ),
330+ (result_a .total_duration , result_b .total_duration , True ),
331+ (result_a .total_retries , result_b .total_retries , True ),
332+ ]
333+ for va , vb , lower_better in metrics :
334+ if lower_better :
335+ if va > vb :
336+ b_wins += 1
337+ elif vb > va :
338+ a_wins += 1
339+ if b_wins > 0 and a_wins == 0 :
340+ return "B is better"
341+ if a_wins > 0 and b_wins == 0 :
342+ return "A is better"
343+ return "inconclusive"
344+
345+ return SemanticDiffReport (
346+ session_a = session_a ,
347+ session_b = session_b ,
348+ duration_a = result_a .total_duration ,
349+ duration_b = result_b .total_duration ,
350+ cost_a = cost_a ,
351+ cost_b = cost_b ,
352+ errors_a = meta_a .errors ,
353+ errors_b = meta_b .errors ,
354+ tool_calls_a = meta_a .tool_calls ,
355+ tool_calls_b = meta_b .tool_calls ,
356+ llm_requests_a = meta_a .llm_requests ,
357+ llm_requests_b = meta_b .llm_requests ,
358+ retries_a = result_a .total_retries ,
359+ retries_b = result_b .total_retries ,
360+ files_read_both = sorted (reads_a & reads_b ),
361+ files_read_a_only = sorted (reads_a - reads_b ),
362+ files_read_b_only = sorted (reads_b - reads_a ),
363+ files_written_both = sorted (writes_a & writes_b ),
364+ files_written_a_only = sorted (writes_a - writes_b ),
365+ files_written_b_only = sorted (writes_b - writes_a ),
366+ cmds_both = sorted (cmds_a & cmds_b ),
367+ cmds_a_only = sorted (cmds_a - cmds_b ),
368+ cmds_b_only = sorted (cmds_b - cmds_a ),
369+ eval_scores_a = eval_a ,
370+ eval_scores_b = eval_b ,
371+ verdict = _verdict (),
372+ )
373+
374+
375+ def _pct_change (a : float , b : float ) -> str :
376+ if a == 0 :
377+ return "n/a"
378+ pct = (b - a ) / a * 100
379+ sign = "+" if pct > 0 else ""
380+ return f"{ sign } { pct :.0f} %"
381+
382+
383+ def format_semantic_diff (report : SemanticDiffReport , out : TextIO = sys .stdout ) -> None :
384+ w = out .write
385+ a = report .session_a [:12 ]
386+ b = report .session_b [:12 ]
387+
388+ w (f"\n Semantic diff: { a } vs { b } \n " )
389+ w ("─" * 69 + "\n " )
390+ w (f" { '' :30} { 'Session A' :>12} { 'Session B' :>12} { 'Change' :>8} \n " )
391+ w ("─" * 69 + "\n " )
392+
393+ def _row (label : str , va , vb , fmt = str , lower_better : bool = True ) -> None :
394+ change = _pct_change (float (va ), float (vb )) if isinstance (va , (int , float )) else ""
395+ w (f" { label :<30} { fmt (va ):>12} { fmt (vb ):>12} { change :>8} \n " )
396+
397+ _row ("Duration" , _fmt_duration (report .duration_a ), _fmt_duration (report .duration_b ), fmt = str )
398+ _row ("Cost" , f"${ report .cost_a :.4f} " , f"${ report .cost_b :.4f} " , fmt = str )
399+ _row ("Errors" , report .errors_a , report .errors_b )
400+ _row ("Tool calls" , report .tool_calls_a , report .tool_calls_b )
401+ _row ("LLM requests" , report .llm_requests_a , report .llm_requests_b )
402+ _row ("Retries" , report .retries_a , report .retries_b )
403+ w ("─" * 69 + "\n " )
404+
405+ def _file_rows (label : str , both : list , a_only : list , b_only : list ) -> None :
406+ if both :
407+ w (f" { label } (both) { ', ' .join (both [:3 ])} { '...' if len (both )> 3 else '' } \n " )
408+ for f in a_only [:3 ]:
409+ w (f" { label } (A only) { f } \n " )
410+ for f in b_only [:3 ]:
411+ w (f" { label } (B only) { f } \n " )
412+
413+ _file_rows ("Files read" , report .files_read_both , report .files_read_a_only , report .files_read_b_only )
414+ _file_rows ("Files written" , report .files_written_both , report .files_written_a_only , report .files_written_b_only )
415+ _file_rows ("Commands" , report .cmds_both , report .cmds_a_only , report .cmds_b_only )
416+
417+ if report .eval_scores_a or report .eval_scores_b :
418+ w ("─" * 69 + "\n " )
419+ all_scorers = sorted (set (report .eval_scores_a ) | set (report .eval_scores_b ))
420+ for scorer in all_scorers :
421+ sa = report .eval_scores_a .get (scorer , "n/a" )
422+ sb = report .eval_scores_b .get (scorer , "n/a" )
423+ w (f" Eval { scorer :<25} { str (sa ):>12} { str (sb ):>12} \n " )
424+
425+ w ("─" * 69 + "\n " )
426+ w (f" Verdict: { report .verdict } \n \n " )
427+
428+
231429# ---------------------------------------------------------------------------
232430# CLI handler
233431# ---------------------------------------------------------------------------
@@ -245,6 +443,12 @@ def cmd_diff(args: argparse.Namespace) -> int:
245443 sys .stderr .write (f"Session not found: { args .session_b } \n " )
246444 return 1
247445
446+ if getattr (args , "semantic" , False ):
447+ eval_config = getattr (args , "eval_config" , ".agent-evals.yaml" ) or ".agent-evals.yaml"
448+ report = semantic_diff (store , id_a , id_b , eval_config = eval_config )
449+ format_semantic_diff (report )
450+ return 0
451+
248452 result = diff_sessions (store , id_a , id_b )
249453 format_diff (result )
250454 return 0
0 commit comments