77 Evaluates declarative rules on every event. Supports pause (SIGSTOP/SIGCONT)
88 and kill (SIGTERM) actions with optional notifications. Auto-generates a
99 postmortem when a kill action fires.
10+
11+ Watchdog mode (--timeout / --budget):
12+ Enforces a wall-clock timeout and/or token-cost ceiling. When either limit
13+ is breached the agent process is terminated and a structured post-mortem
14+ JSON file is written to the session directory. An optional --on-death
15+ command is invoked with the post-mortem path substituted for
16+ {post_mortem_path}.
1017"""
1118
1219from __future__ import annotations
1724import os
1825import re
1926import signal
27+ import subprocess
2028import sys
2129import time
2230import urllib .request
3038from .store import TraceStore
3139
3240
41+ # ---------------------------------------------------------------------------
42+ # Duration parsing
43+ # ---------------------------------------------------------------------------
44+
45+ def _parse_duration (value : str ) -> float :
46+ """Parse a human-readable duration string to seconds.
47+
48+ Accepts: 30s, 5m, 2h, 1h30m, 90 (bare number = seconds).
49+ """
50+ value = value .strip ()
51+ if not value :
52+ raise ValueError ("empty duration" )
53+
54+ # bare number → seconds
55+ try :
56+ return float (value )
57+ except ValueError :
58+ pass
59+
60+ total = 0.0
61+ pattern = re .compile (r"(\d+(?:\.\d+)?)\s*([smhd]?)" )
62+ for m in pattern .finditer (value .lower ()):
63+ num , unit = float (m .group (1 )), m .group (2 )
64+ if unit == "d" :
65+ total += num * 86400
66+ elif unit == "h" :
67+ total += num * 3600
68+ elif unit == "m" :
69+ total += num * 60
70+ else : # 's' or no unit
71+ total += num
72+ if total == 0.0 :
73+ raise ValueError (f"cannot parse duration: { value !r} " )
74+ return total
75+
76+
3377# ---------------------------------------------------------------------------
3478# Alert actions
3579# ---------------------------------------------------------------------------
@@ -285,6 +329,8 @@ class WatcherConfig:
285329 operation_rules : list [OperationRule ] = field (default_factory = list )
286330 # Token budget threshold (1–100, percentage of context window)
287331 max_context_pct : int = 90
332+ # Watchdog: command to run after kill, with {post_mortem_path} substituted
333+ on_death_cmd : str = ""
288334
289335 @classmethod
290336 def from_dict (cls , d : dict ) -> "WatcherConfig" :
@@ -414,6 +460,8 @@ def _dispatch_alert(
414460 action : str | None = None ,
415461 notify : str = "" ,
416462 dry_run : bool = False ,
463+ store : TraceStore | None = None ,
464+ session_id : str = "" ,
417465) -> None :
418466 action = action or config .on_violation
419467 # terminal is always shown regardless of action so the operator watching
@@ -437,9 +485,80 @@ def _dispatch_alert(
437485 state .paused = True
438486 elif action == "kill" and state .agent_pid :
439487 _alert_terminal (f"Killing agent process { state .agent_pid } " )
488+ # Write watchdog post-mortem JSON before killing
489+ pm_path : Path | None = None
490+ if store and session_id :
491+ pm_path = _write_watchdog_postmortem (store , session_id , state , reason = message )
492+ if pm_path :
493+ _alert_terminal (f"Post-mortem written to { pm_path } " )
494+ if config .on_death_cmd :
495+ _invoke_on_death (config .on_death_cmd , pm_path )
440496 _kill_process (state .agent_pid )
441497
442498
499+ def _write_watchdog_postmortem (
500+ store : TraceStore ,
501+ session_id : str ,
502+ state : WatchState ,
503+ reason : str ,
504+ ) -> Path | None :
505+ """Write a structured JSON post-mortem to the session directory.
506+
507+ Returns the path written, or None on failure.
508+ """
509+ try :
510+ events = store .load_events (session_id )
511+ meta = store .load_meta (session_id )
512+ except Exception :
513+ return None
514+
515+ last_tool_call = None
516+ last_llm_response = None
517+ for ev in reversed (events ):
518+ if last_tool_call is None and ev .event_type == EventType .TOOL_CALL :
519+ last_tool_call = ev .data
520+ if last_llm_response is None and ev .event_type == EventType .LLM_RESPONSE :
521+ last_llm_response = ev .data
522+ if last_tool_call and last_llm_response :
523+ break
524+
525+ elapsed = time .time () - state .start_time
526+ pm = {
527+ "session_id" : session_id ,
528+ "terminated_at" : time .time (),
529+ "reason" : reason ,
530+ "elapsed_seconds" : round (elapsed , 2 ),
531+ "cost_at_death" : round (state .estimated_cost , 6 ),
532+ "last_tool_call" : last_tool_call ,
533+ "last_llm_response" : last_llm_response ,
534+ "recovery_context" : (
535+ f"Session { session_id } was terminated after { elapsed :.0f} s "
536+ f"(${ state .estimated_cost :.4f} spent). "
537+ f"Reason: { reason } . "
538+ "Resume from the last tool call above."
539+ ),
540+ }
541+
542+ pm_path = store ._session_dir (session_id ) / "watchdog-postmortem.json"
543+ try :
544+ pm_path .write_text (json .dumps (pm , indent = 2 ))
545+ return pm_path
546+ except Exception :
547+ return None
548+
549+
550+ def _invoke_on_death (on_death_cmd : str , pm_path : Path | None ) -> None :
551+ """Run the --on-death command with {post_mortem_path} substituted."""
552+ if not on_death_cmd :
553+ return
554+ path_str = str (pm_path ) if pm_path else ""
555+ cmd = on_death_cmd .replace ("{post_mortem_path}" , path_str )
556+ try :
557+ subprocess .Popen (cmd , shell = True )
558+ except Exception as exc :
559+ sys .stderr .write (f"[watch] on-death command failed: { exc } \n " )
560+
561+
443562def _dispatch_nanny_rule (
444563 rule : NannyRule ,
445564 event : TraceEvent ,
@@ -455,14 +574,12 @@ def _dispatch_nanny_rule(
455574
456575 # Auto-generate postmortem on kill
457576 if rule .action == "kill" and not dry_run :
458- try :
459- from .postmortem import generate_postmortem , format_postmortem
460- pm = generate_postmortem (store , session_id )
461- pm_path = Path (config .alert_log ).parent / f"postmortem-{ session_id [:12 ]} .md"
462- pm_path .write_text (format_postmortem (pm ))
463- _alert_terminal (f"Postmortem written to { pm_path } " )
464- except Exception :
465- pass
577+ pm_path = _write_watchdog_postmortem (store , session_id , state , reason = msg )
578+ if pm_path :
579+ _alert_terminal (f"Post-mortem written to { pm_path } " )
580+ on_death = getattr (config , "on_death_cmd" , "" )
581+ if on_death :
582+ _invoke_on_death (on_death , pm_path )
466583
467584
468585# ---------------------------------------------------------------------------
@@ -724,7 +841,10 @@ def watch_session(
724841
725842 violations = check_event (event , config , state )
726843 for msg in violations :
727- _dispatch_alert (msg , config , state , dry_run = dry_run )
844+ _dispatch_alert (
845+ msg , config , state , dry_run = dry_run ,
846+ store = store , session_id = session_id ,
847+ )
728848
729849 # --- Nanny rule evaluation ---
730850 if nanny_rules :
@@ -759,12 +879,33 @@ def cmd_watch(args: argparse.Namespace) -> int:
759879 if config_path :
760880 config = WatcherConfig .load (config_path )
761881 else :
882+ # --timeout is a friendlier alias for --max-duration
883+ max_duration = getattr (args , "max_duration" , 1800 )
884+ timeout_str = getattr (args , "timeout" , None )
885+ if timeout_str :
886+ try :
887+ max_duration = _parse_duration (timeout_str )
888+ except ValueError as exc :
889+ sys .stderr .write (f"[watch] invalid --timeout value: { exc } \n " )
890+ return 1
891+
892+ # --budget is a friendlier alias for --max-cost
893+ max_cost = getattr (args , "max_cost" , 10.0 )
894+ budget_str = getattr (args , "budget" , None )
895+ if budget_str is not None :
896+ try :
897+ max_cost = float (budget_str )
898+ except ValueError :
899+ sys .stderr .write (f"[watch] invalid --budget value: { budget_str !r} \n " )
900+ return 1
901+
762902 config = WatcherConfig (
763903 max_retries = getattr (args , "max_retries" , 5 ),
764- max_cost_dollars = getattr ( args , " max_cost" , 10.0 ) ,
765- max_duration_seconds = getattr ( args , " max_duration" , 1800 ) ,
904+ max_cost_dollars = max_cost ,
905+ max_duration_seconds = max_duration ,
766906 on_violation = getattr (args , "on_violation" , "terminal" ),
767907 webhook_url = getattr (args , "webhook" , "" ) or "" ,
908+ on_death_cmd = getattr (args , "on_death" , "" ) or "" ,
768909 )
769910
770911 # Load nanny rules if --rules provided
0 commit comments