fix: agent lifecycle reliability — stale claim, stderr, heartbeat, worktree

chernistry · claude · chernistry · commit a56302ec4401 · 2026-04-08T17:50:10.000+03:00
Three root causes for "agent died without output" failures:

1. Stale claim used task.created_at instead of actual claim time — tasks
   created hours ago were immediately marked stale when freshly claimed.
   Added claimed_at field to Task model, set on claim, used for timeout.
   Default timeout increased from 10m to 15m.

2. Agent stderr redirected to /dev/null — auth failures, MCP config
   errors, and CLI crashes were invisible. Now captured in .stderr.log.

3. Heartbeat file touched after spawn instead of before — race window
   where running agent had no heartbeat and looked dead to watchdog.
   Now touched before adapter.spawn() call.

4. Worktree creation failure didn't block spawn — agent launched into
   nonexistent directory and crashed silently. Now raises SpawnError
   so retry logic handles it properly.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/bernstein/adapters/claude.py b/src/bernstein/adapters/claude.py
@@ -412,6 +412,8 @@ def _launch_process(
                 parent environment is inherited (legacy behaviour).
         """
         log_file = log_path.open("w")
+        stderr_path = log_path.with_suffix(".stderr.log")
+        stderr_file = stderr_path.open("w")
         preexec_fn = self._get_preexec_fn()
         try:
             try:
@@ -420,7 +422,7 @@ def _launch_process(
                     cwd=workdir,
                     env=env,
                     stdout=subprocess.PIPE,
-                    stderr=subprocess.DEVNULL,
+                    stderr=stderr_file,
                     start_new_session=True,
                     preexec_fn=preexec_fn,
                 )
@@ -434,7 +436,7 @@ def _launch_process(
                     [sys.executable, "-c", wrapper],
                     stdin=claude_proc.stdout,
                     stdout=log_file,
-                    stderr=subprocess.DEVNULL,
+                    stderr=stderr_file,
                     start_new_session=True,
                     cwd=workdir,
                     env=env,
@@ -444,6 +446,7 @@ def _launch_process(
                 raise
         finally:
             log_file.close()
+            stderr_file.close()
 
         # Allow claude_proc to receive SIGPIPE if wrapper dies
         if claude_proc.stdout:
diff --git a/src/bernstein/core/models.py b/src/bernstein/core/models.py
@@ -261,6 +261,7 @@ class Task:
     max_output_tokens: int | None = None  # Escalated limit for model output
     meta_messages: list[str] = field(default_factory=list[str])  # Operational nudges/hints (T423)
     created_at: float = field(default_factory=time.time)
+    claimed_at: float | None = None  # Epoch timestamp when task was claimed by an agent
     completed_at: float | None = None  # Epoch timestamp when task completed/failed
     closed_at: float | None = None  # Epoch timestamp when task was verified and closed
     deadline: float | None = None  # Epoch timestamp when task must be complete
@@ -349,6 +350,7 @@ def from_dict(cls, raw: dict[str, Any]) -> Task:
             max_output_tokens=raw.get("max_output_tokens"),
             meta_messages=list(raw.get("meta_messages", [])),
             created_at=raw.get("created_at", time.time()),
+            claimed_at=raw.get("claimed_at"),
             completed_at=raw.get("completed_at"),
             closed_at=raw.get("closed_at"),
             deadline=raw.get("deadline"),
@@ -1055,6 +1057,7 @@ class OrchestratorConfig:
     permission_mode: str | None = None  # "bypass" | "plan" | "auto" | "default" — see permission_mode.py
     agent_resource_limits: Any | None = None  # ResourceLimits | None — OS-level limits for non-sandboxed spawns
     shutdown_stagger_delay_s: float = 5.0  # Seconds between SHUTDOWN signals during drain
+    stale_claim_timeout_s: float = 900.0  # Seconds before a claimed task with no live agent is released
 
     def __post_init__(self) -> None:
         """Parse nested workflow config if dict provided."""
diff --git a/src/bernstein/core/orchestrator.py b/src/bernstein/core/orchestrator.py
@@ -246,7 +246,7 @@ class Orchestrator:
     _MAX_PROCESSED_DONE: int = 500  # cap _processed_done_tasks set size
     _MANAGER_REVIEW_COMPLETION_THRESHOLD: int = 7  # trigger review after this many completions
     _MANAGER_REVIEW_STALL_S: float = 900.0  # trigger review after 15 min of no progress
-    _STALE_CLAIM_TIMEOUT_S: float = 600.0  # 10 minutes — release claimed tasks older than this
+    _STALE_CLAIM_TIMEOUT_S: float = 900.0  # default fallback; prefer config.stale_claim_timeout_s
 
     def __init__(
         self,
@@ -2013,9 +2013,8 @@ def _release_stale_claims(self, claimed_tasks: list[Task]) -> int:
 
         When an agent dies silently (no crash signal, no heartbeat timeout),
         its claimed tasks stay in "claimed" forever.  This method detects
-        tasks with no matching live agent that have exceeded
-        ``_STALE_CLAIM_TIMEOUT_S`` and marks them failed so they can be
-        retried.
+        tasks with no matching live agent that have exceeded the stale claim
+        timeout and marks them failed so they can be retried.
 
         Args:
             claimed_tasks: Tasks with status "claimed" from the current tick.
@@ -2024,6 +2023,7 @@ def _release_stale_claims(self, claimed_tasks: list[Task]) -> int:
             Number of tasks released.
         """
         now = time.time()
+        timeout = self._config.stale_claim_timeout_s
         released = 0
         for task in claimed_tasks:
             # Skip tasks that have a known live agent in this session
@@ -2033,9 +2033,12 @@ def _release_stale_claims(self, claimed_tasks: list[Task]) -> int:
                 if agent is not None and agent.status != "dead":
                     continue
 
-            # Use created_at as lower-bound proxy (task model has no claimed_at)
-            age_s = now - task.created_at
-            if age_s < self._STALE_CLAIM_TIMEOUT_S:
+            # Use claimed_at (when available) to measure actual time in claimed
+            # state.  Fall back to created_at for legacy tasks that pre-date the
+            # claimed_at field — this is conservative (over-counts) but safe.
+            claim_epoch = task.claimed_at if task.claimed_at is not None else task.created_at
+            age_s = now - claim_epoch
+            if age_s < timeout:
                 continue
 
             try:
diff --git a/src/bernstein/core/server.py b/src/bernstein/core/server.py
@@ -366,6 +366,7 @@ class TaskResponse(BaseModel):
     slack_context: dict[str, Any] | None = None
     metadata: dict[str, Any] = Field(default_factory=dict)
     created_at: float
+    claimed_at: float | None = None
     deadline: float | None = None
     progress_log: list[ProgressEntry] = Field(default_factory=list)
     version: int = 1
@@ -849,6 +850,7 @@ def task_to_response(task: Task) -> TaskResponse:
         slack_context=task.slack_context,
         metadata=task.metadata,
         created_at=task.created_at,
+        claimed_at=task.claimed_at,
         progress_log=list(cast("list[ProgressEntry]", task.progress_log)),  # type: ignore[reportUnknownMemberType]
         version=task.version,
         parent_session_id=task.parent_session_id,
diff --git a/src/bernstein/core/spawner.py b/src/bernstein/core/spawner.py
@@ -1269,13 +1269,10 @@ def _spawn_for_tasks_internal(self, tasks: list[Task], model_override: str | Non
                 self._worktree_paths[session_id] = spawn_cwd
                 self._worktree_roots[session_id] = worktree_repo_root
             except WorktreeError as exc:
-                logger.warning(
-                    "Cannot create workspace for agent %s. "
-                    "Reason: %s. "
-                    "Fix: run 'bernstein stop' then restart, or delete .sdd/worktrees/ manually",
-                    session_id,
-                    exc,
-                )
+                raise SpawnError(
+                    f"Cannot create workspace for agent {session_id}: {exc}. "
+                    "Fix: run 'bernstein stop' then restart, or delete .sdd/worktrees/ manually"
+                ) from exc
 
         # Build per-task MCP config: auto-detected servers merged with base config
         effective_mcp = self._mcp_config
@@ -1375,6 +1372,16 @@ def _spawn_for_tasks_internal(self, tasks: list[Task], model_override: str | Non
         _unattended_attempt = 0
         result: SpawnResult | None = None
 
+        # Touch heartbeat file BEFORE spawn so the watchdog sees the agent as
+        # alive from the moment it starts — avoids a race window where the
+        # process is running but no heartbeat file exists yet.
+        try:
+            hb_dir = self._workdir / ".sdd" / "runtime" / "heartbeats"
+            hb_dir.mkdir(parents=True, exist_ok=True)
+            (hb_dir / session_id).touch()
+        except OSError:
+            pass
+
         while True:
             # Remote spawn already succeeded — skip the local adapter loop entirely
             if remote_spawned:
@@ -1596,17 +1603,6 @@ def _spawn_for_tasks_internal(self, tasks: list[Task], model_override: str | Non
             if result.log_path:
                 session.log_path = str(result.log_path)
 
-            # Touch heartbeat file immediately so the agent starts with a
-            # fresh timestamp — prevents idle recycling from killing agents
-            # that take a long time before emitting their first stream-json
-            # event (e.g. Claude Code thinking for 2+ minutes).
-            try:
-                hb_dir = self._workdir / ".sdd" / "runtime" / "heartbeats"
-                hb_dir.mkdir(parents=True, exist_ok=True)
-                (hb_dir / session_id).touch()
-            except OSError:
-                pass
-
         if session.status != "working":
             transition_agent(
                 session,
diff --git a/src/bernstein/core/task_store.py b/src/bernstein/core/task_store.py
@@ -74,6 +74,7 @@ class TaskRecord(TypedDict):
     risk_level: str
     slack_context: dict[str, Any] | None
     version: int
+    claimed_at: float | None
     completed_at: float | None
     closed_at: float | None
     claimed_by_session: str | None
@@ -434,6 +435,7 @@ def recover_stale_claimed_tasks(self) -> int:
             for task in list(self._by_status.get(stale_status, {}).values()):
                 self._index_remove(task)
                 task.status = TaskStatus.OPEN
+                task.claimed_at = None
                 task.claimed_by_session = None
                 self._index_add(task)
                 reset_count += 1
@@ -604,6 +606,7 @@ def _task_to_record(self, task: Task) -> TaskRecord:
             "risk_level": task.risk_level,
             "slack_context": task.slack_context,
             "version": task.version,
+            "claimed_at": task.claimed_at,
             "completed_at": task.completed_at,
             "closed_at": task.closed_at,
             "claimed_by_session": task.claimed_by_session,
@@ -984,6 +987,7 @@ async def claim_next(
                 return None
             self._index_remove(task)
             transition_task(task, TaskStatus.CLAIMED, actor="task_store", reason="claim_next")
+            task.claimed_at = time.time()
             task.claimed_by_session = claimed_by_session
             task.version += 1
             self._index_add(task)
@@ -1042,6 +1046,7 @@ async def claim_by_id(
                     raise ValueError(overlap_msg)
                 self._index_remove(task)
                 transition_task(task, TaskStatus.CLAIMED, actor="task_store", reason="claim_by_id")
+                task.claimed_at = time.time()
                 task.claimed_by_session = claimed_by_session
                 task.version += 1
                 self._index_add(task)
@@ -1087,6 +1092,7 @@ async def claim_batch(
                     continue
                 self._index_remove(task)
                 transition_task(task, TaskStatus.CLAIMED, actor="task_store", reason=f"claim_batch by {agent_id}")
+                task.claimed_at = time.time()
                 task.assigned_agent = agent_id
                 task.claimed_by_session = claimed_by_session
                 task.version += 1
@@ -1595,6 +1601,7 @@ async def force_claim(self, task_id: str) -> Task:
                 transition_task(task, TaskStatus.OPEN, actor="task_store", reason="force_claim")
                 self._index_add(task)
             task.priority = 0
+            task.claimed_at = None  # Clear claim timestamp on force-claim
             task.claimed_by_session = None  # Clear ownership on force-claim
             task.version += 1
             await self._append_jsonl(self._task_to_record(task))
diff --git a/tests/unit/test_orchestrator.py b/tests/unit/test_orchestrator.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import subprocess
 import time
 from pathlib import Path
 from types import SimpleNamespace
@@ -533,6 +534,10 @@ def test_starving_role_gets_slot_when_capacity_is_tight(self, tmp_path: Path) ->
         """When max_agents capacity is near-full, a starving role gets the last slot
         instead of a role that already has an agent and is still under its per-role cap.
         """
+        # Worktree creation requires a git repo with at least one commit.
+        subprocess.run(["git", "init", str(tmp_path)], capture_output=True, check=True)
+        subprocess.run(["git", "-C", str(tmp_path), "commit", "--allow-empty", "-m", "init"], capture_output=True, check=True)
+
         backend_task = _make_task(id="T-be", role="backend", priority=2)
         qa_task = _make_task(id="T-qa", role="qa", priority=2)
         all_tasks = [_task_as_dict(backend_task), _task_as_dict(qa_task)]