fix: auto-respawn agents after abnormal exit (#59)

AliceLJY · claude · AliceLJY · commit 037d0e8450ab · 2026-04-06T22:28:26.000+08:00
When an agent exits abnormally with pending tasks, automatically attempt
to respawn it (max 2 attempts) using the circuit breaker health tracking.
Respawn is triggered both from the on-exit hook and as a fallback from
the waiter's dead-agent detection loop.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/clawteam/cli/commands.py b/clawteam/cli/commands.py
@@ -1760,6 +1760,9 @@ def lifecycle_on_exit(
         if t.owner == agent and t.status == TaskStatus.in_progress
     ]
 
+    # Save spawn info BEFORE unregistering — needed for auto-respawn.
+    saved_spawn_info = get_agent_info(team, agent)
+
     # Unregister from spawn registry so is_agent_alive returns None for this agent.
     # Guard: only unregister if the agent is already dead (avoids removing a live entry
     # if the hook fires before the process actually exits).
@@ -1820,6 +1823,40 @@ def lifecycle_on_exit(
         ),
     )
 
+    # --- Auto-respawn: attempt to restart the agent if pending tasks remain ---
+    pending_tasks = [t for t in store.list_tasks() if t.status == TaskStatus.pending]
+    if pending_tasks and saved_spawn_info:
+        from clawteam.spawn.respawn import respawn_agent
+
+        respawn_result = respawn_agent(team, agent, spawn_info=saved_spawn_info)
+        if respawn_result.startswith("ok:"):
+            _output(
+                {"status": "agent_respawned", "agent": agent, "detail": respawn_result},
+                lambda d: console.print(
+                    f"  [green]Auto-respawned agent '{agent}'.[/green] {d['detail']}"
+                ),
+            )
+            if leader_name:
+                mailbox.send(
+                    from_agent=agent,
+                    to=leader_name,
+                    content=f"Agent '{agent}' auto-respawned. {respawn_result}",
+                )
+        else:
+            _output(
+                {"status": "respawn_failed", "agent": agent, "detail": respawn_result},
+                lambda d: console.print(
+                    f"  [red]Auto-respawn failed for '{agent}':[/red] {d['detail']}"
+                ),
+            )
+            if leader_name:
+                mailbox.send(
+                    from_agent=agent,
+                    to=leader_name,
+                    content=f"Auto-respawn failed for '{agent}': {respawn_result}. "
+                            "Manual intervention may be needed.",
+                )
+
 
 @lifecycle_app.command("check-zombies")
 def lifecycle_check_zombies(
diff --git a/clawteam/spawn/respawn.py b/clawteam/spawn/respawn.py
@@ -0,0 +1,86 @@
+"""Auto-respawn logic for agents that exit abnormally."""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+MAX_RESPAWN_ATTEMPTS = 2
+
+
+def respawn_agent(
+    team_name: str,
+    agent_name: str,
+    spawn_info: dict | None = None,
+) -> str:
+    """Attempt to respawn a dead agent using its recorded spawn info.
+
+    Checks agent health (consecutive_failures) to enforce a max respawn limit.
+    Records a failure outcome before attempting respawn so the circuit breaker
+    tracks crash history.
+
+    Args:
+        team_name: Team the agent belongs to.
+        agent_name: Logical name of the agent to respawn.
+        spawn_info: Previously captured spawn registry entry.  When the caller
+            already read the info before unregistering the dead agent it should
+            pass it here; otherwise the function reads from the live registry.
+
+    Returns:
+        Status string — starts with ``"ok: "`` on success, ``"Error: "`` on failure.
+    """
+    from clawteam.spawn import get_backend, spawn_with_retry
+    from clawteam.spawn.registry import get_agent_health, get_agent_info, record_outcome
+    from clawteam.team.manager import TeamManager
+
+    # Record the crash so consecutive_failures increments.
+    health = record_outcome(team_name, agent_name, success=False)
+
+    if health.consecutive_failures > MAX_RESPAWN_ATTEMPTS:
+        return (
+            f"Error: agent '{agent_name}' crashed {health.consecutive_failures} times "
+            f"(max respawn attempts: {MAX_RESPAWN_ATTEMPTS}), not respawning"
+        )
+
+    info = spawn_info or get_agent_info(team_name, agent_name)
+    if not info:
+        return f"Error: no spawn info found for agent '{agent_name}'"
+
+    member = TeamManager.get_member(team_name, agent_name)
+    if not member:
+        return f"Error: agent '{agent_name}' not found in team config"
+
+    backend_name = info.get("backend", "tmux")
+    command = info.get("command", [])
+    if not command:
+        return f"Error: no command recorded for agent '{agent_name}'"
+
+    try:
+        backend = get_backend(backend_name)
+        result = spawn_with_retry(
+            backend,
+            max_retries=1,
+            command=command,
+            agent_name=agent_name,
+            agent_id=member.agent_id,
+            agent_type=member.agent_type,
+            team_name=team_name,
+            model=member.model_name or None,
+        )
+
+        if result.startswith("Error"):
+            return f"Error: respawn failed for '{agent_name}': {result}"
+
+        logger.info(
+            "Respawned agent '%s' (crash count: %d/%d)",
+            agent_name,
+            health.consecutive_failures,
+            MAX_RESPAWN_ATTEMPTS,
+        )
+        return (
+            f"ok: respawned agent '{agent_name}' "
+            f"(crash {health.consecutive_failures}/{MAX_RESPAWN_ATTEMPTS})"
+        )
+    except Exception as exc:
+        return f"Error: respawn failed for '{agent_name}': {exc}"
diff --git a/clawteam/team/waiter.py b/clawteam/team/waiter.py
@@ -183,6 +183,14 @@ def _check_dead_agents(self) -> None:
             if abandoned and self.on_agent_dead:
                 self.on_agent_dead(agent_name, abandoned)
 
+            # Auto-respawn if there are pending tasks (fallback for when on-exit hook didn't fire)
+            if abandoned:
+                try:
+                    from clawteam.spawn.respawn import respawn_agent
+                    respawn_agent(self.team_name, agent_name)
+                except Exception:
+                    pass  # Best-effort; on-exit hook is the primary respawn path
+
 
 def _task_summary(task: TaskItem) -> dict:
     """Summarize a task for the wait result."""
diff --git a/tests/test_respawn.py b/tests/test_respawn.py
@@ -0,0 +1,151 @@
+"""Tests for clawteam.spawn.respawn module."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from clawteam.spawn.registry import AgentHealth, HealthState
+from clawteam.spawn.respawn import MAX_RESPAWN_ATTEMPTS, respawn_agent
+
+# Patch targets: since respawn_agent uses lazy imports (from X import Y inside
+# the function body), we must patch the *source* modules so the fresh local
+# bindings pick up the mocks.
+_PATCH_RECORD = "clawteam.spawn.registry.record_outcome"
+_PATCH_GET_INFO = "clawteam.spawn.registry.get_agent_info"
+_PATCH_TM = "clawteam.team.manager.TeamManager"
+_PATCH_BACKEND = "clawteam.spawn.get_backend"
+_PATCH_SWR = "clawteam.spawn.spawn_with_retry"
+
+
+@pytest.fixture
+def mock_team():
+    return {"team": "test-team", "agent": "worker-1"}
+
+
+class TestRespawnAgent:
+
+    def test_respawn_succeeds_on_first_crash(self, mock_team):
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=1,
+            state=HealthState.degraded,
+        )
+        member = MagicMock()
+        member.agent_id = "abc123"
+        member.agent_type = "researcher"
+        member.model_name = ""
+        spawn_info = {
+            "backend": "tmux",
+            "command": ["claude", "--dangerously-skip-permissions"],
+            "tmux_target": "clawteam-test:0",
+            "pid": 0,
+        }
+
+        with (
+            patch(_PATCH_RECORD, return_value=health),
+            patch(_PATCH_GET_INFO, return_value=spawn_info),
+            patch(_PATCH_TM) as mock_tm,
+            patch(_PATCH_BACKEND),
+            patch(_PATCH_SWR, return_value="ok") as mock_swr,
+        ):
+            mock_tm.get_member.return_value = member
+            result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
+
+        assert result.startswith("ok:")
+        mock_swr.assert_called_once()
+
+    def test_respawn_blocked_after_max_attempts(self, mock_team):
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=MAX_RESPAWN_ATTEMPTS + 1,
+            state=HealthState.open,
+        )
+
+        with patch(_PATCH_RECORD, return_value=health):
+            result = respawn_agent(mock_team["team"], mock_team["agent"])
+
+        assert result.startswith("Error:")
+        assert "not respawning" in result
+
+    def test_respawn_fails_no_spawn_info(self, mock_team):
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=1,
+        )
+
+        with (
+            patch(_PATCH_RECORD, return_value=health),
+            patch(_PATCH_GET_INFO, return_value=None),
+        ):
+            result = respawn_agent(mock_team["team"], mock_team["agent"])
+
+        assert result.startswith("Error:")
+        assert "no spawn info" in result
+
+    def test_respawn_fails_no_team_member(self, mock_team):
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=1,
+        )
+        spawn_info = {"backend": "tmux", "command": ["claude"]}
+
+        with (
+            patch(_PATCH_RECORD, return_value=health),
+            patch(_PATCH_GET_INFO, return_value=spawn_info),
+            patch(_PATCH_TM) as mock_tm,
+        ):
+            mock_tm.get_member.return_value = None
+            result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
+
+        assert result.startswith("Error:")
+        assert "not found in team config" in result
+
+    def test_respawn_fails_spawn_error(self, mock_team):
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=1,
+        )
+        member = MagicMock()
+        member.agent_id = "abc"
+        member.agent_type = "worker"
+        member.model_name = ""
+        spawn_info = {"backend": "tmux", "command": ["claude"]}
+
+        with (
+            patch(_PATCH_RECORD, return_value=health),
+            patch(_PATCH_GET_INFO, return_value=spawn_info),
+            patch(_PATCH_TM) as mock_tm,
+            patch(_PATCH_BACKEND),
+            patch(_PATCH_SWR, return_value="Error: tmux not found"),
+        ):
+            mock_tm.get_member.return_value = member
+            result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
+
+        assert result.startswith("Error:")
+
+    def test_respawn_second_attempt_still_allowed(self, mock_team):
+        """Second crash (consecutive_failures=2) should still allow respawn."""
+        health = AgentHealth(
+            agent_name=mock_team["agent"],
+            consecutive_failures=2,
+            state=HealthState.degraded,
+        )
+        member = MagicMock()
+        member.agent_id = "abc"
+        member.agent_type = "worker"
+        member.model_name = ""
+        spawn_info = {"backend": "tmux", "command": ["claude"]}
+
+        with (
+            patch(_PATCH_RECORD, return_value=health),
+            patch(_PATCH_GET_INFO, return_value=spawn_info),
+            patch(_PATCH_TM) as mock_tm,
+            patch(_PATCH_BACKEND),
+            patch(_PATCH_SWR, return_value="ok"),
+        ):
+            mock_tm.get_member.return_value = member
+            result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
+
+        assert result.startswith("ok:")