Skip to content

Commit 037d0e8

Browse files
AliceLJYclaude
andcommitted
fix: auto-respawn agents after abnormal exit (#59)
When an agent exits abnormally with pending tasks, automatically attempt to respawn it (max 2 attempts) using the circuit breaker health tracking. Respawn is triggered both from the on-exit hook and as a fallback from the waiter's dead-agent detection loop. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c595f0e commit 037d0e8

4 files changed

Lines changed: 282 additions & 0 deletions

File tree

clawteam/cli/commands.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,6 +1760,9 @@ def lifecycle_on_exit(
17601760
if t.owner == agent and t.status == TaskStatus.in_progress
17611761
]
17621762

1763+
# Save spawn info BEFORE unregistering — needed for auto-respawn.
1764+
saved_spawn_info = get_agent_info(team, agent)
1765+
17631766
# Unregister from spawn registry so is_agent_alive returns None for this agent.
17641767
# Guard: only unregister if the agent is already dead (avoids removing a live entry
17651768
# if the hook fires before the process actually exits).
@@ -1820,6 +1823,40 @@ def lifecycle_on_exit(
18201823
),
18211824
)
18221825

1826+
# --- Auto-respawn: attempt to restart the agent if pending tasks remain ---
1827+
pending_tasks = [t for t in store.list_tasks() if t.status == TaskStatus.pending]
1828+
if pending_tasks and saved_spawn_info:
1829+
from clawteam.spawn.respawn import respawn_agent
1830+
1831+
respawn_result = respawn_agent(team, agent, spawn_info=saved_spawn_info)
1832+
if respawn_result.startswith("ok:"):
1833+
_output(
1834+
{"status": "agent_respawned", "agent": agent, "detail": respawn_result},
1835+
lambda d: console.print(
1836+
f" [green]Auto-respawned agent '{agent}'.[/green] {d['detail']}"
1837+
),
1838+
)
1839+
if leader_name:
1840+
mailbox.send(
1841+
from_agent=agent,
1842+
to=leader_name,
1843+
content=f"Agent '{agent}' auto-respawned. {respawn_result}",
1844+
)
1845+
else:
1846+
_output(
1847+
{"status": "respawn_failed", "agent": agent, "detail": respawn_result},
1848+
lambda d: console.print(
1849+
f" [red]Auto-respawn failed for '{agent}':[/red] {d['detail']}"
1850+
),
1851+
)
1852+
if leader_name:
1853+
mailbox.send(
1854+
from_agent=agent,
1855+
to=leader_name,
1856+
content=f"Auto-respawn failed for '{agent}': {respawn_result}. "
1857+
"Manual intervention may be needed.",
1858+
)
1859+
18231860

18241861
@lifecycle_app.command("check-zombies")
18251862
def lifecycle_check_zombies(

clawteam/spawn/respawn.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""Auto-respawn logic for agents that exit abnormally."""
2+
3+
from __future__ import annotations
4+
5+
import logging
6+
7+
logger = logging.getLogger(__name__)
8+
9+
MAX_RESPAWN_ATTEMPTS = 2
10+
11+
12+
def respawn_agent(
13+
team_name: str,
14+
agent_name: str,
15+
spawn_info: dict | None = None,
16+
) -> str:
17+
"""Attempt to respawn a dead agent using its recorded spawn info.
18+
19+
Checks agent health (consecutive_failures) to enforce a max respawn limit.
20+
Records a failure outcome before attempting respawn so the circuit breaker
21+
tracks crash history.
22+
23+
Args:
24+
team_name: Team the agent belongs to.
25+
agent_name: Logical name of the agent to respawn.
26+
spawn_info: Previously captured spawn registry entry. When the caller
27+
already read the info before unregistering the dead agent it should
28+
pass it here; otherwise the function reads from the live registry.
29+
30+
Returns:
31+
Status string — starts with ``"ok: "`` on success, ``"Error: "`` on failure.
32+
"""
33+
from clawteam.spawn import get_backend, spawn_with_retry
34+
from clawteam.spawn.registry import get_agent_health, get_agent_info, record_outcome
35+
from clawteam.team.manager import TeamManager
36+
37+
# Record the crash so consecutive_failures increments.
38+
health = record_outcome(team_name, agent_name, success=False)
39+
40+
if health.consecutive_failures > MAX_RESPAWN_ATTEMPTS:
41+
return (
42+
f"Error: agent '{agent_name}' crashed {health.consecutive_failures} times "
43+
f"(max respawn attempts: {MAX_RESPAWN_ATTEMPTS}), not respawning"
44+
)
45+
46+
info = spawn_info or get_agent_info(team_name, agent_name)
47+
if not info:
48+
return f"Error: no spawn info found for agent '{agent_name}'"
49+
50+
member = TeamManager.get_member(team_name, agent_name)
51+
if not member:
52+
return f"Error: agent '{agent_name}' not found in team config"
53+
54+
backend_name = info.get("backend", "tmux")
55+
command = info.get("command", [])
56+
if not command:
57+
return f"Error: no command recorded for agent '{agent_name}'"
58+
59+
try:
60+
backend = get_backend(backend_name)
61+
result = spawn_with_retry(
62+
backend,
63+
max_retries=1,
64+
command=command,
65+
agent_name=agent_name,
66+
agent_id=member.agent_id,
67+
agent_type=member.agent_type,
68+
team_name=team_name,
69+
model=member.model_name or None,
70+
)
71+
72+
if result.startswith("Error"):
73+
return f"Error: respawn failed for '{agent_name}': {result}"
74+
75+
logger.info(
76+
"Respawned agent '%s' (crash count: %d/%d)",
77+
agent_name,
78+
health.consecutive_failures,
79+
MAX_RESPAWN_ATTEMPTS,
80+
)
81+
return (
82+
f"ok: respawned agent '{agent_name}' "
83+
f"(crash {health.consecutive_failures}/{MAX_RESPAWN_ATTEMPTS})"
84+
)
85+
except Exception as exc:
86+
return f"Error: respawn failed for '{agent_name}': {exc}"

clawteam/team/waiter.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,14 @@ def _check_dead_agents(self) -> None:
183183
if abandoned and self.on_agent_dead:
184184
self.on_agent_dead(agent_name, abandoned)
185185

186+
# Auto-respawn if there are pending tasks (fallback for when on-exit hook didn't fire)
187+
if abandoned:
188+
try:
189+
from clawteam.spawn.respawn import respawn_agent
190+
respawn_agent(self.team_name, agent_name)
191+
except Exception:
192+
pass # Best-effort; on-exit hook is the primary respawn path
193+
186194

187195
def _task_summary(task: TaskItem) -> dict:
188196
"""Summarize a task for the wait result."""

tests/test_respawn.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
"""Tests for clawteam.spawn.respawn module."""
2+
3+
from __future__ import annotations
4+
5+
from unittest.mock import MagicMock, patch
6+
7+
import pytest
8+
9+
from clawteam.spawn.registry import AgentHealth, HealthState
10+
from clawteam.spawn.respawn import MAX_RESPAWN_ATTEMPTS, respawn_agent
11+
12+
# Patch targets: since respawn_agent uses lazy imports (from X import Y inside
13+
# the function body), we must patch the *source* modules so the fresh local
14+
# bindings pick up the mocks.
15+
_PATCH_RECORD = "clawteam.spawn.registry.record_outcome"
16+
_PATCH_GET_INFO = "clawteam.spawn.registry.get_agent_info"
17+
_PATCH_TM = "clawteam.team.manager.TeamManager"
18+
_PATCH_BACKEND = "clawteam.spawn.get_backend"
19+
_PATCH_SWR = "clawteam.spawn.spawn_with_retry"
20+
21+
22+
@pytest.fixture
23+
def mock_team():
24+
return {"team": "test-team", "agent": "worker-1"}
25+
26+
27+
class TestRespawnAgent:
28+
29+
def test_respawn_succeeds_on_first_crash(self, mock_team):
30+
health = AgentHealth(
31+
agent_name=mock_team["agent"],
32+
consecutive_failures=1,
33+
state=HealthState.degraded,
34+
)
35+
member = MagicMock()
36+
member.agent_id = "abc123"
37+
member.agent_type = "researcher"
38+
member.model_name = ""
39+
spawn_info = {
40+
"backend": "tmux",
41+
"command": ["claude", "--dangerously-skip-permissions"],
42+
"tmux_target": "clawteam-test:0",
43+
"pid": 0,
44+
}
45+
46+
with (
47+
patch(_PATCH_RECORD, return_value=health),
48+
patch(_PATCH_GET_INFO, return_value=spawn_info),
49+
patch(_PATCH_TM) as mock_tm,
50+
patch(_PATCH_BACKEND),
51+
patch(_PATCH_SWR, return_value="ok") as mock_swr,
52+
):
53+
mock_tm.get_member.return_value = member
54+
result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
55+
56+
assert result.startswith("ok:")
57+
mock_swr.assert_called_once()
58+
59+
def test_respawn_blocked_after_max_attempts(self, mock_team):
60+
health = AgentHealth(
61+
agent_name=mock_team["agent"],
62+
consecutive_failures=MAX_RESPAWN_ATTEMPTS + 1,
63+
state=HealthState.open,
64+
)
65+
66+
with patch(_PATCH_RECORD, return_value=health):
67+
result = respawn_agent(mock_team["team"], mock_team["agent"])
68+
69+
assert result.startswith("Error:")
70+
assert "not respawning" in result
71+
72+
def test_respawn_fails_no_spawn_info(self, mock_team):
73+
health = AgentHealth(
74+
agent_name=mock_team["agent"],
75+
consecutive_failures=1,
76+
)
77+
78+
with (
79+
patch(_PATCH_RECORD, return_value=health),
80+
patch(_PATCH_GET_INFO, return_value=None),
81+
):
82+
result = respawn_agent(mock_team["team"], mock_team["agent"])
83+
84+
assert result.startswith("Error:")
85+
assert "no spawn info" in result
86+
87+
def test_respawn_fails_no_team_member(self, mock_team):
88+
health = AgentHealth(
89+
agent_name=mock_team["agent"],
90+
consecutive_failures=1,
91+
)
92+
spawn_info = {"backend": "tmux", "command": ["claude"]}
93+
94+
with (
95+
patch(_PATCH_RECORD, return_value=health),
96+
patch(_PATCH_GET_INFO, return_value=spawn_info),
97+
patch(_PATCH_TM) as mock_tm,
98+
):
99+
mock_tm.get_member.return_value = None
100+
result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
101+
102+
assert result.startswith("Error:")
103+
assert "not found in team config" in result
104+
105+
def test_respawn_fails_spawn_error(self, mock_team):
106+
health = AgentHealth(
107+
agent_name=mock_team["agent"],
108+
consecutive_failures=1,
109+
)
110+
member = MagicMock()
111+
member.agent_id = "abc"
112+
member.agent_type = "worker"
113+
member.model_name = ""
114+
spawn_info = {"backend": "tmux", "command": ["claude"]}
115+
116+
with (
117+
patch(_PATCH_RECORD, return_value=health),
118+
patch(_PATCH_GET_INFO, return_value=spawn_info),
119+
patch(_PATCH_TM) as mock_tm,
120+
patch(_PATCH_BACKEND),
121+
patch(_PATCH_SWR, return_value="Error: tmux not found"),
122+
):
123+
mock_tm.get_member.return_value = member
124+
result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
125+
126+
assert result.startswith("Error:")
127+
128+
def test_respawn_second_attempt_still_allowed(self, mock_team):
129+
"""Second crash (consecutive_failures=2) should still allow respawn."""
130+
health = AgentHealth(
131+
agent_name=mock_team["agent"],
132+
consecutive_failures=2,
133+
state=HealthState.degraded,
134+
)
135+
member = MagicMock()
136+
member.agent_id = "abc"
137+
member.agent_type = "worker"
138+
member.model_name = ""
139+
spawn_info = {"backend": "tmux", "command": ["claude"]}
140+
141+
with (
142+
patch(_PATCH_RECORD, return_value=health),
143+
patch(_PATCH_GET_INFO, return_value=spawn_info),
144+
patch(_PATCH_TM) as mock_tm,
145+
patch(_PATCH_BACKEND),
146+
patch(_PATCH_SWR, return_value="ok"),
147+
):
148+
mock_tm.get_member.return_value = member
149+
result = respawn_agent(mock_team["team"], mock_team["agent"], spawn_info=spawn_info)
150+
151+
assert result.startswith("ok:")

0 commit comments

Comments
 (0)