|
5 | 5 | import json |
6 | 6 | import subprocess |
7 | 7 | import time |
| 8 | +from enum import Enum |
8 | 9 | from pathlib import Path |
9 | 10 |
|
| 11 | +from pydantic import BaseModel, Field |
| 12 | + |
10 | 13 | from clawteam.fileutil import atomic_write_text, file_locked |
11 | 14 | from clawteam.paths import ensure_within_root, validate_identifier |
12 | 15 | from clawteam.team.models import get_data_dir |
13 | 16 |
|
| 17 | +# --------------------------------------------------------------------------- |
| 18 | +# Circuit Breaker — agent health tracking |
| 19 | +# --------------------------------------------------------------------------- |
| 20 | + |
| 21 | +class HealthState(str, Enum): |
| 22 | + healthy = "healthy" |
| 23 | + degraded = "degraded" |
| 24 | + open = "open" |
| 25 | + |
| 26 | + |
| 27 | +class AgentHealth(BaseModel): |
| 28 | + """Health status for a spawned agent (circuit breaker pattern).""" |
| 29 | + |
| 30 | + model_config = {"populate_by_name": True} |
| 31 | + |
| 32 | + agent_name: str = Field(alias="agentName") |
| 33 | + state: HealthState = HealthState.healthy |
| 34 | + quality_score: float = Field(default=1.0, alias="qualityScore") |
| 35 | + consecutive_failures: int = Field(default=0, alias="consecutiveFailures") |
| 36 | + total_successes: int = Field(default=0, alias="totalSuccesses") |
| 37 | + total_failures: int = Field(default=0, alias="totalFailures") |
| 38 | + last_failure_at: float = Field(default=0.0, alias="lastFailureAt") |
| 39 | + cooldown_seconds: float = Field(default=60.0, alias="cooldownSeconds") |
| 40 | + |
| 41 | + @property |
| 42 | + def is_accepting_tasks(self) -> bool: |
| 43 | + """Return True if the agent can accept new tasks.""" |
| 44 | + if self.state != HealthState.open: |
| 45 | + return True |
| 46 | + # Half-open: allow after cooldown |
| 47 | + if self.last_failure_at and (time.time() - self.last_failure_at) >= self.cooldown_seconds: |
| 48 | + return True |
| 49 | + return False |
| 50 | + |
| 51 | + |
| 52 | +DEFAULT_FAILURE_THRESHOLD = 3 |
| 53 | +DEFAULT_COOLDOWN_SECONDS = 60.0 |
| 54 | + |
| 55 | + |
| 56 | +def _health_path(team_name: str) -> Path: |
| 57 | + return ensure_within_root( |
| 58 | + get_data_dir() / "teams", |
| 59 | + validate_identifier(team_name, "team name"), |
| 60 | + "agent_health.json", |
| 61 | + ) |
| 62 | + |
| 63 | + |
| 64 | +def _load_health(team_name: str) -> dict[str, dict]: |
| 65 | + path = _health_path(team_name) |
| 66 | + if path.exists(): |
| 67 | + try: |
| 68 | + return json.loads(path.read_text()) |
| 69 | + except (json.JSONDecodeError, OSError): |
| 70 | + return {} |
| 71 | + return {} |
| 72 | + |
| 73 | + |
| 74 | +def _save_health(team_name: str, data: dict[str, dict]) -> None: |
| 75 | + atomic_write_text(_health_path(team_name), json.dumps(data, indent=2)) |
| 76 | + |
| 77 | + |
| 78 | +def get_agent_health(team_name: str, agent_name: str) -> AgentHealth: |
| 79 | + """Return health status for an agent (creates default if not tracked).""" |
| 80 | + health_data = _load_health(team_name) |
| 81 | + if agent_name in health_data: |
| 82 | + return AgentHealth.model_validate(health_data[agent_name]) |
| 83 | + return AgentHealth(agent_name=agent_name) |
| 84 | + |
| 85 | + |
| 86 | +def get_all_health(team_name: str) -> dict[str, AgentHealth]: |
| 87 | + """Return health for all tracked agents.""" |
| 88 | + health_data = _load_health(team_name) |
| 89 | + return { |
| 90 | + name: AgentHealth.model_validate(data) |
| 91 | + for name, data in health_data.items() |
| 92 | + } |
| 93 | + |
| 94 | + |
| 95 | +def record_outcome( |
| 96 | + team_name: str, |
| 97 | + agent_name: str, |
| 98 | + success: bool, |
| 99 | + failure_threshold: int = DEFAULT_FAILURE_THRESHOLD, |
| 100 | + cooldown_seconds: float = DEFAULT_COOLDOWN_SECONDS, |
| 101 | +) -> AgentHealth: |
| 102 | + """Record a task outcome and update agent health state. |
| 103 | +
|
| 104 | + State transitions: |
| 105 | + - healthy → degraded: first failure |
| 106 | + - degraded → open: consecutive_failures >= threshold |
| 107 | + - open → healthy: success after cooldown (half-open probe) |
| 108 | + - any → healthy: success resets consecutive failures |
| 109 | + """ |
| 110 | + path = _health_path(team_name) |
| 111 | + with file_locked(path): |
| 112 | + health_data = _load_health(team_name) |
| 113 | + raw = health_data.get(agent_name, {"agentName": agent_name}) |
| 114 | + health = AgentHealth.model_validate(raw) |
| 115 | + health.cooldown_seconds = cooldown_seconds |
| 116 | + |
| 117 | + if success: |
| 118 | + health.consecutive_failures = 0 |
| 119 | + health.total_successes += 1 |
| 120 | + health.quality_score = min(1.0, health.quality_score + 0.1) |
| 121 | + health.state = HealthState.healthy |
| 122 | + else: |
| 123 | + health.consecutive_failures += 1 |
| 124 | + health.total_failures += 1 |
| 125 | + health.last_failure_at = time.time() |
| 126 | + health.quality_score = max(0.0, health.quality_score - 0.2) |
| 127 | + if health.consecutive_failures >= failure_threshold: |
| 128 | + health.state = HealthState.open |
| 129 | + elif health.consecutive_failures >= 1: |
| 130 | + health.state = HealthState.degraded |
| 131 | + |
| 132 | + health_data[agent_name] = json.loads(health.model_dump_json(by_alias=True)) |
| 133 | + _save_health(team_name, health_data) |
| 134 | + return health |
| 135 | + |
14 | 136 |
|
15 | 137 | def _registry_path(team_name: str) -> Path: |
16 | 138 | return ensure_within_root( |
|
0 commit comments