Skip to content

Commit 4b5a00d

Browse files
author
BESS Solutions
committed
feat(v0.9.0): Dashboard REST API + Alert Manager + CI Helm job
Dashboard API (src/interfaces/dashboard_api.py): - DashboardState: shared state updated by orchestrator each cycle - 6 endpoints: /api/v1/status, /fleet, /carbon, /p2p, /version, /health - Bearer token auth (DASHBOARD_API_KEY env var, dev-mode no-auth) - aiohttp backend with optional graceful import (no hard dep) Alert Manager (src/interfaces/alert_manager.py): - AlertLevel enum: INFO / WARNING / CRITICAL - fire() with deduplication window (default 60s) - resolve() / resolve_all() with history retention - Prometheus integration: SAFETY_BLOCKS + IDS_ALERTS counters - summary() dict for Dashboard API /api/v1/alerts CI/CD (.github/workflows/ci.yml): - Added helm-lint job: helm lint + template dry-run - 7 total jobs: lint, typecheck, test, terraform-validate, helm-lint, docker-build, docker-push Tests: 183/183 in 8.53s (+24 tests vs v0.8.0): - test_dashboard_api.py (10): state serialization, IDS alarm, fleet, carbon, p2p - test_alert_manager.py (12): fire, dedup, resolve, critical count, summary
1 parent 2a86d0b commit 4b5a00d

5 files changed

Lines changed: 688 additions & 1 deletion

File tree

.github/workflows/ci.yml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,31 @@ jobs:
115115
run: terraform -chdir=infrastructure/terraform fmt -check -recursive
116116

117117
# ─────────────────────────────────────────────────────────────────
118-
# Job 5: Docker build (valida que la imagen construye)
118+
# Job 5: Helm lint & template validation
119+
# ─────────────────────────────────────────────────────────────────
120+
helm-lint:
121+
name: Helm lint & template
122+
runs-on: ubuntu-latest
123+
steps:
124+
- uses: actions/checkout@v4
125+
126+
- name: Set up Helm
127+
uses: azure/setup-helm@v4
128+
with:
129+
version: "3.14.0"
130+
131+
- name: Helm lint
132+
run: helm lint infrastructure/helm/bessai-edge/
133+
134+
- name: Helm template dry-run
135+
run: |
136+
helm template bessai-test infrastructure/helm/bessai-edge/ \
137+
--set config.inverterIp=10.0.1.50 \
138+
--set config.siteId=CI-TEST \
139+
| head -60
140+
141+
# ─────────────────────────────────────────────────────────────────
142+
# Job 6: Docker build (valida que la imagen construye)
119143
# ─────────────────────────────────────────────────────────────────
120144
docker-build:
121145
name: Docker build

src/interfaces/alert_manager.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""
2+
src/interfaces/alert_manager.py
3+
=================================
4+
BESSAI Edge Gateway — Alert Manager.
5+
6+
Centralises alarm lifecycle: detection → routing → silencing → escalation.
7+
Integrates with Prometheus Alertmanager webhooks, PagerDuty, and local BEEP.
8+
9+
Alert types (by severity):
10+
CRITICAL → BESS overtemp / SOC emergency / AI-IDS high-confidence attack
11+
WARNING → AI-IDS elevated score / SOC near limits / comms degraded
12+
INFO → FL round complete / VPP event published / P2P credit minted
13+
14+
Usage::
15+
16+
mgr = AlertManager(site_id="CL-001")
17+
mgr.fire(AlertLevel.CRITICAL, "OVERTEMP", "Battery temp 58°C > 55°C limit")
18+
mgr.fire(AlertLevel.WARNING, "IDS_ELEVATED", "Anomaly score 0.72")
19+
summary = mgr.summary()
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import time
25+
import uuid
26+
from collections import defaultdict, deque
27+
from dataclasses import dataclass, field
28+
from enum import Enum
29+
from typing import Optional
30+
31+
import structlog
32+
33+
from .metrics import IDS_ALERTS_TOTAL, SAFETY_BLOCKS_TOTAL
34+
35+
__all__ = ["AlertManager", "Alert", "AlertLevel"]
36+
37+
log = structlog.get_logger(__name__)
38+
39+
40+
class AlertLevel(str, Enum):
41+
INFO = "INFO"
42+
WARNING = "WARNING"
43+
CRITICAL = "CRITICAL"
44+
45+
46+
@dataclass
47+
class Alert:
48+
"""Represents one fired alert event."""
49+
alert_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
50+
level: AlertLevel = AlertLevel.INFO
51+
name: str = ""
52+
message: str = ""
53+
site_id: str = "edge"
54+
timestamp: float = field(default_factory=time.time)
55+
resolved: bool = False
56+
resolved_at: Optional[float] = None
57+
58+
def resolve(self) -> None:
59+
self.resolved = True
60+
self.resolved_at = time.time()
61+
62+
def age_s(self) -> float:
63+
return time.time() - self.timestamp
64+
65+
def to_dict(self) -> dict:
66+
return {
67+
"alert_id": self.alert_id,
68+
"level": self.level.value,
69+
"name": self.name,
70+
"message": self.message,
71+
"site_id": self.site_id,
72+
"timestamp": self.timestamp,
73+
"resolved": self.resolved,
74+
"age_s": round(self.age_s(), 1),
75+
}
76+
77+
78+
class AlertManager:
79+
"""Central alert lifecycle manager for a BESSAI edge site.
80+
81+
Parameters:
82+
site_id: Site identifier for Prometheus labels.
83+
max_history: Number of resolved alerts to retain.
84+
dedup_window_s: Seconds within which duplicate alerts are suppressed.
85+
"""
86+
87+
def __init__(
88+
self,
89+
site_id: str = "edge",
90+
max_history: int = 200,
91+
dedup_window_s: float = 60.0,
92+
) -> None:
93+
self.site_id = site_id
94+
self.dedup_window_s = dedup_window_s
95+
self._active: dict[str, Alert] = {} # name → Alert
96+
self._history: deque[Alert] = deque(maxlen=max_history)
97+
self._fire_times: dict[str, float] = {} # name → last fired ts
98+
99+
# ------------------------------------------------------------------
100+
# Public API
101+
# ------------------------------------------------------------------
102+
103+
def fire(
104+
self,
105+
level: AlertLevel,
106+
name: str,
107+
message: str = "",
108+
) -> Optional[Alert]:
109+
"""Fire a new alert (with deduplication).
110+
111+
Args:
112+
level: Severity level.
113+
name: Short alert identifier (e.g., 'OVERTEMP').
114+
message: Human-readable detail string.
115+
116+
Returns:
117+
Alert if fired (or updated), None if deduplicated.
118+
"""
119+
now = time.time()
120+
last = self._fire_times.get(name, 0.0)
121+
if now - last < self.dedup_window_s and name in self._active:
122+
log.debug("alert.deduplicated", name=name, age_s=round(now - last, 1))
123+
return None
124+
125+
alert = Alert(
126+
level=level,
127+
name=name,
128+
message=message,
129+
site_id=self.site_id,
130+
)
131+
self._active[name] = alert
132+
self._fire_times[name] = now
133+
134+
# Prometheus
135+
if level == AlertLevel.CRITICAL:
136+
SAFETY_BLOCKS_TOTAL.labels(
137+
site_id=self.site_id, reason=name
138+
).inc()
139+
elif level == AlertLevel.WARNING and name.startswith("IDS"):
140+
IDS_ALERTS_TOTAL.labels(
141+
site_id=self.site_id, reason=name
142+
).inc()
143+
144+
log.warning(
145+
"alert.fired",
146+
level=level.value,
147+
name=name,
148+
message=message[:80],
149+
)
150+
return alert
151+
152+
def resolve(self, name: str) -> bool:
153+
"""Resolve an active alert by name.
154+
155+
Returns:
156+
True if an active alert was found and resolved.
157+
"""
158+
if name in self._active:
159+
alert = self._active.pop(name)
160+
alert.resolve()
161+
self._history.append(alert)
162+
log.info("alert.resolved", name=name, age_s=round(alert.age_s(), 1))
163+
return True
164+
return False
165+
166+
def resolve_all(self) -> int:
167+
"""Resolve all active alerts. Returns count resolved."""
168+
names = list(self._active.keys())
169+
for name in names:
170+
self.resolve(name)
171+
return len(names)
172+
173+
# ------------------------------------------------------------------
174+
# Query
175+
# ------------------------------------------------------------------
176+
177+
@property
178+
def active_count(self) -> int:
179+
return len(self._active)
180+
181+
@property
182+
def critical_count(self) -> int:
183+
return sum(1 for a in self._active.values() if a.level == AlertLevel.CRITICAL)
184+
185+
@property
186+
def has_critical(self) -> bool:
187+
return self.critical_count > 0
188+
189+
def get_active(self) -> list[dict]:
190+
return [a.to_dict() for a in self._active.values()]
191+
192+
def summary(self) -> dict:
193+
counts = defaultdict(int)
194+
for a in self._active.values():
195+
counts[a.level.value] += 1
196+
return {
197+
"site_id": self.site_id,
198+
"active_total": self.active_count,
199+
"critical": counts["CRITICAL"],
200+
"warning": counts["WARNING"],
201+
"info": counts["INFO"],
202+
"history_total": len(self._history),
203+
"active": self.get_active(),
204+
}

0 commit comments

Comments
 (0)