Skip to content

Commit fb35f81

Browse files
committed
feat: production monitoring/alerting + fix all test failures (657/657 pass)
- Add vera/monitoring/ module: MetricsCollector, deep health checks, AlertManager - Add /health (deep), /metrics, /alerts endpoints with request timing middleware - Wire metrics into LLM calls, scheduler loops, and agent dispatch - Add MonitoringSettings config (VERA_MONITORING_* env vars) - Fix spending alert month_start bug (replace(day=1) kept current time, not midnight) - Fix test_api.py health check assertion for new deep health response - Fix ruff lint/format issues - 25 monitoring tests + 26 staging validation tests - All 657 tests pass, 0 failures
1 parent 5fa32c8 commit fb35f81

13 files changed

Lines changed: 1264 additions & 9 deletions

File tree

.env.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,12 @@ VERA_BROKER_IBKR_HOST=127.0.0.1
206206
VERA_BROKER_IBKR_PORT=7497
207207
VERA_BROKER_IBKR_CLIENT_ID=1
208208
VERA_BROKER_AUTO_TRADE_LIMIT=500
209+
210+
# --- Production Monitoring & Alerting ---
211+
212+
VERA_MONITORING_ENABLED=true
213+
VERA_MONITORING_METRICS_ENDPOINT=true
214+
VERA_MONITORING_ALERT_ERROR_RATE_THRESHOLD=0.1
215+
VERA_MONITORING_ALERT_LATENCY_THRESHOLD_MS=5000
216+
VERA_MONITORING_ALERT_SCHEDULER_FAIL_THRESHOLD=3
217+
VERA_MONITORING_ALERT_CHECK_INTERVAL_S=60

config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,19 @@ class BrokerSettings(BaseSettings):
374374
model_config = {"env_prefix": "VERA_BROKER_"}
375375

376376

377+
class MonitoringSettings(BaseSettings):
378+
"""Production monitoring and alerting configuration."""
379+
380+
enabled: bool = Field(True, description="Enable production monitoring")
381+
metrics_endpoint: bool = Field(True, description="Expose /metrics endpoint")
382+
alert_error_rate_threshold: float = Field(0.1, description="HTTP 5xx error rate threshold (10%)")
383+
alert_latency_threshold_ms: float = Field(5000.0, description="Average response latency threshold (ms)")
384+
alert_scheduler_fail_threshold: int = Field(3, description="Consecutive scheduler failures before alert")
385+
alert_check_interval_s: int = Field(60, description="Alert check interval (seconds)")
386+
387+
model_config = {"env_prefix": "VERA_MONITORING_"}
388+
389+
377390
class Settings(BaseSettings):
378391
"""Root settings — aggregates all configuration groups."""
379392

@@ -397,6 +410,7 @@ class Settings(BaseSettings):
397410
media: MediaSettings = Field(default_factory=MediaSettings)
398411
spotify: SpotifySettings = Field(default_factory=SpotifySettings)
399412
ssh: SSHSettings = Field(default_factory=SSHSettings)
413+
monitoring: MonitoringSettings = Field(default_factory=MonitoringSettings)
400414
debug: bool = Field(False, description="Enable debug logging")
401415
data_dir: Path = Field(default_factory=_resolve_data_dir, description="Data storage directory")
402416

tests/test_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ async def test_health(app):
4747
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
4848
resp = await client.get("/health")
4949
assert resp.status_code == 200
50-
assert resp.json()["status"] == "ok"
50+
assert resp.json()["status"] in ("ok", "healthy", "degraded")
5151

5252

5353
@pytest.mark.asyncio

tests/test_monitoring.py

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
"""Tests for eVera production monitoring and alerting."""
2+
3+
from __future__ import annotations
4+
5+
import time
6+
from unittest.mock import AsyncMock, MagicMock, patch
7+
8+
import pytest
9+
10+
from vera.monitoring.alerts import AlertManager
11+
from vera.monitoring.metrics import MetricsCollector
12+
13+
# ─── MetricsCollector tests ───────────────────────────────────────────
14+
15+
16+
class TestMetricsCollector:
17+
def setup_method(self):
18+
self.m = MetricsCollector()
19+
20+
def test_record_request(self):
21+
self.m.record_request("GET", "/health", 200, 5.0)
22+
self.m.record_request("GET", "/health", 200, 10.0)
23+
data = self.m.get_metrics()
24+
key = "GET /health 200"
25+
assert key in data["requests"]
26+
assert data["requests"][key]["count"] == 2
27+
assert data["requests"][key]["avg"] == 7.5
28+
assert data["requests"][key]["errors"] == 0
29+
30+
def test_record_request_error(self):
31+
self.m.record_request("POST", "/chat", 500, 100.0)
32+
data = self.m.get_metrics()
33+
key = "POST /chat 500"
34+
assert data["requests"][key]["errors"] == 1
35+
assert len(data["recent_errors"]) == 1
36+
37+
def test_record_llm_call(self):
38+
self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 500, 1200.0)
39+
self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 300, 800.0, error=True)
40+
data = self.m.get_metrics()
41+
key = "openai/gpt-4o (SPECIALIST)"
42+
assert key in data["llm"]
43+
assert data["llm"][key]["count"] == 2
44+
assert data["llm"][key]["tokens"] == 800
45+
assert data["llm"][key]["errors"] == 1
46+
47+
def test_record_scheduler_run(self):
48+
self.m.record_scheduler_run("reminder", success=True, duration_ms=50.0)
49+
self.m.record_scheduler_run("reminder", success=False, duration_ms=10.0)
50+
data = self.m.get_metrics()
51+
assert "reminder" in data["scheduler"]
52+
assert data["scheduler"]["reminder"]["count"] == 2
53+
assert data["scheduler"]["reminder"]["errors"] == 1
54+
assert data["scheduler"]["reminder"]["last_run"] is not None
55+
56+
def test_record_agent_dispatch(self):
57+
self.m.record_agent_dispatch("companion", 300.0)
58+
self.m.record_agent_dispatch("companion", 500.0, error=True)
59+
data = self.m.get_metrics()
60+
assert "companion" in data["agents"]
61+
assert data["agents"]["companion"]["count"] == 2
62+
assert data["agents"]["companion"]["errors"] == 1
63+
64+
def test_get_request_error_rate(self):
65+
self.m.record_request("GET", "/health", 200, 5.0)
66+
self.m.record_request("GET", "/chat", 500, 100.0)
67+
rate = self.m.get_request_error_rate()
68+
assert rate == 0.5 # 1 error out of 2
69+
70+
def test_get_llm_error_rate(self):
71+
self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 100, 500.0)
72+
assert self.m.get_llm_error_rate() == 0.0
73+
self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 0, 0, error=True)
74+
assert self.m.get_llm_error_rate() == 0.5
75+
76+
def test_get_avg_request_latency(self):
77+
self.m.record_request("GET", "/a", 200, 100.0)
78+
self.m.record_request("GET", "/b", 200, 200.0)
79+
assert self.m.get_avg_request_latency() == 150.0
80+
81+
def test_uptime(self):
82+
data = self.m.get_metrics()
83+
assert data["uptime_seconds"] >= 0
84+
85+
def test_reset(self):
86+
self.m.record_request("GET", "/", 200, 5.0)
87+
self.m.record_llm_call("x", "y", "z", 1, 1.0)
88+
self.m.reset()
89+
data = self.m.get_metrics()
90+
assert data["requests"] == {}
91+
assert data["llm"] == {}
92+
assert data["scheduler"] == {}
93+
assert data["agents"] == {}
94+
assert data["recent_errors"] == []
95+
96+
97+
# ─── AlertManager tests ──────────────────────────────────────────────
98+
99+
100+
class TestAlertManager:
101+
def setup_method(self):
102+
self.metrics = MetricsCollector()
103+
self.am = AlertManager(
104+
self.metrics,
105+
error_rate_threshold=0.1,
106+
latency_threshold_ms=1000.0,
107+
scheduler_fail_threshold=2,
108+
)
109+
110+
def test_no_alerts_when_clean(self):
111+
alerts = self.am.check()
112+
assert alerts == []
113+
114+
def test_error_rate_alert(self):
115+
# 1 success, 1 error = 50% error rate > 10% threshold
116+
self.metrics.record_request("GET", "/", 200, 5.0)
117+
self.metrics.record_request("GET", "/bad", 500, 5.0)
118+
alerts = self.am.check()
119+
names = [a.name for a in alerts]
120+
assert "high_error_rate" in names
121+
122+
def test_error_rate_alert_not_duplicate(self):
123+
self.metrics.record_request("GET", "/bad", 500, 5.0)
124+
self.am.check()
125+
# Second check should not create duplicate
126+
alerts = self.am.check()
127+
assert len([a for a in alerts if a.name == "high_error_rate"]) == 0
128+
129+
def test_llm_error_rate_alert(self):
130+
self.metrics.record_llm_call("openai", "gpt-4o", "SPEC", 0, 0, error=True)
131+
alerts = self.am.check()
132+
names = [a.name for a in alerts]
133+
assert "high_llm_error_rate" in names
134+
135+
def test_latency_alert(self):
136+
self.metrics.record_request("GET", "/slow", 200, 2000.0)
137+
alerts = self.am.check()
138+
names = [a.name for a in alerts]
139+
assert "high_latency" in names
140+
141+
def test_scheduler_failure_alert(self):
142+
self.metrics.record_scheduler_run("reminder", success=False)
143+
self.metrics.record_scheduler_run("reminder", success=False)
144+
alerts = self.am.check()
145+
names = [a.name for a in alerts]
146+
assert "scheduler_fail_reminder" in names
147+
148+
def test_alert_resolves(self):
149+
self.metrics.record_request("GET", "/bad", 500, 5.0)
150+
self.am.check()
151+
# Now add many successes to bring rate below threshold
152+
self.metrics.reset()
153+
self.metrics.record_request("GET", "/", 200, 5.0)
154+
self.am.check()
155+
active = self.am.get_alerts(include_resolved=False)
156+
# high_error_rate should be resolved
157+
error_alerts = [a for a in active if a["name"] == "high_error_rate"]
158+
assert len(error_alerts) == 0
159+
160+
def test_get_alerts_with_resolved(self):
161+
self.metrics.record_request("GET", "/bad", 500, 5.0)
162+
self.am.check()
163+
self.metrics.reset()
164+
self.metrics.record_request("GET", "/", 200, 5.0)
165+
self.am.check()
166+
all_alerts = self.am.get_alerts(include_resolved=True)
167+
assert len(all_alerts) >= 1
168+
169+
def test_reset(self):
170+
self.metrics.record_request("GET", "/bad", 500, 5.0)
171+
self.am.check()
172+
self.am.reset()
173+
assert self.am.get_alerts() == []
174+
175+
176+
# ─── Health check tests ──────────────────────────────────────────────
177+
178+
179+
class TestHealthCheck:
180+
def test_healthy_status(self):
181+
from vera.monitoring.health import deep_health_check
182+
183+
brain = MagicMock()
184+
brain.memory_vault.semantic.get_all.return_value = {"key": "value"}
185+
brain.memory_vault.get_stats.return_value = {"episodic_events": 5}
186+
187+
task_mock = MagicMock()
188+
task_mock.done.return_value = False
189+
brain.scheduler._tasks = [task_mock]
190+
191+
brain.provider_manager._provider_health = {"openai": True}
192+
193+
result = deep_health_check(brain)
194+
assert result["status"] == "healthy"
195+
assert result["version"] == "1.0.0"
196+
assert "uptime_seconds" in result
197+
assert "timestamp" in result
198+
assert result["checks"]["memory_vault"]["status"] == "ok"
199+
assert result["checks"]["scheduler"]["status"] == "ok"
200+
assert result["checks"]["providers"]["status"] == "ok"
201+
202+
def test_degraded_when_provider_partial(self):
203+
from vera.monitoring.health import deep_health_check
204+
205+
brain = MagicMock()
206+
brain.memory_vault.semantic.get_all.return_value = {}
207+
brain.memory_vault.get_stats.return_value = {}
208+
209+
task_mock = MagicMock()
210+
task_mock.done.return_value = False
211+
brain.scheduler._tasks = [task_mock]
212+
213+
brain.provider_manager._provider_health = {
214+
"openai": True,
215+
"ollama": False,
216+
}
217+
218+
result = deep_health_check(brain)
219+
assert result["status"] == "degraded"
220+
assert result["checks"]["providers"]["status"] == "degraded"
221+
222+
def test_unhealthy_when_memory_fails(self):
223+
from vera.monitoring.health import deep_health_check
224+
225+
brain = MagicMock()
226+
brain.memory_vault.semantic.get_all.side_effect = Exception("DB error")
227+
228+
task_mock = MagicMock()
229+
task_mock.done.return_value = False
230+
brain.scheduler._tasks = [task_mock]
231+
brain.provider_manager._provider_health = {}
232+
233+
result = deep_health_check(brain)
234+
assert result["status"] == "unhealthy"
235+
assert result["checks"]["memory_vault"]["status"] == "error"
236+
237+
238+
# ─── Endpoint tests ──────────────────────────────────────────────────
239+
240+
241+
class TestEndpoints:
242+
@pytest.fixture
243+
def client(self):
244+
"""Create a test client with mocked brain."""
245+
from unittest.mock import PropertyMock
246+
247+
from fastapi.testclient import TestClient
248+
249+
mock_brain = MagicMock()
250+
mock_brain.memory_vault.semantic.get_all.return_value = {}
251+
mock_brain.memory_vault.get_stats.return_value = {}
252+
mock_brain.provider_manager._provider_health = {}
253+
254+
task_mock = MagicMock()
255+
task_mock.done.return_value = False
256+
mock_brain.scheduler._tasks = [task_mock]
257+
mock_brain.scheduler._notification_handlers = []
258+
mock_brain.scheduler.add_notification_handler = MagicMock()
259+
mock_brain.scheduler.remove_notification_handler = MagicMock()
260+
261+
# Patch settings to avoid needing .env
262+
with patch("vera.app.settings") as mock_settings:
263+
mock_settings.server.cors_origins = ["http://localhost:8000"]
264+
mock_settings.server.api_key = ""
265+
mock_settings.server.webhook_secret = ""
266+
mock_settings.monitoring.alert_error_rate_threshold = 0.1
267+
mock_settings.monitoring.alert_latency_threshold_ms = 5000.0
268+
mock_settings.monitoring.alert_scheduler_fail_threshold = 3
269+
270+
from vera.app import create_app
271+
272+
app = create_app(brain=mock_brain)
273+
yield TestClient(app)
274+
275+
def test_health_endpoint(self, client):
276+
resp = client.get("/health")
277+
assert resp.status_code == 200
278+
data = resp.json()
279+
assert "status" in data
280+
assert "version" in data
281+
assert "uptime_seconds" in data
282+
assert "checks" in data
283+
284+
def test_metrics_endpoint(self, client):
285+
resp = client.get("/metrics")
286+
assert resp.status_code == 200
287+
data = resp.json()
288+
assert "uptime_seconds" in data
289+
assert "requests" in data
290+
assert "llm" in data
291+
assert "scheduler" in data
292+
assert "agents" in data
293+
294+
def test_alerts_endpoint(self, client):
295+
resp = client.get("/alerts")
296+
assert resp.status_code == 200
297+
data = resp.json()
298+
assert isinstance(data, list)

0 commit comments

Comments
 (0)