|
| 1 | +"""Tests for eVera production monitoring and alerting.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import time |
| 6 | +from unittest.mock import AsyncMock, MagicMock, patch |
| 7 | + |
| 8 | +import pytest |
| 9 | + |
| 10 | +from vera.monitoring.alerts import AlertManager |
| 11 | +from vera.monitoring.metrics import MetricsCollector |
| 12 | + |
| 13 | +# ─── MetricsCollector tests ─────────────────────────────────────────── |
| 14 | + |
| 15 | + |
| 16 | +class TestMetricsCollector: |
| 17 | + def setup_method(self): |
| 18 | + self.m = MetricsCollector() |
| 19 | + |
| 20 | + def test_record_request(self): |
| 21 | + self.m.record_request("GET", "/health", 200, 5.0) |
| 22 | + self.m.record_request("GET", "/health", 200, 10.0) |
| 23 | + data = self.m.get_metrics() |
| 24 | + key = "GET /health 200" |
| 25 | + assert key in data["requests"] |
| 26 | + assert data["requests"][key]["count"] == 2 |
| 27 | + assert data["requests"][key]["avg"] == 7.5 |
| 28 | + assert data["requests"][key]["errors"] == 0 |
| 29 | + |
| 30 | + def test_record_request_error(self): |
| 31 | + self.m.record_request("POST", "/chat", 500, 100.0) |
| 32 | + data = self.m.get_metrics() |
| 33 | + key = "POST /chat 500" |
| 34 | + assert data["requests"][key]["errors"] == 1 |
| 35 | + assert len(data["recent_errors"]) == 1 |
| 36 | + |
| 37 | + def test_record_llm_call(self): |
| 38 | + self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 500, 1200.0) |
| 39 | + self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 300, 800.0, error=True) |
| 40 | + data = self.m.get_metrics() |
| 41 | + key = "openai/gpt-4o (SPECIALIST)" |
| 42 | + assert key in data["llm"] |
| 43 | + assert data["llm"][key]["count"] == 2 |
| 44 | + assert data["llm"][key]["tokens"] == 800 |
| 45 | + assert data["llm"][key]["errors"] == 1 |
| 46 | + |
| 47 | + def test_record_scheduler_run(self): |
| 48 | + self.m.record_scheduler_run("reminder", success=True, duration_ms=50.0) |
| 49 | + self.m.record_scheduler_run("reminder", success=False, duration_ms=10.0) |
| 50 | + data = self.m.get_metrics() |
| 51 | + assert "reminder" in data["scheduler"] |
| 52 | + assert data["scheduler"]["reminder"]["count"] == 2 |
| 53 | + assert data["scheduler"]["reminder"]["errors"] == 1 |
| 54 | + assert data["scheduler"]["reminder"]["last_run"] is not None |
| 55 | + |
| 56 | + def test_record_agent_dispatch(self): |
| 57 | + self.m.record_agent_dispatch("companion", 300.0) |
| 58 | + self.m.record_agent_dispatch("companion", 500.0, error=True) |
| 59 | + data = self.m.get_metrics() |
| 60 | + assert "companion" in data["agents"] |
| 61 | + assert data["agents"]["companion"]["count"] == 2 |
| 62 | + assert data["agents"]["companion"]["errors"] == 1 |
| 63 | + |
| 64 | + def test_get_request_error_rate(self): |
| 65 | + self.m.record_request("GET", "/health", 200, 5.0) |
| 66 | + self.m.record_request("GET", "/chat", 500, 100.0) |
| 67 | + rate = self.m.get_request_error_rate() |
| 68 | + assert rate == 0.5 # 1 error out of 2 |
| 69 | + |
| 70 | + def test_get_llm_error_rate(self): |
| 71 | + self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 100, 500.0) |
| 72 | + assert self.m.get_llm_error_rate() == 0.0 |
| 73 | + self.m.record_llm_call("openai", "gpt-4o", "SPECIALIST", 0, 0, error=True) |
| 74 | + assert self.m.get_llm_error_rate() == 0.5 |
| 75 | + |
| 76 | + def test_get_avg_request_latency(self): |
| 77 | + self.m.record_request("GET", "/a", 200, 100.0) |
| 78 | + self.m.record_request("GET", "/b", 200, 200.0) |
| 79 | + assert self.m.get_avg_request_latency() == 150.0 |
| 80 | + |
| 81 | + def test_uptime(self): |
| 82 | + data = self.m.get_metrics() |
| 83 | + assert data["uptime_seconds"] >= 0 |
| 84 | + |
| 85 | + def test_reset(self): |
| 86 | + self.m.record_request("GET", "/", 200, 5.0) |
| 87 | + self.m.record_llm_call("x", "y", "z", 1, 1.0) |
| 88 | + self.m.reset() |
| 89 | + data = self.m.get_metrics() |
| 90 | + assert data["requests"] == {} |
| 91 | + assert data["llm"] == {} |
| 92 | + assert data["scheduler"] == {} |
| 93 | + assert data["agents"] == {} |
| 94 | + assert data["recent_errors"] == [] |
| 95 | + |
| 96 | + |
| 97 | +# ─── AlertManager tests ────────────────────────────────────────────── |
| 98 | + |
| 99 | + |
| 100 | +class TestAlertManager: |
| 101 | + def setup_method(self): |
| 102 | + self.metrics = MetricsCollector() |
| 103 | + self.am = AlertManager( |
| 104 | + self.metrics, |
| 105 | + error_rate_threshold=0.1, |
| 106 | + latency_threshold_ms=1000.0, |
| 107 | + scheduler_fail_threshold=2, |
| 108 | + ) |
| 109 | + |
| 110 | + def test_no_alerts_when_clean(self): |
| 111 | + alerts = self.am.check() |
| 112 | + assert alerts == [] |
| 113 | + |
| 114 | + def test_error_rate_alert(self): |
| 115 | + # 1 success, 1 error = 50% error rate > 10% threshold |
| 116 | + self.metrics.record_request("GET", "/", 200, 5.0) |
| 117 | + self.metrics.record_request("GET", "/bad", 500, 5.0) |
| 118 | + alerts = self.am.check() |
| 119 | + names = [a.name for a in alerts] |
| 120 | + assert "high_error_rate" in names |
| 121 | + |
| 122 | + def test_error_rate_alert_not_duplicate(self): |
| 123 | + self.metrics.record_request("GET", "/bad", 500, 5.0) |
| 124 | + self.am.check() |
| 125 | + # Second check should not create duplicate |
| 126 | + alerts = self.am.check() |
| 127 | + assert len([a for a in alerts if a.name == "high_error_rate"]) == 0 |
| 128 | + |
| 129 | + def test_llm_error_rate_alert(self): |
| 130 | + self.metrics.record_llm_call("openai", "gpt-4o", "SPEC", 0, 0, error=True) |
| 131 | + alerts = self.am.check() |
| 132 | + names = [a.name for a in alerts] |
| 133 | + assert "high_llm_error_rate" in names |
| 134 | + |
| 135 | + def test_latency_alert(self): |
| 136 | + self.metrics.record_request("GET", "/slow", 200, 2000.0) |
| 137 | + alerts = self.am.check() |
| 138 | + names = [a.name for a in alerts] |
| 139 | + assert "high_latency" in names |
| 140 | + |
| 141 | + def test_scheduler_failure_alert(self): |
| 142 | + self.metrics.record_scheduler_run("reminder", success=False) |
| 143 | + self.metrics.record_scheduler_run("reminder", success=False) |
| 144 | + alerts = self.am.check() |
| 145 | + names = [a.name for a in alerts] |
| 146 | + assert "scheduler_fail_reminder" in names |
| 147 | + |
| 148 | + def test_alert_resolves(self): |
| 149 | + self.metrics.record_request("GET", "/bad", 500, 5.0) |
| 150 | + self.am.check() |
| 151 | + # Now add many successes to bring rate below threshold |
| 152 | + self.metrics.reset() |
| 153 | + self.metrics.record_request("GET", "/", 200, 5.0) |
| 154 | + self.am.check() |
| 155 | + active = self.am.get_alerts(include_resolved=False) |
| 156 | + # high_error_rate should be resolved |
| 157 | + error_alerts = [a for a in active if a["name"] == "high_error_rate"] |
| 158 | + assert len(error_alerts) == 0 |
| 159 | + |
| 160 | + def test_get_alerts_with_resolved(self): |
| 161 | + self.metrics.record_request("GET", "/bad", 500, 5.0) |
| 162 | + self.am.check() |
| 163 | + self.metrics.reset() |
| 164 | + self.metrics.record_request("GET", "/", 200, 5.0) |
| 165 | + self.am.check() |
| 166 | + all_alerts = self.am.get_alerts(include_resolved=True) |
| 167 | + assert len(all_alerts) >= 1 |
| 168 | + |
| 169 | + def test_reset(self): |
| 170 | + self.metrics.record_request("GET", "/bad", 500, 5.0) |
| 171 | + self.am.check() |
| 172 | + self.am.reset() |
| 173 | + assert self.am.get_alerts() == [] |
| 174 | + |
| 175 | + |
| 176 | +# ─── Health check tests ────────────────────────────────────────────── |
| 177 | + |
| 178 | + |
| 179 | +class TestHealthCheck: |
| 180 | + def test_healthy_status(self): |
| 181 | + from vera.monitoring.health import deep_health_check |
| 182 | + |
| 183 | + brain = MagicMock() |
| 184 | + brain.memory_vault.semantic.get_all.return_value = {"key": "value"} |
| 185 | + brain.memory_vault.get_stats.return_value = {"episodic_events": 5} |
| 186 | + |
| 187 | + task_mock = MagicMock() |
| 188 | + task_mock.done.return_value = False |
| 189 | + brain.scheduler._tasks = [task_mock] |
| 190 | + |
| 191 | + brain.provider_manager._provider_health = {"openai": True} |
| 192 | + |
| 193 | + result = deep_health_check(brain) |
| 194 | + assert result["status"] == "healthy" |
| 195 | + assert result["version"] == "1.0.0" |
| 196 | + assert "uptime_seconds" in result |
| 197 | + assert "timestamp" in result |
| 198 | + assert result["checks"]["memory_vault"]["status"] == "ok" |
| 199 | + assert result["checks"]["scheduler"]["status"] == "ok" |
| 200 | + assert result["checks"]["providers"]["status"] == "ok" |
| 201 | + |
| 202 | + def test_degraded_when_provider_partial(self): |
| 203 | + from vera.monitoring.health import deep_health_check |
| 204 | + |
| 205 | + brain = MagicMock() |
| 206 | + brain.memory_vault.semantic.get_all.return_value = {} |
| 207 | + brain.memory_vault.get_stats.return_value = {} |
| 208 | + |
| 209 | + task_mock = MagicMock() |
| 210 | + task_mock.done.return_value = False |
| 211 | + brain.scheduler._tasks = [task_mock] |
| 212 | + |
| 213 | + brain.provider_manager._provider_health = { |
| 214 | + "openai": True, |
| 215 | + "ollama": False, |
| 216 | + } |
| 217 | + |
| 218 | + result = deep_health_check(brain) |
| 219 | + assert result["status"] == "degraded" |
| 220 | + assert result["checks"]["providers"]["status"] == "degraded" |
| 221 | + |
| 222 | + def test_unhealthy_when_memory_fails(self): |
| 223 | + from vera.monitoring.health import deep_health_check |
| 224 | + |
| 225 | + brain = MagicMock() |
| 226 | + brain.memory_vault.semantic.get_all.side_effect = Exception("DB error") |
| 227 | + |
| 228 | + task_mock = MagicMock() |
| 229 | + task_mock.done.return_value = False |
| 230 | + brain.scheduler._tasks = [task_mock] |
| 231 | + brain.provider_manager._provider_health = {} |
| 232 | + |
| 233 | + result = deep_health_check(brain) |
| 234 | + assert result["status"] == "unhealthy" |
| 235 | + assert result["checks"]["memory_vault"]["status"] == "error" |
| 236 | + |
| 237 | + |
| 238 | +# ─── Endpoint tests ────────────────────────────────────────────────── |
| 239 | + |
| 240 | + |
| 241 | +class TestEndpoints: |
| 242 | + @pytest.fixture |
| 243 | + def client(self): |
| 244 | + """Create a test client with mocked brain.""" |
| 245 | + from unittest.mock import PropertyMock |
| 246 | + |
| 247 | + from fastapi.testclient import TestClient |
| 248 | + |
| 249 | + mock_brain = MagicMock() |
| 250 | + mock_brain.memory_vault.semantic.get_all.return_value = {} |
| 251 | + mock_brain.memory_vault.get_stats.return_value = {} |
| 252 | + mock_brain.provider_manager._provider_health = {} |
| 253 | + |
| 254 | + task_mock = MagicMock() |
| 255 | + task_mock.done.return_value = False |
| 256 | + mock_brain.scheduler._tasks = [task_mock] |
| 257 | + mock_brain.scheduler._notification_handlers = [] |
| 258 | + mock_brain.scheduler.add_notification_handler = MagicMock() |
| 259 | + mock_brain.scheduler.remove_notification_handler = MagicMock() |
| 260 | + |
| 261 | + # Patch settings to avoid needing .env |
| 262 | + with patch("vera.app.settings") as mock_settings: |
| 263 | + mock_settings.server.cors_origins = ["http://localhost:8000"] |
| 264 | + mock_settings.server.api_key = "" |
| 265 | + mock_settings.server.webhook_secret = "" |
| 266 | + mock_settings.monitoring.alert_error_rate_threshold = 0.1 |
| 267 | + mock_settings.monitoring.alert_latency_threshold_ms = 5000.0 |
| 268 | + mock_settings.monitoring.alert_scheduler_fail_threshold = 3 |
| 269 | + |
| 270 | + from vera.app import create_app |
| 271 | + |
| 272 | + app = create_app(brain=mock_brain) |
| 273 | + yield TestClient(app) |
| 274 | + |
| 275 | + def test_health_endpoint(self, client): |
| 276 | + resp = client.get("/health") |
| 277 | + assert resp.status_code == 200 |
| 278 | + data = resp.json() |
| 279 | + assert "status" in data |
| 280 | + assert "version" in data |
| 281 | + assert "uptime_seconds" in data |
| 282 | + assert "checks" in data |
| 283 | + |
| 284 | + def test_metrics_endpoint(self, client): |
| 285 | + resp = client.get("/metrics") |
| 286 | + assert resp.status_code == 200 |
| 287 | + data = resp.json() |
| 288 | + assert "uptime_seconds" in data |
| 289 | + assert "requests" in data |
| 290 | + assert "llm" in data |
| 291 | + assert "scheduler" in data |
| 292 | + assert "agents" in data |
| 293 | + |
| 294 | + def test_alerts_endpoint(self, client): |
| 295 | + resp = client.get("/alerts") |
| 296 | + assert resp.status_code == 200 |
| 297 | + data = resp.json() |
| 298 | + assert isinstance(data, list) |
0 commit comments