forked from microsoft/agent-governance-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathslo_alerting.py
More file actions
112 lines (89 loc) · 3.97 KB
/
slo_alerting.py
File metadata and controls
112 lines (89 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
SLO Alerting Example — Set up SLOs for an AI agent with error budgets.
Demonstrates defining SLIs (latency, accuracy, error rate), combining them
into an SLO with an error budget, and triggering alerts when the budget burns
too fast.
Run:
pip install agent-sre
python examples/slo_alerting.py
"""
import random
from agent_sre import SLO, ErrorBudget
from agent_sre.slo.indicators import (
ResponseLatency,
TaskSuccessRate,
ToolCallAccuracy,
)
from agent_sre.slo.objectives import ExhaustionAction, SLOStatus
# ── 1. Register SLIs ───────────────────────────────────────────────────
latency = ResponseLatency(target_ms=3000.0, percentile=0.95, window="1h")
accuracy = ToolCallAccuracy(target=0.99, window="24h")
success = TaskSuccessRate(target=0.95, window="24h")
# ── 2. Create SLO with error budget ───────────────────────────────────
budget = ErrorBudget(
total=0.05, # 5% error budget (1 - 0.95 target)
burn_rate_alert=2.0, # Warn at 2× normal consumption
burn_rate_critical=10.0, # Critical at 10× normal consumption
exhaustion_action=ExhaustionAction.FREEZE_DEPLOYMENTS,
)
slo = SLO(
name="code-review-agent",
description="Reliability targets for code-review agent",
indicators=[latency, accuracy, success],
error_budget=budget,
agent_id="code-review-agent",
)
# ── 3. Simulate agent work ─────────────────────────────────────────────
print("SLO Alerting Example")
print("=" * 60)
print()
for i in range(200):
# Simulate realistic agent outcomes
task_ok = random.random() < 0.92 # 92% success (below 95% target)
tool_ok = random.random() < 0.98 # 98% accuracy (below 99% target)
latency_ms = random.gauss(2500, 800)
success.record_task(task_ok)
accuracy.record_call(tool_ok)
latency.record_latency(max(100, latency_ms))
# Every event counts against the error budget
slo.record_event(good=task_ok and tool_ok)
# ── 4. Check error budget consumption ──────────────────────────────────
print("SLO Status")
print("-" * 40)
status = slo.evaluate()
print(f" Name: {slo.name}")
print(f" Status: {status.value}")
print(f" Error Budget Left: {slo.error_budget.remaining_percent:.1f}%")
print(f" Budget Exhausted: {slo.error_budget.is_exhausted}")
print(f" Burn Rate (1h): {slo.error_budget.burn_rate(3600):.1f}×")
print()
# ── 5. Check indicators ───────────────────────────────────────────────
print("Indicators")
print("-" * 40)
for ind in slo.indicators:
val = ind.current_value()
comp = ind.compliance()
if val is not None and comp is not None:
ok = "✅" if comp >= ind.target else "❌"
print(f" {ok} {ind.name}: {val:.3f} (target: {ind.target}, compliance: {comp:.1%})")
print()
# ── 6. Trigger alerts based on budget ──────────────────────────────────
print("Alerts")
print("-" * 40)
firing = slo.error_budget.firing_alerts()
if firing:
for alert in firing:
print(f" 🔔 {alert.severity.upper()}: {alert.name} (threshold: {alert.rate:.1f}×)")
else:
print(" No alerts firing")
if status == SLOStatus.EXHAUSTED:
print()
print(f" 🚨 Error budget exhausted — action: {budget.exhaustion_action.value}")
elif status in (SLOStatus.CRITICAL, SLOStatus.WARNING):
print()
print(f" ⚠️ SLO at risk — consider slowing deployments")
print()
print("─" * 60)
print("Use SLOs to define 'reliable' and error budgets to know when to slow down.")