forked from microsoft/agent-governance-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcanary_rollout.py
More file actions
113 lines (94 loc) · 3.86 KB
/
canary_rollout.py
File metadata and controls
113 lines (94 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Staged Rollout Example — Safely deploy a new agent version.
Compares agent v1 vs v2 decision quality using golden test cases,
with automatic rollback if the new version performs worse.
Run:
pip install agent-sre
python examples/canary_rollout.py
"""
import random
from agent_sre.delivery.rollout import (
CanaryRollout,
RollbackCondition,
RolloutStep,
)
# ── Define rollout stages ───────────────────────────────────────────────
rollout = CanaryRollout(
name="assistant-v2",
steps=[
RolloutStep(weight=0.05, duration_seconds=60, name="5% canary"),
RolloutStep(weight=0.25, duration_seconds=120, name="25% ramp"),
RolloutStep(weight=1.0, duration_seconds=0, name="full rollout"),
],
rollback_conditions=[
RollbackCondition(metric="error_rate", threshold=0.10, comparator="gte"),
RollbackCondition(metric="hallucination_rate", threshold=0.08, comparator="gte"),
],
)
# ── Simulate golden test evaluation ────────────────────────────────────
def evaluate_agent_version(version: str, n_tasks: int = 50) -> dict[str, float]:
"""Simulate evaluating an agent version against golden test cases."""
if version == "v1":
return {
"accuracy": 0.94,
"error_rate": 0.06,
"hallucination_rate": 0.04,
"avg_latency_ms": 1200,
"cost_per_task": 0.35,
}
else:
# v2 is worse on hallucinations (simulate regression)
return {
"accuracy": 0.91,
"error_rate": 0.09,
"hallucination_rate": 0.12, # Regression!
"avg_latency_ms": 900,
"cost_per_task": 0.28,
}
print("Staged Rollout Example")
print("=" * 60)
print()
# ── Start rollout ──────────────────────────────────────────────────────
rollout.start()
print(f"Started rollout: {rollout.name}")
print(f"Status: {rollout.state.value}")
print()
# ── Step through canary stages ─────────────────────────────────────────
step_num = 0
while rollout.current_step is not None:
step = rollout.current_step
step_num += 1
print(f"Stage {step_num}: {step.name} (traffic: {step.weight:.0%})")
# Evaluate candidate
metrics = evaluate_agent_version("v2")
print(f" Accuracy: {metrics['accuracy']:.1%}")
print(f" Error Rate: {metrics['error_rate']:.1%}")
print(f" Hallucination: {metrics['hallucination_rate']:.1%}")
print(f" Latency: {metrics['avg_latency_ms']:.0f}ms")
print(f" Cost/Task: ${metrics['cost_per_task']:.2f}")
# Check rollback conditions
should_rollback = rollout.check_rollback(metrics)
if should_rollback:
rollout.rollback(reason="Canary metrics breached rollback thresholds")
print()
print(f" 🛑 ROLLBACK triggered!")
print(f" Reason: hallucination_rate {metrics['hallucination_rate']:.1%} >= 8% threshold")
break
# Advance to next stage
advanced = rollout.advance()
if not advanced:
break
print(f" ✅ Metrics within bounds, advancing...")
print()
print()
print(f"Final Status: {rollout.state.value}")
print(f"Progress: {rollout.progress_percent:.0f}%")
print()
print("─" * 60)
if rollout.state.value == "rolled_back":
print("Agent v2 was automatically rolled back due to hallucination regression.")
print("Users continue to receive agent v1 — zero impact.")
else:
print("Agent v2 promoted to full traffic.")