-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopenqoe-alert-rules.yml
More file actions
310 lines (293 loc) · 11.8 KB
/
openqoe-alert-rules.yml
File metadata and controls
310 lines (293 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# OpenQoE Alert Rules
# Production-ready alerts for video QoE monitoring
groups:
# Critical Quality Alerts
- name: quality_critical
interval: 30s
rules:
# High Video Startup Time (VST)
- alert: HighVideoStartupTime
expr: |
histogram_quantile(0.95,
sum(rate(openqoe_video_startup_seconds_bucket[5m])) by (le, org_id, player_id)
) > 3
for: 5m
labels:
severity: critical
component: player
metric: video_startup_time
annotations:
summary: "High video startup time detected"
description: "P95 video startup time is {{ $value | humanizeDuration }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 3 seconds."
impact: "Users experiencing slow video loading, likely to abandon playback"
runbook: "https://docs.openqoe.com/runbooks/high-vst"
# Very High Video Startup Time (Warning threshold)
- alert: ElevatedVideoStartupTime
expr: |
histogram_quantile(0.95,
sum(rate(openqoe_video_startup_seconds_bucket[5m])) by (le, org_id, player_id)
) > 2
for: 5m
labels:
severity: warning
component: player
metric: video_startup_time
annotations:
summary: "Elevated video startup time"
description: "P95 video startup time is {{ $value | humanizeDuration }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 2 seconds."
impact: "Degraded user experience during video initialization"
# High Rebuffer Rate
- alert: HighRebufferRate
expr: |
sum(rate(openqoe_rebuffer_events_total[5m])) by (org_id, player_id)
/
sum(rate(openqoe_views_started_total[5m])) by (org_id, player_id)
> 0.1
for: 5m
labels:
severity: critical
component: streaming
metric: rebuffer_rate
annotations:
summary: "High rebuffer rate detected"
description: "Rebuffer rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 10%."
impact: "More than 10% of views experiencing buffering interruptions"
runbook: "https://docs.openqoe.com/runbooks/high-rebuffer-rate"
# Elevated Rebuffer Rate (Warning)
- alert: ElevatedRebufferRate
expr: |
sum(rate(openqoe_rebuffer_events_total[5m])) by (org_id, player_id)
/
sum(rate(openqoe_views_started_total[5m])) by (org_id, player_id)
> 0.05
for: 5m
labels:
severity: warning
component: streaming
metric: rebuffer_rate
annotations:
summary: "Elevated rebuffer rate"
description: "Rebuffer rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 5%."
impact: "Users experiencing increased buffering interruptions"
# High Error Rate
- alert: HighErrorRate
expr: |
sum(rate(openqoe_errors_total[5m])) by (org_id, player_id)
/
sum(rate(openqoe_events_total[5m])) by (org_id, player_id)
> 0.05
for: 5m
labels:
severity: critical
component: player
metric: error_rate
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 5%."
impact: "More than 5% of events are errors, indicating systemic issues"
runbook: "https://docs.openqoe.com/runbooks/high-error-rate"
# Network Errors Spike
- alert: NetworkErrorsSpike
expr: |
sum(rate(openqoe_errors_total{error_family="network"}[5m])) by (org_id, player_id)
/
sum(rate(openqoe_events_total[5m])) by (org_id, player_id)
> 0.02
for: 3m
labels:
severity: warning
component: network
metric: network_errors
annotations:
summary: "Network error spike detected"
description: "Network error rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}."
impact: "Potential CDN or network connectivity issues"
# Business Impact Alerts
- name: business_impact
interval: 1m
rules:
# Low Completion Rate
- alert: LowCompletionRate
expr: |
avg(openqoe_completion_rate) by (org_id, player_id, video_id) < 0.5
for: 10m
labels:
severity: warning
component: engagement
metric: completion_rate
annotations:
summary: "Low video completion rate"
description: "Completion rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, video={{ $labels.video_id }}. Threshold: 50%."
impact: "Low viewer engagement, potential content or quality issues"
runbook: "https://docs.openqoe.com/runbooks/low-completion"
# Very Low Completion Rate (Critical)
- alert: VeryLowCompletionRate
expr: |
avg(openqoe_completion_rate) by (org_id, player_id, video_id) < 0.25
for: 5m
labels:
severity: critical
component: engagement
metric: completion_rate
annotations:
summary: "Very low video completion rate"
description: "Completion rate is {{ $value | humanizePercentage }} for org={{ $labels.org_id }}, video={{ $labels.video_id }}. Threshold: 25%."
impact: "Severe engagement issues, viewers abandoning content"
# Dropping Watch Time
- alert: DroppingWatchTime
expr: |
(
sum(rate(openqoe_playing_time_seconds[30m])) by (org_id)
-
sum(rate(openqoe_playing_time_seconds[30m] offset 1h)) by (org_id)
)
/
sum(rate(openqoe_playing_time_seconds[30m] offset 1h)) by (org_id)
< -0.3
for: 15m
labels:
severity: warning
component: engagement
metric: watch_time
annotations:
summary: "Watch time dropping significantly"
description: "Watch time has decreased by {{ $value | humanizePercentage }} for org={{ $labels.org_id }} compared to 1 hour ago."
impact: "Overall engagement declining, potential quality issues"
# Performance Alerts
- name: performance
interval: 1m
rules:
# High Seek Latency
- alert: HighSeekLatency
expr: |
histogram_quantile(0.95,
sum(rate(openqoe_seek_latency_seconds_bucket[5m])) by (le, org_id, player_id)
) > 2
for: 5m
labels:
severity: warning
component: player
metric: seek_latency
annotations:
summary: "High seek latency detected"
description: "P95 seek latency is {{ $value | humanizeDuration }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 2 seconds."
impact: "Users experiencing slow scrubbing/seeking operations"
# High Dropped Frames Rate
- alert: HighDroppedFramesRate
expr: |
sum(rate(openqoe_dropped_frames_total[5m])) by (org_id, player_id, device_category) > 10
for: 5m
labels:
severity: warning
component: rendering
metric: dropped_frames
annotations:
summary: "High dropped frames rate"
description: "Dropped frames rate is {{ $value }} fps for org={{ $labels.org_id }}, device={{ $labels.device_category }}."
impact: "Visual playback quality degradation, potential device or bitrate issues"
# Long Rebuffer Duration
- alert: LongRebufferDuration
expr: |
histogram_quantile(0.95,
sum(rate(openqoe_rebuffer_duration_seconds_bucket[5m])) by (le, org_id, player_id)
) > 5
for: 5m
labels:
severity: warning
component: streaming
metric: rebuffer_duration
annotations:
summary: "Long rebuffer duration detected"
description: "P95 rebuffer duration is {{ $value | humanizeDuration }} for org={{ $labels.org_id }}, player={{ $labels.player_id }}. Threshold: 5 seconds."
impact: "Extended buffering interruptions affecting user experience"
# Live Streaming Alerts
- name: live_streaming
interval: 10s
rules:
# No Concurrent Viewers (Stream May Be Down)
- alert: NoLiveConcurrentViewers
expr: |
sum(openqoe_concurrent_views_current) by (org_id, stream_id) == 0
for: 2m
labels:
severity: critical
component: live_streaming
metric: concurrent_viewers
annotations:
summary: "No concurrent viewers for live stream"
description: "Live stream {{ $labels.stream_id }} for org={{ $labels.org_id }} has zero viewers."
impact: "Stream may be down or experiencing critical issues"
runbook: "https://docs.openqoe.com/runbooks/no-live-viewers"
# Concurrent Viewers Dropped Significantly
- alert: LiveViewersDrop
expr: |
(
sum(openqoe_concurrent_views_current) by (org_id, stream_id)
-
sum(openqoe_concurrent_views_current offset 5m) by (org_id, stream_id)
)
/
sum(openqoe_concurrent_views_current offset 5m) by (org_id, stream_id)
< -0.5
for: 2m
labels:
severity: warning
component: live_streaming
metric: concurrent_viewers
annotations:
summary: "Live stream viewership dropped sharply"
description: "Concurrent viewers dropped by {{ $value | humanizePercentage }} for stream={{ $labels.stream_id }}."
impact: "Potential stream quality or delivery issue causing viewer abandonment"
# High Live Join Time
- alert: HighLiveJoinTime
expr: |
histogram_quantile(0.95,
sum(rate(openqoe_video_startup_seconds_bucket{content_type="live"}[5m])) by (le, org_id, stream_id)
) > 5
for: 3m
labels:
severity: warning
component: live_streaming
metric: join_time
annotations:
summary: "High live stream join time"
description: "P95 join time is {{ $value | humanizeDuration }} for stream={{ $labels.stream_id }}. Threshold: 5 seconds."
impact: "Users experiencing slow live stream startup"
# Data Pipeline Alerts (for Phase 2 - Worker Health)
- name: pipeline_health
interval: 1m
rules:
# No Events Received (Worker or SDK Issue)
- alert: NoEventsReceived
expr: |
rate(openqoe_events_total[5m]) == 0
for: 5m
labels:
severity: critical
component: worker
metric: event_ingestion
annotations:
summary: "No QoE events being received"
description: "Worker has received zero events for 5 minutes for org={{ $labels.org_id }}."
impact: "Data pipeline broken, no monitoring data available"
runbook: "https://docs.openqoe.com/runbooks/no-events"
# Events Dropped Significantly
- alert: EventsDroppedSignificantly
expr: |
(
rate(openqoe_events_total[5m])
-
rate(openqoe_events_total[5m] offset 30m)
)
/
rate(openqoe_events_total[5m] offset 30m)
< -0.7
for: 10m
labels:
severity: warning
component: worker
metric: event_ingestion
annotations:
summary: "Event ingestion dropped significantly"
description: "Event rate decreased by {{ $value | humanizePercentage }} for org={{ $labels.org_id }}."
impact: "Potential SDK deployment issue or traffic drop"