-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdatacenter.yml
More file actions
181 lines (170 loc) · 7.04 KB
/
datacenter.yml
File metadata and controls
181 lines (170 loc) · 7.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
groups:
- name: datacenter
rules:
##
## Basic Transformations
## - counter -> rate
## - histogram -> latency
## - etc
##
- record: fastly_datacenter:requests
expr: sum(rate(fastly_rt_requests_total[120s])) by (datacenter)
- record: fastly_datacenter:4xx_ratio
expr: |
sum(rate(fastly_rt_status_group_total{status_group="4xx"}[120s])) by (datacenter)
/ sum(rate(fastly_rt_status_group_total[120s])) by (datacenter)
- record: fastly_datacenter:5xx_ratio
expr: |
sum(rate(fastly_rt_status_group_total{status_group="5xx"}[120s])) by (datacenter)
/ sum(rate(fastly_rt_status_group_total[120s])) by (datacenter)
- record: fastly_datacenter:errors_ratio
expr: |
sum(rate(fastly_rt_errors_total[120s])) by (datacenter)
/ fastly_datacenter:requests
- record: fastly_datacenter:latency_p50_seconds
expr: histogram_quantile(0.5,sum(rate(fastly_rt_miss_duration_seconds_bucket[120s])) by (le,datacenter))
- record: fastly_datacenter:latency_p99_seconds
expr: histogram_quantile(0.99,sum(rate(fastly_rt_miss_duration_seconds_bucket[120s])) by (le,datacenter))
##
## Thresholds
## - Hysteresis, for more details see
## https://promcon.io/2019-munich/talks/improved-alerting-with-prometheus-and-alertmanager/
##
- record: fastly_datacenter:4xx_warn_ratio
expr: |
0.0375+0*count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenter4xxPercent"}) by (datacenter)
or 0.05+0*fastly_datacenter:requests
- record: fastly_datacenter:5xx_warn_ratio
expr: |
0.0375+0*count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenter5xxPercent"}) by (datacenter)
or 0.05+0*fastly_datacenter:requests
- record: fastly_datacenter:errors_warn_ratio
expr: |
0.0375+0*count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterErrorPercent"}) by (datacenter)
or 0.05+0*fastly_datacenter:requests
# 826ms would put you in the worst 20% of sites
# Per https://www.littledata.io/average/server-response-time
- record: fastly_datacenter:latency_p50_warn_seconds
expr: |
0.620+0*count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterP50Latency"}) by (datacenter)
or 0.826+0*fastly_datacenter:requests
# Our P50 latency, for example, tends to be less than one third of our P99 latency
# Per https://medium.com/@djsmith42/how-to-metric-edafaf959fc7
- record: fastly_datacenter:latency_p99_warn_seconds
expr: |
1.859+0*count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterP99Latency"}) by (datacenter)
or 2.478+0*fastly_datacenter:requests
##
## Alert Status
##
- record: fastly_datacenter:alerts
expr: count(ALERTS{alertstate="firing", datacenter=~".+"}) by (alertname,datacenter)
- record: fastly_datacenter:alerts_recently
expr: sum(sum_over_time(ALERTS{alertstate="firing", datacenter=~".+"}[1h])) by (alertname,datacenter)
##
## Criteria
##
# Notice: some minimum level of traffic over time
- record: fastly_datacenter:criteria_notice
expr: |
min_over_time(fastly_datacenter:requests[5m]) > 1
and count_over_time(fastly_datacenter:requests[5m]) > 2
- record: fastly_datacenter:criteria_notice_recently
expr: count_over_time(fastly_datacenter:criteria_notice[1h])
# Warning: top 20% of notice
- record: fastly_datacenter:criteria_warning
#expr: |
# topk(
# scalar(
# count(fastly_datacenter:criteria_notice) * .2
# ), sum(fastly_datacenter:criteria_notice) by (datacenter)
# ) or count(fastly_datacenter:alerts_recently) by (datacenter)
expr: |
fastly_datacenter:criteria_notice > scalar(
sum(fastly_datacenter:requests) * .01
)
- record: fastly_datacenter:criteria_warning_recently
expr: count_over_time(fastly_datacenter:criteria_warning[1h])
##
## Alert Rules
## - Hold down timers / latch, for more details see
## https://utcc.utoronto.ca/~cks/space/blog/sysadmin/PrometheusAlertmanagerFlapping
## - Example: max_over_time(min_over_time(fastly_datacenter:4xx_ratio[3m])[10m:15s])
## - Once the event persists for 3m
## - Hold down / latch for 10m
##
- alert: fastlyDatacenter4xxPercent
expr: |
(
max_over_time(
min_over_time(fastly_datacenter:4xx_ratio[10m])[30m:15s]
) > fastly_datacenter:4xx_warn_ratio
) and (
fastly_datacenter:criteria_warning_recently
or count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenter4xxPercent"}) by (datacenter)
)
labels:
job: 'fastlyDatacenter'
severity: warning
annotations:
description: 'High ({{- $value | humanizePercentage }})'
- alert: fastlyDatacenter5xxPercent
expr: |
(
max_over_time(
min_over_time(fastly_datacenter:5xx_ratio[10m])[30m:15s]
) > fastly_datacenter:5xx_warn_ratio
) and (
fastly_datacenter:criteria_warning_recently
or count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenter5xxPercent"}) by (datacenter)
)
labels:
job: 'fastlyDatacenter'
severity: warning
annotations:
description: 'High ({{- $value | humanizePercentage }})'
- alert: fastlyDatacenterErrorPercent
expr: |
(
max_over_time(
min_over_time(fastly_datacenter:errors_ratio[10m])[30m:15s]
) > fastly_datacenter:errors_warn_ratio
) and (
fastly_datacenter:criteria_warning_recently
or count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterErrorPercent"}) by (datacenter)
)
labels:
job: 'fastlyDatacenter'
severity: warning
annotations:
description: 'High ({{- $value | humanizePercentage }})'
- alert: fastlyDatacenterP50Latency
expr: |
(
max_over_time(
min_over_time(fastly_datacenter:latency_p50_seconds[10m])[30m:15s]
) > fastly_datacenter:latency_p50_warn_seconds
) and (
fastly_datacenter:criteria_warning_recently
or count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterP50Latency"}) by (datacenter)
)
labels:
job: 'fastlyDatacenter'
severity: warning
annotations:
description: 'High ({{- $value | humanizeDuration }})'
- alert: fastlyDatacenterP99Latency
expr: |
(
max_over_time(
min_over_time(fastly_datacenter:latency_p99_seconds[10m])[30m:15s]
) > fastly_datacenter:latency_p99_warn_seconds
) and (
fastly_datacenter:criteria_warning_recently
or count(fastly_datacenter:alerts_recently{alertname="fastlyDatacenterP99Latency"}) by (datacenter)
)
labels:
job: 'fastlyDatacenter'
severity: warning
annotations:
description: 'High ({{- $value | humanizeDuration }})'