Skip to content

Commit 8652b72

Browse files
authored
Merge pull request #360 from ooni/fix-ecs-alerts
Improve ECS-related alerts
2 parents 5dcbea5 + 3dedb7b commit 8652b72

2 files changed

Lines changed: 17 additions & 15 deletions

File tree

ansible/roles/prometheus/files/alert_rules.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ groups:
99

1010
# including http scraping failure
1111
- alert: InstanceDown
12-
expr: up != 1
12+
expr: up{job!="ecs-tasks"} != 1
1313
for: 5m
1414
annotations:
15-
summary: '{{ $labels.instance }} {{if $labels.ec2_host}} ({{$labels.ec2_host}}) {{end}} is not `up`'
15+
summary: '{{if $labels.ec2_name}}{{$labels.ec2_name}}{{else}}{{$labels.instance}}{{end}} {{if $labels.ec2_host}} ({{$labels.ec2_host}}) {{end}}{{if $labels.env}} [{{$labels.env}}] {{else if $labels.environment}} [{{$labels.environment}}] {{end}} is not `up`'
1616

1717
- alert: systemd # yes, just "systemd", it's unclear what's going wrong :-)
1818
expr: node_systemd_system_running != 1 # that's basically output of `systemctl is-system-running`

ansible/roles/prometheus/templates/prometheus.yml

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -214,16 +214,16 @@ scrape_configs:
214214
# - targets:
215215
# - backend-hel.ooni.org:444
216216

217-
# EC2 instances monitoring:
217+
# EC2 instances monitoring:
218218
- job_name: 'ooni-aws-ec2-dev'
219219
scrape_interval: 5s
220-
scheme: https
221-
metrics_path: "/metrics"
220+
scheme: https
221+
metrics_path: "/metrics"
222222
ec2_sd_configs:
223-
- access_key: "{{prometheus_aws_access_key_dev}}"
223+
- access_key: "{{prometheus_aws_access_key_dev}}"
224224
secret_key: "{{prometheus_aws_secret_key_dev}}"
225225
region: "eu-central-1"
226-
port: 9100
226+
port: 9100
227227
filters: &instance_filters
228228
- name: instance-state-name
229229
values: ['running']
@@ -232,6 +232,8 @@ scrape_configs:
232232
relabel_configs: &ec2_relabeling # Change the host to the proxy host with relabeling
233233
- source_labels: [__meta_ec2_tag_Environment] # take environment from tags
234234
target_label: env
235+
- source_labels: [__meta_ec2_tag_Name] # EC2 instance Name tag
236+
target_label: ec2_name
235237
- source_labels: [__address__]
236238
regex: "([0-9\\.]+):([0-9]+)" # <ip>:<port>"
237239
replacement: "$1"
@@ -267,14 +269,14 @@ scrape_configs:
267269

268270
- job_name: 'ooni-aws-ec2-prod'
269271
scrape_interval: 5s
270-
scheme: https
271-
metrics_path: "/metrics"
272+
scheme: https
273+
metrics_path: "/metrics"
272274
ec2_sd_configs:
273-
- access_key: "{{prometheus_aws_access_key_prod}}"
275+
- access_key: "{{prometheus_aws_access_key_prod}}"
274276
secret_key: "{{prometheus_aws_secret_key_prod}}"
275277
region: "eu-central-1"
276-
port: 9100
277-
filters:
278+
port: 9100
279+
filters:
278280
*instance_filters
279281
relabel_configs: # Change the host to the proxy host with relabeling
280282
*ec2_relabeling
@@ -287,7 +289,7 @@ scrape_configs:
287289
username: 'prom'
288290
password: '{{ prometheus_metrics_password }}'
289291
file_sd_configs:
290-
- files:
292+
- files:
291293
- '/var/lib/prometheus/file_discovery/*.json'
292294
relabel_configs: # Change the host to the proxy host with relabeling
293295
# Store ip in ecs_host
@@ -326,7 +328,7 @@ scrape_configs:
326328

327329
- job_name: "fastpath"
328330
static_configs:
329-
- targets:
331+
- targets:
330332
- fastpath.dev.ooni.io:9102
331333
- fastpath.prod.ooni.io:9102
332334
scrape_interval: 5s
@@ -374,7 +376,7 @@ scrape_configs:
374376
static_configs:
375377
- targets:
376378
- testlists.dev.ooni.io:9102
377-
- testlists.prod.ooni.io:9102
379+
# - testlists.prod.ooni.io:9102
378380
scrape_interval: 5s
379381
scheme: https
380382
relabel_configs: # Change the host to the proxy host with relabeling

0 commit comments

Comments
 (0)