Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
**/.DS_Store
*.env
!wakusim.env
alertmanager-config.yml
28 changes: 27 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,15 @@ services:
image: prom/prometheus:latest
volumes:
- ./monitoring/prometheus-config.yml:/etc/prometheus/prometheus.yml:z
- ./monitoring/alert-manager/alert-rules.yml:/etc/prometheus/alert-rules.yml:z
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.retention.time=7d
ports:
- 127.0.0.1:9090:9090
restart: on-failure
depends_on:
- alertmanager
networks:
- simulation

Expand Down Expand Up @@ -213,4 +216,27 @@ services:
- redis
- foundry
networks:
- simulation
- simulation

env_replacer:
image: alpine:3.19.1
environment:
- DISCORD_WEBHOOK=$DISCORD_WEBHOOK
volumes:
- ./monitoring/alert-manager/alertmanager-config.yml.template:/etc/alertmanager/alertmanager.yml.template:z
- ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z
command: ["/bin/sh", "-c", "apk add --no-cache gettext && envsubst < /etc/alertmanager/alertmanager.yml.template > /etc/alertmanager/alertmanager.yml"]

alertmanager:
image: prom/alertmanager:latest
volumes:
- ./monitoring/alert-manager/alertmanager-config.yml:/etc/alertmanager/alertmanager.yml:z
command:
- --config.file=/etc/alertmanager/alertmanager.yml
ports:
- 127.0.0.1:9093:9093
restart: on-failure
networks:
- simulation
depends_on:
- env_replacer
23 changes: 23 additions & 0 deletions monitoring/alert-manager/alert-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
groups:
- name: waku
rules:
- alert: HighNimWakuMemUsage
expr: >
nim_gc_mem_bytes{} > 1
for: 5m
annotations:
summary: "Too high memory usage for {{ $labels.instance }}"
description: "Host {{ $labels.instance }} running nim-waku has GC memory usage higher 1GB"
current_value: "{{ $value }}"

- record: job:waku_libp2p_peers
expr: libp2p_peers{}

- alert: NimWakuPeersDecrease
expr: >
(job:waku_libp2p_peers / avg_over_time(job:waku_libp2p_peers[12h])) < 0.50
for: 15m
annotations:
summary: "Drop of libp2p_peers on {{ $labels.instance }}"
description: "Host {{ $labels.instance }} running nim-waku has more than 50% drop of peers compared to 12h average"
current_value: "{{ $value }}"
11 changes: 11 additions & 0 deletions monitoring/alert-manager/alertmanager-config.yml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 2m
receiver: 'discord'

receivers:
- name: 'discord'
discord_configs:
- webhook_url: ${DISCORD_WEBHOOK}
9 changes: 9 additions & 0 deletions monitoring/prometheus-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ global:
external_labels:
monitor: "Monitoring"

alerting:
alertmanagers:
- scheme: http
static_configs:
- targets: [ 'alertmanager:9093' ]

rule_files:
- "./alert-rules.yml"

scrape_configs:
- job_name: cadvisor
scrape_interval: 5s
Expand Down