Skip to content

Commit df75cae

Browse files
m0nikasinghdknecht
authored andcommitted
2024-03: Minimizing on-call burnout through alerts observability
1 parent dba3407 commit df75cae

File tree

16 files changed

+2339
-0
lines changed

16 files changed

+2339
-0
lines changed

.gitguardian.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
version: 2
2+
secret:
3+
ignored-paths:
4+
- '2024-03-alerts-observability/*'
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# DIY alerts observability
2+
3+
This demo uses `vector.dev` to collect data from different sources and write the data in the datastore
4+
we use one `http_server` vector instance - to receive Alertmanager webhook notifications,
5+
two `http_client` sources to query Alertmanager's alerts and silence API endpoints and
6+
two `sinks` for writing all the state logs in ClickHouse into `alerts` and `silences` tables.
7+
8+
The docker-compose will bring up several containers:
9+
10+
* `Cadvisor` is used to generate system metrics for monitoring.
11+
* `Prometheus` is used to monitor and generate alerts.
12+
* `Alertmanager` is to route alerts and provide the alert events via webhook and API.
13+
* `alertmanager_silence` is to create an Alertmanager silence.
14+
* `blackbox_exporter` is for monitoring the sites and generating alerts.
15+
* `ClickHouse` is used to write the Alertmanager alert events into the datastore for alerts observability.
16+
* `Vector.dev` - to collect data from Alertmanager webhook, alerts and silences API, transform the data and write into ClickHouse.
17+
* `Grafana` is used to visualize the logs.
18+
19+
## Pre-requisite:
20+
`docker`
21+
22+
## Getting started:
23+
24+
* Setup password for ClickHouse and bring up the containers using docker compose
25+
```console
26+
foo@bar:~$ export CLICKHOUSE_PASSWORD="<PASSWORD here>"
27+
foo@bar:~$ docker compose up
28+
```
29+
30+
Please wait for about 5 minutes for the alerts to be triggered and
31+
visit http://localhost:3000/ to explore the `Alerts and silences overview` dashboard and play around.
32+
33+
![alerts and silences overview](images/alerts-silences-overview.png "alerts and silences overview")
27.3 MB
Binary file not shown.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
route:
2+
receiver: 'webhook'
3+
4+
receivers:
5+
- name: 'webhook'
6+
webhook_configs:
7+
- url: http://vector:8888
8+
9+
templates:
10+
- template/*.tmpl
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/sh
2+
3+
echo "{
4+
\"matchers\": [
5+
{
6+
\"name\": \"alertname\",
7+
\"value\": \"service_down\",
8+
\"isRegex\": false
9+
}
10+
],
11+
\"startsAt\": \"$(date -Ins | sed s/+00:00/Z/ | sed s/,/./)\",
12+
\"endsAt\": \"$(TZ='UTC-1:00' date -Ins | sed s/+01:00/Z/ | sed s/,/./)\",
13+
\"createdBy\": \"demouser\",
14+
\"comment\": \"Silence\"
15+
}" > post-data
16+
sleep 90
17+
18+
wget 'http://alertmanager:9093/api/v1/silences' --header='Content-Type: application/json' --post-file=post-data
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
modules:
2+
http_2xx:
3+
prober: http
4+
timeout: 5s
5+
http:
6+
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
7+
valid_status_codes: []
8+
method: GET
9+
no_follow_redirects: false
10+
fail_if_ssl: false
11+
fail_if_not_ssl: false
12+
tls_config:
13+
insecure_skip_verify: false
14+
preferred_ip_protocol: "ip4"
15+
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/sh
2+
3+
CLICKHOUSE_DB="${CLICKHOUSE_DB:-database}";
4+
CLICKHOUSE_USER="${CLICKHOUSE_USER:-user}";
5+
CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-password}";
6+
7+
cat <<EOT >> /etc/clickhouse-server/users.d/user.xml
8+
<yandex>
9+
<!-- Docs: <https://clickhouse.tech/docs/en/operations/settings/settings_users/> -->
10+
<users>
11+
<${CLICKHOUSE_USER}>
12+
<profile>default</profile>
13+
<networks>
14+
<ip>::/0</ip>
15+
</networks>
16+
<password>${CLICKHOUSE_PASSWORD}</password>
17+
<quota>default</quota>
18+
</${CLICKHOUSE_USER}>
19+
</users>
20+
</yandex>
21+
EOT
22+
#cat /etc/clickhouse-server/users.d/user.xml;
23+
24+
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS ${CLICKHOUSE_DB}";
25+
26+
echo -n '
27+
SET input_format_import_nested_json = 1;
28+
' | clickhouse-client
29+
30+
echo -n '
31+
SET output_format_json_array_of_rows = 1;
32+
' | clickhouse-client
33+
34+
echo -n "SET date_time_input_format='best_effort';" | clickhouse-client
35+
36+
echo -n '
37+
CREATE TABLE r0.alerts
38+
(
39+
`date` Date DEFAULT toDate(now()),
40+
`datetime` DateTime DEFAULT now(),
41+
`timestamp` DateTime64(3) DEFAULT now() CODEC(Delta(4), ZSTD(1)),
42+
`startsAt` DateTime64(3),
43+
`endsAt` DateTime64(3),
44+
`updatedAt` DateTime64(3),
45+
`status.inhibitedBy` Array(String),
46+
`status.silencedBy` String,
47+
`status.state` LowCardinality(String),
48+
`annotations.summary` String,
49+
`annotations.dashboard` String,
50+
`annotations.link` String,
51+
`fingerprint` String,
52+
`receivers` Array(String),
53+
`labelsmap` Map(String, String),
54+
`labels.alertname` String,
55+
`labels.component` String,
56+
`labels.service` String,
57+
`labels.instance` String,
58+
`labels.job` String,
59+
`labels.metal` String,
60+
`labels.notify` String,
61+
`labels.priority` String,
62+
`labels.prometheus` String,
63+
`labels.region` String,
64+
`labels.severity` String
65+
)
66+
ENGINE = MergeTree
67+
PARTITION BY toStartOfHour(datetime)
68+
ORDER BY labels.alertname
69+
SETTINGS index_granularity = 8192;' | clickhouse-client
70+
71+
echo -n '
72+
CREATE TABLE r0.silences
73+
(
74+
`date` Date DEFAULT toDate(now()),
75+
`datetime` DateTime DEFAULT now(),
76+
`id` String,
77+
`status.state` LowCardinality(String),
78+
`updatedAt` DateTime64(3),
79+
`startsAt` DateTime64(3),
80+
`createdBy` LowCardinality(String),
81+
`endsAt` DateTime64(3),
82+
`matchers` Map(String, String),
83+
`comment` String
84+
)
85+
ENGINE = ReplacingMergeTree
86+
PARTITION BY toStartOfHour(datetime)
87+
ORDER BY (id, startsAt, endsAt)
88+
SETTINGS index_granularity = 8192;
89+
' | clickhouse-client
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
version: '3.1'
2+
3+
volumes:
4+
prometheus_data: {}
5+
grafana_data: {}
6+
7+
services:
8+
9+
cadvisor:
10+
image: gcr.io/cadvisor/cadvisor
11+
volumes:
12+
- /:/rootfs:ro
13+
- /var/run:/var/run:rw
14+
- /sys:/sys:ro
15+
- /var/lib/docker/:/var/lib/docker:ro
16+
ports:
17+
- 8080:8080
18+
restart: always
19+
deploy:
20+
mode: global
21+
22+
prometheus:
23+
image: prom/prometheus:v2.49.1
24+
volumes:
25+
- ./prometheus/:/etc/prometheus/
26+
- prometheus_data:/prometheus
27+
command:
28+
- '--config.file=/etc/prometheus/prometheus.yml'
29+
- '--storage.tsdb.path=/prometheus'
30+
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
31+
- '--web.console.templates=/usr/share/prometheus/consoles'
32+
- '--web.enable-lifecycle'
33+
ports:
34+
- 9090:9090
35+
links:
36+
- cadvisor:cadvisor
37+
- alertmanager:alertmanager
38+
depends_on:
39+
- cadvisor
40+
restart: always
41+
deploy:
42+
mode: global
43+
44+
alertmanager:
45+
image: prom/alertmanager:v0.26.0
46+
ports:
47+
- 9093:9093
48+
volumes:
49+
- ./alertmanager/:/etc/alertmanager/
50+
- ./alertmanager/amtool:/bin/amtool
51+
restart: always
52+
command:
53+
- '--config.file=/etc/alertmanager/config.yml'
54+
deploy:
55+
mode: global
56+
57+
alertmanager_silence:
58+
image: prom/alertmanager:v0.26.0
59+
volumes:
60+
- ./alertmanager/:/etc/alertmanager/
61+
- ./alertmanager/amtool:/bin/amtool
62+
- ./alertmanager/silence_entrypoint.sh:/silence_entrypoint.sh
63+
depends_on:
64+
alertmanager:
65+
condition: service_started
66+
restart: 'no'
67+
entrypoint: '/silence_entrypoint.sh'
68+
69+
blackbox_exporter:
70+
image: prom/blackbox-exporter:v0.24.0
71+
ports:
72+
- 9115:9115
73+
volumes:
74+
- ./blackboxexporter/:/etc/blackboxexporter/
75+
command:
76+
- '--config.file=/etc/blackboxexporter/config.yml'
77+
restart: always
78+
79+
clickhouse:
80+
image: clickhouse/clickhouse-server:23.3.19.32
81+
ports:
82+
- "8123:8123"
83+
- "9008:9008"
84+
- "9009:9009"
85+
environment:
86+
# Default user and database will be created using `init-defaults.sh` script
87+
CLICKHOUSE_DB: r0
88+
CLICKHOUSE_USER: demouser
89+
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
90+
hostname: clickhouse-0.localhost
91+
ulimits:
92+
nproc: 65535
93+
nofile:
94+
soft: 262144
95+
hard: 262144
96+
volumes:
97+
- ./clickhouse/init-defaults.sh:/docker-entrypoint-initdb.d/init-defaults.sh:ro
98+
healthcheck:
99+
test:
100+
[
101+
"CMD-SHELL",
102+
"http_proxy='' wget -nv -t1 --spider 'http://localhost:8123/' || exit 1",
103+
]
104+
interval: 10s
105+
timeout: 10s
106+
retries: 30
107+
108+
vector:
109+
image: timberio/vector:0.27.0-debian
110+
container_name: vector
111+
ports:
112+
- 8888:8888
113+
environment:
114+
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
115+
volumes:
116+
- ./vector/vector.toml:/etc/vector/vector.toml:ro
117+
depends_on:
118+
clickhouse:
119+
condition: service_healthy
120+
121+
grafana:
122+
build: ./grafana/
123+
environment:
124+
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
125+
- GF_AUTH_ANONYMOUS_ENABLED=true
126+
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
127+
- GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS=grafana-clickhouse-datasource
128+
- GF_SECURITY_ADMIN_USER=admin
129+
- GF_SECURITY_ADMIN_PASSWORD=admin
130+
entrypoint:
131+
- sh
132+
- -euc
133+
- |
134+
mkdir -p /etc/grafana/provisioning/datasources
135+
cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
136+
apiVersion: 1
137+
datasources:
138+
- name: 'ClickHouse'
139+
type: 'grafana-clickhouse-datasource'
140+
isDefault: true
141+
jsonData:
142+
defaultDatabase: r0
143+
port: 9000
144+
server: clickhouse-0.localhost
145+
username: demouser
146+
tlsSkipVerify: false
147+
secureJsonData:
148+
password: ${CLICKHOUSE_PASSWORD}
149+
editable: true
150+
EOF
151+
mkdir -p /etc/grafana/provisioning/dashboards
152+
cat <<EOF > /etc/grafana/provisioning/dashboards/dashboard.yaml
153+
apiVersion: 1
154+
providers:
155+
- name: demo
156+
type: file
157+
updateIntervalSeconds: 30
158+
options:
159+
path: /var/lib/grafana/dashboards
160+
foldersFromFilesStructure: true
161+
EOF
162+
/run.sh
163+
ports:
164+
- "3000:3000"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM grafana/grafana:10.2.3-ubuntu
2+
3+
WORKDIR /tmp
4+
USER root
5+
6+
RUN grafana cli plugins install grafana-clickhouse-datasource
7+
8+
RUN mkdir -p /var/lib/grafana/dashboards
9+
10+
COPY dashboards/*.json /var/lib/grafana/dashboards/

0 commit comments

Comments
 (0)