Skip to content

Commit 9b55e81

Browse files
author
Jorgen-5
committed
Added alert files
1 parent d8b5faa commit 9b55e81

File tree

4 files changed

+190
-0
lines changed

4 files changed

+190
-0
lines changed

.github/workflows/alert-deploy.yml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: Deploy alerts
2+
run-name: Deploy alerts for klass to test and prod
3+
4+
on:
5+
pull_request:
6+
push:
7+
branches:
8+
- main
9+
paths:
10+
- '.nais/test/*-alerts.yaml'
11+
- '.nais/prod/*-alerts.yaml'
12+
- '.github/workflows/alert-deploy.yml'
13+
14+
permissions:
15+
id-token: write
16+
17+
jobs:
18+
test-deploy:
19+
name: Deploy alerts to test
20+
runs-on: ubuntu-latest
21+
steps:
22+
- uses: actions/checkout@v4
23+
- name: Deploy test alerts
24+
run: |
25+
for file in .nais/test/*-alerts.yaml; do
26+
echo "Deploying $file to test"
27+
nais deploy \
28+
--cluster test \
29+
--resource "$file" \
30+
--deploy-server deploy.ssb.cloud.nais.io:443
31+
done
32+
33+
prod-deploy:
34+
name: Deploy alerts to prod
35+
runs-on: ubuntu-latest
36+
steps:
37+
- uses: actions/checkout@v4
38+
- name: Deploy prod alerts
39+
run: |
40+
for file in .nais/prod/*-alerts.yaml; do
41+
echo "Deploying $file to prod"
42+
nais deploy \
43+
--cluster prod \
44+
--resource "$file" \
45+
--deploy-server deploy.ssb.cloud.nais.io:443
46+
done

.nais/prod/klass-api-alerts.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: "monitoring.coreos.com/v1"
2+
kind: PrometheusRule
3+
metadata:
4+
name: alert-klass-api
5+
namespace: dapla-metadata
6+
labels:
7+
team: dapla-metadata
8+
spec:
9+
groups:
10+
- name: dapla-metadata
11+
rules:
12+
- alert: High number of errors
13+
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-api",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-api"}[3m]))) > 10
14+
for: 3m
15+
annotations:
16+
title: "High number of errors logged"
17+
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
18+
action: "`kubectl describe pod -l app=klass-api -n dapla-metadata` -> `kubectl logs <podname>`"
19+
labels:
20+
service: klass-api
21+
namespace: dapla-metadata
22+
severity: critical
23+
environment: prod
24+
25+
- alert: A Klass-api client is unavailable
26+
expr: rate(http_client_requests_seconds_count{app="klass-api", status!="200"}[1m]) > 0
27+
for: 1m
28+
annotations:
29+
title: "A Klass-api client is unavailable "
30+
consequence: "The service may lack some functionality"
31+
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
32+
labels:
33+
service: klass-api
34+
namespace: dapla-metadata
35+
severity: critical
36+
environment: prod
37+
38+
- alert: Klass-api is unavailable
39+
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
40+
for: 1m
41+
annotations:
42+
title: "Klass-api is unavailable"
43+
consequence: "Service is unavailable to users. "
44+
labels:
45+
service: klass-api
46+
namespace: dapla-metadata
47+
severity: critical
48+
environment: prod

.nais/test/klass-api-alerts.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: "monitoring.coreos.com/v1"
2+
kind: PrometheusRule
3+
metadata:
4+
name: alert-klass-api
5+
namespace: dapla-metadata
6+
labels:
7+
team: dapla-metadata
8+
spec:
9+
groups:
10+
- name: dapla-metadata
11+
rules:
12+
- alert: High number of errors
13+
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-api",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-api"}[3m]))) > 10
14+
for: 3m
15+
annotations:
16+
title: "High number of errors logged"
17+
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
18+
action: "`kubectl describe pod -l app=klass-api -n dapla-metadata` -> `kubectl logs <podname>`"
19+
labels:
20+
service: klass-api
21+
namespace: dapla-metadata
22+
severity: critical
23+
environment: test
24+
25+
- alert: A Klass-api client is unavailable
26+
expr: rate(http_client_requests_seconds_count{app="klass-api", status!="200"}[1m]) > 0
27+
for: 1m
28+
annotations:
29+
title: "A Klass-api client is unavailable "
30+
consequence: "The service may lack some functionality"
31+
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
32+
labels:
33+
service: klass-api
34+
namespace: dapla-metadata
35+
severity: critical
36+
environment: test
37+
38+
- alert: Klass-api is unavailable
39+
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
40+
for: 1m
41+
annotations:
42+
title: "Klass-api is unavailable"
43+
consequence: "Service is unavailable to users. "
44+
labels:
45+
service: klass-api
46+
namespace: dapla-metadata
47+
severity: critical
48+
environment: test
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: "monitoring.coreos.com/v1"
2+
kind: PrometheusRule
3+
metadata:
4+
name: alert-klass-forvaltning
5+
namespace: dapla-metadata
6+
labels:
7+
team: dapla-metadata
8+
spec:
9+
groups:
10+
- name: dapla-metadata
11+
rules:
12+
- alert: High number of errors
13+
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning"}[3m]))) > 10
14+
for: 3m
15+
annotations:
16+
title: "High number of errors logged"
17+
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
18+
action: "`kubectl describe pod -l app=klass-forvaltning -n dapla-metadata` -> `kubectl logs <podname>`"
19+
labels:
20+
service: klass-forvaltning
21+
namespace: dapla-metadata
22+
severity: critical
23+
environment: test
24+
25+
- alert: A Klass-forvaltning client is unavailable
26+
expr: rate(http_client_requests_seconds_count{app="klass-forvaltning", status!="200"}[1m]) > 0
27+
for: 1m
28+
annotations:
29+
title: "A Klass-forvaltning client is unavailable "
30+
consequence: "The service may lack some functionality"
31+
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
32+
labels:
33+
service: klass-forvaltning
34+
namespace: dapla-metadata
35+
severity: critical
36+
environment: test
37+
38+
- alert: Klass-forvaltning is unavailable
39+
expr: kube_deployment_status_replicas_available{deployment="klass-forvaltning"} == 0
40+
for: 1m
41+
annotations:
42+
title: "Klass-forvaltning is unavailable"
43+
consequence: "Service is unavailable to users. "
44+
labels:
45+
service: klass-forvaltning
46+
namespace: dapla-metadata
47+
severity: critical
48+
environment: test

0 commit comments

Comments
 (0)