Skip to content

Commit ff56c92

Browse files
authored
Add HTTP Status code alert (#506)
* Add HTTP Status code alert * Fix deployment name * Update prod alerts
1 parent 37969d1 commit ff56c92

File tree

2 files changed

+47
-8
lines changed

2 files changed

+47
-8
lines changed

.nais/prod/klass-api-alerts.yaml

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,34 @@ spec:
99
groups:
1010
- name: dapla-metadata
1111
rules:
12+
- alert: HTTP error responses
13+
expr: 100 * sum(rate(nginx_ingress_controller_request_duration_seconds_count{ingress="klass-nais-ingress-eaceac17",status!~"[4-5].*"}[10m])) by (ingress) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ingress="klass-nais-ingress-eaceac17"}[10m])) by (ingress) < 75.0
14+
for: 3m
15+
annotations:
16+
title: "HTTP error responses"
17+
consequence: "A high number of HTTP requests are receiving error responses. This may indicate a problem with the application."
18+
action: "Investigate whether this indicates an app failure or user errors."
19+
labels:
20+
service: klass-api
21+
namespace: dapla-metadata
22+
severity: critical
23+
environment: prod
24+
25+
- alert: High heap memory usage
26+
expr: (100 * sum by (instance) (jvm_memory_used_bytes{application="klass", area="heap"})) / (sum by (instance) (jvm_memory_max_bytes{application="klass", area="heap"}) ) > 70.0
27+
for: 3m
28+
annotations:
29+
title: "High heap memory usage"
30+
consequence: "If this increase continues then the app could run out of memory and either lock or crash."
31+
action: "Immediate: Restart the app from the Nais console\nShort term: Investigate the cause of high heap usage and either fix the bug or increase the available heap."
32+
labels:
33+
service: klass-api
34+
namespace: dapla-metadata
35+
severity: critical
36+
environment: prod
37+
1238
- alert: High number of errors
13-
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-api",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-api"}[3m]))) > 10
39+
expr: (100 * sum by (app, namespace) (rate(logback_events_total{application="klass",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{application="klass"}[3m]))) > 1
1440
for: 3m
1541
annotations:
1642
title: "High number of errors logged"
@@ -23,7 +49,7 @@ spec:
2349
environment: prod
2450

2551
- alert: A Klass-api client is unavailable
26-
expr: rate(http_client_requests_seconds_count{app="klass-api", status!="200"}[1m]) > 0
52+
expr: rate(http_client_requests_seconds_count{application="klass", status!="200"}[1m]) > 0
2753
for: 1m
2854
annotations:
2955
title: "A Klass-api client is unavailable "
@@ -35,8 +61,8 @@ spec:
3561
severity: critical
3662
environment: prod
3763

38-
- alert: Klass-api is unavailable
39-
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
64+
- alert: Klass is unavailable
65+
expr: kube_deployment_status_replicas_available{deployment="klass"} == 0
4066
for: 1m
4167
annotations:
4268
title: "Klass-api is unavailable"
@@ -45,4 +71,4 @@ spec:
4571
service: klass-api
4672
namespace: dapla-metadata
4773
severity: critical
48-
environment: prod
74+
environment: prod

.nais/test/klass-api-alerts.yaml

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,26 @@ spec:
99
groups:
1010
- name: dapla-metadata
1111
rules:
12+
- alert: HTTP error responses
13+
expr: 100 * sum(rate(nginx_ingress_controller_request_duration_seconds_count{ingress="klass-nais-ingress-eaceac17",status!~"[4-5].*"}[10m])) by (ingress) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ingress="klass-nais-ingress-eaceac17"}[10m])) by (ingress) < 75.0
14+
for: 3m
15+
annotations:
16+
title: "HTTP error responses"
17+
consequence: "A high number of HTTP requests are receiving error responses. This may indicate a problem with the application."
18+
action: "Investigate whether this indicates an app failure or user errors."
19+
labels:
20+
service: klass-api
21+
namespace: dapla-metadata
22+
severity: critical
23+
environment: test
24+
1225
- alert: High heap memory usage
1326
expr: (100 * sum by (instance) (jvm_memory_used_bytes{application="klass", area="heap"})) / (sum by (instance) (jvm_memory_max_bytes{application="klass", area="heap"}) ) > 70.0
1427
for: 3m
1528
annotations:
1629
title: "High heap memory usage"
1730
consequence: "If this increase continues then the app could run out of memory and either lock or crash."
18-
action: "Immediate: Restart the app from the Nais console\nShort term: Investigate the cause of high heap usage and either fix the bug or "
31+
action: "Immediate: Restart the app from the Nais console\nShort term: Investigate the cause of high heap usage and either fix the bug or increase the available heap."
1932
labels:
2033
service: klass-api
2134
namespace: dapla-metadata
@@ -48,8 +61,8 @@ spec:
4861
severity: critical
4962
environment: test
5063

51-
- alert: Klass-api is unavailable
52-
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
64+
- alert: Klass is unavailable
65+
expr: kube_deployment_status_replicas_available{deployment="klass"} == 0
5366
for: 1m
5467
annotations:
5568
title: "Klass-api is unavailable"

0 commit comments

Comments
 (0)