Skip to content

Commit e0bca42

Browse files
prometheus: add an alert for CronJob owned Job failure and another for non Cronjob owned Job
1 parent c4df491 commit e0bca42

File tree

5 files changed

+52
-19
lines changed

5 files changed

+52
-19
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44

55
### Enhancements
66

7+
- Add alerts for a CronJob owned Job failure and another one for
8+
non-CronJob owned Job failure
9+
(PR[#4584](https://github.com/scality/metalk8s/pull/4584))
10+
711
- Bump Kubernetes version to
812
[1.30.11](https://github.com/kubernetes/kubernetes/releases/tag/v1.30.11)
913
(PR[#4578](https://github.com/scality/metalk8s/pull/4578))

charts/drop-prometheus-rules.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ node-exporter:
1616
- NodeFileDescriptorLimit
1717
kubernetes-apps:
1818
- KubeJobNotCompleted
19+
- KubeJobFailed
1920
# workaround: this fires upon install
2021
# revert the entire commit after the fix is merged
2122
etcd:

salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
replicas: 1
1010
config:
1111
retention_time: "10d"
12-
retention_size: "0" # "0" to disable size-based retention
12+
retention_size: "0" # "0" to disable size-based retention
1313
enable_admin_api: false
1414
serviceMonitor:
1515
kubelet:
@@ -19,37 +19,43 @@ spec:
1919
kube_job_not_completed:
2020
warning:
2121
hours: 24 # Hours of job active before we trigger alert
22+
kube_cronjob_owned_job_failed:
23+
warning:
24+
minutes: 5 # Minutes of job active before we trigger alert
25+
kube_job_failed:
26+
warning:
27+
minutes: 5 # Minutes of job active before we trigger alert
2228
node_exporter:
2329
node_filesystem_space_filling_up:
2430
warning:
25-
hours: 24 # Hours before there is no space left
26-
threshold: 40 # Min space left to trigger prediction
31+
hours: 24 # Hours before there is no space left
32+
threshold: 40 # Min space left to trigger prediction
2733
critical:
2834
hours: 4
2935
threshold: 20
3036
node_filesystem_almost_out_of_space:
3137
warning:
32-
available: 20 # Percentage of free space left
38+
available: 20 # Percentage of free space left
3339
critical:
3440
available: 12
3541
node_filesystem_files_filling_up:
3642
warning:
37-
hours: 24 # Hours before there is no inode left
38-
threshold: 40 # Min space left to trigger prediction
43+
hours: 24 # Hours before there is no inode left
44+
threshold: 40 # Min space left to trigger prediction
3945
critical:
4046
hours: 4
4147
threshold: 20
4248
node_filesystem_almost_out_of_files:
4349
warning:
44-
available: 15 # Percentage of free inodes left
50+
available: 15 # Percentage of free inodes left
4551
critical:
4652
available: 8
4753
node_network_receive_errors:
4854
warning:
49-
error_rate: 0.01 # Rate of receive errors for the last 2m
55+
error_rate: 0.01 # Rate of receive errors for the last 2m
5056
node_network_transmit_errors:
5157
warning:
52-
error_rate: 0.01 # Rate of transmit errors for the last 2m
58+
error_rate: 0.01 # Rate of transmit errors for the last 2m
5359
node_high_number_conntrack_entries_used:
5460
warning:
5561
threshold: 0.75

salt/metalk8s/addons/prometheus-operator/deployed/chart.sls

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73953,16 +73953,6 @@ spec:
7395373953
for: 15m
7395473954
labels:
7395573955
severity: warning
73956-
- alert: KubeJobFailed
73957-
annotations:
73958-
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
73959-
complete. Removing failed job after investigation should clear this alert.
73960-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
73961-
summary: Job failed to complete.
73962-
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
73963-
for: 15m
73964-
labels:
73965-
severity: warning
7396673956
- alert: KubeHpaReplicasMismatch
7396773957
annotations:
7396873958
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}

salt/metalk8s/addons/prometheus-operator/deployed/kube-alerts-rules.sls

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,36 @@ spec:
3737
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > ({% endraw %}{{ rules.kube_apps.kube_job_not_completed.warning.hours }}{% raw %} * 60 * 60)
3838
labels:
3939
severity: warning
40+
- alert: KubeCronJobOwnedJobFailed
41+
annotations:
42+
description: Job {{ $labels.job_name }} created by CronJob {{ $labels.namespace }}/{{ $labels.cronjob_name }} failed to complete.
43+
Check the logs of the Job and the CronJob state to understand the failure.
44+
Removing failed job after investigation should clear this alert.
45+
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecronjobownedjobfailed
46+
summary: Job owned by CronJob failed to complete.
47+
for: {% endraw %}{{ rules.kube_apps.kube_cronjob_owned_job_failed.warning.minutes }}{% raw %}m
48+
expr: |-
49+
(
50+
kube_job_status_failed{job_name!=""} == 1
51+
and on(job_name, namespace)
52+
kube_job_owner{owner_kind="CronJob", job_name!=""}
53+
)
54+
* on(job_name, namespace) group_left(owner_name)
55+
kube_job_owner{owner_kind="CronJob", job_name!=""}
56+
labels:
57+
severity: warning
58+
- alert: KubeJobFailed
59+
annotations:
60+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
61+
Check the logs of the Job to understand the failure.
62+
Removing failed job after investigation should clear this alert.
63+
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
64+
summary: Job failed to complete.
65+
expr: |-
66+
kube_job_status_failed{job_name!=""} == 1
67+
unless on(job_name, namespace)
68+
kube_job_owner{owner_kind="CronJob", job_name!=""}
69+
for: {% endraw %}{{ rules.kube_apps.kube_job_failed.warning.minutes }}{% raw %}m
70+
labels:
71+
severity: warning
4072
{%- endraw %}

0 commit comments

Comments
 (0)