Skip to content

Commit 26243d6

Browse files
committed
Merge branch 'improvement/add-cronjob-jobs-alerts' into q/130.0
2 parents ce11d6e + ad9c130 commit 26243d6

File tree

7 files changed

+740
-680
lines changed

7 files changed

+740
-680
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@
5858
[4.14.1](https://github.com/prometheus-community/helm-charts/releases/tag/prometheus-adapter-4.14.1)
5959
(PR[#4563](https://github.com/scality/metalk8s/pull/4563))
6060

61+
- Add alerts for a CronJob owned Job failure and another one for
62+
non-CronJob owned Job failure
63+
(PR[#4584](https://github.com/scality/metalk8s/pull/4584))
64+
6165
## Release 129.0.3 (in development)
6266

6367
### Enhancements

charts/drop-prometheus-rules.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ node-exporter:
1616
- NodeFileDescriptorLimit
1717
kubernetes-apps:
1818
- KubeJobNotCompleted
19+
- KubeJobFailed
1920
# workaround: this fires upon install
2021
# revert the entire commit after the fix is merged
2122
etcd:

salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
replicas: 1
1010
config:
1111
retention_time: "10d"
12-
retention_size: "0" # "0" to disable size-based retention
12+
retention_size: "0" # "0" to disable size-based retention
1313
enable_admin_api: false
1414
serviceMonitor:
1515
kubelet:
@@ -19,37 +19,43 @@ spec:
1919
kube_job_not_completed:
2020
warning:
2121
hours: 24 # Hours of job active before we trigger alert
22+
kube_cronjob_owned_job_failed:
23+
warning:
24+
minutes: 5 # Minutes of job active before we trigger alert
25+
kube_job_failed:
26+
warning:
27+
minutes: 5 # Minutes of job active before we trigger alert
2228
node_exporter:
2329
node_filesystem_space_filling_up:
2430
warning:
25-
hours: 24 # Hours before there is no space left
26-
threshold: 40 # Min space left to trigger prediction
31+
hours: 24 # Hours before there is no space left
32+
threshold: 40 # Min space left to trigger prediction
2733
critical:
2834
hours: 4
2935
threshold: 20
3036
node_filesystem_almost_out_of_space:
3137
warning:
32-
available: 20 # Percentage of free space left
38+
available: 20 # Percentage of free space left
3339
critical:
3440
available: 12
3541
node_filesystem_files_filling_up:
3642
warning:
37-
hours: 24 # Hours before there is no inode left
38-
threshold: 40 # Min space left to trigger prediction
43+
hours: 24 # Hours before there is no inode left
44+
threshold: 40 # Min space left to trigger prediction
3945
critical:
4046
hours: 4
4147
threshold: 20
4248
node_filesystem_almost_out_of_files:
4349
warning:
44-
available: 15 # Percentage of free inodes left
50+
available: 15 # Percentage of free inodes left
4551
critical:
4652
available: 8
4753
node_network_receive_errors:
4854
warning:
49-
error_rate: 0.01 # Rate of receive errors for the last 2m
55+
error_rate: 0.01 # Rate of receive errors for the last 2m
5056
node_network_transmit_errors:
5157
warning:
52-
error_rate: 0.01 # Rate of transmit errors for the last 2m
58+
error_rate: 0.01 # Rate of transmit errors for the last 2m
5359
node_high_number_conntrack_entries_used:
5460
warning:
5561
threshold: 0.75

salt/metalk8s/addons/prometheus-operator/deployed/chart.sls

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73953,16 +73953,6 @@ spec:
7395373953
for: 15m
7395473954
labels:
7395573955
severity: warning
73956-
- alert: KubeJobFailed
73957-
annotations:
73958-
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
73959-
complete. Removing failed job after investigation should clear this alert.
73960-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
73961-
summary: Job failed to complete.
73962-
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
73963-
for: 15m
73964-
labels:
73965-
severity: warning
7396673956
- alert: KubeHpaReplicasMismatch
7396773957
annotations:
7396873958
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}

salt/metalk8s/addons/prometheus-operator/deployed/kube-alerts-rules.sls

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,45 @@ spec:
2929
annotations:
3030
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
3131
more than {% endraw %}{{ rules.kube_apps.kube_job_not_completed.warning.hours }}{% raw %} hours to complete.
32-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
3332
summary: Job did not complete in time
3433
expr: |-
3534
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
3635
and
3736
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > ({% endraw %}{{ rules.kube_apps.kube_job_not_completed.warning.hours }}{% raw %} * 60 * 60)
3837
labels:
3938
severity: warning
39+
- alert: KubeCronJobOwnedJobFailed
40+
annotations:
41+
description: Job {{ $labels.job_name }} created by CronJob {{ $labels.namespace }}/{{ $labels.cronjob_name }} failed to complete.
42+
Check the logs of the Job and the CronJob state to understand the failure.
43+
Removing failed job after investigation should clear this alert.
44+
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
45+
summary: Job owned by CronJob failed to complete.
46+
for: {% endraw %}{{ rules.kube_apps.kube_cronjob_owned_job_failed.warning.minutes }}{% raw %}m
47+
expr: |-
48+
kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
49+
and on(job_name, namespace)
50+
(
51+
topk by (owner_name, namespace) (1,
52+
kube_job_created{job="kube-state-metrics"}
53+
* on(job_name, namespace) group_left(owner_name, owner_kind)
54+
kube_job_owner{job="kube-state-metrics", owner_kind="CronJob"}
55+
)
56+
)
57+
labels:
58+
severity: warning
59+
- alert: KubeJobFailed
60+
annotations:
61+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
62+
Check the logs of the Job to understand the failure.
63+
Removing failed job after investigation should clear this alert.
64+
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
65+
summary: Job failed to complete.
66+
expr: |-
67+
kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
68+
unless on(job_name, namespace)
69+
kube_job_owner{job="kube-state-metrics", owner_kind="CronJob"}
70+
for: {% endraw %}{{ rules.kube_apps.kube_job_failed.warning.minutes }}{% raw %}m
71+
labels:
72+
severity: warning
4073
{%- endraw %}

tools/rule_extractor/alerting_rules.json

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,18 @@
161161
"query": "sum by (persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1",
162162
"severity": "warning"
163163
},
164+
{
165+
"message": "Job owned by CronJob failed to complete.",
166+
"name": "KubeCronJobOwnedJobFailed",
167+
"query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0 and on (job_name, namespace) (topk by (owner_name, namespace) (1, kube_job_created{job=\"kube-state-metrics\"} * on (job_name, namespace) group_left (owner_name, owner_kind) kube_job_owner{job=\"kube-state-metrics\",owner_kind=\"CronJob\"}))",
168+
"severity": "warning"
169+
},
170+
{
171+
"message": "Job failed to complete.",
172+
"name": "KubeJobFailed",
173+
"query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0 unless on (job_name, namespace) kube_job_owner{job=\"kube-state-metrics\",owner_kind=\"CronJob\"}",
174+
"severity": "warning"
175+
},
164176
{
165177
"message": "Job did not complete in time",
166178
"name": "KubeJobNotCompleted",
@@ -533,12 +545,6 @@
533545
"query": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0",
534546
"severity": "warning"
535547
},
536-
{
537-
"message": "Job failed to complete.",
538-
"name": "KubeJobFailed",
539-
"query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0",
540-
"severity": "warning"
541-
},
542548
{
543549
"message": "Pod is crash looping.",
544550
"name": "KubePodCrashLooping",

0 commit comments

Comments
 (0)