Merge branch 'improvement/add-cronjob-jobs-alerts' into q/130.0

bert-e · bert-e · commit 26243d67607d · 2025-05-05T15:43:02.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,10 @@
   [4.14.1](https://github.com/prometheus-community/helm-charts/releases/tag/prometheus-adapter-4.14.1)
   (PR[#4563](https://github.com/scality/metalk8s/pull/4563))
 
+- Add alerts for a CronJob owned Job failure and another one for
+  non-CronJob owned Job failure 
+  (PR[#4584](https://github.com/scality/metalk8s/pull/4584))
+
 ## Release 129.0.3 (in development)
 
 ### Enhancements
diff --git a/charts/drop-prometheus-rules.yaml b/charts/drop-prometheus-rules.yaml
@@ -16,6 +16,7 @@ node-exporter:
   - NodeFileDescriptorLimit
 kubernetes-apps:
   - KubeJobNotCompleted
+  - KubeJobFailed
 # workaround: this fires upon install
 # revert the entire commit after the fix is merged
 etcd:
diff --git a/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml b/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml
@@ -9,7 +9,7 @@ spec:
     replicas: 1
   config:
     retention_time: "10d"
-    retention_size: "0"  # "0" to disable size-based retention
+    retention_size: "0" # "0" to disable size-based retention
     enable_admin_api: false
     serviceMonitor:
       kubelet:
@@ -19,37 +19,43 @@ spec:
       kube_job_not_completed:
         warning:
           hours: 24 # Hours of job active before we trigger alert
+      kube_cronjob_owned_job_failed:
+        warning:
+          minutes: 5 # Minutes of job active before we trigger alert
+      kube_job_failed:
+        warning:
+          minutes: 5 # Minutes of job active before we trigger alert
     node_exporter:
       node_filesystem_space_filling_up:
         warning:
-          hours: 24  # Hours before there is no space left
-          threshold: 40  # Min space left to trigger prediction
+          hours: 24 # Hours before there is no space left
+          threshold: 40 # Min space left to trigger prediction
         critical:
           hours: 4
           threshold: 20
       node_filesystem_almost_out_of_space:
         warning:
-          available: 20  # Percentage of free space left
+          available: 20 # Percentage of free space left
         critical:
           available: 12
       node_filesystem_files_filling_up:
         warning:
-          hours: 24  # Hours before there is no inode left
-          threshold: 40  # Min space left to trigger prediction
+          hours: 24 # Hours before there is no inode left
+          threshold: 40 # Min space left to trigger prediction
         critical:
           hours: 4
           threshold: 20
       node_filesystem_almost_out_of_files:
         warning:
-          available: 15  # Percentage of free inodes left
+          available: 15 # Percentage of free inodes left
         critical:
           available: 8
       node_network_receive_errors:
         warning:
-          error_rate: 0.01  # Rate of receive errors for the last 2m
+          error_rate: 0.01 # Rate of receive errors for the last 2m
       node_network_transmit_errors:
         warning:
-          error_rate: 0.01  # Rate of transmit errors for the last 2m
+          error_rate: 0.01 # Rate of transmit errors for the last 2m
       node_high_number_conntrack_entries_used:
         warning:
           threshold: 0.75
diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls
@@ -73953,16 +73953,6 @@ spec:
       for: 15m
       labels:
         severity: warning
-    - alert: KubeJobFailed
-      annotations:
-        description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
-          complete. Removing failed job after investigation should clear this alert.
-        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
-        summary: Job failed to complete.
-      expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"}  > 0
-      for: 15m
-      labels:
-        severity: warning
     - alert: KubeHpaReplicasMismatch
       annotations:
         description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }}
diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/kube-alerts-rules.sls b/salt/metalk8s/addons/prometheus-operator/deployed/kube-alerts-rules.sls
@@ -29,12 +29,45 @@ spec:
       annotations:
         description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
           more than {% endraw %}{{ rules.kube_apps.kube_job_not_completed.warning.hours }}{% raw %} hours to complete.
-        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
         summary: Job did not complete in time
       expr: |-
         time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
           and
         kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > ({% endraw %}{{ rules.kube_apps.kube_job_not_completed.warning.hours }}{% raw %} * 60 * 60)
       labels:
         severity: warning
+    - alert: KubeCronJobOwnedJobFailed
+      annotations:
+        description: Job {{ $labels.job_name }} created by CronJob {{ $labels.namespace }}/{{ $labels.cronjob_name }} failed to complete.
+          Check the logs of the Job and the CronJob state to understand the failure.
+          Removing failed job after investigation should clear this alert.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
+        summary: Job owned by CronJob failed to complete.
+      for: {% endraw %}{{ rules.kube_apps.kube_cronjob_owned_job_failed.warning.minutes }}{% raw %}m
+      expr: |-
+        kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
+        and on(job_name, namespace)
+        (
+          topk by (owner_name, namespace) (1,
+            kube_job_created{job="kube-state-metrics"}
+              * on(job_name, namespace) group_left(owner_name, owner_kind)
+                kube_job_owner{job="kube-state-metrics", owner_kind="CronJob"}
+          )
+        )
+      labels:
+        severity: warning
+    - alert: KubeJobFailed
+      annotations:
+        description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
+          Check the logs of the Job to understand the failure.
+          Removing failed job after investigation should clear this alert.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
+        summary: Job failed to complete.
+      expr: |-
+        kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
+        unless on(job_name, namespace)
+        kube_job_owner{job="kube-state-metrics", owner_kind="CronJob"}
+      for: {% endraw %}{{ rules.kube_apps.kube_job_failed.warning.minutes }}{% raw %}m
+      labels:
+        severity: warning
 {%- endraw %}
diff --git a/tools/rule_extractor/alerting_rules.json b/tools/rule_extractor/alerting_rules.json
@@ -161,6 +161,18 @@
         "query": "sum by (persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1",
         "severity": "warning"
     },
+    {
+        "message": "Job owned by CronJob failed to complete.",
+        "name": "KubeCronJobOwnedJobFailed",
+        "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0 and on (job_name, namespace) (topk by (owner_name, namespace) (1, kube_job_created{job=\"kube-state-metrics\"} * on (job_name, namespace) group_left (owner_name, owner_kind) kube_job_owner{job=\"kube-state-metrics\",owner_kind=\"CronJob\"}))",
+        "severity": "warning"
+    },
+    {
+        "message": "Job failed to complete.",
+        "name": "KubeJobFailed",
+        "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0 unless on (job_name, namespace) kube_job_owner{job=\"kube-state-metrics\",owner_kind=\"CronJob\"}",
+        "severity": "warning"
+    },
     {
         "message": "Job did not complete in time",
         "name": "KubeJobNotCompleted",
@@ -533,12 +545,6 @@
         "query": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0",
         "severity": "warning"
     },
-    {
-        "message": "Job failed to complete.",
-        "name": "KubeJobFailed",
-        "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0",
-        "severity": "warning"
-    },
     {
         "message": "Pod is crash looping.",
         "name": "KubePodCrashLooping",
diff --git a/tools/rule_extractor/rules.json b/tools/rule_extractor/rules.json