chore(monitoring): fix alerts grouping (#11849)

bissquit · web-flow · commit b3adde808805 · 2025-09-17T10:15:38.000+02:00
Signed-off-by: Egor Vasilyev &lt;e.s.vasilyev@mail.ru&gt;
diff --git a/packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-connect-rules.yaml b/packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-connect-rules.yaml
@@ -18,18 +18,18 @@ spec:
         summary: 'All Kafka Connect containers down or in CrashLookBackOff status'
         description: 'All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes'
     - alert: ConnectFailedConnector
-      expr: sum(kafka_connect_connector_status{status="failed"}) > 0
+      expr: sum(kafka_connect_connector_status{status="failed"}) by (namespace, pod) > 0
       for: 5m
       labels:
         severity: major
       annotations:
         summary: 'Kafka Connect Connector Failure'
-        description: 'One or more connectors have been in failed state for 5 minutes,'
+        description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} connector(s) in failed state during last 5 minutes'
     - alert: ConnectFailedTask
-      expr: sum(kafka_connect_worker_connector_failed_task_count) > 0
+      expr: sum(kafka_connect_worker_connector_failed_task_count) by (namespace, pod) > 0
       for: 5m
       labels:
         severity: major
       annotations:
         summary: 'Kafka Connect Task Failure'
-        description: 'One or more tasks have been in failed state for 5 minutes.'
+        description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} task(s) in failed state during last 5 minutes'
diff --git a/packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-rules.yaml b/packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-rules.yaml
@@ -26,21 +26,21 @@ spec:
         summary: 'Kafka under replicated partitions'
         description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
     - alert: AbnormalControllerState
-      expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
+      expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name, namespace, pod) != 1
       for: 10s
       labels:
         severity: warning
       annotations:
         summary: 'Kafka abnormal controller state'
-        description: 'There are {{ $value }} active controllers in the cluster'
+        description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} active controllers'
     - alert: OfflinePartitions
-      expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0
+      expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) by (namespace, pod) > 0
       for: 10s
       labels:
         severity: warning
       annotations:
         summary: 'Kafka offline partitions'
-        description: 'One or more partitions have no leader'
+        description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} partition(s) with no leader'
     - alert: UnderMinIsrPartitionCount
       expr: kafka_server_replicamanager_underminisrpartitioncount > 0
       for: 10s