Skip to content

Commit b3adde8

Browse files
authored
chore(monitoring): fix alerts grouping (#11849)
Signed-off-by: Egor Vasilyev <e.s.vasilyev@mail.ru>
1 parent 1a867d3 commit b3adde8

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-connect-rules.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,18 @@ spec:
1818
summary: 'All Kafka Connect containers down or in CrashLookBackOff status'
1919
description: 'All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes'
2020
- alert: ConnectFailedConnector
21-
expr: sum(kafka_connect_connector_status{status="failed"}) > 0
21+
expr: sum(kafka_connect_connector_status{status="failed"}) by (namespace, pod) > 0
2222
for: 5m
2323
labels:
2424
severity: major
2525
annotations:
2626
summary: 'Kafka Connect Connector Failure'
27-
description: 'One or more connectors have been in failed state for 5 minutes,'
27+
description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} connector(s) in failed state during last 5 minutes'
2828
- alert: ConnectFailedTask
29-
expr: sum(kafka_connect_worker_connector_failed_task_count) > 0
29+
expr: sum(kafka_connect_worker_connector_failed_task_count) by (namespace, pod) > 0
3030
for: 5m
3131
labels:
3232
severity: major
3333
annotations:
3434
summary: 'Kafka Connect Task Failure'
35-
description: 'One or more tasks have been in failed state for 5 minutes.'
35+
description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} task(s) in failed state during last 5 minutes'

packaging/examples/metrics/prometheus-install/prometheus-rules/prometheus-kafka-rules.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,21 @@ spec:
2626
summary: 'Kafka under replicated partitions'
2727
description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
2828
- alert: AbnormalControllerState
29-
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
29+
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name, namespace, pod) != 1
3030
for: 10s
3131
labels:
3232
severity: warning
3333
annotations:
3434
summary: 'Kafka abnormal controller state'
35-
description: 'There are {{ $value }} active controllers in the cluster'
35+
description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} active controllers'
3636
- alert: OfflinePartitions
37-
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0
37+
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) by (namespace, pod) > 0
3838
for: 10s
3939
labels:
4040
severity: warning
4141
annotations:
4242
summary: 'Kafka offline partitions'
43-
description: 'One or more partitions have no leader'
43+
description: 'Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} partition(s) with no leader'
4444
- alert: UnderMinIsrPartitionCount
4545
expr: kafka_server_replicamanager_underminisrpartitioncount > 0
4646
for: 10s

0 commit comments

Comments
 (0)