File tree Expand file tree Collapse file tree 2 files changed +8
-8
lines changed
packaging/examples/metrics/prometheus-install/prometheus-rules Expand file tree Collapse file tree 2 files changed +8
-8
lines changed Original file line number Diff line number Diff line change @@ -18,18 +18,18 @@ spec:
1818 summary : ' All Kafka Connect containers down or in CrashLookBackOff status'
1919 description : ' All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes'
2020 - alert : ConnectFailedConnector
21- expr : sum(kafka_connect_connector_status{status="failed"}) > 0
21+ expr : sum(kafka_connect_connector_status{status="failed"}) by (namespace, pod) > 0
2222 for : 5m
2323 labels :
2424 severity : major
2525 annotations :
2626 summary : ' Kafka Connect Connector Failure'
27- description : ' One or more connectors have been in failed state for 5 minutes, '
27+ description : ' Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} connector(s) in failed state during last 5 minutes'
2828 - alert : ConnectFailedTask
29- expr : sum(kafka_connect_worker_connector_failed_task_count) > 0
29+ expr : sum(kafka_connect_worker_connector_failed_task_count) by (namespace, pod) > 0
3030 for : 5m
3131 labels :
3232 severity : major
3333 annotations :
3434 summary : ' Kafka Connect Task Failure'
35- description : ' One or more tasks have been in failed state for 5 minutes. '
35+ description : ' Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} task(s) in failed state during last 5 minutes'
Original file line number Diff line number Diff line change @@ -26,21 +26,21 @@ spec:
2626 summary : ' Kafka under replicated partitions'
2727 description : ' There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
2828 - alert : AbnormalControllerState
29- expr : sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
29+ expr : sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name, namespace, pod ) != 1
3030 for : 10s
3131 labels :
3232 severity : warning
3333 annotations :
3434 summary : ' Kafka abnormal controller state'
35- description : ' There are {{ $value }} active controllers in the cluster '
35+ description : ' Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} active controllers '
3636 - alert : OfflinePartitions
37- expr : sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0
37+ expr : sum(kafka_controller_kafkacontroller_offlinepartitionscount) by (namespace, pod) > 0
3838 for : 10s
3939 labels :
4040 severity : warning
4141 annotations :
4242 summary : ' Kafka offline partitions'
43- description : ' One or more partitions have no leader'
43+ description : ' Kafka instance on pod {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} partition(s) with no leader'
4444 - alert : UnderMinIsrPartitionCount
4545 expr : kafka_server_replicamanager_underminisrpartitioncount > 0
4646 for : 10s
You can’t perform that action at this time.
0 commit comments