Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
* `MimirIngesterStuckProcessingRecordsFromKafka` → `MimirIngesterKafkaProcessingStuck`
* `MimirStrongConsistencyOffsetNotPropagatedToIngesters` → `MimirStrongConsistencyOffsetMissing`
* `MimirKafkaClientBufferedProduceBytesTooHigh` → `MimirKafkaClientProduceBufferHigh`
* [CHANGE] Alerts: Add more native histogram versions of alerts using classic histograms. #13814
* [ENHANCEMENT] Dashboards: Support native histograms in the Alertmanager, Compactor, Queries, Rollout operator, Reads, RemoteRuler-Reads, Ruler, and Writes dashboards. #13556 #13621 #13629 #13673 #13690 #13678 #13633 #13672
* [ENHANCEMENT] Alerts: Add `MimirFewerIngestersConsumingThanActivePartitions` alert. #13159
* [ENHANCEMENT] Querier and query-frontend: Add alerts for querier ring, which is used when performing query planning in query-frontends and distributing portions of the plan to queriers for execution. #13165
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ spec:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
expr: |
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
>
>
2.5
for: 15m
labels:
Expand Down Expand Up @@ -143,14 +143,32 @@ spec:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
expr: |
(
sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
sum by (cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
/
sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
sum by (cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{}[1m]))
)
# We want to get alerted only in case there's a constant failure.
== 1
for: 5m
labels:
histogram: classic
severity: critical
- alert: MimirKVStoreFailure
annotations:
message: |
Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
expr: |
(
sum by (cluster, namespace, pod, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{status_code!~"2.+"}[1m])))
/
sum by (cluster, namespace, pod, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{}[1m])))
)
# We want to get alerted only in case there's a constant failure.
== 1
for: 5m
labels:
histogram: native
severity: critical
- alert: MimirMemoryMapAreasTooHigh
annotations:
Expand Down Expand Up @@ -1298,6 +1316,21 @@ spec:
)[5m:1m]) > 0
for: 5m
labels:
histogram: classic
severity: warning
- alert: MimirStartingIngesterKafkaDelayGrowing
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkadelaygrowing
expr: |
deriv((
sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
/
sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
)[5m:1m]) > 0
for: 5m
labels:
histogram: native
severity: warning
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
Expand All @@ -1311,6 +1344,22 @@ spec:
) > (2 * 60)
for: 3m
labels:
histogram: classic
severity: critical
threshold: very_high_for_short_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
expr: |
(
sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
/
sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
) > (2 * 60)
for: 3m
labels:
histogram: native
severity: critical
threshold: very_high_for_short_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
Expand All @@ -1325,6 +1374,22 @@ spec:
) > 30
for: 15m
labels:
histogram: classic
severity: critical
threshold: relatively_high_for_long_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
expr: |
(
sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
/
sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
) > 30
for: 15m
labels:
histogram: native
severity: critical
threshold: relatively_high_for_long_period
- alert: MimirIngesterKafkaProcessingFailed
Expand Down
71 changes: 68 additions & 3 deletions operations/mimir-mixin-compiled-baremetal/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ groups:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
expr: |
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
>
>
2.5
for: 15m
labels:
Expand Down Expand Up @@ -131,14 +131,32 @@ groups:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
expr: |
(
sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
sum by (cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
/
sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
sum by (cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{}[1m]))
)
# We want to get alerted only in case there's a constant failure.
== 1
for: 5m
labels:
histogram: classic
severity: critical
- alert: MimirKVStoreFailure
annotations:
message: |
Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
expr: |
(
sum by (cluster, namespace, instance, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{status_code!~"2.+"}[1m])))
/
sum by (cluster, namespace, instance, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{}[1m])))
)
# We want to get alerted only in case there's a constant failure.
== 1
for: 5m
labels:
histogram: native
severity: critical
- alert: MimirMemoryMapAreasTooHigh
annotations:
Expand Down Expand Up @@ -1272,6 +1290,21 @@ groups:
)[5m:1m]) > 0
for: 5m
labels:
histogram: classic
severity: warning
- alert: MimirStartingIngesterKafkaDelayGrowing
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkadelaygrowing
expr: |
deriv((
sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
/
sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
)[5m:1m]) > 0
for: 5m
labels:
histogram: native
severity: warning
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
Expand All @@ -1285,6 +1318,22 @@ groups:
) > (2 * 60)
for: 3m
labels:
histogram: classic
severity: critical
threshold: very_high_for_short_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
expr: |
(
sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
/
sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
) > (2 * 60)
for: 3m
labels:
histogram: native
severity: critical
threshold: very_high_for_short_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
Expand All @@ -1299,6 +1348,22 @@ groups:
) > 30
for: 15m
labels:
histogram: classic
severity: critical
threshold: relatively_high_for_long_period
- alert: MimirRunningIngesterReceiveDelayTooHigh
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
expr: |
(
sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
/
sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
) > 30
for: 15m
labels:
histogram: native
severity: critical
threshold: relatively_high_for_long_period
- alert: MimirIngesterKafkaProcessingFailed
Expand Down
Loading