grafana · zenador · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -136,6 +136,7 @@
   * `MimirIngesterStuckProcessingRecordsFromKafka` → `MimirIngesterKafkaProcessingStuck`
   * `MimirStrongConsistencyOffsetNotPropagatedToIngesters` → `MimirStrongConsistencyOffsetMissing`
   * `MimirKafkaClientBufferedProduceBytesTooHigh` → `MimirKafkaClientProduceBufferHigh`
+* [CHANGE] Alerts: Add more native histogram versions of alerts using classic histograms. #13814
 * [ENHANCEMENT] Dashboards: Support native histograms in the Alertmanager, Compactor, Queries, Rollout operator, Reads, RemoteRuler-Reads, Ruler, and Writes dashboards. #13556 #13621 #13629 #13673 #13690 #13678 #13633 #13672
 * [ENHANCEMENT] Alerts: Add `MimirFewerIngestersConsumingThanActivePartitions` alert. #13159
 * [ENHANCEMENT] Querier and query-frontend: Add alerts for querier ring, which is used when performing query planning in query-frontends and distributing portions of the plan to queriers for execution. #13165

@@ -65,7 +65,7 @@ spec:
               runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
             expr: |
               cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
-                 >
+                >
               2.5
             for: 15m
             labels:
@@ -143,14 +143,32 @@ spec:
               runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
             expr: |
               (
-                sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
+                sum by (cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
                 /
-                sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
+                sum by (cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{}[1m]))
               )
               # We want to get alerted only in case there's a constant failure.
               == 1
             for: 5m
             labels:
+              histogram: classic
+              severity: critical
+          - alert: MimirKVStoreFailure
+            annotations:
+              message: |
+                  Mimir {{ $labels.pod }} in  {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
+              runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
+            expr: |
+              (
+                sum by (cluster, namespace, pod, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{status_code!~"2.+"}[1m])))
+                /
+                sum by (cluster, namespace, pod, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{}[1m])))
+              )
+              # We want to get alerted only in case there's a constant failure.
+              == 1
+            for: 5m
+            labels:
+              histogram: native
               severity: critical
           - alert: MimirMemoryMapAreasTooHigh
             annotations:
@@ -1298,6 +1316,21 @@ spec:
               )[5m:1m]) > 0
             for: 5m
             labels:
+              histogram: classic
+              severity: warning
+          - alert: MimirStartingIngesterKafkaDelayGrowing
+            annotations:
+              message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka.
+              runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkadelaygrowing
+            expr: |
+              deriv((
+                  sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+                  /
+                  sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+              )[5m:1m]) > 0
+            for: 5m
+            labels:
+              histogram: native
               severity: warning
           - alert: MimirRunningIngesterReceiveDelayTooHigh
             annotations:
@@ -1311,6 +1344,22 @@ spec:
               ) > (2 * 60)
             for: 3m
             labels:
+              histogram: classic
+              severity: critical
+              threshold: very_high_for_short_period
+          - alert: MimirRunningIngesterReceiveDelayTooHigh
+            annotations:
+              message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
+              runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
+            expr: |
+              (
+                sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+                /
+                sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              ) > (2 * 60)
+            for: 3m
+            labels:
+              histogram: native
               severity: critical
               threshold: very_high_for_short_period
           - alert: MimirRunningIngesterReceiveDelayTooHigh
@@ -1325,6 +1374,22 @@ spec:
               ) > 30
             for: 15m
             labels:
+              histogram: classic
+              severity: critical
+              threshold: relatively_high_for_long_period
+          - alert: MimirRunningIngesterReceiveDelayTooHigh
+            annotations:
+              message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
+              runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
+            expr: |
+              (
+                sum by (cluster, namespace, pod) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+                /
+                sum by (cluster, namespace, pod) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              ) > 30
+            for: 15m
+            labels:
+              histogram: native
               severity: critical
               threshold: relatively_high_for_long_period
           - alert: MimirIngesterKafkaProcessingFailed

@@ -53,7 +53,7 @@ groups:
             runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
           expr: |
             cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
-               >
+              >
             2.5
           for: 15m
           labels:
@@ -131,14 +131,32 @@ groups:
             runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
           expr: |
             (
-              sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
+              sum by (cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
               /
-              sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
+              sum by (cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{}[1m]))
             )
             # We want to get alerted only in case there's a constant failure.
             == 1
           for: 5m
           labels:
+            histogram: classic
+            severity: critical
+        - alert: MimirKVStoreFailure
+          annotations:
+            message: |
+                Mimir {{ $labels.instance }} in  {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
+            runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure
+          expr: |
+            (
+              sum by (cluster, namespace, instance, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{status_code!~"2.+"}[1m])))
+              /
+              sum by (cluster, namespace, instance, status_code, kv_name) (histogram_count(rate(cortex_kv_request_duration_seconds{}[1m])))
+            )
+            # We want to get alerted only in case there's a constant failure.
+            == 1
+          for: 5m
+          labels:
+            histogram: native
             severity: critical
         - alert: MimirMemoryMapAreasTooHigh
           annotations:
@@ -1272,6 +1290,21 @@ groups:
             )[5m:1m]) > 0
           for: 5m
           labels:
+            histogram: classic
+            severity: warning
+        - alert: MimirStartingIngesterKafkaDelayGrowing
+          annotations:
+            message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka.
+            runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkadelaygrowing
+          expr: |
+            deriv((
+                sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+                /
+                sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="starting"}[1m])))
+            )[5m:1m]) > 0
+          for: 5m
+          labels:
+            histogram: native
             severity: warning
         - alert: MimirRunningIngesterReceiveDelayTooHigh
           annotations:
@@ -1285,6 +1318,22 @@ groups:
             ) > (2 * 60)
           for: 3m
           labels:
+            histogram: classic
+            severity: critical
+            threshold: very_high_for_short_period
+        - alert: MimirRunningIngesterReceiveDelayTooHigh
+          annotations:
+            message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
+            runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
+          expr: |
+            (
+              sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              /
+              sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+            ) > (2 * 60)
+          for: 3m
+          labels:
+            histogram: native
             severity: critical
             threshold: very_high_for_short_period
         - alert: MimirRunningIngesterReceiveDelayTooHigh
@@ -1299,6 +1348,22 @@ groups:
             ) > 30
           for: 15m
           labels:
+            histogram: classic
+            severity: critical
+            threshold: relatively_high_for_long_period
+        - alert: MimirRunningIngesterReceiveDelayTooHigh
+          annotations:
+            message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka.
+            runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh
+          expr: |
+            (
+              sum by (cluster, namespace, instance) (histogram_sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+              /
+              sum by (cluster, namespace, instance) (histogram_count(rate(cortex_ingest_storage_reader_receive_delay_seconds{phase="running"}[1m])))
+            ) > 30
+          for: 15m
+          labels:
+            histogram: native
             severity: critical
             threshold: relatively_high_for_long_period
         - alert: MimirIngesterKafkaProcessingFailed