Skip to content

Commit bc86453

Browse files
authored
Respect base_alerts_range_interval_minutes for all alert rules with short range selectors (#15083)
This PR fixes a handful of alerts with short duration range selectors that did not use `$.alertRangeInterval` and therefore did not respect `base_alerts_range_interval_minutes`. This is important so that the alerts can be configured to behave correctly when running against clusters where the scrape interval has been increased from the default 15s.
1 parent 7aadeb2 commit bc86453

File tree

3 files changed

+51
-21
lines changed

3 files changed

+51
-21
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@
340340
* [BUGFIX] Dashboards: Filter out 0s from `max_series` limit on Writes Resources > Ingester > In-memory series panel. #13419
341341
* [BUGFIX] Dashboards: Fix issue where the "Tenant gateway requests" panels on Tenants dashboard would show data from all components. #13940
342342
* [BUGFIX] Dashboards: Fix issue where the MQE-related dashboard panels on the Queries dashboard would show data from both queriers and query-frontends, instead of just queriers. #14029
343+
* [BUGFIX] Alerts: Fix alert definitions with short range vector selectors that did not respect the configured `base_alerts_range_interval_minutes`. #15083
343344

344345
### Jsonnet
345346

operations/mimir-mixin/alerts/alerts.libsonnet

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,8 +1116,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
11161116
{
11171117
alert: $.alertName('MemberlistZoneAwareRoutingAutoFailover'),
11181118
expr: |||
1119-
sum by (%(alert_aggregation_labels)s) (rate(memberlist_client_zone_aware_routing_select_nodes_skipped_total[1m])) > 0
1120-
||| % $._config,
1119+
sum by (%(alert_aggregation_labels)s) (rate(memberlist_client_zone_aware_routing_select_nodes_skipped_total[%(range)s])) > 0
1120+
||| % {
1121+
alert_aggregation_labels: $._config.alert_aggregation_labels,
1122+
range: $.alertRangeInterval(1),
1123+
},
11211124
'for': '10m',
11221125
labels: {
11231126
severity: 'warning',

operations/mimir-mixin/alerts/ingest-storage.libsonnet

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
8989
// We use node_id to only alert if problems to the same Kafka node are repeating.
9090
// If problems are for different nodes (eg. during rollout), that is not a problem, and we don't need to trigger alert.
9191
expr: |||
92-
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m]))
92+
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[%(range)s]))
9393
> 0
94-
||| % $._config,
94+
||| % {
95+
alert_aggregation_labels: $._config.alert_aggregation_labels,
96+
per_instance_label: $._config.per_instance_label,
97+
range: $.alertRangeInterval(1),
98+
},
9599
labels: {
96100
severity: 'critical',
97101
},
@@ -133,16 +137,20 @@ local utils = import 'mixin-utils/utils.libsonnet';
133137
(
134138
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (
135139
# This is the old metric name. We're keeping support for backward compatibility.
136-
rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])
140+
rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[%(range)s])
137141
or
138-
rate(cortex_ingest_storage_reader_requests_failed_total{cause="server"}[1m])
142+
rate(cortex_ingest_storage_reader_requests_failed_total{cause="server"}[%(range)s])
139143
) > 0
140144
)
141145
142146
# Tolerate failures during the forced TSDB head compaction, because samples older than the
143147
# new "head min time" will fail to be appended while the forced compaction is running.
144-
unless (max by (%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_ingester_tsdb_forced_compactions_in_progress[1m])) > 0)
145-
||| % $._config,
148+
unless (max by (%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_ingester_tsdb_forced_compactions_in_progress[%(range)s])) > 0)
149+
||| % {
150+
alert_aggregation_labels: $._config.alert_aggregation_labels,
151+
per_instance_label: $._config.per_instance_label,
152+
range: $.alertRangeInterval(1),
153+
},
146154
labels: {
147155
severity: 'critical',
148156
},
@@ -194,8 +202,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
194202
alert: $.alertName('StrongConsistencyEnforcementFailed'),
195203
'for': '5m',
196204
expr: |||
197-
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0
198-
||| % $._config,
205+
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingest_storage_strong_consistency_failures_total[%(range)s])) > 0
206+
||| % {
207+
alert_aggregation_labels: $._config.alert_aggregation_labels,
208+
per_instance_label: $._config.per_instance_label,
209+
range: $.alertRangeInterval(1),
210+
},
199211
labels: {
200212
severity: 'critical',
201213
},
@@ -209,11 +221,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
209221
alert: $.alertName('StrongConsistencyOffsetMissing'),
210222
'for': '5m',
211223
expr: |||
212-
sum by (%(alert_aggregation_labels)s) (rate(cortex_ingest_storage_strong_consistency_requests_total{component="partition-reader", with_offset="false"}[1m]))
224+
sum by (%(alert_aggregation_labels)s) (rate(cortex_ingest_storage_strong_consistency_requests_total{component="partition-reader", with_offset="false"}[%(range)s]))
213225
/
214-
sum by (%(alert_aggregation_labels)s) (rate(cortex_ingest_storage_strong_consistency_requests_total{component="partition-reader"}[1m]))
226+
sum by (%(alert_aggregation_labels)s) (rate(cortex_ingest_storage_strong_consistency_requests_total{component="partition-reader"}[%(range)s]))
215227
* 100 > 5
216-
||| % $._config,
228+
||| % {
229+
alert_aggregation_labels: $._config.alert_aggregation_labels,
230+
range: $.alertRangeInterval(1),
231+
},
217232
labels: {
218233
severity: 'warning',
219234
},
@@ -229,15 +244,19 @@ local utils = import 'mixin-utils/utils.libsonnet';
229244
expr: |||
230245
max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (
231246
# New metric.
232-
max_over_time(cortex_ingest_storage_writer_buffered_produce_bytes_distribution{quantile="1.0"}[1m])
247+
max_over_time(cortex_ingest_storage_writer_buffered_produce_bytes_distribution{quantile="1.0"}[%(range)s])
233248
or
234249
# Old metric.
235-
max_over_time(cortex_ingest_storage_writer_buffered_produce_bytes{quantile="1.0"}[1m])
250+
max_over_time(cortex_ingest_storage_writer_buffered_produce_bytes{quantile="1.0"}[%(range)s])
236251
)
237252
/
238-
min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (min_over_time(cortex_ingest_storage_writer_buffered_produce_bytes_limit[1m]))
253+
min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (min_over_time(cortex_ingest_storage_writer_buffered_produce_bytes_limit[%(range)s]))
239254
* 100 > 50
240-
||| % $._config,
255+
||| % {
256+
alert_aggregation_labels: $._config.alert_aggregation_labels,
257+
per_instance_label: $._config.per_instance_label,
258+
range: $.alertRangeInterval(1),
259+
},
241260
labels: {
242261
severity: 'critical',
243262
},
@@ -265,8 +284,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
265284
{
266285
alert: $.alertName('BlockBuilderCompactAndUploadFailed'),
267286
expr: |||
268-
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0
269-
||| % $._config,
287+
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[%(range)s])) > 0
288+
||| % {
289+
290+
alert_aggregation_labels: $._config.alert_aggregation_labels,
291+
per_instance_label: $._config.per_instance_label,
292+
range: $.alertRangeInterval(1),
293+
},
270294
labels: {
271295
severity: 'critical',
272296
},
@@ -334,8 +358,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
334358
{
335359
alert: $.alertName('BlockBuilderPersistentJobFailure'),
336360
expr: |||
337-
increase(cortex_blockbuilder_scheduler_persistent_job_failures_total[1m]) > 0
338-
||| % $._config,
361+
increase(cortex_blockbuilder_scheduler_persistent_job_failures_total[%(range)s]) > 0
362+
||| % {
363+
range: $.alertRangeInterval(1),
364+
},
339365
labels: {
340366
severity: 'critical',
341367
},

0 commit comments

Comments
 (0)