Skip to content

Commit 1abee28

Browse files
authored
Merge pull request #1176 from cloudflare/source
Include query fragments in alerts/template
2 parents 7651ec9 + 1331c38 commit 1abee28

File tree

9 files changed

+439
-165
lines changed

9 files changed

+439
-165
lines changed

cmd/pint/tests/0076_ci_group_errors.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ rules.yml:29-30 Bug: `summary` annotation is required. (alerts/annotation)
263263
29 | annotations:
264264
30 | instance: 'sum on {{ $labels.instance }} is {{ $value }}'
265265

266-
rules.yml:30 Bug: Template is using `instance` label but the query removes it. (alerts/template)
266+
rules.yml:30 Bug: Template is using `instance` label but the query results won't have this label. Query is using aggregation with `by(foo)`, only labels included inside `by(...)` will be present on the results. (alerts/template)
267267
30 | instance: 'sum on {{ $labels.instance }} is {{ $value }}'
268268

269269
rules.yml:32-33 Bug: `link` annotation is required. (alerts/annotation)

cmd/pint/tests/0087_dedup.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ level=INFO msg="Finding all rules to check" paths=["rules"]
77
rules/01.yml:5 Warning: Alert query doesn't have any condition, it will always fire if the metric exists. (alerts/comparison)
88
5 | expr: sum(up{job="bar"}) / sum(foo) / sum(bar)
99

10-
rules/01.yml:12 Bug: Template is using `cluster` label but the query removes it. (alerts/template)
10+
rules/01.yml:12 Bug: Template is using `cluster` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
1111
12 | summary: "Server {{ $labels.instance }} in cluster {{ $labels.cluster }} has gone down"
1212

13-
rules/01.yml:12 Bug: Template is using `instance` label but the query removes it. (alerts/template)
13+
rules/01.yml:12 Bug: Template is using `instance` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
1414
12 | summary: "Server {{ $labels.instance }} in cluster {{ $labels.cluster }} has gone down"
1515

16-
rules/01.yml:13 Bug: Template is using `cluster` label but the query removes it. (alerts/template)
16+
rules/01.yml:13 Bug: Template is using `cluster` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
1717
13 | dashboard: "https://grafana.example.com/dashboard?var-cluster={{ $labels.cluster }}&var-instance={{ $labels.cluster }}"
1818

1919
level=INFO msg="Problems found" Bug=3 Warning=1

docs/changelog.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,15 @@
3333
}
3434
```
3535

36+
### Changed
37+
38+
- [alerts/template](checks/alerts/template.md) check was refactored and will now produce more accurate results.
39+
Messages produced by this check might include details of the PromQL query fragment causing the problem
40+
if the query is complex enough.
41+
3642
### Fixed
3743

3844
- Don't try to create GitLab comments on unmodified lines - [#1147](https://github.com/cloudflare/pint/pull/1147).
39-
- [alerts/template](checks/alerts/template.md) check was refactored and will now produce more accurate results.
4045

4146
## v0.67.0
4247

internal/checks/alerts_absent.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ func (c AlertsAbsentCheck) Check(ctx context.Context, _ discovery.Path, rule par
5959
}
6060

6161
var hasAbsent bool
62-
src := utils.LabelsSource(rule.AlertingRule.Expr.Query)
62+
src := utils.LabelsSource(rule.AlertingRule.Expr.Value.Value, rule.AlertingRule.Expr.Query)
6363
for _, s := range append(src.Alternatives, src) {
6464
if s.Operation == "absent" {
6565
hasAbsent = true

internal/checks/alerts_template.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
115115
return nil
116116
}
117117

118-
src := utils.LabelsSource(rule.AlertingRule.Expr.Query)
118+
src := utils.LabelsSource(rule.AlertingRule.Expr.Value.Value, rule.AlertingRule.Expr.Query)
119119
data := promTemplate.AlertTemplateData(map[string]string{}, map[string]string{}, "", promql.Sample{})
120120

121121
if rule.AlertingRule.Labels != nil {
@@ -144,7 +144,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
144144
})
145145
}
146146

147-
for _, problem := range checkQueryLabels(label.Key.Value, label.Value.Value, src) {
147+
for _, problem := range checkQueryLabels(rule.AlertingRule.Expr.Value.Value, label.Key.Value, label.Value.Value, src) {
148148
problems = append(problems, Problem{
149149
Lines: parser.LineRange{
150150
First: label.Key.Lines.First,
@@ -174,7 +174,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
174174
})
175175
}
176176

177-
for _, problem := range checkQueryLabels(annotation.Key.Value, annotation.Value.Value, src) {
177+
for _, problem := range checkQueryLabels(rule.AlertingRule.Expr.Value.Value, annotation.Key.Value, annotation.Value.Value, src) {
178178
problems = append(problems, Problem{
179179
Lines: parser.LineRange{
180180
First: annotation.Key.Lines.First,
@@ -436,7 +436,7 @@ func findTemplateVariables(name, text string) (vars [][]string, aliases aliasMap
436436
return vars, aliases, true
437437
}
438438

439-
func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems []exprProblem) {
439+
func checkQueryLabels(query, labelName, labelValue string, src utils.Source) (problems []exprProblem) {
440440
vars, aliases, ok := findTemplateVariables(labelName, labelValue)
441441
if !ok {
442442
return nil
@@ -452,11 +452,11 @@ func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems
452452
}
453453
for _, s := range append(src.Alternatives, src) {
454454
if s.FixedLabels && !slices.Contains(s.IncludedLabels, v[1]) {
455-
problems = append(problems, textForProblem(v[1], "", s, Bug))
455+
problems = append(problems, textForProblem(query, v[1], "", s, Bug))
456456
goto NEXT
457457
}
458458
if slices.Contains(s.ExcludedLabels, v[1]) {
459-
problems = append(problems, textForProblem(v[1], v[1], s, Bug))
459+
problems = append(problems, textForProblem(query, v[1], v[1], s, Bug))
460460
goto NEXT
461461
}
462462
}
@@ -469,7 +469,7 @@ func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems
469469
return problems
470470
}
471471

472-
func textForProblem(label, reasonLabel string, src utils.Source, severity Severity) exprProblem {
472+
func textForProblem(query, label, reasonLabel string, src utils.Source, severity Severity) exprProblem {
473473
switch {
474474
case src.Operation == "absent":
475475
return exprProblem{
@@ -489,23 +489,26 @@ func textForProblem(label, reasonLabel string, src utils.Source, severity Severi
489489
details: TemplateCheckLabelsDetails,
490490
severity: severity,
491491
}
492-
case slices.Contains([]string{
493-
promParser.CardOneToOne.String(),
494-
promParser.CardOneToMany.String(),
495-
promParser.CardManyToMany.String(),
496-
promParser.CardManyToOne.String(),
497-
}, src.Operation):
492+
case src.Operation == promParser.CardOneToOne.String():
498493
return exprProblem{
499494
text: fmt.Sprintf("Template is using `%s` label but the query results won't have this label. %s",
500-
label, src.ExcludeReason[reasonLabel]),
501-
details: TemplateCheckOnDetails,
495+
label, src.ExcludeReason[reasonLabel].Reason),
496+
details: maybeAddQueryFragment(query, src.ExcludeReason[reasonLabel].Fragment, TemplateCheckOnDetails),
502497
severity: severity,
503498
}
504499
default:
505500
return exprProblem{
506-
text: fmt.Sprintf("Template is using `%s` label but the query removes it.", label),
507-
details: TemplateCheckAggregationDetails,
501+
text: fmt.Sprintf("Template is using `%s` label but the query results won't have this label. %s",
502+
label, src.ExcludeReason[reasonLabel].Reason),
503+
details: maybeAddQueryFragment(query, src.ExcludeReason[reasonLabel].Fragment, TemplateCheckAggregationDetails),
508504
severity: severity,
509505
}
510506
}
511507
}
508+
509+
func maybeAddQueryFragment(query, fragment, msg string) string {
510+
if fragment == query {
511+
return msg
512+
}
513+
return fmt.Sprintf("%s\nQuery fragment causing this problem: `%s`.", msg, fragment)
514+
}

internal/checks/alerts_template_test.go

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ func TestTemplateCheck(t *testing.T) {
294294
Last: 4,
295295
},
296296
Reporter: checks.TemplateCheckName,
297-
Text: "Template is using `job` label but the query removes it.",
298-
Details: checks.TemplateCheckAggregationDetails,
297+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
298+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo)`.",
299299
Severity: checks.Bug,
300300
},
301301
}
@@ -314,8 +314,8 @@ func TestTemplateCheck(t *testing.T) {
314314
Last: 4,
315315
},
316316
Reporter: checks.TemplateCheckName,
317-
Text: "Template is using `job` label but the query removes it.",
318-
Details: checks.TemplateCheckAggregationDetails,
317+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
318+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo)`.",
319319
Severity: checks.Bug,
320320
},
321321
}
@@ -334,8 +334,8 @@ func TestTemplateCheck(t *testing.T) {
334334
Last: 4,
335335
},
336336
Reporter: checks.TemplateCheckName,
337-
Text: "Template is using `job` label but the query removes it.",
338-
Details: checks.TemplateCheckAggregationDetails,
337+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
338+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
339339
Severity: checks.Bug,
340340
},
341341
}
@@ -354,8 +354,8 @@ func TestTemplateCheck(t *testing.T) {
354354
Last: 4,
355355
},
356356
Reporter: checks.TemplateCheckName,
357-
Text: "Template is using `job` label but the query removes it.",
358-
Details: checks.TemplateCheckAggregationDetails,
357+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
358+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
359359
Severity: checks.Bug,
360360
},
361361
}
@@ -374,8 +374,8 @@ func TestTemplateCheck(t *testing.T) {
374374
Last: 4,
375375
},
376376
Reporter: checks.TemplateCheckName,
377-
Text: "Template is using `job` label but the query removes it.",
378-
Details: checks.TemplateCheckAggregationDetails,
377+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
378+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
379379
Severity: checks.Bug,
380380
},
381381
}
@@ -394,8 +394,8 @@ func TestTemplateCheck(t *testing.T) {
394394
Last: 4,
395395
},
396396
Reporter: checks.TemplateCheckName,
397-
Text: "Template is using `job` label but the query removes it.",
398-
Details: checks.TemplateCheckAggregationDetails,
397+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
398+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(bar)`.",
399399
Severity: checks.Bug,
400400
},
401401
}
@@ -414,8 +414,8 @@ func TestTemplateCheck(t *testing.T) {
414414
Last: 4,
415415
},
416416
Reporter: checks.TemplateCheckName,
417-
Text: "Template is using `job` label but the query removes it.",
418-
Details: checks.TemplateCheckAggregationDetails,
417+
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `by(notjob)`, only labels included inside `by(...)` will be present on the results.",
418+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) by(notjob)`.",
419419
Severity: checks.Bug,
420420
},
421421
}
@@ -440,8 +440,8 @@ func TestTemplateCheck(t *testing.T) {
440440
Last: 6,
441441
},
442442
Reporter: checks.TemplateCheckName,
443-
Text: "Template is using `ixtance` label but the query removes it.",
444-
Details: checks.TemplateCheckAggregationDetails,
443+
Text: "Template is using `ixtance` label but the query results won't have this label. Query is using aggregation with `by(instance, version)`, only labels included inside `by(...)` will be present on the results.",
444+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `count(build_info) by (instance, version)`.",
445445
Severity: checks.Bug,
446446
},
447447
}
@@ -1312,14 +1312,12 @@ func TestTemplateCheck(t *testing.T) {
13121312
{
13131313
description: "multiple or",
13141314
content: `
1315-
- alert: Prefix_Advertised_On_Very_Few_Routers
1315+
- alert: Foo
13161316
expr: >
1317-
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*|.*multicolo.*|.*aggregate.*|.*test.*|.*tier1.*|.*regional.*|.*brat.*|.*utopia.*|.*byoip.*",prefix!~"141.101.112.0/20|190.93.240.0/20"})
1317+
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*"})
13181318
< 0.5 > 0
13191319
or avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*multicolo.*"})
13201320
< 0.4 > 0
1321-
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*aggregate.*"} OR router_anycast_prefix_enabled{prefix=~"141.101.112.0/20|190.93.240.0/20"})
1322-
< 20 > 0
13231321
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*offpeak.*"})
13241322
< 8 > 0
13251323
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*tier1.*"})
@@ -1340,14 +1338,12 @@ func TestTemplateCheck(t *testing.T) {
13401338
{
13411339
description: "multiple or / missing group_left()",
13421340
content: `
1343-
- alert: Prefix_Advertised_On_Very_Few_Routers
1341+
- alert: Foo
13441342
expr: >
1345-
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*|.*multicolo.*|.*aggregate.*|.*test.*|.*tier1.*|.*regional.*|.*brat.*|.*utopia.*|.*byoip.*",prefix!~"141.101.112.0/20|190.93.240.0/20"})
1343+
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*"})
13461344
< 0.5 > 0
13471345
or avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*multicolo.*"})
13481346
< 0.4 > 0
1349-
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*aggregate.*"} OR router_anycast_prefix_enabled{prefix=~"141.101.112.0/20|190.93.240.0/20"})
1350-
< 20 > 0
13511347
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*offpeak.*"})
13521348
< 8 > 0
13531349
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*tier1.*"})
@@ -1367,12 +1363,12 @@ func TestTemplateCheck(t *testing.T) {
13671363
return []checks.Problem{
13681364
{
13691365
Lines: parser.LineRange{
1370-
First: 21,
1371-
Last: 21,
1366+
First: 19,
1367+
Last: 19,
13721368
},
13731369
Reporter: checks.TemplateCheckName,
1374-
Text: "Template is using `prefix` label but the query removes it.",
1375-
Details: checks.TemplateCheckAggregationDetails,
1370+
Text: "Template is using `prefix` label but the query results won't have this label. Query is using one-to-one vector matching with `on()`, only labels included inside `on(...)` will be present on the results.",
1371+
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~\".*tier1.*\"}) < on() count(colo_router_tier:disabled_pops:max{tier=\"1\",router=~\"edge.*\"}) * 0.4`.",
13761372
Severity: checks.Bug,
13771373
},
13781374
}

internal/checks/promql_fragile.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (c FragileCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rul
6464
}
6565

6666
if rule.AlertingRule != nil {
67-
for _, problem := range c.checkSampling(expr.Query) {
67+
for _, problem := range c.checkSampling(expr.Value.Value, expr.Query) {
6868
problems = append(problems, Problem{
6969
Lines: expr.Value.Lines,
7070
Reporter: c.Reporter(),
@@ -126,8 +126,8 @@ NEXT:
126126
return problems
127127
}
128128

129-
func (c FragileCheck) checkSampling(node *parser.PromQLNode) (problems []exprProblem) {
130-
s := utils.LabelsSource(node)
129+
func (c FragileCheck) checkSampling(expr string, node *parser.PromQLNode) (problems []exprProblem) {
130+
s := utils.LabelsSource(expr, node)
131131
for _, src := range append(s.Alternatives, s) {
132132
if src.Type != utils.AggregateSource {
133133
continue

0 commit comments

Comments
 (0)