Skip to content

Commit 9024afa

Browse files
authored
Merge pull request #1138 from cloudflare/fallback
Limit how long we can check other servers
2 parents 87bfd23 + 740164c commit 9024afa

File tree

7 files changed

+142
-10
lines changed

7 files changed

+142
-10
lines changed

docs/changelog.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# Changelog
22

3-
## v0.65.4
3+
## v0.66.0
4+
5+
### Added
6+
7+
- Added `fallbackTimeout` option to the [promql/series](checks/promql/series.md) check
8+
that controls how much time pint can spend checking other Prometheus servers for missing
9+
metrics.
410

511
### Fixed
612

docs/checks/promql/series.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,11 @@ Syntax:
141141

142142
```js
143143
check "promql/series" {
144+
lookbackRange = "7d"
145+
lookbackStep = "5m"
144146
ignoreMetrics = [ "(.*)", ... ]
145147
ignoreLabelsValue = { "...": [ "...", ... ] }
148+
fallbackTimeout = "5m"
146149
}
147150
```
148151

@@ -166,6 +169,13 @@ check "promql/series" {
166169
comments, see below.
167170
The value of this option is a map where the key is a metric selector to match on and the value
168171
is the list of label names.
172+
- `fallbackTimeout` - if a query uses a metric that is missing from a Prometheus server pint will
173+
check if that metric is present on any other Prometheus server and report any findings.
174+
This option controls how long can these extra checks take if there a long list of additional
175+
servers to check. pint will abort checking more Prometheus servers when it reaches that time limit.
176+
This is a timeout for the whole operation of checking other Prometheus servers. With the default limit
177+
of 5 minutes and if there's 10 extra Prometheus servers to check and it takes 5 minutes to check first
178+
4 servers then pint will abort checking remaining 6 servers.
169179

170180
Example:
171181

internal/checks/base_test.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,11 @@ func runTests(t *testing.T, testCases []checkTest) {
152152
entries, err := parseContent(tc.content)
153153
require.NoError(t, err, "cannot parse rule content")
154154
for _, entry := range entries {
155-
ctx := context.WithValue(context.Background(), promapi.AllPrometheusServers, proms)
155+
ctx := context.Background()
156156
if tc.ctx != nil {
157157
ctx = tc.ctx(uri)
158158
}
159+
ctx = context.WithValue(ctx, promapi.AllPrometheusServers, proms)
159160
problems := tc.checker(prom).Check(ctx, entry.Path, entry.Rule, tc.entries)
160161
require.Equal(t, tc.problems(uri), problems)
161162
}
@@ -468,11 +469,15 @@ func (mr metadataResponse) respond(w http.ResponseWriter, _ *http.Request) {
468469
}
469470

470471
type sleepResponse struct {
472+
resp responseWriter
471473
sleep time.Duration
472474
}
473475

474-
func (sr sleepResponse) respond(_ http.ResponseWriter, _ *http.Request) {
476+
func (sr sleepResponse) respond(w http.ResponseWriter, r *http.Request) {
475477
time.Sleep(sr.sleep)
478+
if sr.resp != nil {
479+
sr.resp.respond(w, r)
480+
}
476481
}
477482

478483
var (

internal/checks/promql_series.go

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ type PromqlSeriesSettings struct {
2727
LookbackRange string `hcl:"lookbackRange,optional" json:"lookbackRange,omitempty"`
2828
LookbackStep string `hcl:"lookbackStep,optional" json:"lookbackStep,omitempty"`
2929
IgnoreMetrics []string `hcl:"ignoreMetrics,optional" json:"ignoreMetrics,omitempty"`
30+
FallbackTimeout string `hcl:"fallbackTimeout,optional" json:"fallbackTimeout,omitempty"`
3031
ignoreMetricsRe []*regexp.Regexp
3132
lookbackRangeDuration time.Duration
3233
lookbackStepDuration time.Duration
34+
fallbackTimeout time.Duration
3335
}
3436

3537
func (c *PromqlSeriesSettings) Validate() error {
@@ -59,6 +61,15 @@ func (c *PromqlSeriesSettings) Validate() error {
5961
c.lookbackStepDuration = time.Duration(dur)
6062
}
6163

64+
c.fallbackTimeout = time.Minute * 5
65+
if c.FallbackTimeout != "" {
66+
dur, err := model.ParseDuration(c.FallbackTimeout)
67+
if err != nil {
68+
return err
69+
}
70+
c.fallbackTimeout = time.Duration(dur)
71+
}
72+
6273
for selector := range c.IgnoreLabelsValue {
6374
if _, err := promParser.ParseMetricSelector(selector); err != nil {
6475
return fmt.Errorf("%q is not a valid PromQL metric selector: %w", selector, err)
@@ -300,7 +311,7 @@ func (c SeriesCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Ru
300311
Lines: expr.Value.Lines,
301312
Reporter: c.Reporter(),
302313
Text: text,
303-
Details: c.checkOtherServer(ctx, selector.String()),
314+
Details: c.checkOtherServer(ctx, selector.String(), settings.fallbackTimeout),
304315
Severity: severity,
305316
})
306317
slog.Debug("No historical series for base metric", slog.String("check", c.Reporter()), slog.String("selector", (&bareSelector).String()))
@@ -564,10 +575,15 @@ func (c SeriesCheck) Check(ctx context.Context, _ discovery.Path, rule parser.Ru
564575
return problems
565576
}
566577

567-
func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string {
578+
func (c SeriesCheck) checkOtherServer(ctx context.Context, query string, timeout time.Duration) string {
568579
var servers []*promapi.FailoverGroup
569580
if val := ctx.Value(promapi.AllPrometheusServers); val != nil {
570-
servers = val.([]*promapi.FailoverGroup)
581+
for _, s := range val.([]*promapi.FailoverGroup) {
582+
if s.Name() == c.prom.Name() {
583+
continue
584+
}
585+
servers = append(servers, s)
586+
}
571587
}
572588

573589
if len(servers) == 0 {
@@ -579,10 +595,31 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
579595
buf.WriteString(query)
580596
buf.WriteString("` was found on other prometheus servers:\n\n")
581597

582-
var matches, skipped int
598+
start := time.Now()
599+
var tested, matches, skipped int
583600
for _, prom := range servers {
584-
slog.Debug("Checking if metric exists on any other Prometheus server", slog.String("check", c.Reporter()), slog.String("selector", query))
585-
601+
if time.Since(start) >= timeout {
602+
slog.Debug("Time limit reached for checking if metric exists on any other Prometheus server",
603+
slog.String("check", c.Reporter()),
604+
slog.String("selector", query),
605+
)
606+
buf.WriteString("\npint tried to check ")
607+
buf.WriteString(strconv.Itoa(len(servers)))
608+
buf.WriteString(" server(s) but stopped after checking ")
609+
buf.WriteString(strconv.Itoa(tested))
610+
buf.WriteString(" server(s) due to reaching time limit (")
611+
buf.WriteString(output.HumanizeDuration(timeout))
612+
buf.WriteString(").\n")
613+
break
614+
}
615+
616+
slog.Debug("Checking if metric exists on any other Prometheus server",
617+
slog.String("check", c.Reporter()),
618+
slog.String("name", prom.Name()),
619+
slog.String("selector", query),
620+
)
621+
622+
tested++
586623
qr, err := prom.Query(ctx, fmt.Sprintf("count(%s)", query))
587624
if err != nil {
588625
continue

internal/checks/promql_series_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4070,6 +4070,60 @@ func TestSeriesCheck(t *testing.T) {
40704070
},
40714071
},
40724072
},
4073+
{
4074+
description: "series present on other servers / timeout",
4075+
content: "- record: foo\n expr: notfound\n",
4076+
checker: newSeriesCheck,
4077+
prometheus: newSimpleProm,
4078+
ctx: func(_ string) context.Context {
4079+
s := checks.PromqlSeriesSettings{
4080+
FallbackTimeout: "50ms",
4081+
}
4082+
if err := s.Validate(); err != nil {
4083+
t.Error(err)
4084+
t.FailNow()
4085+
}
4086+
return context.WithValue(context.Background(), checks.SettingsKey(checks.SeriesCheckName), &s)
4087+
},
4088+
otherProms: func(uri string) []*promapi.FailoverGroup {
4089+
var proms []*promapi.FailoverGroup
4090+
for i := range 15 {
4091+
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
4092+
}
4093+
return proms
4094+
},
4095+
problems: func(uri string) []checks.Problem {
4096+
return []checks.Problem{
4097+
{
4098+
Lines: parser.LineRange{
4099+
First: 2,
4100+
Last: 2,
4101+
},
4102+
Reporter: checks.SeriesCheckName,
4103+
Text: noMetricText("prom", uri, "notfound", "1w"),
4104+
Details: fmt.Sprintf("`notfound` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=notfound)\n- [prom1](%s/other/graph?g0.expr=notfound)\n- [prom2](%s/other/graph?g0.expr=notfound)\n\npint tried to check 15 server(s) but stopped after checking 3 server(s) due to reaching time limit (50ms).\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n", uri, uri, uri),
4105+
Severity: checks.Bug,
4106+
},
4107+
}
4108+
},
4109+
mocks: []*prometheusMock{
4110+
{
4111+
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
4112+
resp: sleepResponse{
4113+
sleep: time.Millisecond * 20,
4114+
resp: respondWithSingleInstantVector(),
4115+
},
4116+
},
4117+
{
4118+
conds: []requestCondition{requireQueryPath},
4119+
resp: respondWithEmptyVector(),
4120+
},
4121+
{
4122+
conds: []requestCondition{requireRangeQueryPath},
4123+
resp: respondWithEmptyMatrix(),
4124+
},
4125+
},
4126+
},
40734127
}
40744128
runTests(t, testCases)
40754129
}

internal/config/config_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2277,6 +2277,18 @@ func TestConfigErrors(t *testing.T) {
22772277
}`,
22782278
err: `"foo{" is not a valid PromQL metric selector: 1:5: parse error: unexpected end of input inside braces`,
22792279
},
2280+
{
2281+
config: `check "promql/series" { lookbackRange = "1x" }`,
2282+
err: `unknown unit "x" in duration "1x"`,
2283+
},
2284+
{
2285+
config: `check "promql/series" { lookbackStep = "1x" }`,
2286+
err: `unknown unit "x" in duration "1x"`,
2287+
},
2288+
{
2289+
config: `check "promql/series" { fallbackTimeout = "1x" }`,
2290+
err: `unknown unit "x" in duration "1x"`,
2291+
},
22802292
{
22812293
config: `rule {
22822294
link ".+++" {}

internal/promapi/failover.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,15 @@ func (fg *FailoverGroup) Config(ctx context.Context, cacheTTL time.Duration) (cf
199199

200200
func (fg *FailoverGroup) Query(ctx context.Context, expr string) (qr *QueryResult, err error) {
201201
var uri string
202-
for _, prom := range fg.servers {
202+
for try, prom := range fg.servers {
203+
if try > 0 {
204+
slog.Debug(
205+
"Using failover URI",
206+
slog.String("name", fg.name),
207+
slog.Int("retry", try),
208+
slog.String("uri", prom.safeURI),
209+
)
210+
}
203211
uri = prom.safeURI
204212
qr, err = prom.Query(ctx, expr)
205213
if err == nil {

0 commit comments

Comments
 (0)