Skip to content

[bugfix] fix avg_over_time & compare queries panic with max series #5016

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ configurable via the throughput_bytes_slo field, and it will populate op="traces
* [BUGFIX] Correctly cache frontend jobs for query range (TraceQL Metrics). [#4771](https://github.com/grafana/tempo/pull/4771) (@joe-elliott)
* [BUGFIX] Various edge case fixes for query range (TraceQL Metrics) [#4962](https://github.com/grafana/tempo/pull/4962) (@ruslan-mikhailov)
* [BUGFIX] Fix `TempoBlockListRisingQuickly` alert grouping. [#4876](https://github.com/grafana/tempo/pull/4876) (@mapno)
* [BUGFIX] Fix panic when aggregating avg_over_time metrics with truncated timeseries [#5016](https://github.com/grafana/tempo/pull/5016) (@ie-pham)

# v2.7.2

Expand Down
107 changes: 97 additions & 10 deletions modules/frontend/combiner/metrics_query_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ func NewQueryRange(req *tempopb.QueryRangeRequest, maxSeriesLimit int) (Combiner
}

sortResponse(resp)
if combiner.MaxSeriesReached() {
// Truncating the final response because even if we bail as soon as len(resp.Series) >= maxSeries
// it's possible that the last response pushed us over the max series limit.
resp.Series = resp.Series[:maxSeries]
truncateResponse(resp, maxSeries, req)

// partial is when the max series is reached either in the querier or generators
// since it might have been truncated - we need to add the warning the it may be inaccurate
// max series reached is when the max series is actually reached at the query-frontend level
if combiner.MaxSeriesReached() || combiner.Partial() {
resp.Status = tempopb.PartialStatus_PARTIAL
resp.Message = maxSeriesReachedErrorMsg
}
Expand All @@ -73,19 +75,15 @@ func NewQueryRange(req *tempopb.QueryRangeRequest, maxSeriesLimit int) (Combiner
}

sortResponse(resp)
if combiner.MaxSeriesReached() {
// Truncating the final response because even if we bail as soon as len(resp.Series) >= maxSeries
// it's possible that the last response pushed us over the max series limit.
resp.Series = resp.Series[:maxSeries]
}
truncateResponse(resp, maxSeries, req)
attachExemplars(req, resp)

// compare with prev resp and only return diffs
diff := diffResponse(prevResp, resp)
// store resp for next diff
prevResp = resp

if combiner.MaxSeriesReached() {
if combiner.Partial() || combiner.MaxSeriesReached() {
diff.Status = tempopb.PartialStatus_PARTIAL
diff.Message = maxSeriesReachedErrorMsg
}
Expand Down Expand Up @@ -233,3 +231,92 @@ func attachExemplars(req *tempopb.QueryRangeRequest, res *tempopb.QueryRangeResp
}
}
}

func truncateResponse(resp *tempopb.QueryRangeResponse, maxSeries int, req *tempopb.QueryRangeRequest) {
if maxSeries == 0 || len(resp.Series) <= maxSeries {
return
}
rootexpr, err := traceql.Parse(req.Query)
if err != nil {
return
}
// if this query is a compare, we need make sure there is one total series per key
// only include if both total series and corresponding count series exist
const (
baselinePrefix = "baseline"
selectionPrefix = "selection"
baselineTotalPrefix = "baseline_total"
selectionTotalPrefix = "selection_total"
metaTypePrefix = "__meta_type"
)
if _, ok := rootexpr.MetricsPipeline.(*traceql.MetricsCompare); ok {
baselineCountMap := make(map[string][]int) // count is one single series for each key/value pair
baselineTotalMap := make(map[string]int) // total is always a single series for each key
selectionCountMap := make(map[string][]int)
selectionTotalMap := make(map[string]int)
results := make([]*tempopb.TimeSeries, maxSeries)
resultsIdx := 0

for i, series := range resp.Series {
for _, label := range series.Labels {
if label.Key != metaTypePrefix {
// record the corresponding index
if strings.Contains(series.PromLabels, baselineTotalPrefix) {
baselineTotalMap[label.Key] = i
continue
}
// if it's not baselineTotal but has baseline it's baselineCount
if strings.Contains(series.PromLabels, baselinePrefix) {
baselineCountMap[label.Key] = append(baselineCountMap[label.Key], i)
continue
}
if strings.Contains(series.PromLabels, selectionTotalPrefix) {
selectionTotalMap[label.Key] = i
continue
}
// if it's not selectionTotal but has selection it's selectionCount
if strings.Contains(series.PromLabels, selectionPrefix) {
selectionCountMap[label.Key] = append(selectionCountMap[label.Key], i)
continue
}
}
}
}

// do baseline first,
// the total is more important so just check total first
for a, i := range baselineTotalMap {
// check if we have a count for this total
if _, ok := baselineCountMap[a]; ok && resultsIdx < maxSeries {
results[resultsIdx] = resp.Series[i]
resultsIdx++
for _, series := range baselineCountMap[a] {
if resultsIdx >= maxSeries {
break
}
results[resultsIdx] = resp.Series[series]
resultsIdx++
}
}
}
// then do selection
for a, i := range selectionTotalMap {
// check if we have a count for this total
if _, ok := selectionCountMap[a]; ok && resultsIdx < maxSeries {
results[resultsIdx] = resp.Series[i]
resultsIdx++
for _, series := range selectionCountMap[a] {
if resultsIdx >= maxSeries {
break
}
results[resultsIdx] = resp.Series[series]
resultsIdx++
}
}
}
resp.Series = results[:resultsIdx]
return
}
// otherwise just truncate
resp.Series = resp.Series[:maxSeries]
}
214 changes: 214 additions & 0 deletions modules/frontend/combiner/metrics_query_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,198 @@ func TestQueryRangemaxSeriesShouldQuit(t *testing.T) {
require.True(t, queryRangeCombiner.ShouldQuit())
}

func TestTruncateResponse(t *testing.T) {
type testCase struct {
name string
query string
series []*tempopb.TimeSeries
maxSeries int
expectedSeries []*tempopb.TimeSeries
}

defaultSeries := func(series int) []*tempopb.TimeSeries {
randomValues := []string{"bar", "baz", "bat", "cat", "dog", "cow", "pig", "hen", "boo", "moo"}
seriesList := make([]*tempopb.TimeSeries, series)
for i := 0; i < series; i++ {
seriesList[i] = &tempopb.TimeSeries{
PromLabels: "foo=" + randomValues[i],
Labels: []v1.KeyValue{
{Key: "foo", Value: &v1.AnyValue{Value: &v1.AnyValue_StringValue{StringValue: randomValues[i]}}},
},
}
}
return seriesList
}

tcs := []testCase{
{
name: "Less than max series",
query: "{} | rate()",
series: defaultSeries(3),
maxSeries: 4,
expectedSeries: defaultSeries(3),
},
{
name: "More than max series",
query: "{} | rate()",
series: defaultSeries(10),
maxSeries: 4,
expectedSeries: defaultSeries(4),
},
{
name: "No limit",
query: "{} | rate()",
series: defaultSeries(10),
maxSeries: 0,
expectedSeries: defaultSeries(10),
},
{
name: "Compare less than max series",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with extra count series same key",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection", "resource.service.name", "odd", false), // will be removed after truncate
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with extra total baseline series diff key",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("baseline_total", "namespace", "<nil>", true), // will be removed bc no matching count series
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with extra total selection series diff key",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection_total", "namespace", "<nil>", true), // will be removed bc no matching count series
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with extra count selection series diff key",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "namespace", "even", false), // will be removed bc no matching total series
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with extra count baseline series diff key",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
makeCompareSeries("baseline", "namespace", "even", false), // will be removed bc no matching total series
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "resource.service.name", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "resource.service.name", "<nil>", true),
},
},
{
name: "Compare more than max series with zero matching series",
query: "{} | compare({status=error})",
series: []*tempopb.TimeSeries{
makeCompareSeries("baseline", "resource.service.name", "odd", false),
makeCompareSeries("baseline_total", "namespace", "<nil>", true),
makeCompareSeries("selection", "resource.service.name", "even", false),
makeCompareSeries("selection_total", "foo", "<nil>", true),
makeCompareSeries("baseline", "evenodd", "even", false),
},
maxSeries: 4,
expectedSeries: []*tempopb.TimeSeries{},
},
}

for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
req := &tempopb.QueryRangeRequest{
Query: tc.query,
Start: 0,
End: 1000,
Step: 1,
MaxSeries: uint32(tc.maxSeries),
}

resp := &tempopb.QueryRangeResponse{
Series: tc.series,
}
expectedResp := &tempopb.QueryRangeResponse{
Series: tc.expectedSeries,
}
truncateResponse(resp, tc.maxSeries, req)

sortResponse(resp)
sortResponse(expectedResp)
require.Equal(t, tc.expectedSeries, resp.Series)
})
}
}

func BenchmarkDiffSeriesAndMarshal(b *testing.B) {
prev, curr := seriesWithTenPercentDiff()

Expand Down Expand Up @@ -703,3 +895,25 @@ func ts(samples []tempopb.Sample, exemplars []tempopb.Exemplar, kvs ...string) *

return ts
}

func makeCompareSeries(meta, key, value string, total bool) *tempopb.TimeSeries {
Labels := []v1.KeyValue{
tempopb.MakeKeyValueString("__meta_type", meta),
tempopb.MakeKeyValueString(key, value),
}

PromLabels := `{__meta_type="` + meta + `", "` + key + `"="` + value + `"}`

if total {
Labels = []v1.KeyValue{
tempopb.MakeKeyValueString("__meta_type", "meta"),
tempopb.MakeKeyValueString(key, "nil"),
}
PromLabels = `{__meta_type="` + meta + `", "` + key + `"="<nil>"}`
}

return &tempopb.TimeSeries{
PromLabels: PromLabels,
Labels: Labels,
}
}
Loading