Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 44 additions & 7 deletions model/labels/cost.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,18 @@ func (m *Matcher) SingleMatchCost() float64 {
panic("labels.Matcher.SingleMatchCost: invalid match type " + m.Type.String() + m.String())
}

// EstimateSelectivity is the estimated fraction of all strings that it would match.
// EstimateSelectivity returns the estimated fraction of label values this matcher would match.
// sampleValues is a representative sample of actual label values for this label name.
// If totalLabelValues is 0, then the selectivity is assumed to be 1.0.
// For example:
// * namespace!="" will match all values, so its selectivity is 1;
// * namespace=~"foo" will match only a single value, so its selectivity across 100 values is 0.01;
// * namespace=~"foo|bar" will match two values, so its selectivity across 100 values is 0.02.
func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
//
// For complex regexes where we can't determine the number of matching values statically,
// selectivity is computed by testing against sampleValues and cached in the matcher,
// so subsequent calls reuse the cached value.
func (m *Matcher) EstimateSelectivity(totalLabelValues uint64, sampleValues []string) float64 {
if totalLabelValues == 0 {
return 1.0
}
Expand All @@ -74,19 +79,16 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
case MatchRegexp, MatchNotRegexp:
// If we have optimized set matches, we know exactly how many values we'll match.
// We assume that all of them will be present in the corpus we're testing against.
// Note: setMatches selectivity depends on totalLabelValues, so we compute it fresh each time.
switch setMatchesSize := len(m.re.setMatches); {
case setMatchesSize > 0:
selectivity = float64(setMatchesSize) / float64(totalLabelValues)
case m.Value == "":
selectivity = 0
case m.re.prefix != "":
// For prefix matches, estimate we'll match ~10% of values.
selectivity = 0.1
case m.Value == ".+" || m.Value == ".*":
selectivity = 1.0
default:
// For unoptimized regex, assume we'll match ~10% of values
selectivity = 0.1
selectivity = m.estimateComplexRegexSelectivity(sampleValues)
}
}
selectivity = max(0.0, min(selectivity, 1.0))
Expand All @@ -99,6 +101,41 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
return selectivity
}

// estimateComplexRegexSelectivity estimates selectivity for regex matchers that don't have
// simple optimizations (set matches, empty, or .+/.* patterns).
// Uses sample values when available, otherwise falls back to a 10% heuristic.
func (m *Matcher) estimateComplexRegexSelectivity(sampleValues []string) float64 {
// Check cache first
if cached := m.re.estimatedSelectivity.Load(); cached >= 0 {
return cached
}

var selectivity float64
if len(sampleValues) > 0 {
// Use sample values to estimate selectivity
selectivity = float64(m.matchesN(sampleValues)) / float64(len(sampleValues))
} else {
// No sample values available, use an arbitrary value
selectivity = 0.1
}

// Cache the computed selectivity
m.re.estimatedSelectivity.Store(selectivity)
return selectivity
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Cached selectivity shared incorrectly across different labels

The estimatedSelectivity is cached on FastRegexMatcher, which is shared via global cache across all Matcher instances using the same regex pattern, regardless of label name. When two matchers like label_a=~"pattern" and label_b=~"pattern" are created, they share the same FastRegexMatcher. If EstimateSelectivity is called for label_a with its specific sample values, that selectivity is cached and incorrectly returned for label_b even though label_b may have completely different value distributions. The cache granularity is wrong - selectivity depends on label-specific sample values but is cached at the regex-pattern level.

Additional Locations (1)

Fix in Cursor Fix in Web

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

damn, you're good. and right. this is a problem


// matchesN counts how many values in the slice this matcher matches.
// Returns at least 1 to avoid division by zero in selectivity calculations.
func (m *Matcher) matchesN(values []string) int {
count := 0
for _, v := range values {
if m.Matches(v) {
count++
}
}
return max(1, count)
}

func (m *FastRegexMatcher) SingleMatchCost() float64 {
parsed := m.parsedRe
if parsed == nil {
Expand Down
61 changes: 60 additions & 1 deletion model/labels/cost_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,71 @@ func TestSelectivity(t *testing.T) {
t.Run(fmt.Sprintf("%d series {%s}", tt.numSeries, matcher), func(t *testing.T) {
// Tolerate a single value error in 10M values
const tolerance = 1e-7
actualSelectivity := matcher.EstimateSelectivity(tt.numSeries)
actualSelectivity := matcher.EstimateSelectivity(tt.numSeries, nil)
require.InDelta(t, tt.selectivity, actualSelectivity, tolerance)
})
}
}

func TestSelectivityWithSampleValues(t *testing.T) {
sampleValues := []string{
"foo_value",
"bar_value",
"baz_value",
"qux_value",
"prefix_one",
"prefix_two",
"prefix_three",
"something_else",
"another_thing",
"final_item",
}

tests := []struct {
name string
numSeries uint64
selectivity float64
l string
t MatchType
v string
}{
// Regex that matches 3 out of 10 sample values (30%)
{"prefix match", 100, 0.3, "name", MatchRegexp, "prefix_.*"},
// Regex that matches 1 out of 10 sample values (10%)
{"single match", 100, 0.1, "name", MatchRegexp, "foo_.*"},
// Regex that matches nothing (but matchesN returns at least 1)
{"no match", 100, 0.1, "name", MatchRegexp, "nomatch_.*"},
// Not-regex inverts selectivity
{"not prefix match", 100, 0.7, "name", MatchNotRegexp, "prefix_.*"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
matcher, err := NewMatcher(tt.t, tt.l, tt.v)
require.NoError(t, err)
const tolerance = 1e-7
actualSelectivity := matcher.EstimateSelectivity(tt.numSeries, sampleValues)
require.InDelta(t, tt.selectivity, actualSelectivity, tolerance)
})
}
}

func TestSelectivityCaching(t *testing.T) {
matcher, err := NewMatcher(MatchRegexp, "name", "test_.*")
require.NoError(t, err)

sampleValues1 := []string{"test_one", "test_two", "other"} // 2/3 match
sampleValues2 := []string{"test_one", "other", "another", "more"} // 1/4 match

// First call computes and caches selectivity based on sampleValues1
selectivity1 := matcher.EstimateSelectivity(100, sampleValues1)
require.InDelta(t, 2.0/3.0, selectivity1, 1e-7)

// Second call should return cached value, ignoring sampleValues2
selectivity2 := matcher.EstimateSelectivity(100, sampleValues2)
require.InDelta(t, 2.0/3.0, selectivity2, 1e-7) // Still 2/3, not 1/4
}

func BenchmarkStringEquality(b *testing.B) {
benchmarks := []struct {
name string
Expand Down
8 changes: 7 additions & 1 deletion model/labels/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/dgraph-io/ristretto"
"github.com/grafana/regexp"
"github.com/grafana/regexp/syntax"
"go.uber.org/atomic"
"golang.org/x/text/unicode/norm"
)

Expand Down Expand Up @@ -68,6 +69,10 @@ type FastRegexMatcher struct {
// matchString is the "compiled" function to run by MatchString().
matchString func(string) bool
parsedRe *syntax.Regexp

// estimatedSelectivity is a cached selectivity value for this regex.
// A value < 0 means not yet computed.
estimatedSelectivity *atomic.Float64
}

func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
Expand All @@ -90,7 +95,8 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {

func newFastRegexMatcherWithoutCache(v string) (*FastRegexMatcher, error) {
m := &FastRegexMatcher{
reString: v,
reString: v,
estimatedSelectivity: atomic.NewFloat64(-1.0), // Not yet computed
}

m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)
Expand Down
Loading