grafana · dimitarvdimitrov · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/model/labels/cost.go b/model/labels/cost.go
@@ -50,13 +50,18 @@ func (m *Matcher) SingleMatchCost() float64 {
 	panic("labels.Matcher.SingleMatchCost: invalid match type " + m.Type.String() + m.String())
 }
 
-// EstimateSelectivity is the estimated fraction of all strings that it would match.
+// EstimateSelectivity returns the estimated fraction of label values this matcher would match.
+// sampleValues is a representative sample of actual label values for this label name.
 // If totalLabelValues is 0, then the selectivity is assumed to be 1.0.
 // For example:
 // * namespace!="" will match all values, so its selectivity is 1;
 // * namespace=~"foo" will match only a single value, so its selectivity across 100 values is 0.01;
 // * namespace=~"foo|bar" will match two values, so its selectivity across 100 values is 0.02.
-func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
+//
+// For complex regexes where we can't determine the number of matching values statically,
+// selectivity is computed by testing against sampleValues and cached in the matcher,
+// so subsequent calls reuse the cached value.
+func (m *Matcher) EstimateSelectivity(totalLabelValues uint64, sampleValues []string) float64 {
 	if totalLabelValues == 0 {
 		return 1.0
 	}
@@ -74,19 +79,16 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
 	case MatchRegexp, MatchNotRegexp:
 		// If we have optimized set matches, we know exactly how many values we'll match.
 		// We assume that all of them will be present in the corpus we're testing against.
+		// Note: setMatches selectivity depends on totalLabelValues, so we compute it fresh each time.
 		switch setMatchesSize := len(m.re.setMatches); {
 		case setMatchesSize > 0:
 			selectivity = float64(setMatchesSize) / float64(totalLabelValues)
 		case m.Value == "":
 			selectivity = 0
-		case m.re.prefix != "":
-			// For prefix matches, estimate we'll match ~10% of values.
-			selectivity = 0.1
 		case m.Value == ".+" || m.Value == ".*":
 			selectivity = 1.0
 		default:
-			// For unoptimized regex, assume we'll match ~10% of values
-			selectivity = 0.1
+			selectivity = m.estimateComplexRegexSelectivity(sampleValues)
 		}
 	}
 	selectivity = max(0.0, min(selectivity, 1.0))
@@ -99,6 +101,41 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 {
 	return selectivity
 }
 
+// estimateComplexRegexSelectivity estimates selectivity for regex matchers that don't have
+// simple optimizations (set matches, empty, or .+/.* patterns).
+// Uses sample values when available, otherwise falls back to a 10% heuristic.
+func (m *Matcher) estimateComplexRegexSelectivity(sampleValues []string) float64 {
+	// Check cache first
+	if cached := m.re.estimatedSelectivity.Load(); cached >= 0 {
+		return cached
+	}
+
+	var selectivity float64
+	if len(sampleValues) > 0 {
+		// Use sample values to estimate selectivity
+		selectivity = float64(m.matchesN(sampleValues)) / float64(len(sampleValues))
+	} else {
+		// No sample values available, use an arbitrary value
+		selectivity = 0.1
+	}
+
+	// Cache the computed selectivity
+	m.re.estimatedSelectivity.Store(selectivity)
+	return selectivity
+}
+
+// matchesN counts how many values in the slice this matcher matches.
+// Returns at least 1 to avoid division by zero in selectivity calculations.
+func (m *Matcher) matchesN(values []string) int {
+	count := 0
+	for _, v := range values {
+		if m.Matches(v) {
+			count++
+		}
+	}
+	return max(1, count)
+}
+
 func (m *FastRegexMatcher) SingleMatchCost() float64 {
 	parsed := m.parsedRe
 	if parsed == nil {

diff --git a/model/labels/cost_test.go b/model/labels/cost_test.go
@@ -174,12 +174,71 @@ func TestSelectivity(t *testing.T) {
 		t.Run(fmt.Sprintf("%d series {%s}", tt.numSeries, matcher), func(t *testing.T) {
 			// Tolerate a single value error in 10M values
 			const tolerance = 1e-7
-			actualSelectivity := matcher.EstimateSelectivity(tt.numSeries)
+			actualSelectivity := matcher.EstimateSelectivity(tt.numSeries, nil)
 			require.InDelta(t, tt.selectivity, actualSelectivity, tolerance)
 		})
 	}
 }
 
+func TestSelectivityWithSampleValues(t *testing.T) {
+	sampleValues := []string{
+		"foo_value",
+		"bar_value",
+		"baz_value",
+		"qux_value",
+		"prefix_one",
+		"prefix_two",
+		"prefix_three",
+		"something_else",
+		"another_thing",
+		"final_item",
+	}
+
+	tests := []struct {
+		name        string
+		numSeries   uint64
+		selectivity float64
+		l           string
+		t           MatchType
+		v           string
+	}{
+		// Regex that matches 3 out of 10 sample values (30%)
+		{"prefix match", 100, 0.3, "name", MatchRegexp, "prefix_.*"},
+		// Regex that matches 1 out of 10 sample values (10%)
+		{"single match", 100, 0.1, "name", MatchRegexp, "foo_.*"},
+		// Regex that matches nothing (but matchesN returns at least 1)
+		{"no match", 100, 0.1, "name", MatchRegexp, "nomatch_.*"},
+		// Not-regex inverts selectivity
+		{"not prefix match", 100, 0.7, "name", MatchNotRegexp, "prefix_.*"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matcher, err := NewMatcher(tt.t, tt.l, tt.v)
+			require.NoError(t, err)
+			const tolerance = 1e-7
+			actualSelectivity := matcher.EstimateSelectivity(tt.numSeries, sampleValues)
+			require.InDelta(t, tt.selectivity, actualSelectivity, tolerance)
+		})
+	}
+}
+
+func TestSelectivityCaching(t *testing.T) {
+	matcher, err := NewMatcher(MatchRegexp, "name", "test_.*")
+	require.NoError(t, err)
+
+	sampleValues1 := []string{"test_one", "test_two", "other"}        // 2/3 match
+	sampleValues2 := []string{"test_one", "other", "another", "more"} // 1/4 match
+
+	// First call computes and caches selectivity based on sampleValues1
+	selectivity1 := matcher.EstimateSelectivity(100, sampleValues1)
+	require.InDelta(t, 2.0/3.0, selectivity1, 1e-7)
+
+	// Second call should return cached value, ignoring sampleValues2
+	selectivity2 := matcher.EstimateSelectivity(100, sampleValues2)
+	require.InDelta(t, 2.0/3.0, selectivity2, 1e-7) // Still 2/3, not 1/4
+}
+
 func BenchmarkStringEquality(b *testing.B) {
 	benchmarks := []struct {
 		name     string

diff --git a/model/labels/regexp.go b/model/labels/regexp.go
@@ -24,6 +24,7 @@ import (
 	"github.com/dgraph-io/ristretto"
 	"github.com/grafana/regexp"
 	"github.com/grafana/regexp/syntax"
+	"go.uber.org/atomic"
 	"golang.org/x/text/unicode/norm"
 )
 
@@ -68,6 +69,10 @@ type FastRegexMatcher struct {
 	// matchString is the "compiled" function to run by MatchString().
 	matchString func(string) bool
 	parsedRe    *syntax.Regexp
+
+	// estimatedSelectivity is a cached selectivity value for this regex.
+	// A value < 0 means not yet computed.
+	estimatedSelectivity *atomic.Float64
 }
 
 func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
@@ -90,7 +95,8 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
 
 func newFastRegexMatcherWithoutCache(v string) (*FastRegexMatcher, error) {
 	m := &FastRegexMatcher{
-		reString: v,
+		reString:             v,
+		estimatedSelectivity: atomic.NewFloat64(-1.0), // Not yet computed
 	}
 
 	m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)