-
Notifications
You must be signed in to change notification settings - Fork 10
labels: improve regex selectivity estimation with sample values #1051
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
96ecdbd
f769137
68ecfbd
7b840db
32e305b
f40a7ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,13 +50,18 @@ func (m *Matcher) SingleMatchCost() float64 { | |
| panic("labels.Matcher.SingleMatchCost: invalid match type " + m.Type.String() + m.String()) | ||
| } | ||
|
|
||
| // EstimateSelectivity is the estimated fraction of all strings that it would match. | ||
| // EstimateSelectivity returns the estimated fraction of label values this matcher would match. | ||
| // sampleValues is a representative sample of actual label values for this label name. | ||
| // If totalLabelValues is 0, then the selectivity is assumed to be 1.0. | ||
| // For example: | ||
| // * namespace!="" will match all values, so its selectivity is 1; | ||
| // * namespace=~"foo" will match only a single value, so its selectivity across 100 values is 0.01; | ||
| // * namespace=~"foo|bar" will match two values, so its selectivity across 100 values is 0.02. | ||
| func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 { | ||
| // | ||
| // For complex regexes where we can't determine the number of matching values statically, | ||
| // selectivity is computed by testing against sampleValues and cached in the matcher, | ||
| // so subsequent calls reuse the cached value. | ||
| func (m *Matcher) EstimateSelectivity(totalLabelValues uint64, sampleValues []string) float64 { | ||
| if totalLabelValues == 0 { | ||
| return 1.0 | ||
| } | ||
|
|
@@ -74,19 +79,16 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 { | |
| case MatchRegexp, MatchNotRegexp: | ||
| // If we have optimized set matches, we know exactly how many values we'll match. | ||
| // We assume that all of them will be present in the corpus we're testing against. | ||
| // Note: setMatches selectivity depends on totalLabelValues, so we compute it fresh each time. | ||
| switch setMatchesSize := len(m.re.setMatches); { | ||
| case setMatchesSize > 0: | ||
| selectivity = float64(setMatchesSize) / float64(totalLabelValues) | ||
| case m.Value == "": | ||
| selectivity = 0 | ||
| case m.re.prefix != "": | ||
| // For prefix matches, estimate we'll match ~10% of values. | ||
| selectivity = 0.1 | ||
| case m.Value == ".+" || m.Value == ".*": | ||
| selectivity = 1.0 | ||
| default: | ||
| // For unoptimized regex, assume we'll match ~10% of values | ||
| selectivity = 0.1 | ||
| selectivity = m.estimateComplexRegexSelectivity(sampleValues) | ||
| } | ||
| } | ||
| selectivity = max(0.0, min(selectivity, 1.0)) | ||
|
|
@@ -99,6 +101,41 @@ func (m *Matcher) EstimateSelectivity(totalLabelValues uint64) float64 { | |
| return selectivity | ||
| } | ||
|
|
||
| // estimateComplexRegexSelectivity estimates selectivity for regex matchers that don't have | ||
| // simple optimizations (set matches, empty, or .+/.* patterns). | ||
| // Uses sample values when available, otherwise falls back to a 10% heuristic. | ||
| func (m *Matcher) estimateComplexRegexSelectivity(sampleValues []string) float64 { | ||
| // Check cache first | ||
| if cached := m.re.estimatedSelectivity.Load(); cached >= 0 { | ||
| return cached | ||
| } | ||
|
|
||
| var selectivity float64 | ||
| if len(sampleValues) > 0 { | ||
| // Use sample values to estimate selectivity | ||
| selectivity = float64(m.matchesN(sampleValues)) / float64(len(sampleValues)) | ||
| } else { | ||
| // No sample values available, use an arbitrary value | ||
| selectivity = 0.1 | ||
| } | ||
|
|
||
| // Cache the computed selectivity | ||
| m.re.estimatedSelectivity.Store(selectivity) | ||
| return selectivity | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Cached selectivity shared incorrectly across different labelsThe Additional Locations (1)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. damn, you're good. and right. this is a problem |
||
|
|
||
| // matchesN counts how many values in the slice this matcher matches. | ||
| // Returns at least 1 to avoid division by zero in selectivity calculations. | ||
| func (m *Matcher) matchesN(values []string) int { | ||
| count := 0 | ||
| for _, v := range values { | ||
| if m.Matches(v) { | ||
| count++ | ||
| } | ||
| } | ||
| return max(1, count) | ||
| } | ||
dimitarvdimitrov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| func (m *FastRegexMatcher) SingleMatchCost() float64 { | ||
| parsed := m.parsedRe | ||
| if parsed == nil { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.