grafana · dimitarvdimitrov · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
@@ -11354,6 +11354,39 @@
                   "fieldType": "int",
                   "fieldCategory": "advanced"
                 },
+                {
+                  "kind": "field",
+                  "name": "sample_values_probability",
+                  "required": false,
+                  "desc": "Probability of sampling a label value for regex selectivity estimation (0.0-1.0). Sampled values are held in memory in addition to series. Set to 0 to disable; regexes will be estimated to select 10% of values.",
+                  "fieldValue": null,
+                  "fieldDefaultValue": 0.01,
+                  "fieldFlag": "blocks-storage.tsdb.index-lookup-planning.sample-values-probability",
+                  "fieldType": "float",
+                  "fieldCategory": "advanced"
+                },
+                {
+                  "kind": "field",
+                  "name": "sample_values_max_count",
+                  "required": false,
+                  "desc": "Maximum number of sampled values to store per label name. Set to 0 to disable; regexes will be estimated to select 10% of values.",
+                  "fieldValue": null,
+                  "fieldDefaultValue": 1024,
+                  "fieldFlag": "blocks-storage.tsdb.index-lookup-planning.sample-values-max-count",
+                  "fieldType": "int",
+                  "fieldCategory": "advanced"
+                },
+                {
+                  "kind": "field",
+                  "name": "sample_values_max_bytes",
+                  "required": false,
+                  "desc": "Maximum total size in bytes of sampled values per label name. Set to 0 to disable; regexes will be estimated to select 10% of values.",
+                  "fieldValue": null,
+                  "fieldDefaultValue": 65536,
+                  "fieldFlag": "blocks-storage.tsdb.index-lookup-planning.sample-values-max-bytes",
+                  "fieldType": "int",
+                  "fieldCategory": "advanced"
+                },
                 {
                   "kind": "field",
                   "name": "index_lookup_planning_enabled",

@@ -829,6 +829,12 @@ Usage of ./cmd/mimir/mimir:
     	Cost for retrieving the posting list from disk or from memory. (default 10)
   -blocks-storage.tsdb.index-lookup-planning.retrieved-series-cost float
     	Cost for retrieving series from the index and checking if a series belongs to the query's shard. (default 15)
+  -blocks-storage.tsdb.index-lookup-planning.sample-values-max-bytes int
+    	Maximum total size in bytes of sampled values per label name. Set to 0 to disable; regexes will be estimated to select 10% of values. (default 65536)
+  -blocks-storage.tsdb.index-lookup-planning.sample-values-max-count int
+    	Maximum number of sampled values to store per label name. Set to 0 to disable; regexes will be estimated to select 10% of values. (default 1024)
+  -blocks-storage.tsdb.index-lookup-planning.sample-values-probability float
+    	Probability of sampling a label value for regex selectivity estimation (0.0-1.0). Sampled values are held in memory in addition to series. Set to 0 to disable; regexes will be estimated to select 10% of values. (default 0.01)
   -blocks-storage.tsdb.index-lookup-planning.statistics-collection-frequency duration
     	[experimental] How frequently to collect block statistics, which are used in query execution optimization. 0 to disable. (default 1h0m0s)
   -blocks-storage.tsdb.memory-snapshot-on-shutdown

@@ -5257,6 +5257,23 @@ tsdb:
     # CLI flag: -blocks-storage.tsdb.index-lookup-planning.label-cardinality-for-smaller-sketch
     [label_cardinality_for_smaller_sketch: <int> | default = 1000]
 
+    # (advanced) Probability of sampling a label value for regex selectivity
+    # estimation (0.0-1.0). Sampled values are held in memory in addition to
+    # series. Set to 0 to disable; regexes will be estimated to select 10% of
+    # values.
+    # CLI flag: -blocks-storage.tsdb.index-lookup-planning.sample-values-probability
+    [sample_values_probability: <float> | default = 0.01]
+
+    # (advanced) Maximum number of sampled values to store per label name. Set
+    # to 0 to disable; regexes will be estimated to select 10% of values.
+    # CLI flag: -blocks-storage.tsdb.index-lookup-planning.sample-values-max-count
+    [sample_values_max_count: <int> | default = 1024]
+
+    # (advanced) Maximum total size in bytes of sampled values per label name.
+    # Set to 0 to disable; regexes will be estimated to select 10% of values.
+    # CLI flag: -blocks-storage.tsdb.index-lookup-planning.sample-values-max-bytes
+    [sample_values_max_bytes: <int> | default = 65536]
+
     # (experimental) Controls the collection of statistics and whether to defer
     # some vector selector matchers to sequential scans. This leads to better
     # performance.

@@ -801,6 +801,9 @@
   "blocks-storage.tsdb.index-lookup-planning.min-series-per-block-for-query-planning": 10000,
   "blocks-storage.tsdb.index-lookup-planning.label-cardinality-for-larger-sketch": 1000000,
   "blocks-storage.tsdb.index-lookup-planning.label-cardinality-for-smaller-sketch": 1000,
+  "blocks-storage.tsdb.index-lookup-planning.sample-values-probability": 0.01,
+  "blocks-storage.tsdb.index-lookup-planning.sample-values-max-count": 1024,
+  "blocks-storage.tsdb.index-lookup-planning.sample-values-max-bytes": 65536,
   "blocks-storage.tsdb.index-lookup-planning.comparison-portion": 0,
   "blocks-storage.tsdb.index-lookup-planning.statistics-collection-frequency": 3600000000000,
   "compactor.block-ranges": [],

@@ -9,7 +9,6 @@ import (
 	"math"
 
 	"github.com/prometheus/prometheus/model/labels"
-	"github.com/prometheus/prometheus/tsdb/index"
 
 	"github.com/grafana/mimir/pkg/storage/sharding"
 )
@@ -64,7 +63,7 @@ func (pq plans) Iterator() iter.Seq[plan] {
 
 // generatePlansBranchAndBound uses branch-and-bound to explore the space of possible plans.
 // It prunes branches that cannot possibly lead to a better plan than the current best.
-func (p CostBasedPlanner) generatePlansBranchAndBound(ctx context.Context, statistics index.Statistics, matchers []*labels.Matcher, pools *costBasedPlannerPools, shard *sharding.ShardSelector) iter.Seq[plan] {
+func (p CostBasedPlanner) generatePlansBranchAndBound(ctx context.Context, statistics Statistics, matchers []*labels.Matcher, pools *costBasedPlannerPools, shard *sharding.ShardSelector) iter.Seq[plan] {
 	// Initialize priority queue with the root plan (all predicates undecided)
 	prospectPlans := pools.GetPlans(maxPlansForPlanning)
 

@@ -14,13 +14,19 @@ const (
 	DefaultMinSeriesPerBlockForQueryPlanning = 10_000
 	DefaultLabelCardinalityForLargerSketch   = 1e6
 	DefaultLabelCardinalityForSmallerSketch  = 1e3
+	DefaultSampleValuesProbability           = 0.01 // 1% sample rate
+	DefaultSampleValuesMaxCount              = 1024
+	DefaultSampleValuesMaxBytes              = 64 * 1024 // 64KB
 )
 
 var defaultCostConfig = CostConfig{
 	RetrievedPostingCost:              DefaultRetrievedPostingCost,
 	RetrievedSeriesCost:               DefaultRetrievedSeriesCost,
 	RetrievedPostingListCost:          DefaultRetrievedPostingListCost,
 	MinSeriesPerBlockForQueryPlanning: DefaultMinSeriesPerBlockForQueryPlanning,
+	SampleValuesProbability:           DefaultSampleValuesProbability,
+	SampleValuesMaxCount:              DefaultSampleValuesMaxCount,
+	SampleValuesMaxBytes:              DefaultSampleValuesMaxBytes,
 }
 
 type CostConfig struct {
@@ -43,6 +49,19 @@ type CostConfig struct {
 
 	// LabelCardinalityForSmallerSketch is the number of series with a label for that label name to be allocated a smaller count-min sketch.
 	LabelCardinalityForSmallerSketch uint64 `yaml:"label_cardinality_for_smaller_sketch" category:"advanced"`
+
+	// SampleValuesProbability is the probability of sampling a label value for selectivity estimation.
+	// Sampled values are held in memory in addition to the in-memory storage of series.
+	// Set to 0 to disable sampling; regular expressions will be estimated to select 10% of all values of a label.
+	SampleValuesProbability float64 `yaml:"sample_values_probability" category:"advanced"`
+
+	// SampleValuesMaxCount is the maximum number of sampled values to store per label name.
+	// Set to 0 to disable sampling; regular expressions will be estimated to select 10% of all values of a label.
+	SampleValuesMaxCount int `yaml:"sample_values_max_count" category:"advanced"`
+
+	// SampleValuesMaxBytes is the maximum total size in bytes of sampled values per label name.
+	// Set to 0 to disable sampling; regular expressions will be estimated to select 10% of all values of a label.
+	SampleValuesMaxBytes int `yaml:"sample_values_max_bytes" category:"advanced"`
 }
 
 func (cfg *CostConfig) RegisterFlags(f *flag.FlagSet, prefix string) {
@@ -52,6 +71,9 @@ func (cfg *CostConfig) RegisterFlags(f *flag.FlagSet, prefix string) {
 	f.Uint64Var(&cfg.MinSeriesPerBlockForQueryPlanning, prefix+"min-series-per-block-for-query-planning", DefaultMinSeriesPerBlockForQueryPlanning, "Minimum number of series a block must have for query planning to be used.")
 	f.Uint64Var(&cfg.LabelCardinalityForLargerSketch, prefix+"label-cardinality-for-larger-sketch", DefaultLabelCardinalityForLargerSketch, "Number of series for a label name above which larger count-min sketches are used for that label.")
 	f.Uint64Var(&cfg.LabelCardinalityForSmallerSketch, prefix+"label-cardinality-for-smaller-sketch", DefaultLabelCardinalityForSmallerSketch, "Number of series for a label name above which smaller count-min sketches are used for that label.")
+	f.Float64Var(&cfg.SampleValuesProbability, prefix+"sample-values-probability", DefaultSampleValuesProbability, "Probability of sampling a label value for regex selectivity estimation (0.0-1.0). Sampled values are held in memory in addition to series. Set to 0 to disable; regexes will be estimated to select 10% of values.")
+	f.IntVar(&cfg.SampleValuesMaxCount, prefix+"sample-values-max-count", DefaultSampleValuesMaxCount, "Maximum number of sampled values to store per label name. Set to 0 to disable; regexes will be estimated to select 10% of values.")
+	f.IntVar(&cfg.SampleValuesMaxBytes, prefix+"sample-values-max-bytes", DefaultSampleValuesMaxBytes, "Maximum total size in bytes of sampled values per label name. Set to 0 to disable; regexes will be estimated to select 10% of values.")
 }
 
 func (cfg *CostConfig) Validate() error {

@@ -7,7 +7,7 @@ import (
 	"fmt"
 )
 
-// mockStatistics implements the index.Statistics interface with hardcoded data for testing
+// mockStatistics implements the Statistics interface with hardcoded data for testing
 type mockStatistics struct {
 	// seriesPerValue maps label name -> label value -> number of series
 	seriesPerValue map[string]map[string]uint64
@@ -93,6 +93,24 @@ func (m *mockStatistics) LabelValuesCardinality(_ context.Context, name string,
 	return total
 }
 
+func (m *mockStatistics) SampleValues(_ context.Context, name string) []string {
+	labelValues := m.seriesPerValue[name]
+	if len(labelValues) == 0 {
+		return nil
+	}
+
+	// Return DefaultSampleValuesProbability of all values
+	numSamples := max(1, int(DefaultSampleValuesProbability*float64(len(labelValues))))
+	samples := make([]string, 0, numSamples)
+	for value := range labelValues {
+		if len(samples) >= numSamples {
+			break
+		}
+		samples = append(samples, value)
+	}
+	return samples
+}
+
 // newHighCardinalityMockStatistics creates a mockStatistics with higher cardinality
 // to test the planner's behavior with realistic scale data
 func newHighCardinalityMockStatistics() *mockStatistics {

@@ -7,7 +7,6 @@ import (
 	"slices"
 
 	"github.com/prometheus/prometheus/model/labels"
-	"github.com/prometheus/prometheus/tsdb/index"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
 
@@ -38,7 +37,7 @@ type plan struct {
 }
 
 // newScanOnlyPlan returns a plan in which all predicates would be used to scan and none to reach from the index.
-func newScanOnlyPlan(ctx context.Context, stats index.Statistics, config CostConfig, matchers []*labels.Matcher, predicatesPool *pool.SlabPool[bool], shard *sharding.ShardSelector) plan {
+func newScanOnlyPlan(ctx context.Context, stats Statistics, config CostConfig, matchers []*labels.Matcher, predicatesPool *pool.SlabPool[bool], shard *sharding.ShardSelector) plan {
 	p := plan{
 		predicates:          make([]planPredicate, 0, len(matchers)),
 		indexPredicate:      make([]bool, 0, len(matchers)),
@@ -58,7 +57,7 @@ func newScanOnlyPlan(ctx context.Context, stats index.Statistics, config CostCon
 	return p
 }
 
-func newIndexOnlyPlan(ctx context.Context, stats index.Statistics, config CostConfig, matchers []*labels.Matcher, predicatesPool *pool.SlabPool[bool], shard *sharding.ShardSelector) plan {
+func newIndexOnlyPlan(ctx context.Context, stats Statistics, config CostConfig, matchers []*labels.Matcher, predicatesPool *pool.SlabPool[bool], shard *sharding.ShardSelector) plan {
 	p := newScanOnlyPlan(ctx, stats, config, matchers, predicatesPool, shard)
 	for i := range p.indexPredicate {
 		p.indexPredicate[i] = true

@@ -70,11 +70,11 @@ func (p *costBasedPlannerPools) Release() {
 
 type CostBasedPlanner struct {
 	config  CostConfig
-	stats   index.Statistics
+	stats   Statistics
 	metrics Metrics
 }
 
-func NewCostBasedPlanner(metrics Metrics, statistics index.Statistics, config CostConfig) *CostBasedPlanner {
+func NewCostBasedPlanner(metrics Metrics, statistics Statistics, config CostConfig) *CostBasedPlanner {
 	return &CostBasedPlanner{
 		config:  config,
 		metrics: metrics,

@@ -40,7 +40,7 @@ func (p *PlannerFactory) CreatePlanner(meta tsdb.BlockMeta, reader tsdb.IndexRea
 		level.Info(logger).Log("msg", "skipping query planning for small block", "planning_threshold_series", p.config.MinSeriesPerBlockForQueryPlanning)
 		return NoopPlanner{}
 	}
-	stats, err := p.statsGenerator.Stats(meta, reader, p.config.LabelCardinalityForSmallerSketch, p.config.LabelCardinalityForLargerSketch)
+	stats, err := p.statsGenerator.Stats(meta, reader, p.config)
 	if err != nil {
 		level.Warn(logger).Log("msg", "failed to generate statistics; queries for this block won't use query planning", "err", err)
 		return NoopPlanner{}

@@ -6,7 +6,6 @@ import (
 	"context"
 
 	"github.com/prometheus/prometheus/model/labels"
-	"github.com/prometheus/prometheus/tsdb/index"
 )
 
 type planPredicate struct {
@@ -25,7 +24,7 @@ type planPredicate struct {
 	indexScanCost float64
 }
 
-func newPlanPredicate(ctx context.Context, m *labels.Matcher, stats index.Statistics, config CostConfig) planPredicate {
+func newPlanPredicate(ctx context.Context, m *labels.Matcher, stats Statistics, config CostConfig) planPredicate {
 	pred := planPredicate{
 		matcher:         m,
 		singleMatchCost: m.SingleMatchCost(),
@@ -66,7 +65,7 @@ func estimatePredicateIndexScanCost(pred planPredicate, m *labels.Matcher) float
 	panic("estimatePredicateIndexScanCost called with unhandled matcher type: " + m.Type.String() + m.String())
 }
 
-func estimatePredicateCardinality(ctx context.Context, m *labels.Matcher, stats index.Statistics, selectivity float64) uint64 {
+func estimatePredicateCardinality(ctx context.Context, m *labels.Matcher, stats Statistics, selectivity float64) uint64 {
 	switch m.Type {
 	case labels.MatchEqual:
 		return estimateEqualMatcherCardinality(ctx, m, stats)
@@ -81,23 +80,23 @@ func estimatePredicateCardinality(ctx context.Context, m *labels.Matcher, stats
 	}
 }
 
-func estimateEqualMatcherCardinality(ctx context.Context, m *labels.Matcher, stats index.Statistics) uint64 {
+func estimateEqualMatcherCardinality(ctx context.Context, m *labels.Matcher, stats Statistics) uint64 {
 	if m.Matches("") { // foo=""
 		return numSeriesWithoutLabel(ctx, m.Name, stats)
 	}
 	// foo="bar"
 	return stats.LabelValuesCardinality(ctx, m.Name, m.Value)
 }
 
-func estimateNotEqualMatcherCardinality(ctx context.Context, m *labels.Matcher, stats index.Statistics) uint64 {
+func estimateNotEqualMatcherCardinality(ctx context.Context, m *labels.Matcher, stats Statistics) uint64 {
 	if m.Value == "" { // foo!=""
 		return stats.LabelValuesCardinality(ctx, m.Name)
 	}
 	// foo!="bar" matches all series except those with foo="bar"
 	return stats.TotalSeries() - stats.LabelValuesCardinality(ctx, m.Name, m.Value)
 }
 
-func estimateRegexMatcherCardinality(ctx context.Context, m *labels.Matcher, stats index.Statistics, selectivity float64) uint64 {
+func estimateRegexMatcherCardinality(ctx context.Context, m *labels.Matcher, stats Statistics, selectivity float64) uint64 {
 	var matchedSeries uint64
 	if setMatches := m.SetMatches(); len(setMatches) > 0 { // foo=~"bar|baz", foo=~"|bar"
 		matchedSeries = stats.LabelValuesCardinality(ctx, m.Name, setMatches...)
@@ -113,7 +112,7 @@ func estimateRegexMatcherCardinality(ctx context.Context, m *labels.Matcher, sta
 	return matchedSeries
 }
 
-func estimateNotRegexMatcherCardinality(ctx context.Context, m *labels.Matcher, stats index.Statistics, selectivity float64) uint64 {
+func estimateNotRegexMatcherCardinality(ctx context.Context, m *labels.Matcher, stats Statistics, selectivity float64) uint64 {
 	matchedSeries := uint64(0)
 
 	// Calculate how many series are matched by the regex as if the regex was positive.
@@ -135,7 +134,7 @@ func estimateNotRegexMatcherCardinality(ctx context.Context, m *labels.Matcher,
 	return matchedSeries
 }
 
-func numSeriesWithoutLabel(ctx context.Context, labelName string, stats index.Statistics) uint64 {
+func numSeriesWithoutLabel(ctx context.Context, labelName string, stats Statistics) uint64 {
 	return stats.TotalSeries() - stats.LabelValuesCardinality(ctx, labelName)
 }
 

@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+
+package lookupplan
+
+import "context"
+
+// Statistics provides cardinality information about a TSDB component for query planning.
+type Statistics interface {
+	// TotalSeries returns the number of series in the TSDB component.
+	TotalSeries() uint64
+
+	// LabelValuesCount returns the number of values for a label name. If the given label name does not exist,
+	// it is valid to return 0.
+	LabelValuesCount(ctx context.Context, name string) uint64
+
+	// LabelValuesCardinality returns the cardinality of a given label name (i.e., the number of series which
+	// contain that label name). If values are provided, it returns the combined cardinality of all given values;
+	// otherwise, it returns the total cardinality across all values for the label name. If the label name does not exist,
+	// it returns 0.
+	LabelValuesCardinality(ctx context.Context, name string, values ...string) uint64
+
+	// SampleValues returns a representative sample of label values for the given label name.
+	// This is used for selectivity estimation of regex matchers.
+	// Returns nil if no sample is available.
+	SampleValues(ctx context.Context, name string) []string
+}