Skip to content

Commit 194d1d7

Browse files
committed
executor: use analyze NDV rate option
1 parent c9464b1 commit 194d1d7

8 files changed

Lines changed: 218 additions & 18 deletions

File tree

pkg/executor/analyze_utils_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ package executor
1717
import (
1818
"context"
1919
"fmt"
20+
"math"
2021
"testing"
2122

23+
"github.com/pingcap/tidb/pkg/parser/ast"
2224
"github.com/pingcap/tidb/pkg/planner/core"
2325
"github.com/pingcap/tidb/pkg/statistics"
2426
"github.com/pingcap/tidb/pkg/types"
@@ -72,6 +74,20 @@ func BuildExecutorForTest(ctx context.Context, stmt *ExecStmt) error {
7274
}
7375

7476
func TestEstimateSamplingNDV(t *testing.T) {
77+
t.Run("configured NDV sample rate", func(t *testing.T) {
78+
require.Equal(t, float64(statistics.DefaultNDVSampleRate), configuredAnalyzeNDVSampleRate(nil))
79+
require.Equal(t, 0.25, configuredAnalyzeNDVSampleRate(map[ast.AnalyzeOptionType]uint64{
80+
ast.AnalyzeOptNDVRate: math.Float64bits(0.25),
81+
}))
82+
})
83+
84+
t.Run("effective NDV sample rate clamp", func(t *testing.T) {
85+
// configured >= rowSampleRate: pass through.
86+
require.Equal(t, 0.25, effectiveAnalyzeNDVSampleRate(0.25, 0.1))
87+
// configured < rowSampleRate: clamp up.
88+
require.Equal(t, 0.5, effectiveAnalyzeNDVSampleRate(0.25, 0.5))
89+
})
90+
7591
rootCollector := statistics.NewReservoirRowSampleCollector(1, 2)
7692
rootCollector.Count = 100
7793
rootCollector.FMSketches = append(rootCollector.FMSketches,

pkg/executor/builder.go

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3169,7 +3169,6 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(
31693169
modifyCount = int64(val.(int))
31703170
})
31713171
sampleRate := new(float64)
3172-
ndvRate := float64(statistics.DefaultNDVSampleRate)
31733172
var sampleRateReason string
31743173
if opts[ast.AnalyzeOptNumSamples] == 0 {
31753174
*sampleRate = math.Float64frombits(opts[ast.AnalyzeOptSampleRate])
@@ -3195,10 +3194,30 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(
31953194
}
31963195
}
31973196
}
3198-
// NDV estimation needs at least as many rows as the row sample to remain statistically
3199-
// useful; align ndvRate up to sampleRate when a smaller value was configured.
3200-
if ndvRate < *sampleRate {
3201-
ndvRate = *sampleRate
3197+
configuredNDVRate := configuredAnalyzeNDVSampleRate(opts)
3198+
ndvRate := effectiveAnalyzeNDVSampleRate(configuredNDVRate, *sampleRate)
3199+
if ndvRate > configuredNDVRate {
3200+
// handleAnalyzeOptions already rejects the case where both samplerate and
3201+
// ndvrate are explicit and ndvrate is smaller, so this only fires when
3202+
// samplerate was auto-adjusted above the user-configured ndvrate.
3203+
if task.PartitionName != "" {
3204+
sc.AppendNote(errors.NewNoStackErrorf(
3205+
`Analyze raised NDV sample rate from %f to %f to match the auto-adjusted row sample rate for table %s.%s's partition %s`,
3206+
configuredNDVRate,
3207+
ndvRate,
3208+
task.DBName,
3209+
task.TableName,
3210+
task.PartitionName,
3211+
))
3212+
} else {
3213+
sc.AppendNote(errors.NewNoStackErrorf(
3214+
`Analyze raised NDV sample rate from %f to %f to match the auto-adjusted row sample rate for table %s.%s`,
3215+
configuredNDVRate,
3216+
ndvRate,
3217+
task.DBName,
3218+
task.TableName,
3219+
))
3220+
}
32023221
}
32033222
job := &statistics.AnalyzeJob{
32043223
DBName: task.DBName,
@@ -3250,6 +3269,28 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(
32503269
return &analyzeTask{taskType: colTask, colExec: e, job: job}
32513270
}
32523271

3272+
// configuredAnalyzeNDVSampleRate returns the NDV sample rate explicitly set in
3273+
// the analyze options, or DefaultNDVSampleRate when the option is absent.
3274+
func configuredAnalyzeNDVSampleRate(opts map[ast.AnalyzeOptionType]uint64) float64 {
3275+
if val, ok := opts[ast.AnalyzeOptNDVRate]; ok {
3276+
return math.Float64frombits(val)
3277+
}
3278+
return float64(statistics.DefaultNDVSampleRate)
3279+
}
3280+
3281+
// effectiveAnalyzeNDVSampleRate raises the configured NDV sample rate up to
3282+
// rowSampleRate. TiKV builds singleton sketches at ndvrate first and then
3283+
// draws the row sample from that stream at samplerate, so ndvrate < samplerate
3284+
// is incoherent — the row sample cannot exceed the population it is drawn
3285+
// from. The usual trigger is a small table whose samplerate gets auto-adjusted
3286+
// up to 1.0 (full scan) above a smaller configured ndvrate.
3287+
func effectiveAnalyzeNDVSampleRate(configured, rowSampleRate float64) float64 {
3288+
if configured < rowSampleRate {
3289+
return rowSampleRate
3290+
}
3291+
return configured
3292+
}
3293+
32533294
// getAdjustedSampleRate calculate the sample rate by the table size. If we cannot get the table size. We use the 0.001 as the default sample rate.
32543295
// From the paper "Random sampling for histogram construction: how much is enough?"'s Corollary 1 to Theorem 5,
32553296
// for a table size n, histogram size k, maximum relative error in bin size f, and error probability gamma,

pkg/executor/test/analyzetest/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ go_test(
99
"main_test.go",
1010
],
1111
flaky = True,
12-
shard_count = 38,
12+
shard_count = 40,
1313
deps = [
1414
"//pkg/config",
1515
"//pkg/config/kerneltype",

pkg/executor/test/analyzetest/analyze_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,83 @@ func TestSnapshotAnalyzeAndMaxTSAnalyze(t *testing.T) {
455455
}
456456
}
457457

458+
func TestAnalyzeUsesNDVRateOption(t *testing.T) {
459+
store, dom := testkit.CreateMockStoreAndDomain(t)
460+
tk := testkit.NewTestKit(t, store)
461+
462+
tk.MustExec("use test")
463+
tk.MustExec("set @@tidb_analyze_version = 2")
464+
tk.MustExec("create table t_ndv_rate(a int, b int, index idx_b(b))")
465+
// 200 rows: col a is fully unique; col b has 10 distinct values, each repeated 20 times.
466+
values := make([]string, 0, 200)
467+
for i := 1; i <= 200; i++ {
468+
values = append(values, fmt.Sprintf("(%d,%d)", i, i%10))
469+
}
470+
tk.MustExec("insert into t_ndv_rate values " + strings.Join(values, ","))
471+
472+
tbl, err := dom.InfoSchema().TableByName(context.Background(), ast.NewCIStr("test"), ast.NewCIStr("t_ndv_rate"))
473+
require.NoError(t, err)
474+
tblInfo := tbl.Meta()
475+
readNDV := func(histID int64, isIndex int) int64 {
476+
rows := tk.MustQuery(fmt.Sprintf(
477+
"select distinct_count from mysql.stats_histograms where table_id = %d and hist_id = %d and is_index = %d",
478+
tblInfo.ID,
479+
histID,
480+
isIndex,
481+
)).Rows()
482+
require.Len(t, rows, 1)
483+
n, err := strconv.ParseInt(rows[0][0].(string), 10, 64)
484+
require.NoError(t, err)
485+
return n
486+
}
487+
488+
t.Run("exact path with ndvrate=1", func(t *testing.T) {
489+
tk.MustExec("analyze table t_ndv_rate with 1 samplerate, 1 ndvrate")
490+
require.Equal(t, int64(200), readNDV(tblInfo.Columns[0].ID, 0))
491+
require.Equal(t, int64(10), readNDV(tblInfo.Columns[1].ID, 0))
492+
require.Equal(t, int64(10), readNDV(tblInfo.Indices[0].ID, 1))
493+
})
494+
495+
t.Run("singleton-sketch path with ndvrate<1", func(t *testing.T) {
496+
// Pin the unistore analyze RNG seed so the Bernoulli sketch sampling and the
497+
// resulting GEE estimate are deterministic.
498+
require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/pkg/store/mockstore/unistore/cophandler/mockAnalyzeSamplingSeed", "return(1)"))
499+
defer func() {
500+
require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/pkg/store/mockstore/unistore/cophandler/mockAnalyzeSamplingSeed"))
501+
}()
502+
tk.MustExec("analyze table t_ndv_rate with 0.5 samplerate, 0.5 ndvrate")
503+
// True NDVs are 200/10/10. Column `a` lands at 141 (instead of 200) because
504+
// GEE under-estimates NDV on fully-unique columns from a half-sample; this
505+
// confirms the singleton-sketch path is exercised rather than the exact one.
506+
require.Equal(t, int64(141), readNDV(tblInfo.Columns[0].ID, 0))
507+
require.Equal(t, int64(10), readNDV(tblInfo.Columns[1].ID, 0))
508+
require.Equal(t, int64(10), readNDV(tblInfo.Indices[0].ID, 1))
509+
})
510+
}
511+
512+
func TestAnalyzeRaisesNDVRateNote(t *testing.T) {
513+
store, dom := testkit.CreateMockStoreAndDomain(t)
514+
tk := testkit.NewTestKit(t, store)
515+
516+
tk.MustExec("use test")
517+
tk.MustExec("set @@tidb_analyze_version = 2")
518+
tk.MustExec("create table t_ndv_note(a int)")
519+
tk.MustExec("insert into t_ndv_note values (1),(2),(3)")
520+
521+
// Force stats_meta to a tiny count so getAdjustedSampleRate returns 1, which
522+
// will exceed the user's configured ndvrate of 0.5 and trigger the clamp.
523+
statsHandle := dom.StatsHandle()
524+
tbl, err := dom.InfoSchema().TableByName(context.Background(), ast.NewCIStr("test"), ast.NewCIStr("t_ndv_note"))
525+
require.NoError(t, err)
526+
tk.MustExec(fmt.Sprintf("update mysql.stats_meta set count = 3 where table_id = %d", tbl.Meta().ID))
527+
require.NoError(t, statsHandle.Update(context.Background(), dom.InfoSchema()))
528+
529+
tk.MustExec("analyze table t_ndv_note with 0.5 ndvrate")
530+
tk.MustQuery("show warnings").CheckContain(
531+
"Analyze raised NDV sample rate from 0.500000 to 1.000000 to match the auto-adjusted row sample rate for table test.t_ndv_note",
532+
)
533+
}
534+
458535
func TestAdjustSampleRateNote(t *testing.T) {
459536
store := testkit.CreateMockStore(t)
460537
tk := testkit.NewTestKit(t, store)

pkg/planner/core/physical_plan_test.go

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"github.com/pingcap/tidb/pkg/planner/util"
4040
"github.com/pingcap/tidb/pkg/planner/util/coretestsdk"
4141
"github.com/pingcap/tidb/pkg/session"
42+
"github.com/pingcap/tidb/pkg/statistics"
4243
"github.com/pingcap/tidb/pkg/store/mockstore"
4344
"github.com/pingcap/tidb/pkg/testkit"
4445
"github.com/pingcap/tidb/pkg/testkit/external"
@@ -61,10 +62,18 @@ func TestAnalyzeBuildSucc(t *testing.T) {
6162
sql: "analyze table t with 0.1 samplerate",
6263
succ: true,
6364
},
65+
{
66+
sql: "analyze table t with 0.5 ndvrate",
67+
succ: true,
68+
},
6469
{
6570
sql: "analyze table t with 10 samplerate",
6671
succ: false,
6772
},
73+
{
74+
sql: "analyze table t with 2 ndvrate",
75+
succ: false,
76+
},
6877
{
6978
sql: "analyze table t with 0.1 samplerate, 100000 samples",
7079
succ: false,
@@ -101,20 +110,29 @@ func TestAnalyzeSetRate(t *testing.T) {
101110
tk.MustExec("use test")
102111
tk.MustExec("create table t(a int)")
103112
tests := []struct {
104-
sql string
105-
rate float64
113+
sql string
114+
rate float64
115+
ndvRate float64
106116
}{
107117
{
108-
sql: "analyze table t",
109-
rate: -1,
118+
sql: "analyze table t",
119+
rate: -1,
120+
ndvRate: statistics.DefaultNDVSampleRate,
110121
},
111122
{
112-
sql: "analyze table t with 0.1 samplerate",
113-
rate: 0.1,
123+
sql: "analyze table t with 0.1 samplerate",
124+
rate: 0.1,
125+
ndvRate: statistics.DefaultNDVSampleRate,
126+
},
127+
{
128+
sql: "analyze table t with 0.25 ndvrate",
129+
rate: -1,
130+
ndvRate: 0.25,
114131
},
115132
{
116-
sql: "analyze table t with 10000 samples",
117-
rate: -1,
133+
sql: "analyze table t with 10000 samples",
134+
rate: -1,
135+
ndvRate: statistics.DefaultNDVSampleRate,
118136
},
119137
}
120138

@@ -132,6 +150,7 @@ func TestAnalyzeSetRate(t *testing.T) {
132150
require.NoError(t, err, comment)
133151
ana := p.(*core.Analyze)
134152
require.Equal(t, tt.rate, math.Float64frombits(ana.Opts[ast.AnalyzeOptSampleRate]))
153+
require.Equal(t, tt.ndvRate, math.Float64frombits(ana.Opts[ast.AnalyzeOptNDVRate]))
135154
}
136155
}
137156

pkg/planner/core/planbuilder.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3111,6 +3111,7 @@ var analyzeOptionLimit = map[ast.AnalyzeOptionType]uint64{
31113111
ast.AnalyzeOptCMSketchDepth: CMSketchSizeLimit,
31123112
ast.AnalyzeOptNumSamples: 5000000,
31133113
ast.AnalyzeOptSampleRate: math.Float64bits(1),
3114+
ast.AnalyzeOptNDVRate: math.Float64bits(statistics.DefaultNDVSampleRate),
31143115
}
31153116

31163117
// TopN reduced from 500 to 100 due to concerns over large number of TopN values collected for customers with many tables.
@@ -3122,6 +3123,7 @@ var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{
31223123
ast.AnalyzeOptCMSketchDepth: 5,
31233124
ast.AnalyzeOptNumSamples: 0,
31243125
ast.AnalyzeOptSampleRate: math.Float64bits(-1),
3126+
ast.AnalyzeOptNDVRate: math.Float64bits(statistics.DefaultNDVSampleRate),
31253127
}
31263128

31273129
// GetAnalyzeOptionDefaultV2ForTest returns the default analyze options for test.
@@ -3133,7 +3135,7 @@ func GetAnalyzeOptionDefaultV2ForTest() map[ast.AnalyzeOptionType]uint64 {
31333135
// explicitly specified in the statement.
31343136
func handleAnalyzeOptions(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint64, error) {
31353137
optMap := make(map[ast.AnalyzeOptionType]uint64, len(analyzeOptionDefaultV2))
3136-
sampleNum, sampleRate := uint64(0), 0.0
3138+
sampleNum, sampleRate, ndvRate := uint64(0), 0.0, 0.0
31373139
for _, opt := range opts {
31383140
datumValue := opt.Value.(*driver.ValueExpr).Datum
31393141
switch opt.Type {
@@ -3143,7 +3145,7 @@ func handleAnalyzeOptions(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint
31433145
return nil, errors.Errorf("Value of analyze option %s should not be larger than %d", ast.AnalyzeOptionString[opt.Type], analyzeOptionLimit[opt.Type])
31443146
}
31453147
optMap[opt.Type] = v
3146-
case ast.AnalyzeOptSampleRate:
3148+
case ast.AnalyzeOptSampleRate, ast.AnalyzeOptNDVRate:
31473149
// Only Int/Float/decimal is accepted, so pass nil here is safe.
31483150
fVal, err := datumValue.ToFloat64(types.DefaultStmtNoWarningContext)
31493151
if err != nil {
@@ -3153,7 +3155,12 @@ func handleAnalyzeOptions(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint
31533155
if fVal <= 0 || fVal > limit {
31543156
return nil, errors.Errorf("Value of analyze option %s should not larger than %f, and should be greater than 0", ast.AnalyzeOptionString[opt.Type], limit)
31553157
}
3156-
sampleRate = fVal
3158+
switch opt.Type {
3159+
case ast.AnalyzeOptSampleRate:
3160+
sampleRate = fVal
3161+
case ast.AnalyzeOptNDVRate:
3162+
ndvRate = fVal
3163+
}
31573164
optMap[opt.Type] = math.Float64bits(fVal)
31583165
default:
31593166
v := datumValue.GetUint64()
@@ -3169,6 +3176,15 @@ func handleAnalyzeOptions(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint
31693176
if sampleNum > 0 && sampleRate > 0 {
31703177
return nil, errors.Errorf("You can only either set the value of the sample num or set the value of the sample rate. Don't set both of them")
31713178
}
3179+
// TiKV builds singleton sketches at ndvrate first and then draws the row
3180+
// sample from that stream at samplerate, so ndvrate < samplerate is
3181+
// incoherent by construction — the row sample cannot exceed the population
3182+
// it is drawn from. Reject the explicit conflict here so the user sees the
3183+
// mistake instead of letting execution-time clamping silently raise the
3184+
// value.
3185+
if ndvRate > 0 && sampleRate > 0 && ndvRate < sampleRate {
3186+
return nil, errors.Errorf("Value of analyze option NDVRATE (%f) must not be smaller than SAMPLERATE (%f)", ndvRate, sampleRate)
3187+
}
31723188

31733189
return optMap, nil
31743190
}

pkg/planner/core/planbuilder_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,16 @@ func TestHandleAnalyzeOptions(t *testing.T) {
670670
},
671671
ExpectedErr: "Value of analyze option SAMPLERATE should not larger than 1.000000, and should be greater than 0",
672672
},
673+
{
674+
name: "Too big NDVRate option",
675+
opts: []ast.AnalyzeOpt{
676+
{
677+
Type: ast.AnalyzeOptNDVRate,
678+
Value: ast.NewValueExpr(2, "", ""),
679+
},
680+
},
681+
ExpectedErr: "Value of analyze option NDVRATE should not larger than 1.000000, and should be greater than 0",
682+
},
673683
{
674684
name: "Too big NumBuckets option",
675685
opts: []ast.AnalyzeOpt{
@@ -694,6 +704,20 @@ func TestHandleAnalyzeOptions(t *testing.T) {
694704
},
695705
ExpectedErr: "ou can only either set the value of the sample num or set the value of the sample rate. Don't set both of them",
696706
},
707+
{
708+
name: "NDVRate below SampleRate",
709+
opts: []ast.AnalyzeOpt{
710+
{
711+
Type: ast.AnalyzeOptSampleRate,
712+
Value: ast.NewValueExpr(0.5, "", ""),
713+
},
714+
{
715+
Type: ast.AnalyzeOptNDVRate,
716+
Value: ast.NewValueExpr(0.1, "", ""),
717+
},
718+
},
719+
ExpectedErr: "Value of analyze option NDVRATE (0.100000) must not be smaller than SAMPLERATE (0.500000)",
720+
},
697721
}
698722

699723
for _, tt := range tests {

0 commit comments

Comments
 (0)