Skip to content

Commit 49c7713

Browse files
authored
executor, statistics: SampleItem refactor (#66336)
close #66335
1 parent 5c19988 commit 49c7713

8 files changed

Lines changed: 41 additions & 81 deletions

File tree

pkg/executor/analyze_col_sampling.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -762,9 +762,9 @@ workLoop:
762762
sampleItems := make([]*statistics.SampleItem, 0, sampleNum)
763763
// consume mandatory memory at the beginning, including empty SampleItems of all sample rows, if exceeds, fast fail
764764
// 8 means the pointer size of sampleItems slice.
765-
// types.EmptyDatumSize means the empty datum size we shallow copied from row.Columns to SampleItem.Value.
765+
// statistics.EmptySampleItemSize already accounts for the embedded types.Datum in SampleItem.Value.
766766
// The real underlying byte slice of Datum in row.Columns has already be accounted FromProto().
767-
collectorMemSize := int64(sampleNum) * (8 + statistics.EmptySampleItemSize + types.EmptyDatumSize)
767+
collectorMemSize := int64(sampleNum) * (8 + statistics.EmptySampleItemSize)
768768
e.memTracker.Consume(collectorMemSize)
769769
var collator collate.Collator
770770
ft := e.colsInfo[task.slicePos].FieldType
@@ -789,7 +789,7 @@ workLoop:
789789
consumeBuffered(deltaSize)
790790
}
791791
sampleItems = append(sampleItems, &statistics.SampleItem{
792-
Value: &val,
792+
Value: val,
793793
Ordinal: j,
794794
})
795795
}
@@ -810,8 +810,8 @@ workLoop:
810810
sampleItems := make([]*statistics.SampleItem, 0, sampleNum)
811811
// consume mandatory memory at the beginning, including all SampleItems, if exceeds, fast fail
812812
// 8 is size of reference, 8 is the size of "b := make([]byte, 0, 8)"
813-
// types.EmptyDatumSize: same meaning as above branch.
814-
collectorMemSize := int64(sampleNum) * (8 + statistics.EmptySampleItemSize + 8 + types.EmptyDatumSize)
813+
// statistics.EmptySampleItemSize already accounts for the embedded types.Datum in SampleItem.Value.
814+
collectorMemSize := int64(sampleNum) * (8 + statistics.EmptySampleItemSize + 8)
815815
e.memTracker.Consume(collectorMemSize)
816816
errCtx := e.ctx.GetSessionVars().StmtCtx.ErrCtx()
817817
indexSampleCollectLoop:
@@ -848,9 +848,8 @@ workLoop:
848848
// here we need to account the remaining bytes.
849849
consumeBuffered(int64(cap(b) - 8))
850850
}
851-
tmp := types.NewBytesDatum(b)
852851
sampleItems = append(sampleItems, &statistics.SampleItem{
853-
Value: &tmp,
852+
Value: types.NewBytesDatum(b),
854853
})
855854
}
856855
flushBuffered(&collectorMemSize)

pkg/statistics/builder.go

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import (
2727
"github.com/pingcap/tidb/pkg/util/codec"
2828
"github.com/pingcap/tidb/pkg/util/collate"
2929
"github.com/pingcap/tidb/pkg/util/generic"
30-
"github.com/pingcap/tidb/pkg/util/intest"
3130
"github.com/pingcap/tidb/pkg/util/memory"
3231
)
3332

@@ -266,7 +265,7 @@ func buildHist(
266265
// In extreme cases, it could be that this value only appears once, and that one row happens to be sampled.
267266
// Therefore, if the sample count of this value is only once, we use a more conservative ndvFactor.
268267
// However, if the calculated ndvFactor is larger than the sampleFactor, we still use the sampleFactor.
269-
hg.AppendBucket(samples[firstSampleIdx].Value, samples[firstSampleIdx].Value, int64(sampleFactor), int64(ndvFactor))
268+
hg.AppendBucket(&samples[firstSampleIdx].Value, &samples[firstSampleIdx].Value, int64(sampleFactor), int64(ndvFactor))
270269
bufferedMemSize := int64(0)
271270
bufferedReleaseSize := int64(0)
272271
defer func() {
@@ -296,7 +295,7 @@ func buildHist(
296295
memTracker.BufferedConsume(&bufferedMemSize, deltaSize)
297296
memTracker.BufferedRelease(&bufferedReleaseSize, deltaSize)
298297
}
299-
cmp, err := upper.Compare(sc.TypeCtx(), samples[i].Value, collate.GetBinaryCollator())
298+
cmp, err := upper.Compare(sc.TypeCtx(), &samples[i].Value, collate.GetBinaryCollator())
300299
if err != nil {
301300
return 0, errors.Trace(err)
302301
}
@@ -321,13 +320,13 @@ func buildHist(
321320
}
322321
} else if totalCount-float64(lastCount) <= valuesPerBucket {
323322
// The bucket still has room to store a new item, update the bucket.
324-
hg.updateLastBucket(samples[i].Value, int64(totalCount), int64(ndvFactor), false)
323+
hg.updateLastBucket(&samples[i].Value, int64(totalCount), int64(ndvFactor), false)
325324
} else {
326325
lastCount = hg.Buckets[bucketIdx].Count
327326
// The bucket is full, store the item in the next bucket.
328327
bucketIdx++
329328
// Refer to the comments for the first bucket for the reason why we use ndvFactor here.
330-
hg.AppendBucket(samples[i].Value, samples[i].Value, int64(totalCount), int64(ndvFactor))
329+
hg.AppendBucket(&samples[i].Value, &samples[i].Value, int64(totalCount), int64(ndvFactor))
331330
}
332331
}
333332
return corrXYSum, nil
@@ -427,11 +426,7 @@ func BuildHistAndTopN(
427426
return cmp.Compare(a.Count, b.Count) // min-heap: smaller counts at root
428427
})
429428

430-
intest.Assert(samples[0].Value != nil, "sample item value should not be nil")
431-
if samples[0].Value == nil {
432-
return nil, nil, errors.Errorf("sample item value is nil")
433-
}
434-
cur, err := getComparedBytes(*samples[0].Value)
429+
cur, err := getComparedBytes(samples[0].Value)
435430
if err != nil {
436431
return nil, nil, errors.Trace(err)
437432
}
@@ -451,11 +446,7 @@ func BuildHistAndTopN(
451446
if numTopN == 0 {
452447
continue
453448
}
454-
intest.Assert(samples[i].Value != nil, "sample item value should not be nil")
455-
if samples[i].Value == nil {
456-
return nil, nil, errors.Errorf("sample item value is nil")
457-
}
458-
sampleBytes, err := getComparedBytes(*samples[i].Value)
449+
sampleBytes, err := getComparedBytes(samples[i].Value)
459450
if err != nil {
460451
return nil, nil, errors.Trace(err)
461452
}

pkg/statistics/builder_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,31 @@ func BenchmarkBuildHistAndTopN(b *testing.B) {
3737
d := types.NewIntDatum(int64(i))
3838
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
3939
require.NoError(b, err)
40-
data = append(data, &SampleItem{Value: &d})
40+
data = append(data, &SampleItem{Value: d})
4141
}
4242
for i := 1; i < 10; i++ {
4343
d := types.NewIntDatum(int64(2))
4444
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
4545
require.NoError(b, err)
46-
data = append(data, &SampleItem{Value: &d})
46+
data = append(data, &SampleItem{Value: d})
4747
}
4848
for i := 1; i < 7; i++ {
4949
d := types.NewIntDatum(int64(4))
5050
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
5151
require.NoError(b, err)
52-
data = append(data, &SampleItem{Value: &d})
52+
data = append(data, &SampleItem{Value: d})
5353
}
5454
for i := 1; i < 5; i++ {
5555
d := types.NewIntDatum(int64(7))
5656
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
5757
require.NoError(b, err)
58-
data = append(data, &SampleItem{Value: &d})
58+
data = append(data, &SampleItem{Value: d})
5959
}
6060
for i := 1; i < 3; i++ {
6161
d := types.NewIntDatum(int64(11))
6262
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
6363
require.NoError(b, err)
64-
data = append(data, &SampleItem{Value: &d})
64+
data = append(data, &SampleItem{Value: d})
6565
}
6666
collector := &SampleCollector{
6767
Samples: data,
@@ -92,29 +92,29 @@ func BenchmarkBuildHistAndTopNWithLowNDV(b *testing.B) {
9292
d := types.NewIntDatum(int64(1000))
9393
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
9494
require.NoError(b, err)
95-
data = append(data, &SampleItem{Value: &d})
95+
data = append(data, &SampleItem{Value: d})
9696
}
9797
for i := 1; i <= 1_000; i++ {
9898
total++
9999
d := types.NewIntDatum(int64(2000))
100100
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
101101
require.NoError(b, err)
102-
data = append(data, &SampleItem{Value: &d})
102+
data = append(data, &SampleItem{Value: d})
103103
}
104104
end := total / 2
105105
for range end {
106106
total++
107107
d := types.NewIntDatum(rand.Int63n(50))
108108
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
109109
require.NoError(b, err)
110-
data = append(data, &SampleItem{Value: &d})
110+
data = append(data, &SampleItem{Value: d})
111111
}
112112
end = cnt - total
113113
for range end {
114114
d := types.NewIntDatum(rand.Int63n(100))
115115
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
116116
require.NoError(b, err)
117-
data = append(data, &SampleItem{Value: &d})
117+
data = append(data, &SampleItem{Value: d})
118118
}
119119
collector := &SampleCollector{
120120
Samples: data,

pkg/statistics/fmsketch_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
func extractSampleItemsDatums(items []*SampleItem) []types.Datum {
3030
datums := make([]types.Datum, len(items))
3131
for i, item := range items {
32-
datums[i] = *item.Value
32+
datums[i] = item.Value
3333
}
3434
return datums
3535
}

pkg/statistics/main_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func createTestStatisticsSamples(t *testing.T) *testStatisticsSamples {
8888
s.count = 100000
8989
samples := make([]*SampleItem, 10000)
9090
for i := range samples {
91-
samples[i] = &SampleItem{Value: &types.Datum{}}
91+
samples[i] = &SampleItem{Value: types.Datum{}}
9292
}
9393
start := 1000
9494
samples[0].Value.SetInt64(0)

pkg/statistics/sample.go

Lines changed: 9 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"github.com/pingcap/tidb/pkg/util/chunk"
3232
"github.com/pingcap/tidb/pkg/util/collate"
3333
"github.com/pingcap/tidb/pkg/util/fastrand"
34-
"github.com/pingcap/tidb/pkg/util/intest"
3534
"github.com/pingcap/tidb/pkg/util/sqlexec"
3635
"github.com/pingcap/tipb/go-tipb"
3736
"github.com/twmb/murmur3"
@@ -40,7 +39,7 @@ import (
4039
// SampleItem is an item of sampled column value.
4140
type SampleItem struct {
4241
// Value is the sampled column value.
43-
Value *types.Datum
42+
Value types.Datum
4443
// Handle is the handle of the sample in its key.
4544
// This property is used to calculate Ordinal in fast analyze.
4645
Handle kv.Handle
@@ -49,38 +48,14 @@ type SampleItem struct {
4948
Ordinal int
5049
}
5150

52-
// EmptySampleItemSize is the size of an empty SampleItem on 64-bit platforms: 32 = 8 (Value pointer) + 16 (Handle interface) + 8 (Ordinal int).
51+
// EmptySampleItemSize is the size of empty SampleItem, 96 = 72 (datum) + 8 (int) + 16.
5352
const EmptySampleItemSize = int64(unsafe.Sizeof(SampleItem{}))
5453

55-
// CopySampleItems returns a deep copy of SampleItem slice.
56-
func CopySampleItems(items []*SampleItem) []*SampleItem {
57-
n := make([]*SampleItem, len(items))
58-
for i, item := range items {
59-
ni := &SampleItem{
60-
Handle: item.Handle,
61-
Ordinal: item.Ordinal,
62-
}
63-
if item.Value != nil {
64-
ni.Value = &types.Datum{}
65-
item.Value.Copy(ni.Value)
66-
}
67-
n[i] = ni
68-
}
69-
return n
70-
}
71-
7254
func sortSampleItems(sc *stmtctx.StatementContext, items []*SampleItem) error {
7355
var err error
74-
if intest.InTest {
75-
for _, item := range items {
76-
if item.Value == nil {
77-
return errors.Errorf("sample item value is nil")
78-
}
79-
}
80-
}
8156
slices.SortStableFunc(items, func(i, j *SampleItem) int {
8257
var cmp int
83-
cmp, err = i.Value.Compare(sc.TypeCtx(), j.Value, collate.GetBinaryCollator())
58+
cmp, err = i.Value.Compare(sc.TypeCtx(), &j.Value, collate.GetBinaryCollator())
8459
if err != nil {
8560
return -1
8661
}
@@ -130,11 +105,7 @@ func (c *SampleCollector) MergeSampleCollector(sc *stmtctx.StatementContext, rc
130105
terror.Log(errors.Trace(err))
131106
}
132107
for _, item := range rc.Samples {
133-
if item.Value == nil {
134-
terror.Log(errors.Errorf("sample item value is nil"))
135-
continue
136-
}
137-
err := c.collect(sc, *item.Value)
108+
err := c.collect(sc, item.Value)
138109
terror.Log(errors.Trace(err))
139110
}
140111
}
@@ -173,8 +144,7 @@ func SampleCollectorFromProto(collector *tipb.SampleCollector) *SampleCollector
173144
for _, val := range collector.Samples {
174145
// When store the histogram bucket boundaries to kv, we need to limit the length of the value.
175146
if len(val) <= MaxSampleValueLength {
176-
tmp := types.NewBytesDatum(val)
177-
item := &SampleItem{Value: &tmp}
147+
item := &SampleItem{Value: types.NewBytesDatum(val)}
178148
s.Samples = append(s.Samples, item)
179149
}
180150
}
@@ -202,15 +172,15 @@ func (c *SampleCollector) collect(sc *stmtctx.StatementContext, d types.Datum) e
202172
// to the underlying slice, GC can't free them which lead to memory leak eventually.
203173
// TODO: Refactor the proto to avoid copying here.
204174
if len(c.Samples) < int(c.MaxSampleSize) {
205-
newItem := &SampleItem{Value: &types.Datum{}}
206-
d.Copy(newItem.Value)
175+
newItem := &SampleItem{}
176+
d.Copy(&newItem.Value)
207177
c.Samples = append(c.Samples, newItem)
208178
} else {
209179
shouldAdd := int64(fastrand.Uint64N(uint64(c.seenValues))) < c.MaxSampleSize
210180
if shouldAdd {
211181
idx := int(fastrand.Uint32N(uint32(c.MaxSampleSize)))
212-
newItem := &SampleItem{Value: &types.Datum{}}
213-
d.Copy(newItem.Value)
182+
newItem := &SampleItem{}
183+
d.Copy(&newItem.Value)
214184
// To keep the order of the elements, we use delete and append, not direct replacement.
215185
c.Samples = slices.Delete(c.Samples, idx, idx+1)
216186
c.Samples = append(c.Samples, newItem)

pkg/statistics/sample_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -146,31 +146,31 @@ func TestBuildStatsOnRowSample(t *testing.T) {
146146
d := types.NewIntDatum(int64(i))
147147
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
148148
require.NoError(t, err)
149-
data = append(data, &SampleItem{Value: &d})
149+
data = append(data, &SampleItem{Value: d})
150150
}
151151
for i := 1; i < 10; i++ {
152152
d := types.NewIntDatum(int64(2))
153153
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
154154
require.NoError(t, err)
155-
data = append(data, &SampleItem{Value: &d})
155+
data = append(data, &SampleItem{Value: d})
156156
}
157157
for i := 1; i < 7; i++ {
158158
d := types.NewIntDatum(int64(4))
159159
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
160160
require.NoError(t, err)
161-
data = append(data, &SampleItem{Value: &d})
161+
data = append(data, &SampleItem{Value: d})
162162
}
163163
for i := 1; i < 5; i++ {
164164
d := types.NewIntDatum(int64(7))
165165
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
166166
require.NoError(t, err)
167-
data = append(data, &SampleItem{Value: &d})
167+
data = append(data, &SampleItem{Value: d})
168168
}
169169
for i := 1; i < 3; i++ {
170170
d := types.NewIntDatum(int64(11))
171171
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
172172
require.NoError(t, err)
173-
data = append(data, &SampleItem{Value: &d})
173+
data = append(data, &SampleItem{Value: d})
174174
}
175175
collector := &SampleCollector{
176176
Samples: data,
@@ -205,19 +205,19 @@ func TestBuildSampleFullNDV(t *testing.T) {
205205
d := types.NewIntDatum(int64(2))
206206
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
207207
require.NoError(t, err)
208-
data = append(data, &SampleItem{Value: &d})
208+
data = append(data, &SampleItem{Value: d})
209209
}
210210
for i := 1; i < 31; i++ {
211211
d := types.NewIntDatum(int64(4))
212212
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
213213
require.NoError(t, err)
214-
data = append(data, &SampleItem{Value: &d})
214+
data = append(data, &SampleItem{Value: d})
215215
}
216216
for i := 1; i < 25; i++ {
217217
d := types.NewIntDatum(int64(7))
218218
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
219219
require.NoError(t, err)
220-
data = append(data, &SampleItem{Value: &d})
220+
data = append(data, &SampleItem{Value: d})
221221
}
222222

223223
// Add many more distinct values to the FMSketch to make column NDV > sample NDV

pkg/statistics/statistics_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ func SubTestBuild() func(*testing.T) {
629629

630630
datum := types.Datum{}
631631
datum.SetMysqlJSON(types.BinaryJSON{TypeCode: types.JSONTypeCodeLiteral})
632-
item := &SampleItem{Value: &datum}
632+
item := &SampleItem{Value: datum}
633633
collector = &SampleCollector{
634634
Count: 1,
635635
NullCount: 0,

0 commit comments

Comments
 (0)