Skip to content

Commit 307b6ce

Browse files
recsplit: use uint32 for bucketIdx - to reduce amount of intermediate data (#19535)
recsplit now producing tons of `etl` files this PR reducing amount of them 25% - not a game-changer but PR is simple: bucket number is small-enough to always fit in u32
1 parent 093a0c8 commit 307b6ce

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

db/recsplit/recsplit.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ type RecSplit struct {
154154
baseDataID uint64 // Minimal app-specific ID of entries of this index - helps app understand what data stored in given shard - persistent field
155155
bucketCount uint64 // Number of buckets
156156
salt uint32 // Murmur3 hash used for converting keys to 64-bit values and assigning to buckets
157-
bucketKeyBuf [16]byte
157+
bucketKeyBuf [12]byte
158158
numBuf [8]byte
159159
collision bool
160160
enums bool // Whether to build two level index with perfect hash table pointing to enumeration and enumeration pointing to offsets
@@ -214,6 +214,9 @@ func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) {
214214
0x4ef95e25f4b4983d, 0x81175195173b92d3, 0x4e50927d8dd15978, 0x1ea2099d1fafae7f, 0x425c8a06fbaaa815, 0xcd4216006c74052a}
215215
}
216216
bucketCount := (args.KeyCount + args.BucketSize - 1) / args.BucketSize
217+
if bucketCount > math.MaxUint32 {
218+
return nil, fmt.Errorf("recsplit: bucketCount %d exceeds uint32 max (too many keys for bucketSize=%d)", bucketCount, args.BucketSize)
219+
}
217220
rs := &RecSplit{
218221
dataStructureVersion: version.DataStructureVersion(args.Version),
219222
bucketSize: args.BucketSize, keyExpectedCount: uint64(args.KeyCount), bucketCount: uint64(bucketCount),
@@ -469,8 +472,9 @@ func (rs *RecSplit) AddKey(key []byte, offset uint64) error {
469472
return errors.New("cannot add keys after perfect hash function had been built")
470473
}
471474
hi, lo := murmur3.Sum128WithSeed(key, rs.salt)
472-
binary.BigEndian.PutUint64(rs.bucketKeyBuf[:], remap(hi, rs.bucketCount))
473-
binary.BigEndian.PutUint64(rs.bucketKeyBuf[8:], lo)
475+
bucketIdx := uint32(remap(hi, rs.bucketCount))
476+
binary.BigEndian.PutUint32(rs.bucketKeyBuf[:], bucketIdx)
477+
binary.BigEndian.PutUint64(rs.bucketKeyBuf[4:], lo)
474478
binary.BigEndian.PutUint64(rs.numBuf[:], offset)
475479
if offset > rs.maxOffset {
476480
rs.maxOffset = offset
@@ -778,8 +782,9 @@ func recsplit(level int, bucket []uint64, offsets []uint64, unary []uint64, rs *
778782

779783
// loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load
780784
func (rs *RecSplit) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error {
781-
// k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket
782-
bucketIdx := binary.BigEndian.Uint64(k)
785+
// k is the BigEndian encoding of the bucket number (4 bytes) + fingerprint (8 bytes),
786+
// and v is the offset/enum value assigned into that bucket
787+
bucketIdx := uint64(binary.BigEndian.Uint32(k))
783788
if rs.currentBucketIdx != bucketIdx {
784789
if rs.currentBucketIdx != math.MaxUint64 {
785790
if err := rs.recsplitCurrentBucket(); err != nil {
@@ -788,7 +793,7 @@ func (rs *RecSplit) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.
788793
}
789794
rs.currentBucketIdx = bucketIdx
790795
}
791-
rs.currentBucket = append(rs.currentBucket, binary.BigEndian.Uint64(k[8:]))
796+
rs.currentBucket = append(rs.currentBucket, binary.BigEndian.Uint64(k[4:]))
792797
rs.currentBucketOffs = append(rs.currentBucketOffs, binary.BigEndian.Uint64(v))
793798
return nil
794799
}

0 commit comments

Comments
 (0)