@@ -154,7 +154,7 @@ type RecSplit struct {
154154 baseDataID uint64 // Minimal app-specific ID of entries of this index - helps app understand what data stored in given shard - persistent field
155155 bucketCount uint64 // Number of buckets
156156 salt uint32 // Murmur3 hash used for converting keys to 64-bit values and assigning to buckets
157- bucketKeyBuf [16 ]byte
157+ bucketKeyBuf [12 ]byte
158158 numBuf [8 ]byte
159159 collision bool
160160 enums bool // Whether to build two level index with perfect hash table pointing to enumeration and enumeration pointing to offsets
@@ -214,6 +214,9 @@ func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) {
214214 0x4ef95e25f4b4983d , 0x81175195173b92d3 , 0x4e50927d8dd15978 , 0x1ea2099d1fafae7f , 0x425c8a06fbaaa815 , 0xcd4216006c74052a }
215215 }
216216 bucketCount := (args .KeyCount + args .BucketSize - 1 ) / args .BucketSize
217+ if bucketCount > math .MaxUint32 {
218+ return nil , fmt .Errorf ("recsplit: bucketCount %d exceeds uint32 max (too many keys for bucketSize=%d)" , bucketCount , args .BucketSize )
219+ }
217220 rs := & RecSplit {
218221 dataStructureVersion : version .DataStructureVersion (args .Version ),
219222 bucketSize : args .BucketSize , keyExpectedCount : uint64 (args .KeyCount ), bucketCount : uint64 (bucketCount ),
@@ -469,8 +472,9 @@ func (rs *RecSplit) AddKey(key []byte, offset uint64) error {
469472 return errors .New ("cannot add keys after perfect hash function had been built" )
470473 }
471474 hi , lo := murmur3 .Sum128WithSeed (key , rs .salt )
472- binary .BigEndian .PutUint64 (rs .bucketKeyBuf [:], remap (hi , rs .bucketCount ))
473- binary .BigEndian .PutUint64 (rs .bucketKeyBuf [8 :], lo )
475+ bucketIdx := uint32 (remap (hi , rs .bucketCount ))
476+ binary .BigEndian .PutUint32 (rs .bucketKeyBuf [:], bucketIdx )
477+ binary .BigEndian .PutUint64 (rs .bucketKeyBuf [4 :], lo )
474478 binary .BigEndian .PutUint64 (rs .numBuf [:], offset )
475479 if offset > rs .maxOffset {
476480 rs .maxOffset = offset
@@ -778,8 +782,9 @@ func recsplit(level int, bucket []uint64, offsets []uint64, unary []uint64, rs *
778782
779783// loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load
780784func (rs * RecSplit ) loadFuncBucket (k , v []byte , _ etl.CurrentTableReader , _ etl.LoadNextFunc ) error {
781- // k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket
782- bucketIdx := binary .BigEndian .Uint64 (k )
785+ // k is the BigEndian encoding of the bucket number (4 bytes) + fingerprint (8 bytes),
786+ // and v is the offset/enum value assigned into that bucket
787+ bucketIdx := uint64 (binary .BigEndian .Uint32 (k ))
783788 if rs .currentBucketIdx != bucketIdx {
784789 if rs .currentBucketIdx != math .MaxUint64 {
785790 if err := rs .recsplitCurrentBucket (); err != nil {
@@ -788,7 +793,7 @@ func (rs *RecSplit) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.
788793 }
789794 rs .currentBucketIdx = bucketIdx
790795 }
791- rs .currentBucket = append (rs .currentBucket , binary .BigEndian .Uint64 (k [8 :]))
796+ rs .currentBucket = append (rs .currentBucket , binary .BigEndian .Uint64 (k [4 :]))
792797 rs .currentBucketOffs = append (rs .currentBucketOffs , binary .BigEndian .Uint64 (v ))
793798 return nil
794799}
0 commit comments