Skip to content

Commit 450a110

Browse files
Sequence: Builder to avoid intermediate EF representation (#19552)
We knowing `count` in-advance. Means: - or directly write to rebased EF - or directly write to SimpleSeq ``` ┌────────┬──────────────────────┬───────────────────────┐ │ │ Builder │ Merge │ ├────────┼──────────────────────┼───────────────────────┤ │ time │ 5.5µs → 1.8µs (−67%) │ 16.7µs → 9.0µs (−46%) │ ├────────┼──────────────────────┼───────────────────────┤ │ memory │ 1896B → 696B (−63%) │ 3117B → 1175B (−63%) │ ├────────┼──────────────────────┼───────────────────────┤ │ allocs │ 9 → 6 (−33%) │ 10 → 7 (−30%) │ └────────┴──────────────────────┴───────────────────────┘ ```
1 parent 0fc700c commit 450a110

File tree

4 files changed

+280
-46
lines changed

4 files changed

+280
-46
lines changed

db/recsplit/multiencseq/sequence_builder.go

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ const SIMPLE_SEQUENCE_MAX_THRESHOLD = 16
2929
//
3030
// This is the "writer" counterpart of SequenceReader.
3131
type SequenceBuilder struct {
32-
baseNum uint64
33-
ef *eliasfano32.EliasFano
32+
baseNum uint64
33+
smallBuf [SIMPLE_SEQUENCE_MAX_THRESHOLD]uint32 // rebased values for simple encoding (count <= 16)
34+
smallCount uint8
35+
rebasedEf *eliasfano32.EliasFano // direct rebased EF for large sequences (count > 16)
3436
}
3537

3638
// Creates a new builder. The builder is not meant to be reused. The construction
@@ -47,67 +49,48 @@ type SequenceBuilder struct {
4749
// count: this is the number of elements in the sequence, used in case of elias fano
4850
// maxOffset: this is maximum value in the sequence, used in case of elias fano
4951
func NewBuilder(baseNum, count, maxOffset uint64) *SequenceBuilder {
50-
return &SequenceBuilder{
51-
baseNum: baseNum,
52-
ef: eliasfano32.NewEliasFano(count, maxOffset),
52+
if count > SIMPLE_SEQUENCE_MAX_THRESHOLD {
53+
// For large sequences, target rebased EF directly. AddOffset subtracts baseNum
54+
// on the fly, so AppendBytes can serialize without a second pass.
55+
return &SequenceBuilder{
56+
baseNum: baseNum,
57+
rebasedEf: eliasfano32.NewEliasFano(count, maxOffset-baseNum),
58+
}
5359
}
60+
return &SequenceBuilder{baseNum: baseNum}
5461
}
5562

5663
func (b *SequenceBuilder) AddOffset(offset uint64) {
57-
// TODO: write offset already subtracting baseNum now that PlainEF is gone
58-
b.ef.AddOffset(offset)
64+
if b.rebasedEf != nil {
65+
b.rebasedEf.AddOffset(offset - b.baseNum)
66+
return
67+
}
68+
b.smallBuf[b.smallCount] = uint32(offset - b.baseNum)
69+
b.smallCount++
5970
}
6071

6172
func (b *SequenceBuilder) Build() {
62-
b.ef.Build()
73+
if b.rebasedEf != nil {
74+
b.rebasedEf.Build()
75+
}
6376
}
6477

6578
func (b *SequenceBuilder) AppendBytes(buf []byte) []byte {
66-
if b.ef.Count() <= SIMPLE_SEQUENCE_MAX_THRESHOLD {
67-
return b.simpleEncoding(buf)
79+
if b.rebasedEf != nil {
80+
buf = append(buf, byte(RebasedEliasFano))
81+
return b.rebasedEf.AppendBytes(buf)
6882
}
69-
70-
return b.rebasedEliasFano(buf)
83+
return b.simpleEncoding(buf)
7184
}
7285

7386
func (b *SequenceBuilder) simpleEncoding(buf []byte) []byte {
7487
// Simple encoding type + size: [0x80, 0x8F]
75-
count := b.ef.Count()
76-
enc := byte(count-1) & byte(0b00001111)
77-
enc |= byte(SimpleEncoding)
88+
enc := (b.smallCount-1)&0x0F | byte(SimpleEncoding)
7889
buf = append(buf, enc)
7990

80-
// Encode elems
81-
var bn [4]byte
82-
for it := b.ef.Iterator(); it.HasNext(); {
83-
n, err := it.Next()
84-
if err != nil {
85-
// TODO: err
86-
panic(err)
87-
}
88-
n -= b.baseNum
89-
90-
binary.BigEndian.PutUint32(bn[:], uint32(n))
91-
buf = append(buf, bn[:]...)
91+
for _, v := range b.smallBuf[:b.smallCount] {
92+
buf = binary.BigEndian.AppendUint32(buf, v)
9293
}
9394

9495
return buf
9596
}
96-
97-
func (b *SequenceBuilder) rebasedEliasFano(buf []byte) []byte {
98-
// Reserved encoding type 0x90 == rebased elias fano
99-
buf = append(buf, byte(RebasedEliasFano))
100-
101-
// Rebased ef
102-
rbef := eliasfano32.NewEliasFano(b.ef.Count(), b.ef.Max()-b.baseNum)
103-
for it := b.ef.Iterator(); it.HasNext(); {
104-
n, err := it.Next()
105-
if err != nil {
106-
panic(err)
107-
}
108-
109-
rbef.AddOffset(n - b.baseNum)
110-
}
111-
rbef.Build()
112-
return rbef.AppendBytes(buf)
113-
}

db/recsplit/multiencseq/sequence_builder_test.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,77 @@ func TestMultiEncodingSeqBuilder(t *testing.T) {
101101
}
102102
})
103103
}
104+
105+
func BenchmarkBuilder(b *testing.B) {
106+
const baseNum = 1_000_000
107+
const n = 500
108+
109+
vals := make([]uint64, n)
110+
for i := range vals {
111+
vals[i] = baseNum + uint64(i)*2
112+
}
113+
114+
b.ResetTimer()
115+
for i := 0; i < b.N; i++ {
116+
sb := NewBuilder(baseNum, n, vals[n-1])
117+
for _, v := range vals {
118+
sb.AddOffset(v)
119+
}
120+
sb.Build()
121+
_ = sb.AppendBytes(nil)
122+
}
123+
}
124+
125+
// TestBuilderRoundTrip verifies that serialized output can be read back correctly via
126+
// SequenceReader for both encoding paths, including the direct-rebased-EF path (count > 16).
127+
func TestBuilderRoundTrip(t *testing.T) {
128+
check := func(t *testing.T, baseNum uint64, vals []uint64) {
129+
t.Helper()
130+
raw := buildTestSeq(baseNum, vals...)
131+
s := ReadMultiEncSeq(baseNum, raw)
132+
require.Equal(t, uint64(len(vals)), s.Count())
133+
require.Equal(t, vals[0], s.Min())
134+
require.Equal(t, vals[len(vals)-1], s.Max())
135+
for i, want := range vals {
136+
require.Equal(t, want, s.Get(uint64(i)), "index %d", i)
137+
}
138+
var it SequenceIterator
139+
it.Reset(s, 0)
140+
for i := 0; it.HasNext(); i++ {
141+
v, err := it.Next()
142+
require.NoError(t, err)
143+
require.Equal(t, vals[i], v, "iterator index %d", i)
144+
}
145+
}
146+
147+
t.Run("boundary: 16 elements uses simple encoding", func(t *testing.T) {
148+
vals := make([]uint64, 16)
149+
for i := range vals {
150+
vals[i] = 5000 + uint64(i)*3
151+
}
152+
raw := buildTestSeq(5000, vals...)
153+
require.Equal(t, byte(SimpleEncoding)|15, raw[0])
154+
check(t, 5000, vals)
155+
})
156+
157+
t.Run("boundary: 17 elements uses rebased EF", func(t *testing.T) {
158+
vals := make([]uint64, 17)
159+
for i := range vals {
160+
vals[i] = 5000 + uint64(i)*3
161+
}
162+
raw := buildTestSeq(5000, vals...)
163+
require.Equal(t, byte(RebasedEliasFano), raw[0])
164+
check(t, 5000, vals)
165+
})
166+
167+
t.Run("large sequence with high baseNum", func(t *testing.T) {
168+
const baseNum = 1_000_000_000
169+
vals := make([]uint64, 100)
170+
for i := range vals {
171+
vals[i] = baseNum + uint64(i)*7
172+
}
173+
raw := buildTestSeq(baseNum, vals...)
174+
require.Equal(t, byte(RebasedEliasFano), raw[0])
175+
check(t, baseNum, vals)
176+
})
177+
}

db/recsplit/multiencseq/sequence_reader.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ func (s *SequenceReader) ReverseIterator(v int) stream.U64 {
195195
}
196196

197197
// Merge merges the other sequence into this one, returning a built SequenceBuilder
198-
// with outBaseNum. Both sequences must be pre-sorted.
198+
// with outBaseNum. Both sequences must be pre-sorted with s.Max() <= other.Min().
199199
// Call AppendBytes on the result to serialize.
200200
func (s *SequenceReader) Merge(other *SequenceReader, outBaseNum uint64, it1, it2 *SequenceIterator) (*SequenceBuilder, error) {
201201
it1.Reset(s, 0)

db/recsplit/multiencseq/sequence_reader_test.go

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ import (
99
"github.com/erigontech/erigon/db/recsplit/simpleseq"
1010
)
1111

12+
func buildTestSeq(baseNum uint64, vals ...uint64) []byte {
13+
b := NewBuilder(baseNum, uint64(len(vals)), vals[len(vals)-1])
14+
for _, v := range vals {
15+
b.AddOffset(v)
16+
}
17+
b.Build()
18+
return b.AppendBytes(nil)
19+
}
20+
1221
func TestMultiEncSeq(t *testing.T) {
1322

1423
t.Run("plain elias fano", func(t *testing.T) {
@@ -112,6 +121,174 @@ func TestMultiEncSeq(t *testing.T) {
112121
})
113122
}
114123

124+
func TestMerge(t *testing.T) {
125+
t.Run("small sequences (simple encoding path)", func(t *testing.T) {
126+
// 3 + 3 = 6 elements: stays within SIMPLE_SEQUENCE_MAX_THRESHOLD
127+
s1 := ReadMultiEncSeq(1000, buildTestSeq(1000, 1001, 1003, 1005))
128+
s2 := ReadMultiEncSeq(1000, buildTestSeq(1000, 1007, 1009, 1011))
129+
130+
var it1, it2 SequenceIterator
131+
merged, err := s1.Merge(s2, 1000, &it1, &it2)
132+
require.NoError(t, err)
133+
134+
out := merged.AppendBytes(nil)
135+
result := ReadMultiEncSeq(1000, out)
136+
require.Equal(t, uint64(6), result.Count())
137+
require.Equal(t, uint64(1001), result.Min())
138+
require.Equal(t, uint64(1011), result.Max())
139+
for i, want := range []uint64{1001, 1003, 1005, 1007, 1009, 1011} {
140+
require.Equal(t, want, result.Get(uint64(i)))
141+
}
142+
})
143+
144+
t.Run("large sequences (rebased EF fast path)", func(t *testing.T) {
145+
// 10 + 10 = 20 elements: exceeds SIMPLE_SEQUENCE_MAX_THRESHOLD
146+
vals1 := make([]uint64, 10)
147+
vals2 := make([]uint64, 10)
148+
for i := range vals1 {
149+
vals1[i] = 1000 + uint64(i)*2
150+
vals2[i] = 1020 + uint64(i)*2
151+
}
152+
s1 := ReadMultiEncSeq(1000, buildTestSeq(1000, vals1...))
153+
s2 := ReadMultiEncSeq(1000, buildTestSeq(1000, vals2...))
154+
155+
var it1, it2 SequenceIterator
156+
merged, err := s1.Merge(s2, 1000, &it1, &it2)
157+
require.NoError(t, err)
158+
159+
out := merged.AppendBytes(nil)
160+
require.Equal(t, byte(RebasedEliasFano), out[0], "expected rebased EF encoding")
161+
162+
result := ReadMultiEncSeq(1000, out)
163+
require.Equal(t, uint64(20), result.Count())
164+
require.Equal(t, uint64(1000), result.Min())
165+
require.Equal(t, uint64(1038), result.Max())
166+
for i := uint64(0); i < 20; i++ {
167+
require.Equal(t, 1000+i*2, result.Get(i))
168+
}
169+
})
170+
}
171+
172+
func TestMergeEncodingBoundary(t *testing.T) {
173+
merge := func(baseNum uint64, raw1, raw2 []byte) []byte {
174+
s1 := ReadMultiEncSeq(baseNum, raw1)
175+
s2 := ReadMultiEncSeq(baseNum, raw2)
176+
var it1, it2 SequenceIterator
177+
merged, err := s1.Merge(s2, baseNum, &it1, &it2)
178+
if err != nil {
179+
t.Fatal(err)
180+
}
181+
return merged.AppendBytes(nil)
182+
}
183+
184+
// 8+8=16: must stay simple encoding
185+
raw16 := merge(1000,
186+
buildTestSeq(1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008),
187+
buildTestSeq(1000, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016),
188+
)
189+
require.Equal(t, byte(SimpleEncoding)|15, raw16[0], "8+8=16 must use simple encoding")
190+
191+
// 8+9=17: must flip to rebased EF
192+
raw17 := merge(1000,
193+
buildTestSeq(1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008),
194+
buildTestSeq(1000, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017),
195+
)
196+
require.Equal(t, byte(RebasedEliasFano), raw17[0], "8+9=17 must use rebased EF")
197+
}
198+
199+
func TestMergeSeek(t *testing.T) {
200+
// merge two large sequences so output uses rebased EF
201+
vals1 := make([]uint64, 10)
202+
vals2 := make([]uint64, 10)
203+
for i := range vals1 {
204+
vals1[i] = 1000 + uint64(i)*2 // 1000,1002,...,1018
205+
vals2[i] = 1020 + uint64(i)*2 // 1020,1022,...,1038
206+
}
207+
s1 := ReadMultiEncSeq(1000, buildTestSeq(1000, vals1...))
208+
s2 := ReadMultiEncSeq(1000, buildTestSeq(1000, vals2...))
209+
var it1, it2 SequenceIterator
210+
merged, err := s1.Merge(s2, 1000, &it1, &it2)
211+
require.NoError(t, err)
212+
result := ReadMultiEncSeq(1000, merged.AppendBytes(nil))
213+
214+
// Seek to existing value
215+
n, ok := result.Seek(1010)
216+
require.True(t, ok)
217+
require.Equal(t, uint64(1010), n)
218+
219+
// Seek to gap — returns next
220+
n, ok = result.Seek(1011)
221+
require.True(t, ok)
222+
require.Equal(t, uint64(1012), n)
223+
224+
// Seek past end
225+
_, ok = result.Seek(1039)
226+
require.False(t, ok)
227+
228+
// Has
229+
require.True(t, result.Has(1020))
230+
require.False(t, result.Has(1021))
231+
}
232+
233+
func TestBuilderFreeFunctions(t *testing.T) {
234+
const baseNum = uint64(5000)
235+
vals := []uint64{5003, 5007, 5015}
236+
b := NewBuilder(baseNum, uint64(len(vals)), vals[len(vals)-1])
237+
for _, v := range vals {
238+
b.AddOffset(v)
239+
}
240+
b.Build()
241+
raw := b.AppendBytes(nil)
242+
243+
require.Equal(t, uint64(3), Count(baseNum, raw))
244+
245+
n, ok := Seek(baseNum, raw, 5006)
246+
require.True(t, ok)
247+
require.Equal(t, uint64(5007), n)
248+
249+
n, ok = Seek(baseNum, raw, 5007)
250+
require.True(t, ok)
251+
require.Equal(t, uint64(5007), n)
252+
253+
_, ok = Seek(baseNum, raw, 5016)
254+
require.False(t, ok)
255+
}
256+
257+
func BenchmarkMerge(b *testing.B) {
258+
const baseNum = 1_000_000
259+
const n = 500 // elements per sequence
260+
261+
raw1 := func() []byte {
262+
sb := NewBuilder(baseNum, n, baseNum+n*2-2)
263+
for i := uint64(0); i < n; i++ {
264+
sb.AddOffset(baseNum + i*2)
265+
}
266+
sb.Build()
267+
return sb.AppendBytes(nil)
268+
}()
269+
raw2 := func() []byte {
270+
sb := NewBuilder(baseNum, n, baseNum+n*2+n*2-2)
271+
for i := uint64(0); i < n; i++ {
272+
sb.AddOffset(baseNum + n*2 + i*2)
273+
}
274+
sb.Build()
275+
return sb.AppendBytes(nil)
276+
}()
277+
278+
var s1, s2 SequenceReader
279+
var it1, it2 SequenceIterator
280+
b.ResetTimer()
281+
for i := 0; i < b.N; i++ {
282+
s1.Reset(baseNum, raw1)
283+
s2.Reset(baseNum, raw2)
284+
merged, err := s1.Merge(&s2, baseNum, &it1, &it2)
285+
if err != nil {
286+
b.Fatal(err)
287+
}
288+
_ = merged.AppendBytes(nil)
289+
}
290+
}
291+
115292
func requireSequenceChecks(t *testing.T, s *SequenceReader) {
116293
t.Helper()
117294

0 commit comments

Comments
 (0)