Skip to content

Commit 9c0d499

Browse files
seg: SamplingFactor>1 produced only 1 sample (erigontech#21806)
## Problem `Compressor.AddWord` decided when a superstring window was full from `len(c.superstring)`. But skipped (non-sampled) windows never append to that buffer, so once `SamplingFactor>1` reached the first skipped window the buffer stopped growing, the overflow branch never fired again, the window counter froze, and **the pattern dictionary was built from the first `superstringLimit` (16MB) of the file only.** This affects every `SamplingFactor>1` user: `seg.DefaultCfg` and `BlockCompressCfg` both use `SamplingFactor=4`, so tx/bodies/headers segments have been built from first-16MB dictionaries. ## Fix Track scanned bytes per window in `scannedBytes`, advanced on **every** word regardless of sampling, and use it (not `len(c.superstring)`) for the overflow check. Window boundaries now advance through skipped windows, so `SamplingFactor` honestly samples every Nth window across the whole file. While here, the windowing logic moved into a small `advanceScan` helper so the byte accounting can't drift from the rollover it feeds, and the send condition is now `len(superstring) > 0` (non-empty ⟺ sampled): this stops pushing empty buffers to the workers and only fetches a fresh pool buffer when one is actually handed off. ## Scope / impact - **Affected:** `seg.DefaultCfg`, `BlockCompressCfg` (`SamplingFactor=4`) — tx/bodies/headers segments now build from a true 25% whole-file sample (slower build, smaller output; offset by erigontech#21625's matcher speedups). - **Not affected:** `DomainCompressCfg` / `HistoryCompressCfg` (`SamplingFactor=1`) — for `SamplingFactor=1` the new accounting is provably identical to the old, so domain/history snapshots stay **byte-identical** (verified by the unchanged checksum tests). Addresses the core bug in erigontech#21628. Per-config `SamplingFactor` re-tuning (the issue's other open thread) is intentionally left out of this PR. ## Testing - New `TestCompressSamplingCoversWholeFile` pins the invariant "the number of windows a file splits into must not depend on `SamplingFactor`" — fails on the old code (SF=1 → 90 windows, SF=4 → stuck at 1), passes after the fix. - Full `db/seg` suite passes under `-race`; existing checksum-asserting tests unchanged (single-window output byte-identical).
1 parent 3fd3193 commit 9c0d499

2 files changed

Lines changed: 61 additions & 10 deletions

File tree

db/seg/compress.go

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ type Compressor struct {
123123
// this is needed for using ordinary (one string) suffix sorting algorithm instead of a generalised (many superstrings) suffix
124124
// sorting algorithm
125125
superstring []byte
126+
scannedBytes int
127+
superstringLimit int
126128
wordsCount uint64
127129
superstringCount uint64
128130
uncompressedBytes int
@@ -187,6 +189,7 @@ func NewCompressor(ctx context.Context, logPrefix, outputFile, tmpDir string, cf
187189
wg: wg,
188190
logger: logger,
189191
version: FileCompressionFormatV1,
192+
superstringLimit: superstringLimit,
190193
}
191194

192195
if cfg.ValuesOnCompressedPage > 0 {
@@ -267,16 +270,8 @@ func (c *Compressor) AddWord(word []byte) error {
267270
}
268271
}
269272

270-
l := 2*len(word) + 2
271-
if len(c.superstring)+l > superstringLimit {
272-
if c.superstringCount%c.SamplingFactor == 0 {
273-
c.superstrings <- c.superstring
274-
}
275-
c.superstringCount++
276-
c.superstring = superStringsPool.Get().([]byte)[:0]
277-
}
278-
279-
if c.superstringCount%c.SamplingFactor == 0 {
273+
c.advanceScan(2*len(word) + 2)
274+
if c.sampledSuperstring() {
280275
for _, a := range word {
281276
c.superstring = append(c.superstring, 1, a)
282277
}
@@ -287,6 +282,25 @@ func (c *Compressor) AddWord(word []byte) error {
287282
return c.uncompressedFile.Append(word)
288283
}
289284

285+
func (c *Compressor) sampledSuperstring() bool { return c.superstringCount%c.SamplingFactor == 0 }
286+
287+
// advanceScan accounts for the next word (encoded size l) and cuts a new superstring when the
288+
// current one would overflow. Bytes are counted for every word so boundaries don't depend on
289+
// sampling; a superstring fills only when sampled, so only a non-empty buffer is sent and replaced.
290+
func (c *Compressor) advanceScan(l int) {
291+
if c.scannedBytes+l <= c.superstringLimit {
292+
c.scannedBytes += l
293+
return
294+
}
295+
296+
if len(c.superstring) > 0 {
297+
c.superstrings <- c.superstring
298+
c.superstring = superStringsPool.Get().([]byte)[:0]
299+
}
300+
c.superstringCount++
301+
c.scannedBytes = 0
302+
}
303+
290304
func (c *Compressor) AddUncompressedWord(word []byte) error {
291305
c.wordsCount++
292306
if c.wordsCount%1024 == 0 {

db/seg/compress_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,40 @@ func TestCompressNoWordPatterns(t *testing.T) {
387387
t.Errorf("fast-path output differs from main, checksum=%d", cs)
388388
}
389389
}
390+
391+
// The number of superstring windows a file is split into is a property of the data, so it
392+
// must not depend on SamplingFactor — sampling only selects which windows feed the dict.
393+
func TestCompressSamplingCoversWholeFile(t *testing.T) {
394+
const word = "mykeyabc"
395+
const nWords = 5000
396+
397+
windowCount := func(samplingFactor uint64) uint64 {
398+
logger := log.New()
399+
tmpDir := t.TempDir()
400+
file := filepath.Join(tmpDir, "compressed")
401+
cfg := DefaultCfg
402+
cfg.MinPatternScore = 1
403+
cfg.SamplingFactor = samplingFactor
404+
c, err := NewCompressor(t.Context(), t.Name(), file, tmpDir, cfg, log.LvlDebug, logger)
405+
require.NoError(t, err)
406+
defer c.Close()
407+
c.superstringLimit = 1000
408+
409+
for i := 0; i < nWords; i++ {
410+
require.NoError(t, c.AddWord([]byte(word)))
411+
}
412+
count := c.superstringCount
413+
require.NoError(t, c.Compress())
414+
415+
d, err := NewDecompressor(file)
416+
require.NoError(t, err)
417+
defer d.Close()
418+
require.EqualValues(t, nWords, d.Count())
419+
return count
420+
}
421+
422+
full := windowCount(1)
423+
require.Greater(t, full, uint64(1))
424+
require.Equal(t, full, windowCount(4))
425+
require.Equal(t, full, windowCount(8))
426+
}

0 commit comments

Comments
 (0)