perf(pruning): adaptive proportional sub-sharding + 2s wait

lucca30 · lucca30 · commit d675a7d88fc7 · 2026-05-07T10:54:48.000-03:00
Phase A2 deploy showed skip-empty cut block_indexer prune from 11min
to 33s (249/256 shards skipped) but txIndex (256/0) and block-store
BH: prefix (256/0) had no empty shards. Per-shard byte counters showed
each populated shard wrote 700MB-2GB; the cgroup peak only dropped
from 42GB to 40GB because the 200ms pause is below Linux's 5s
vm.dirty_writeback_centisecs default — kernel can't drain dirty pages
between bursts that close together.

Two changes here:

1. WaitTimeBetweenCompactions 200ms -&gt; 2s. Crosses the writeback
   boundary so the kernel actually flushes between shards. Wall-time
   cost: ~256 * 1.8s = ~8min extra per sweep, fits the 3h budget.

2. Adaptive proportional sub-sharding driven by persisted prior-cycle
   dWrite. compactShardAdaptive reads the previous cycle's measured
   write_bytes for shard b from the metadata DB and, if above
   SubshardSplitThresholdBytes (500 MiB), divides that single byte
   shard into ceil(prior/500MiB) sub-ranges (capped at 8) by
   interpolating an extra byte. Each sub-range goes through the same
   skip-empty probe + measured compact, so a hot shard whose data
   clusters in a sub-range only touches the populated sub-ranges next
   cycle. Sub-shard pacing = parent shard pacing (each sub gets its
   own 2s wait), spreading I/O over more writeback windows.

Why proportional rather than static: txIndex shards 0x00-0x48 each
write ~1GB; static depth-2 sharding would either over-split rarely-
hot shards or under-split the hot zone. Driving from prior dWrite
adapts to whatever distribution the workload actually has.

History keys live under "compact_history_&lt;label&gt;_&lt;NNN&gt;" and are only
written when totalDW &gt; 0, so empty-DB and empty-shard cases don't
pollute the keyspace with metadata that would force spurious future
compactions.

Tests:
- TestCompactShardAdaptive_NoHistory_SinglePass — first cycle / cold
- TestCompactShardAdaptive_HotHistory_SubdividesProportionally —
  1 GiB prior -&gt; 3 sub-shards, contiguous coverage
- TestSplitFactorFromHistory — threshold + cap math
- Pre-existing skip-empty tests updated to swapCompactAndMeasure.
diff --git a/internal/db/db_utils.go b/internal/db/db_utils.go
@@ -24,14 +24,25 @@ const (
 	// SSTs per shard once the DB grew past steady-state.
 	MaxCompactionInterval = int64(50000)
 
-	// WaitTimeBetweenCompactions throttles successive shard compactions to
-	// (a) yield to the scheduler so consensus goroutines aren't starved and
-	// (b) give the kernel time to drain dirty page-cache pages before the
-	// next shard re-fills it. 200ms gives writeback well over Linux's 5s
-	// vm.dirty_writeback_centisecs default a real window between bursts.
-	// Combined with skip-empty-shard, sweeps now spend most time on the
-	// ~30 populated shards (out of 256) so total wall-time cost is small.
-	WaitTimeBetweenCompactions = 200 * time.Millisecond
+	// WaitTimeBetweenCompactions throttles successive shard compactions.
+	// Linux's vm.dirty_writeback_centisecs defaults to 5s; pauses shorter
+	// than that don't cross a writeback boundary so the kernel can't reliably
+	// drain dirty pages between bursts. 2s gets us past the writeback flush
+	// without idling forever and is the dominant lever for keeping page
+	// cache from accumulating across shards.
+	WaitTimeBetweenCompactions = 2 * time.Second
+
+	// SubshardSplitThresholdBytes is the per-shard write-byte threshold above
+	// which the next prune cycle subdivides that shard. We cap subshard count
+	// at SubshardMaxSplit so a 4 GB shard becomes 8 sub-shards of ~500 MB,
+	// not 80 sub-shards. Below this threshold a shard runs un-split.
+	SubshardSplitThresholdBytes = uint64(500 * 1024 * 1024) // 500 MiB
+	SubshardMaxSplit            = 8
+
+	// CompactHistoryPrefix scopes adaptive-sharding metadata (per-shard prior
+	// dWrite). Stored alongside CompactPrefix in the same DB so resume +
+	// adaptive-split state share a key namespace.
+	compactHistoryPrefix = "compact_history_"
 )
 
 var (
@@ -181,18 +192,15 @@ func CompactPrefixHex256(db dbm.DB, prefix string, label string) error {
 			shardLabel = fmt.Sprintf("%s %s ff-fg", label, prefix)
 		}
 
-		hasAny, err := shardHasKeys(db, start, end)
+		dw, err := compactShardAdaptive(db, start, end, label, b)
 		if err != nil {
-			return fmt.Errorf("compaction %s probe failed: %w", shardLabel, err)
+			return fmt.Errorf("compaction %s: %w", shardLabel, err)
 		}
-		if !hasAny {
+		if dw == 0 {
 			skipped++
 			continue
 		}
 		compacted++
-		if err := compactAndLog(db, start, end, shardLabel); err != nil {
-			return err
-		}
 	}
 
 	log.Printf("compaction %s prefix %q DONE in %s (compacted=%d skipped_empty=%d)",
@@ -227,18 +235,15 @@ func CompactSharded256(db dbm.DB, label string) error {
 			shardLabel = fmt.Sprintf("%s shard %02x-%02x", label, b, b+1)
 		}
 
-		hasAny, err := shardHasKeys(db, start, end)
+		dw, err := compactShardAdaptive(db, start, end, label, b)
 		if err != nil {
-			return fmt.Errorf("compaction %s probe failed: %w", shardLabel, err)
+			return fmt.Errorf("compaction %s: %w", shardLabel, err)
 		}
-		if !hasAny {
+		if dw == 0 {
 			skipped++
 			continue
 		}
 		compacted++
-		if err := compactAndLog(db, start, end, shardLabel); err != nil {
-			return err
-		}
 	}
 
 	log.Printf("compaction %s DONE in %s (compacted=%d skipped_empty=%d)",
@@ -259,12 +264,146 @@ func shardHasKeys(db dbm.DB, start, end []byte) (bool, error) {
 	return it.Valid(), nil
 }
 
+// historyKey returns the metadata key under which we persist the prior-cycle
+// dWrite for a (label, shardIdx) pair. The shard index is the byte b in
+// CompactSharded256 / CompactPrefixHex256.
+func historyKey(label string, shardIdx int) []byte {
+	return []byte(fmt.Sprintf("%s%s_%03d", compactHistoryPrefix, label, shardIdx))
+}
+
+func readHistoryDWrite(db dbm.DB, label string, shardIdx int) uint64 {
+	bz, err := db.Get(historyKey(label, shardIdx))
+	if err != nil || len(bz) != 8 {
+		return 0
+	}
+	return binary.BigEndian.Uint64(bz)
+}
+
+func writeHistoryDWrite(db dbm.DB, label string, shardIdx int, dWrite uint64) {
+	var b [8]byte
+	binary.BigEndian.PutUint64(b[:], dWrite)
+	_ = db.Set(historyKey(label, shardIdx), b[:])
+}
+
+// splitFactorFromHistory maps prior-cycle dWrite to a sub-shard count.
+// Below threshold: 1 (no split). Above: ceil(prior / threshold), capped.
+func splitFactorFromHistory(priorDWrite uint64) int {
+	if priorDWrite < SubshardSplitThresholdBytes {
+		return 1
+	}
+	n := int((priorDWrite + SubshardSplitThresholdBytes - 1) / SubshardSplitThresholdBytes)
+	if n > SubshardMaxSplit {
+		n = SubshardMaxSplit
+	}
+	if n < 1 {
+		n = 1
+	}
+	return n
+}
+
+// byteRangeSplit divides [start, end) into n sub-ranges by interpolating an
+// extra byte after `start`. The returned ranges are contiguous and cover
+// exactly [start, end). For the open-ended case end==nil it splits the
+// keyspace [start, ∞) using ascending second-byte interpolation; the final
+// sub-range stays open-ended.
+//
+// Examples (n=4):
+//
+//	[0x00, 0x01) -> [0x00, 0x0040), [0x0040, 0x0080), [0x0080, 0x00c0), [0x00c0, 0x01)
+//	[0x42, 0x43) -> [0x42, 0x4240), ..., [0x42c0, 0x43)
+func byteRangeSplit(start, end []byte, n int) [][2][]byte {
+	if n <= 1 {
+		return [][2][]byte{{start, end}}
+	}
+	out := make([][2][]byte, 0, n)
+	for i := 0; i < n; i++ {
+		var sStart, sEnd []byte
+		if i == 0 {
+			sStart = start
+		} else {
+			sStart = appendInterpolant(start, i, n)
+		}
+		if i == n-1 {
+			sEnd = end
+		} else {
+			sEnd = appendInterpolant(start, i+1, n)
+		}
+		out = append(out, [2][]byte{sStart, sEnd})
+	}
+	return out
+}
+
+// appendInterpolant returns start || byte(i*256/n). Used to interpolate
+// sub-shard boundaries inside a single-byte shard range.
+func appendInterpolant(start []byte, i, n int) []byte {
+	b := byte((i * 256) / n)
+	out := make([]byte, len(start)+1)
+	copy(out, start)
+	out[len(start)] = b
+	return out
+}
+
+// compactShardAdaptive runs a single shard, optionally subdividing based on
+// the prior-cycle write history persisted in the DB. Returns the cumulative
+// dWrite observed for this shard (which becomes the next cycle's history).
+//
+// The skip-empty probe runs at the *sub*-shard level too: a shard whose
+// data clusters in part of its range will only re-compact the populated
+// sub-ranges next cycle.
+func compactShardAdaptive(db dbm.DB, start, end []byte, label string, shardIdx int) (uint64, error) {
+	prior := readHistoryDWrite(db, label, shardIdx)
+	splitN := splitFactorFromHistory(prior)
+	subs := byteRangeSplit(start, end, splitN)
+
+	var totalDW uint64
+	for i, sub := range subs {
+		var subLabel string
+		if splitN == 1 {
+			subLabel = fmt.Sprintf("%s shard %02x", label, shardIdx)
+		} else {
+			subLabel = fmt.Sprintf("%s shard %02x sub %d/%d", label, shardIdx, i+1, splitN)
+		}
+		hasAny, err := shardHasKeys(db, sub[0], sub[1])
+		if err != nil {
+			return totalDW, fmt.Errorf("%s probe failed: %w", subLabel, err)
+		}
+		if !hasAny {
+			continue
+		}
+		dw, err := compactAndMeasure(db, sub[0], sub[1], subLabel)
+		if err != nil {
+			return totalDW, err
+		}
+		totalDW += dw
+	}
+	if totalDW > 0 {
+		// Only persist when we actually wrote something. Writing zeros for
+		// every empty shard would pollute the keyspace with metadata keys
+		// inside the same byte-shard ranges we're sweeping (history keys
+		// start with 'c' = 0x63, which would otherwise fall into shard 0x63
+		// on the next sweep and force a spurious compaction).
+		writeHistoryDWrite(db, label, shardIdx, totalDW)
+	}
+	return totalDW, nil
+}
+
 // CompactAndLog compacts [start, limit) and logs the range, duration, and
 // the process-level read/write byte deltas observed during the call. The
 // byte deltas come from /proc/self/io and capture *all* I/O issued by the
 // process during the compact (including background goroutines), so they're
 // upper bounds — but they pin per-shard amplification cost in production.
 func CompactAndLog(db dbm.DB, start, limit []byte, label string) error {
+	_, err := compactAndMeasure(db, start, limit, label)
+	return err
+}
+
+// compactAndMeasure is the shared implementation for both the dWrite-returning
+// adaptive path and the legacy CompactAndLog signature. Returns the observed
+// /proc/self/io write_bytes delta so callers can persist it for adaptive
+// sub-sharding decisions on the next cycle.
+var compactAndMeasure = realCompactAndMeasure
+
+func realCompactAndMeasure(db dbm.DB, start, limit []byte, label string) (uint64, error) {
 	time.Sleep(WaitTimeBetweenCompactions)
 
 	rng := fmt.Sprintf("[%s, %s)", prettyKey(start), prettyKey(limit))
@@ -277,11 +416,12 @@ func CompactAndLog(db dbm.DB, start, limit []byte, label string) error {
 
 	if err != nil {
 		log.Printf("compaction %s range %s FAILED after %s: %v", label, rng, elapsed, err)
-		return err
+		return 0, err
 	}
+	dWrite := wb1 - wb0
 	log.Printf("compaction %s range %s done in %s dRead=%dB dWrite=%dB",
-		label, rng, elapsed, rb1-rb0, wb1-wb0)
-	return nil
+		label, rng, elapsed, rb1-rb0, dWrite)
+	return dWrite, nil
 }
 
 // procIOBytes reads /proc/self/io and returns (read_bytes, write_bytes).
diff --git a/internal/db/db_utils_test.go b/internal/db/db_utils_test.go
@@ -25,6 +25,16 @@ func swapCompactAndLog(f func(dbm.DB, []byte, []byte, string) error) (restore fu
 	return func() { compactAndLog = prev }
 }
 
+// swapCompactAndMeasure intercepts the byte-measuring compaction primitive
+// used by CompactSharded256 / CompactPrefixHex256 (via compactShardAdaptive).
+// Returning a nonzero dWrite makes the shard count as "compacted"; returning
+// 0 makes it count as "skipped_empty" from the caller's perspective.
+func swapCompactAndMeasure(f func(dbm.DB, []byte, []byte, string) (uint64, error)) (restore func()) {
+	prev := compactAndMeasure
+	compactAndMeasure = f
+	return func() { compactAndMeasure = prev }
+}
+
 func TestCompactIntSharded_DiscoveryStartsAtHugeFirstKey_NoGaps(t *testing.T) {
 	var intervals [][2][]byte
 	restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
@@ -165,13 +175,13 @@ func TestCompactIntSharded_ResumeFromStoredMeta_NoGaps(t *testing.T) {
 }
 
 // TestCompactSharded256_SkipsEmptyShards seeds keys only under the lowercase
-// 't' prefix and asserts that CompactSharded256 invokes compactAndLog only
+// 't' prefix and asserts that CompactSharded256 invokes the compactor only
 // for that single shard out of 256.
 func TestCompactSharded256_SkipsEmptyShards(t *testing.T) {
 	var calls [][2][]byte
-	restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
+	restore := swapCompactAndMeasure(func(db dbm.DB, start, end []byte, lbl string) (uint64, error) {
 		calls = append(calls, [2][]byte{append([]byte(nil), start...), append([]byte(nil), end...)})
-		return nil
+		return 1, nil // nonzero so the caller counts it as "compacted"
 	})
 	defer restore()
 
@@ -189,9 +199,9 @@ func TestCompactSharded256_SkipsEmptyShards(t *testing.T) {
 // DB no shard is compacted at all.
 func TestCompactSharded256_EmptyDB_NoShardsCompacted(t *testing.T) {
 	var calls int
-	restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
+	restore := swapCompactAndMeasure(func(db dbm.DB, start, end []byte, lbl string) (uint64, error) {
 		calls++
-		return nil
+		return 1, nil
 	})
 	defer restore()
 
@@ -204,9 +214,9 @@ func TestCompactSharded256_EmptyDB_NoShardsCompacted(t *testing.T) {
 // — both '1' first byte). One shard should fire.
 func TestCompactPrefixHex256_SkipsEmptyShards(t *testing.T) {
 	var calls int
-	restore := swapCompactAndLog(func(db dbm.DB, start, end []byte, lbl string) error {
+	restore := swapCompactAndMeasure(func(db dbm.DB, start, end []byte, lbl string) (uint64, error) {
 		calls++
-		return nil
+		return 1, nil
 	})
 	defer restore()
 
@@ -218,3 +228,67 @@ func TestCompactPrefixHex256_SkipsEmptyShards(t *testing.T) {
 
 	require.Equal(t, 1, calls, "only BH:31-32 shard should fire")
 }
+
+// TestCompactShardAdaptive_NoHistory_SinglePass: with no prior dWrite stored
+// the shard runs as one undivided pass.
+func TestCompactShardAdaptive_NoHistory_SinglePass(t *testing.T) {
+	var calls int
+	restore := swapCompactAndMeasure(func(db dbm.DB, start, end []byte, lbl string) (uint64, error) {
+		calls++
+		return 100, nil
+	})
+	defer restore()
+
+	memdb := dbm.NewMemDB()
+	require.NoError(t, memdb.Set([]byte{0x42, 0x00}, []byte{1}))
+
+	dw, err := compactShardAdaptive(memdb, []byte{0x42}, []byte{0x43}, "test", 0x42)
+	require.NoError(t, err)
+	require.Equal(t, uint64(100), dw)
+	require.Equal(t, 1, calls, "no history -> single pass")
+}
+
+// TestCompactShardAdaptive_HotHistory_SubdividesProportionally: prior write
+// of 1 GiB triggers ceil(1GiB/500MiB)=3 sub-shards; ranges interpolate over
+// the parent shard.
+func TestCompactShardAdaptive_HotHistory_SubdividesProportionally(t *testing.T) {
+	var subs [][2][]byte
+	restore := swapCompactAndMeasure(func(db dbm.DB, start, end []byte, lbl string) (uint64, error) {
+		subs = append(subs, [2][]byte{append([]byte(nil), start...), append([]byte(nil), end...)})
+		return 100, nil
+	})
+	defer restore()
+
+	memdb := dbm.NewMemDB()
+	// Seed keys spanning the entire [0x42, 0x43) range so all sub-shards are populated.
+	for b := 0; b < 256; b += 16 {
+		require.NoError(t, memdb.Set([]byte{0x42, byte(b)}, []byte{1}))
+	}
+	// Prime history: 1 GiB prior write -> ceil(1024MiB / 500MiB) = 3 sub-shards.
+	writeHistoryDWrite(memdb, "test", 0x42, 1024*1024*1024)
+
+	_, err := compactShardAdaptive(memdb, []byte{0x42}, []byte{0x43}, "test", 0x42)
+	require.NoError(t, err)
+	require.Equal(t, 3, len(subs), "1GiB prior should split into 3 sub-shards")
+
+	// Verify ranges are contiguous and cover [0x42, 0x43).
+	require.Equal(t, []byte{0x42}, subs[0][0])
+	require.Equal(t, []byte{0x43}, subs[len(subs)-1][1])
+	for i := 1; i < len(subs); i++ {
+		require.Equal(t, subs[i-1][1], subs[i][0], "sub-shards must be contiguous")
+	}
+}
+
+// TestSplitFactorFromHistory verifies the threshold + cap behaviour.
+func TestSplitFactorFromHistory(t *testing.T) {
+	const MiB = uint64(1024 * 1024)
+	require.Equal(t, 1, splitFactorFromHistory(0))
+	require.Equal(t, 1, splitFactorFromHistory(499*MiB))
+	require.Equal(t, 1, splitFactorFromHistory(500*MiB-1))
+	require.Equal(t, 1, splitFactorFromHistory(500*MiB))     // exactly threshold = single
+	require.Equal(t, 2, splitFactorFromHistory(500*MiB+1))   // just above
+	require.Equal(t, 3, splitFactorFromHistory(1024*MiB))    // 1 GiB -> ceil(1024/500) = 3
+	require.Equal(t, 5, splitFactorFromHistory(2*1024*MiB))  // 2 GiB -> ceil(2048/500) = 5
+	require.Equal(t, 8, splitFactorFromHistory(4*1024*MiB))  // 4 GiB -> 8 (just hits cap)
+	require.Equal(t, 8, splitFactorFromHistory(99*1024*MiB)) // capped
+}