Skip to content

Commit 85abedf

Browse files
integrity: CheckKvi parallel (#19502)
1 parent 320cdd3 commit 85abedf

File tree

2 files changed

+112
-81
lines changed

2 files changed

+112
-81
lines changed

db/integrity/commitment_integirty.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,11 @@ func checkCommitmentRootViaSd(ctx context.Context, tx kv.TemporalTx, f state.Vis
244244
}
245245

246246
func checkCommitmentRootViaRecompute(ctx context.Context, tx kv.TemporalTx, sd *execctx.SharedDomains, info commitmentRootInfo, f state.VisibleFile, logger log.Logger) error {
247+
trace := logger.Enabled(ctx, log.LvlTrace)
247248
touchLoggingVisitor := func(k []byte) {
248-
logger.Debug("account touch for root block", "key", common.Address(k), "blockNum", info.blockNum, "file", filepath.Base(f.Fullpath()))
249+
if trace {
250+
logger.Trace("account touch for root block", "key", common.Address(k), "blockNum", info.blockNum, "file", filepath.Base(f.Fullpath()))
251+
}
249252
}
250253
touches, err := touchHistoricalKeys(sd, tx, kv.AccountsDomain, info.blockMinTxNum, info.txNum+1, touchLoggingVisitor)
251254
if err != nil {
@@ -353,6 +356,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
353356
)
354357
return derefCounts{}, nil
355358
}
359+
trace := logger.Enabled(ctx, log.LvlTrace)
356360
logger.Info("[integrity] commitment deref in", "kv", fileName, "startTxNum", startTxNum, "endTxNum", endTxNum)
357361
commDecomp, err := seg.NewDecompressor(file.Fullpath())
358362
if err != nil {
@@ -424,7 +428,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
424428
counts.branchKeys++
425429
branchData := commitment.BranchData(branchValue)
426430
newBranchData, err := branchData.ReplacePlainKeys(newBranchValueBuf[:0], func(key []byte, isStorage bool) ([]byte, error) {
427-
if logger.Enabled(ctx, log.LvlTrace) {
431+
if trace {
428432
logger.Trace(
429433
"checking commitment deref for branch",
430434
"branchKey", hex.EncodeToString(branchKey),
@@ -435,7 +439,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
435439
}
436440
if isStorage {
437441
if len(key) == length.Addr+length.Hash {
438-
if logger.Enabled(ctx, log.LvlTrace) {
442+
if trace {
439443
logger.Trace(
440444
"skipping, not a storage reference",
441445
"branchKey", hex.EncodeToString(branchKey),
@@ -468,7 +472,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
468472
integrityErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
469473
return key, nil
470474
}
471-
if logger.Enabled(ctx, log.LvlTrace) {
475+
if trace {
472476
logger.Trace(
473477
"dereferenced storage key",
474478
"branchKey", hex.EncodeToString(branchKey),
@@ -482,7 +486,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
482486
return plainKey, nil
483487
}
484488
if len(key) == length.Addr {
485-
if logger.Enabled(ctx, log.LvlTrace) {
489+
if trace {
486490
logger.Trace(
487491
"skipping, not an account reference",
488492
"branchKey", hex.EncodeToString(branchKey),
@@ -514,7 +518,7 @@ func checkCommitmentKvDeref(ctx context.Context, file state.VisibleFile, stepSiz
514518
integrityErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
515519
return key, nil
516520
}
517-
if logger.Enabled(ctx, log.LvlTrace) {
521+
if trace {
518522
logger.Trace(
519523
"dereferenced account key",
520524
"branchKey", hex.EncodeToString(branchKey),
@@ -770,12 +774,15 @@ func CheckCommitmentHistAtBlk(ctx context.Context, db kv.TemporalRoDB, br servic
770774
return fmt.Errorf("commitment state txNum doesn't match maxTxNum: %d != %d", latestTxNum, maxTxNum)
771775
}
772776
logger.Info("commitment recalc info", "blockNum", blockNum, "minTxNum", minTxNum, "maxTxNum", maxTxNum, "toTxNum", toTxNum)
777+
trace := logger.Enabled(ctx, log.LvlTrace)
773778
touchLoggingVisitor := func(k []byte) {
774-
args := []any{"key", common.Address(k[:length.Addr])}
775-
if len(k) > length.Addr {
776-
args = append(args, "slot", common.Hash(k[length.Addr:]))
779+
if trace {
780+
args := []any{"key", common.Address(k[:length.Addr])}
781+
if len(k) > length.Addr {
782+
args = append(args, "slot", common.Hash(k[length.Addr:]))
783+
}
784+
logger.Trace("commitment touched key", args...)
777785
}
778-
logger.Debug("commitment touched key", args...)
779786
}
780787
touchStart := time.Now()
781788
accTouches, err := touchHistoricalKeys(sd, tx, kv.AccountsDomain, minTxNum, toTxNum, touchLoggingVisitor)

db/integrity/integrity_kvi.go

Lines changed: 95 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@
1717
package integrity
1818

1919
import (
20+
"bytes"
2021
"context"
2122
"encoding/hex"
2223
"errors"
2324
"fmt"
2425
"path/filepath"
2526
"strings"
26-
"sync/atomic"
2727
"time"
2828

2929
"golang.org/x/sync/errgroup"
3030

31-
"github.com/erigontech/erigon/common/dbg"
31+
"github.com/erigontech/erigon/common/estimate"
3232
"github.com/erigontech/erigon/common/log/v3"
3333
"github.com/erigontech/erigon/db/kv"
3434
"github.com/erigontech/erigon/db/recsplit"
@@ -41,22 +41,14 @@ import (
4141
// ErrIntegrity is useful to differentiate integrity errors from program errors.
4242
var ErrIntegrity = errors.New("integrity error")
4343

44+
// CheckKvis checks all kvi index files for a domain sequentially (one file at a time),
45+
// parallelizing the lookup work inside each file.
4446
func CheckKvis(ctx context.Context, tx kv.TemporalTx, domain kv.Domain, failFast bool, logger log.Logger) error {
4547
start := time.Now()
4648
aggTx := state.AggTx(tx)
4749
files := aggTx.Files(domain)
4850
kvCompression := statecfg.Schema.GetDomainCfg(domain).Compression
49-
var eg *errgroup.Group
50-
if failFast {
51-
// if 1 goroutine fails, fail others
52-
eg, ctx = errgroup.WithContext(ctx)
53-
} else {
54-
eg = &errgroup.Group{}
55-
}
56-
if dbg.EnvBool("CHECK_KVIS_SEQUENTIAL", false) {
57-
eg.SetLimit(1)
58-
}
59-
var keyCount atomic.Uint64
51+
var keyCount uint64
6052
for _, file := range files {
6153
if !strings.HasSuffix(file.Fullpath(), ".kv") {
6254
continue
@@ -76,26 +68,24 @@ func CheckKvis(ctx context.Context, tx kv.TemporalTx, domain kv.Domain, failFast
7668
if !ok {
7769
return fmt.Errorf("kvi not found for %s", kvPath)
7870
}
79-
eg.Go(func() error {
80-
keys, err := CheckKvi(ctx, kviPath, kvPath, kvCompression, failFast, logger)
81-
if err == nil {
82-
keyCount.Add(keys)
83-
return nil
84-
}
85-
if !failFast {
86-
logger.Warn(err.Error())
71+
keys, err := CheckKvi(ctx, kviPath, kvPath, kvCompression, failFast, logger)
72+
keyCount += keys
73+
if err != nil {
74+
if failFast {
75+
return err
8776
}
88-
return err
89-
})
90-
}
91-
err := eg.Wait()
92-
if err != nil {
93-
return err
77+
logger.Warn(err.Error())
78+
}
9479
}
95-
logger.Info("checked kvi files in", "dur", time.Since(start), "files", len(files), "keys", keyCount.Load())
80+
logger.Info("checked kvi files in", "dur", time.Since(start), "files", len(files), "keys", keyCount)
9681
return nil
9782
}
9883

84+
type kviWorkItem struct {
85+
key []byte
86+
offset uint64
87+
}
88+
9989
func CheckKvi(ctx context.Context, kviPath string, kvPath string, kvCompression seg.FileCompression, failFast bool, logger log.Logger) (uint64, error) {
10090
kviFileName := filepath.Base(kviPath)
10191
kvFileName := filepath.Base(kvPath)
@@ -106,74 +96,108 @@ func CheckKvi(ctx context.Context, kviPath string, kvPath string, kvCompression
10696
return 0, err
10797
}
10898
defer kvi.Close()
109-
kviReader := kvi.GetReaderFromPool()
11099
kvDecompressor, err := seg.NewDecompressor(kvPath)
111100
if err != nil {
112101
return 0, err
113102
}
114103
defer kvDecompressor.Close()
115104
kvReader := seg.NewReader(kvDecompressor.MakeGetter(), kvCompression)
116-
var integrityErr error
105+
106+
var firstErr error
117107
if kvKeyCount := uint64(kvReader.Count()) / 2; kvKeyCount != kvi.KeyCount() {
118108
err = fmt.Errorf("kv key count %d != kvi key count %d in %s", kvKeyCount, kvi.KeyCount(), kviFileName)
119109
if failFast {
120110
return 0, err
121111
}
122112
logger.Warn(err.Error())
123-
integrityErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
113+
firstErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
114+
}
115+
116+
trace := logger.Enabled(ctx, log.LvlTrace)
117+
checkOne := func(kviReader *recsplit.IndexReader, work kviWorkItem) error {
118+
if trace {
119+
logger.Trace("[integrity] checking kvi for", "key", hex.EncodeToString(work.key), "offset", work.offset, "kvi", kviFileName)
120+
}
121+
kviOffset, found := kviReader.Lookup(work.key)
122+
if !found {
123+
return fmt.Errorf("%w: key %x not found in %s", ErrIntegrity, work.key, kviFileName)
124+
}
125+
if kviOffset != work.offset {
126+
return fmt.Errorf("%w: key %x offset mismatch %d != %d in %s", ErrIntegrity, work.key, work.offset, kviOffset, kviFileName)
127+
}
128+
return nil
124129
}
125-
logTicker := time.NewTicker(30 * time.Second)
126-
defer logTicker.Stop()
127-
var keyBuf []byte
128-
var keyOffset, keyCount uint64
129-
var atValue bool
130-
for i := 0; kvReader.HasNext(); i++ {
131-
if i%1024 == 0 {
130+
131+
var keyCount uint64
132+
eg, ctx := errgroup.WithContext(ctx)
133+
numWorkers := estimate.AlmostAllCPUs()
134+
workCh := make(chan kviWorkItem, numWorkers*4)
135+
136+
for range numWorkers {
137+
eg.Go(func() error {
138+
kviReader := kvi.GetReaderFromPool()
139+
defer kviReader.Close()
140+
for {
141+
select {
142+
case <-ctx.Done():
143+
return ctx.Err()
144+
case work, ok := <-workCh:
145+
if !ok {
146+
return nil
147+
}
148+
if err := checkOne(kviReader, work); err != nil {
149+
if !failFast {
150+
logger.Warn(err.Error())
151+
}
152+
return err
153+
}
154+
}
155+
}
156+
})
157+
}
158+
159+
// Producer: scan kv file sequentially, emit (key, offset) pairs to workers.
160+
eg.Go(func() error {
161+
defer close(workCh)
162+
logTicker := time.NewTicker(30 * time.Second)
163+
defer logTicker.Stop()
164+
var keyBuf []byte
165+
var keyOffset uint64
166+
var atValue bool
167+
for kvReader.HasNext() {
168+
if atValue {
169+
keyOffset, _ = kvReader.Skip()
170+
atValue = false
171+
continue
172+
}
173+
keyBuf, _ = kvReader.Next(keyBuf[:0])
174+
keyCount++
175+
atValue = true
176+
132177
select {
133178
case <-ctx.Done():
134-
return 0, ctx.Err()
179+
return nil
180+
case workCh <- kviWorkItem{key: bytes.Clone(keyBuf), offset: keyOffset}:
181+
}
182+
183+
select {
135184
case <-logTicker.C:
136185
at := fmt.Sprintf("%d/%d", keyCount, kvi.KeyCount())
137186
percent := fmt.Sprintf("%.1f%%", float64(keyCount)/float64(kvi.KeyCount())*100)
138187
rate := float64(keyCount) / time.Since(start).Seconds()
139188
eta := time.Duration(float64(kvi.KeyCount()-keyCount)/rate) * time.Second
140-
logger.Info("[integrity] checking kvi progress", "at", at, "p", percent, "k/s", rate, "kvi", kviFileName, "eta", eta)
141-
default: // proceed
189+
logger.Info("[integrity] kvi progress", "at", at, "p", percent, "k/s", rate, "eta", eta, "kvi", kviFileName)
190+
default:
142191
}
143192
}
193+
return nil
194+
})
144195

145-
if atValue {
146-
keyOffset, _ = kvReader.Skip()
147-
atValue = false
148-
continue
149-
}
150-
keyBuf, _ = kvReader.Next(keyBuf[:0])
151-
if logger.Enabled(ctx, log.LvlTrace) {
152-
logger.Trace("[integrity] checking kvi for", "key", hex.EncodeToString(keyBuf), "kvi", kviFileName, "offset", keyOffset)
153-
}
154-
keyCount++
155-
atValue = true
156-
kviOffset, ok := kviReader.Lookup(keyBuf)
157-
if !ok {
158-
err = fmt.Errorf("key %x not found in %s", keyBuf, kviFileName)
159-
if failFast {
160-
return 0, err
161-
}
162-
logger.Warn(err.Error())
163-
integrityErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
164-
continue
165-
}
166-
if kviOffset != keyOffset {
167-
err = fmt.Errorf("key %x offset mismatch %d != %d in %s", keyBuf, keyOffset, kviOffset, kviFileName)
168-
if failFast {
169-
return 0, err
170-
}
171-
logger.Warn(err.Error())
172-
integrityErr = fmt.Errorf("%w: %w", ErrIntegrity, err)
173-
}
196+
if err := eg.Wait(); err != nil {
197+
return keyCount, err
174198
}
175199
duration := time.Since(start)
176200
rate := float64(keyCount) / duration.Seconds()
177201
logger.Info("checked kvi in", "dur", duration, "keys", keyCount, "k/s", rate, "kvi", kviFileName, "kv", kvFileName)
178-
return keyCount, integrityErr
202+
return keyCount, firstErr
179203
}

0 commit comments

Comments
 (0)