@@ -43,10 +43,11 @@ type MissingBlockRetryConfig struct {
4343 TipRecheckThreshold int
4444 // RetryDelay keeps retry pressure low while still reacting quickly to transient backend gaps.
4545 RetryDelay time.Duration
46- // MaxStallDuration caps the wall-clock time a single block fetch may spend
47- // in the retry loop before yielding control to the outer resync machinery.
48- // Without this cap, a backend that keeps returning ErrBlockNotFound while
49- // shouldRestartSyncOnMissingBlock keeps reporting "no reorg" loops forever.
46+ // MaxStallDuration caps the wall-clock time a single block fetch may spend in
47+ // the retry loop before yielding errResync. Liveness invariant: since lagging
48+ // probes report "no reorg" and known hashes get retried, a genuinely-behind
49+ // backend or chain-shortening reorg relies on this cap. Must stay > 0
50+ // (ApplyMissingBlockRetryOverride enforces it).
5051 MaxStallDuration time.Duration
5152}
5253
@@ -118,6 +119,14 @@ func NewSyncWorkerWithConfig(db *RocksDB, chain bchain.BlockChain, syncWorkers,
118119 if cfg != nil {
119120 effectiveCfg = * cfg
120121 }
122+ // MaxStallDuration is the load-bearing liveness cap (see its doc): the retry
123+ // loops disable the cap when it's <= 0, which would let a chain-shortening
124+ // reorg spin forever. Enforce the invariant structurally here so every caller
125+ // (including tests passing a partial cfg) gets a safe value, not just the
126+ // ApplyMissingBlockRetryOverride path.
127+ if effectiveCfg .MissingBlockRetry .MaxStallDuration <= 0 {
128+ effectiveCfg .MissingBlockRetry .MaxStallDuration = DefaultMissingBlockRetryConfig ().MaxStallDuration
129+ }
121130 return & SyncWorker {
122131 db : db ,
123132 chain : chain ,
@@ -255,44 +264,44 @@ func (w *SyncWorker) resyncIndex(onNewBlock bchain.OnNewBlockFunc, initialSync b
255264 return err
256265 }
257266 if remoteBestHeight < w .startHeight {
258- glog .Error ("resync: error - remote best height " , remoteBestHeight , " less than sync start height " , w .startHeight )
259- return errors .New ("resync: remote best height error" )
260- }
261- if initialSync {
262- if remoteBestHeight - w .startHeight > uint32 (w .syncChunk ) {
263- glog .Infof ("resync: bulk sync of blocks %d-%d, using %d workers" , w .startHeight , remoteBestHeight , w .syncWorkers )
264- // Bulk sync can encounter a disappearing block hash during reorgs.
265- // When that happens, it returns errResync to trigger a full restart.
266- err = w .BulkConnectBlocks (w .startHeight , remoteBestHeight )
267- if err != nil {
268- if stdErrors .Is (err , errResync ) {
269- // block hash changed during parallel sync, restart the full resync
270- return w .resyncIndex (onNewBlock , initialSync )
267+ glog .Warning ("resync: observed remote best height " , remoteBestHeight , " less than sync start height " , w .startHeight , ", falling back to sequential sync" )
268+ } else {
269+ if initialSync {
270+ if remoteBestHeight - w .startHeight > uint32 (w .syncChunk ) {
271+ glog .Infof ("resync: bulk sync of blocks %d-%d, using %d workers" , w .startHeight , remoteBestHeight , w .syncWorkers )
272+ // Bulk sync can encounter a disappearing block hash during reorgs.
273+ // When that happens, it returns errResync to trigger a full restart.
274+ err = w .BulkConnectBlocks (w .startHeight , remoteBestHeight )
275+ if err != nil {
276+ if stdErrors .Is (err , errResync ) {
277+ // block hash changed during parallel sync, restart the full resync
278+ return w .resyncIndex (onNewBlock , initialSync )
279+ }
280+ return err
271281 }
272- return err
282+ // after parallel load finish the sync using standard way,
283+ // new blocks may have been created in the meantime
284+ return w .resyncIndex (onNewBlock , initialSync )
273285 }
274- // after parallel load finish the sync using standard way,
275- // new blocks may have been created in the meantime
276- return w .resyncIndex (onNewBlock , initialSync )
277286 }
278- }
279- if w .chain .GetChainParser ().GetChainType () == bchain .ChainEthereumType {
280- syncWorkers := uint32 (4 )
281- if remoteBestHeight - w .startHeight >= syncWorkers {
282- glog .Infof ("resync: parallel sync of blocks %d-%d, using %d workers" , w .startHeight , remoteBestHeight , syncWorkers )
283- // Parallel sync also returns errResync when a requested hash no longer
284- // exists at its height; restart to realign with the canonical chain.
285- err = w .ParallelConnectBlocks (onNewBlock , w .startHeight , remoteBestHeight , syncWorkers )
286- if err != nil {
287- if stdErrors .Is (err , errResync ) {
288- // block hash changed during parallel sync, restart the full resync
289- return w .resyncIndex (onNewBlock , initialSync )
287+ if w .chain .GetChainParser ().GetChainType () == bchain .ChainEthereumType {
288+ syncWorkers := uint32 (4 )
289+ if remoteBestHeight - w .startHeight >= syncWorkers {
290+ glog .Infof ("resync: parallel sync of blocks %d-%d, using %d workers" , w .startHeight , remoteBestHeight , syncWorkers )
291+ // Parallel sync also returns errResync when a requested hash no longer
292+ // exists at its height; restart to realign with the canonical chain.
293+ err = w .ParallelConnectBlocks (onNewBlock , w .startHeight , remoteBestHeight , syncWorkers )
294+ if err != nil {
295+ if stdErrors .Is (err , errResync ) {
296+ // block hash changed during parallel sync, restart the full resync
297+ return w .resyncIndex (onNewBlock , initialSync )
298+ }
299+ return err
290300 }
291- return err
301+ // after parallel load finish the sync using standard way,
302+ // new blocks may have been created in the meantime
303+ return w .resyncIndex (onNewBlock , initialSync )
292304 }
293- // after parallel load finish the sync using standard way,
294- // new blocks may have been created in the meantime
295- return w .resyncIndex (onNewBlock , initialSync )
296305 }
297306 }
298307 }
@@ -400,21 +409,37 @@ type hashHeight struct {
400409 height uint32
401410}
402411
412+ // sendHashHeight queues hh but stays abort-aware: if a full hch made this a blocking
413+ // send, the coordinator could never read abortCh and sync would wedge. On abort hh is
414+ // intentionally dropped since the round is being torn down anyway.
415+ func (w * SyncWorker ) sendHashHeight (hch chan <- hashHeight , abortCh <- chan error , hh hashHeight ) error {
416+ select {
417+ case hch <- hh :
418+ return nil
419+ case abortErr := <- abortCh :
420+ return abortErr
421+ case <- w .chanOsSignal :
422+ return ErrOperationInterrupted
423+ }
424+ }
425+
403426func (w * SyncWorker ) shouldRestartSyncOnMissingBlock (height uint32 , expectedHash string ) (bool , error ) {
404- // When a block hash disappears at a given height, it usually indicates a
405- // reorg/rollback. Confirm by checking the current tip and block hash.
427+ // When a block hash disappears at a given height, it can indicate a
428+ // reorg/rollback, but on load-balanced EVM RPCs a single lagging backend can
429+ // also report an older tip. Only restart immediately when another probe can
430+ // prove the height exists with a different hash; otherwise let the retry
431+ // loop or wall-clock cap yield control to the outer resync.
406432 bestHeight , err := w .chain .GetBestBlockHeight ()
407433 if err != nil {
408434 return false , err
409435 }
410436 if bestHeight < height {
411- // The tip moved below the requested height, so this block is no longer valid.
412- return true , nil
437+ return false , nil
413438 }
414439 currentHash , err := w .chain .GetBlockHash (height )
415440 if err != nil {
416441 if stdErrors .Is (err , bchain .ErrBlockNotFound ) {
417- return true , nil
442+ return false , nil
418443 }
419444 return false , err
420445 }
@@ -574,7 +599,17 @@ ConnectLoop:
574599 time .Sleep (time .Millisecond * 500 )
575600 continue
576601 }
577- hch <- hashHeight {hash , h }
602+ if err = w .sendHashHeight (hch , abortCh , hashHeight {hash , h }); err != nil {
603+ if stdErrors .Is (err , errResync ) {
604+ glog .Warning ("sync: parallel connect aborted while queueing block hash, restarting sync" )
605+ } else if stdErrors .Is (err , ErrOperationInterrupted ) {
606+ glog .Info ("connectBlocksParallel interrupted at height " , h )
607+ } else {
608+ glog .Error ("sync: parallel connect aborted while queueing block hash, worker error " , err )
609+ }
610+ close (terminating )
611+ break ConnectLoop
612+ }
578613 h ++
579614 }
580615 }
@@ -791,7 +826,7 @@ ConnectLoop:
791826 close (terminating )
792827 break ConnectLoop
793828 case <- w .chanOsSignal :
794- glog .Info ("connectBlocksParallel interrupted at height " , h )
829+ glog .Info ("BulkConnectBlocks interrupted at height " , h )
795830 err = ErrOperationInterrupted
796831 // signal all workers to terminate their loops (error loops are interrupted below)
797832 close (terminating )
@@ -804,7 +839,17 @@ ConnectLoop:
804839 time .Sleep (time .Millisecond * 500 )
805840 continue
806841 }
807- hch <- hashHeight {hash , h }
842+ if err = w .sendHashHeight (hch , abortCh , hashHeight {hash , h }); err != nil {
843+ if stdErrors .Is (err , errResync ) {
844+ glog .Warning ("sync: bulk connect aborted while queueing block hash, restarting sync" )
845+ } else if stdErrors .Is (err , ErrOperationInterrupted ) {
846+ glog .Info ("BulkConnectBlocks interrupted at height " , h )
847+ } else {
848+ glog .Error ("sync: bulk connect aborted while queueing block hash, worker error " , err )
849+ }
850+ close (terminating )
851+ break ConnectLoop
852+ }
808853 if h > 0 && h % 1000 == 0 {
809854 w .metrics .BlockbookBestHeight .Set (float64 (h ))
810855 glog .Info ("connecting block " , h , " " , hash , ", elapsed " , time .Since (start ), " " , w .db .GetAndResetConnectBlockStats ())
@@ -891,7 +936,10 @@ func (w *SyncWorker) getBlockChain(out chan blockResult, done chan struct{}) {
891936 return
892937 }
893938 if height > bestHeight {
894- return
939+ if hash == "" {
940+ return
941+ }
942+ glog .Warningf ("getBlockChain: block %d %s is above observed backend height %d; retrying because the block hash was already observed" , height , hash , bestHeight )
895943 }
896944 }
897945 if gotNotFound {
0 commit comments