Skip to content

Commit fc43170

Browse files
authored
beacon/light: keep retrying checkpoint init if failed (ethereum#33966)
This PR changes the blsync checkpoint init logic so that even if the initialization fails with a certain server and an error log message is printed, the server goes back to its initial state and is allowed to retry initialization after the failure delay period. The previous logic had an `ssDone` server state that did put the server in a permanently unusable state once the checkpoint init failed for an apparently permanent reason. This was not the correct behavior because different servers behave differently in case of overload and sometimes the response to a permanently missing item is not clearly distinguishable from an overload response. A safer logic is to never assume anything to be permanent and always give a chance to retry. The failure delay formula is also fixed; now it is properly capped at `maxFailureDelay`. The previous formula did allow the delay to grow unlimited if a retry was attempted immediately after each delay period.
1 parent 92b4cb2 commit fc43170

2 files changed

Lines changed: 3 additions & 6 deletions

File tree

beacon/light/request/server.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -438,14 +438,11 @@ func (s *serverWithLimits) fail(desc string) {
438438
// failLocked calculates the dynamic failure delay and applies it.
439439
func (s *serverWithLimits) failLocked(desc string) {
440440
log.Debug("Server error", "description", desc)
441-
s.failureDelay *= 2
442441
now := s.clock.Now()
443442
if now > s.failureDelayEnd {
444443
s.failureDelay *= math.Pow(2, -float64(now-s.failureDelayEnd)/float64(maxFailureDelay))
445444
}
446-
if s.failureDelay < float64(minFailureDelay) {
447-
s.failureDelay = float64(minFailureDelay)
448-
}
445+
s.failureDelay = max(min(s.failureDelay*2, float64(maxFailureDelay)), float64(minFailureDelay))
449446
s.failureDelayEnd = now + mclock.AbsTime(s.failureDelay)
450447
s.delay(time.Duration(s.failureDelay))
451448
}

beacon/light/sync/update_sync.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ const (
6262
ssNeedParent // cp header slot %32 != 0, need parent to check epoch boundary
6363
ssParentRequested // cp parent header requested
6464
ssPrintStatus // has all necessary info, print log message if init still not successful
65-
ssDone // log message printed, no more action required
6665
)
6766

6867
type serverState struct {
@@ -180,7 +179,8 @@ func (s *CheckpointInit) Process(requester request.Requester, events []request.E
180179
default:
181180
log.Error("blsync: checkpoint not available, but reported as finalized; specified checkpoint hash might be too old", "server", server.Name())
182181
}
183-
s.serverState[server] = serverState{state: ssDone}
182+
s.serverState[server] = serverState{state: ssDefault}
183+
requester.Fail(server, "checkpoint init failed")
184184
}
185185
}
186186

0 commit comments

Comments
 (0)