@@ -46,6 +46,14 @@ type Reactor struct {
4646 eventBus * types.EventBus
4747 rs * cstypes.RoundState
4848
49+ // catchUpLagThreshold and catchUpDebounce drive IsBehind, which lets /status
50+ // report catching_up=true after initial sync when this node has fallen behind
51+ // live peers. behindSince tracks how long the lag condition has held
52+ // continuously, for debouncing.
53+ catchUpLagThreshold int64
54+ catchUpDebounce time.Duration
55+ behindSince time.Time
56+
4957 Metrics * Metrics
5058}
5159
@@ -422,6 +430,97 @@ func (conR *Reactor) WaitSync() bool {
422430 return conR .waitSync
423431}
424432
433+ // IsBehind reports whether this node has stopped keeping up with its peers after
434+ // the initial block-sync has completed. WaitSync only reflects startup block-sync
435+ // and stays false for the rest of the process, so on its own catching_up never
436+ // reflects a node that later falls behind. IsBehind closes that gap from
437+ // peer-reported heights.
438+ //
439+ // It compares heights rather than block-time staleness on purpose: a stale local
440+ // block time can't distinguish a node that is behind its peers (some peer reports a
441+ // greater height) from a network where every node has legitimately stopped at the
442+ // same height (no peer is ahead). Only the former is "catching up"; reporting the
443+ // latter as catching up would be a false positive.
444+ func (conR * Reactor ) IsBehind () bool {
445+ raw := conR .isBehindRaw ()
446+
447+ conR .mtx .Lock ()
448+ defer conR .mtx .Unlock ()
449+ return conR .applyDebounceLocked (raw , time .Now ())
450+ }
451+
452+ // isBehindRaw gathers the local height and peer heights and applies the lag
453+ // decision, without debouncing. It holds no lock while reading peers / round state.
454+ func (conR * Reactor ) isBehindRaw () bool {
455+ peerHeights := collectPeerHeights (conR .Switch .Peers ().List ())
456+
457+ // Sole-validator status is read live from consensus state on every call, so it
458+ // reflects validator-set changes this node has committed into local round state
459+ // without any cached value to update. (A node partitioned before a set change
460+ // can't observe the other side's update, but that only makes it report behind,
461+ // which is correct.)
462+ return conR .evaluateBehind (conR .getRoundState ().Height , peerHeights , conR .conS .isLocalSoleValidator ())
463+ }
464+
465+ // collectPeerHeights returns the gossiped consensus height of every peer that
466+ // carries a consensus PeerState. Peers without one (key absent or wrong type)
467+ // are skipped rather than counted as height 0, so they don't dilute the
468+ // majority calculation in evaluateBehind.
469+ func collectPeerHeights (peers []p2p.Peer ) []int64 {
470+ heights := make ([]int64 , 0 , len (peers ))
471+ for _ , peer := range peers {
472+ ps , ok := peer .Get (types .PeerStateKey ).(* PeerState )
473+ if ! ok {
474+ continue
475+ }
476+ heights = append (heights , ps .GetHeight ())
477+ }
478+ return heights
479+ }
480+
481+ // minCorroboratingPeers is the fewest connected peers required before peer-height
482+ // lag is trusted. Peer heights come from unverified gossip (NewRoundStepMessage), and
483+ // with a single peer that peer is a trivial "majority"; requiring at least two means
484+ // no single peer can drive catching_up on its own. Below this we can't corroborate, so
485+ // the height-lag check abstains (the zero-peer rule still covers the no-peer case).
486+ const minCorroboratingPeers = 2
487+
488+ // evaluateBehind is the pure lag decision. With no peers we can't observe progress,
489+ // so we report behind unless this node can finalize on its own (it's the sole
490+ // validator); a non-validator or a validator in a larger set genuinely needs peers.
491+ // Otherwise we report behind only when a majority of at least minCorroboratingPeers
492+ // connected peers report a height more than catchUpLagThreshold ahead of ours, so an
493+ // inflated height from a single peer (or a minority) can't drive the signal. A zero
494+ // threshold disables the height-lag check; the zero-peer rule still applies.
495+ func (conR * Reactor ) evaluateBehind (myHeight int64 , peerHeights []int64 , isSoleValidator bool ) bool {
496+ if len (peerHeights ) == 0 {
497+ return ! isSoleValidator
498+ }
499+ if conR .catchUpLagThreshold <= 0 || len (peerHeights ) < minCorroboratingPeers {
500+ return false
501+ }
502+ ahead := 0
503+ for _ , h := range peerHeights {
504+ if h - myHeight > conR .catchUpLagThreshold {
505+ ahead ++
506+ }
507+ }
508+ return 2 * ahead > len (peerHeights )
509+ }
510+
511+ // applyDebounceLocked requires the lag condition to hold for catchUpDebounce before
512+ // returning true; recovery to false is immediate. conR.mtx must be held.
513+ func (conR * Reactor ) applyDebounceLocked (raw bool , now time.Time ) bool {
514+ if ! raw {
515+ conR .behindSince = time.Time {}
516+ return false
517+ }
518+ if conR .behindSince .IsZero () {
519+ conR .behindSince = now
520+ }
521+ return now .Sub (conR .behindSince ) >= conR .catchUpDebounce
522+ }
523+
425524//--------------------------------------
426525
427526// subscribeToBroadcastEvents subscribes for new round steps and votes
@@ -1057,6 +1156,14 @@ func ReactorMetrics(metrics *Metrics) ReactorOption {
10571156 return func (conR * Reactor ) { conR .Metrics = metrics }
10581157}
10591158
1159+ // ReactorCatchupConfig configures the IsBehind heuristic that backs catching_up.
1160+ func ReactorCatchupConfig (lagThreshold int64 , debounce time.Duration ) ReactorOption {
1161+ return func (conR * Reactor ) {
1162+ conR .catchUpLagThreshold = lagThreshold
1163+ conR .catchUpDebounce = debounce
1164+ }
1165+ }
1166+
10601167//-----------------------------------------------------------------------------
10611168
10621169var (
0 commit comments