0xPolygon
diff --git a/‎config/config.go‎
Lines changed: 31 additions & 0 deletions b/‎config/config.go‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎config/config_test.go‎
Lines changed: 12 additions & 0 deletions b/‎config/config_test.go‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎config/toml.go‎
Lines changed: 11 additions & 0 deletions b/‎config/toml.go‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎consensus/reactor.go‎
Lines changed: 107 additions & 0 deletions b/‎consensus/reactor.go‎
Lines changed: 107 additions & 0 deletions
@@ -1030,6 +1030,26 @@ type ConsensusConfig struct {
 
 	// BlockTimeTolerance is the maximum allowed difference between the proposed block time and wall-clock time.
 	BlockTimeTolerance time.Duration `mapstructure:"block_time_tolerance"`
+
+	// CatchupLagThreshold is how many blocks ahead of us a peer must be to count as
+	// reporting us behind; catching_up=true requires a majority of connected peers to
+	// be that far ahead. The reactor's WaitSync latch only reflects the initial
+	// block-sync at startup and stays false for the rest of the process, so without
+	// this check a node that later stops keeping up with its peers still reports
+	// catching_up=false. The comparison is against peer-reported heights rather than
+	// block-time staleness, so a network where every node has legitimately stopped at
+	// the same height is not misreported as catching up; peer heights are unverified
+	// gossip, so the lag must be corroborated by a majority of at least two connected
+	// peers, which keeps a single peer from driving the signal.
+	// 0 disables peer-height lag detection only; must be >=2 when enabled to absorb
+	// the normal one-height round skew between synced peers. The separate zero-peer
+	// rule (a node with no peers that is not the sole validator reports catching_up)
+	// always applies, independent of this threshold.
+	CatchupLagThreshold int64 `mapstructure:"catchup_lag_threshold"`
+	// CatchupDebounceDuration is how long the peer-lag condition must hold
+	// continuously before catching_up flips to true, damping flapping at the
+	// threshold boundary. The transition back to false is immediate.
+	CatchupDebounceDuration time.Duration `mapstructure:"catchup_debounce_duration"`
 }
 
 // DefaultConsensusConfig returns a default configuration for the consensus service
@@ -1050,6 +1070,8 @@ func DefaultConsensusConfig() *ConsensusConfig {
 		PeerQueryMaj23SleepDuration: 2000 * time.Millisecond,
 		DoubleSignCheckHeight:       int64(0),
 		BlockTimeTolerance:          60 * time.Second,
+		CatchupLagThreshold:         5,
+		CatchupDebounceDuration:     10 * time.Second,
 	}
 }
 
@@ -1155,6 +1177,15 @@ func (cfg *ConsensusConfig) ValidateBasic() error {
 	if cfg.BlockTimeTolerance <= 0 {
 		return errors.New("block_time_tolerance must be positive")
 	}
+	if cfg.CatchupLagThreshold < 0 {
+		return errors.New("catchup_lag_threshold can't be negative")
+	}
+	if cfg.CatchupLagThreshold == 1 {
+		return errors.New("catchup_lag_threshold must be 0 (disabled) or >=2 (margin beyond the one-height round skew)")
+	}
+	if cfg.CatchupDebounceDuration < 0 {
+		return errors.New("catchup_debounce_duration can't be negative")
+	}
 	return nil
 }
 
 
@@ -179,6 +179,12 @@ func TestConsensusConfig_ValidateBasic(t *testing.T) {
 		"BlockTimeTolerance":                   {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = time.Second }, false},
 		"BlockTimeTolerance zero":              {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = 0 }, true},
 		"BlockTimeTolerance negative":          {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = -1 }, true},
+		"CatchupLagThreshold disabled":         {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 0 }, false},
+		"CatchupLagThreshold one":              {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 1 }, true},
+		"CatchupLagThreshold two":              {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 2 }, false},
+		"CatchupLagThreshold negative":         {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = -1 }, true},
+		"CatchupDebounceDuration zero":         {func(c *config.ConsensusConfig) { c.CatchupDebounceDuration = 0 }, false},
+		"CatchupDebounceDuration negative":     {func(c *config.ConsensusConfig) { c.CatchupDebounceDuration = -1 }, true},
 	}
 	for desc, tc := range testcases {
 		t.Run(desc, func(t *testing.T) {
@@ -195,6 +201,12 @@ func TestConsensusConfig_ValidateBasic(t *testing.T) {
 	}
 }
 
+func TestDefaultConsensusConfigCatchupDefaults(t *testing.T) {
+	cfg := config.DefaultConsensusConfig()
+	assert.Equal(t, int64(5), cfg.CatchupLagThreshold)
+	assert.Equal(t, 10*time.Second, cfg.CatchupDebounceDuration)
+}
+
 func TestInstrumentationConfigValidateBasic(t *testing.T) {
 	cfg := config.TestInstrumentationConfig()
 	assert.NoError(t, cfg.ValidateBasic())
 
@@ -519,6 +519,17 @@ peer_query_maj23_sleep_duration = "{{ .Consensus.PeerQueryMaj23SleepDuration }}"
 # Maximum allowed difference between proposed block time and wall-clock time.
 block_time_tolerance = "{{ .Consensus.BlockTimeTolerance }}"
 
+# After initial block-sync completes, report catching_up=true when a majority of the
+# connected peers (at least two) are more than this many blocks ahead of us (i.e. this
+# node has stopped keeping up). Set to 0 to disable peer-height lag detection only;
+# must be >=2 when enabled to absorb normal round skew. A node with no peers that is
+# not the sole validator still reports catching_up regardless of this setting.
+catchup_lag_threshold = {{ .Consensus.CatchupLagThreshold }}
+
+# How long the peer-lag condition must hold before catching_up flips to true
+# (flap damping). The transition back to false is immediate.
+catchup_debounce_duration = "{{ .Consensus.CatchupDebounceDuration }}"
+
 #######################################################
 ###         Storage Configuration Options           ###
 #######################################################
 
@@ -46,6 +46,14 @@ type Reactor struct {
 	eventBus *types.EventBus
 	rs       *cstypes.RoundState
 
+	// catchUpLagThreshold and catchUpDebounce drive IsBehind, which lets /status
+	// report catching_up=true after initial sync when this node has fallen behind
+	// live peers. behindSince tracks how long the lag condition has held
+	// continuously, for debouncing.
+	catchUpLagThreshold int64
+	catchUpDebounce     time.Duration
+	behindSince         time.Time
+
 	Metrics *Metrics
 }
 
@@ -422,6 +430,97 @@ func (conR *Reactor) WaitSync() bool {
 	return conR.waitSync
 }
 
+// IsBehind reports whether this node has stopped keeping up with its peers after
+// the initial block-sync has completed. WaitSync only reflects startup block-sync
+// and stays false for the rest of the process, so on its own catching_up never
+// reflects a node that later falls behind. IsBehind closes that gap from
+// peer-reported heights.
+//
+// It compares heights rather than block-time staleness on purpose: a stale local
+// block time can't distinguish a node that is behind its peers (some peer reports a
+// greater height) from a network where every node has legitimately stopped at the
+// same height (no peer is ahead). Only the former is "catching up"; reporting the
+// latter as catching up would be a false positive.
+func (conR *Reactor) IsBehind() bool {
+	raw := conR.isBehindRaw()
+
+	conR.mtx.Lock()
+	defer conR.mtx.Unlock()
+	return conR.applyDebounceLocked(raw, time.Now())
+}
+
+// isBehindRaw gathers the local height and peer heights and applies the lag
+// decision, without debouncing. It holds no lock while reading peers / round state.
+func (conR *Reactor) isBehindRaw() bool {
+	peerHeights := collectPeerHeights(conR.Switch.Peers().List())
+
+	// Sole-validator status is read live from consensus state on every call, so it
+	// reflects validator-set changes this node has committed into local round state
+	// without any cached value to update. (A node partitioned before a set change
+	// can't observe the other side's update, but that only makes it report behind,
+	// which is correct.)
+	return conR.evaluateBehind(conR.getRoundState().Height, peerHeights, conR.conS.isLocalSoleValidator())
+}
+
+// collectPeerHeights returns the gossiped consensus height of every peer that
+// carries a consensus PeerState. Peers without one (key absent or wrong type)
+// are skipped rather than counted as height 0, so they don't dilute the
+// majority calculation in evaluateBehind.
+func collectPeerHeights(peers []p2p.Peer) []int64 {
+	heights := make([]int64, 0, len(peers))
+	for _, peer := range peers {
+		ps, ok := peer.Get(types.PeerStateKey).(*PeerState)
+		if !ok {
+			continue
+		}
+		heights = append(heights, ps.GetHeight())
+	}
+	return heights
+}
+
+// minCorroboratingPeers is the fewest connected peers required before peer-height
+// lag is trusted. Peer heights come from unverified gossip (NewRoundStepMessage), and
+// with a single peer that peer is a trivial "majority"; requiring at least two means
+// no single peer can drive catching_up on its own. Below this we can't corroborate, so
+// the height-lag check abstains (the zero-peer rule still covers the no-peer case).
+const minCorroboratingPeers = 2
+
+// evaluateBehind is the pure lag decision. With no peers we can't observe progress,
+// so we report behind unless this node can finalize on its own (it's the sole
+// validator); a non-validator or a validator in a larger set genuinely needs peers.
+// Otherwise we report behind only when a majority of at least minCorroboratingPeers
+// connected peers report a height more than catchUpLagThreshold ahead of ours, so an
+// inflated height from a single peer (or a minority) can't drive the signal. A zero
+// threshold disables the height-lag check; the zero-peer rule still applies.
+func (conR *Reactor) evaluateBehind(myHeight int64, peerHeights []int64, isSoleValidator bool) bool {
+	if len(peerHeights) == 0 {
+		return !isSoleValidator
+	}
+	if conR.catchUpLagThreshold <= 0 || len(peerHeights) < minCorroboratingPeers {
+		return false
+	}
+	ahead := 0
+	for _, h := range peerHeights {
+		if h-myHeight > conR.catchUpLagThreshold {
+			ahead++
+		}
+	}
+	return 2*ahead > len(peerHeights)
+}
+
+// applyDebounceLocked requires the lag condition to hold for catchUpDebounce before
+// returning true; recovery to false is immediate. conR.mtx must be held.
+func (conR *Reactor) applyDebounceLocked(raw bool, now time.Time) bool {
+	if !raw {
+		conR.behindSince = time.Time{}
+		return false
+	}
+	if conR.behindSince.IsZero() {
+		conR.behindSince = now
+	}
+	return now.Sub(conR.behindSince) >= conR.catchUpDebounce
+}
+
 //--------------------------------------
 
 // subscribeToBroadcastEvents subscribes for new round steps and votes
@@ -1057,6 +1156,14 @@ func ReactorMetrics(metrics *Metrics) ReactorOption {
 	return func(conR *Reactor) { conR.Metrics = metrics }
 }
 
+// ReactorCatchupConfig configures the IsBehind heuristic that backs catching_up.
+func ReactorCatchupConfig(lagThreshold int64, debounce time.Duration) ReactorOption {
+	return func(conR *Reactor) {
+		conR.catchUpLagThreshold = lagThreshold
+		conR.catchUpDebounce = debounce
+	}
+}
+
 //-----------------------------------------------------------------------------
 
 var (