From e39478be8ef65234c956e5b963dfbc1a5ceee73c Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Tue, 2 Jun 2026 18:18:47 +0530 Subject: [PATCH 1/6] feat(consensus): report catching_up when fallen behind peers The catching_up field in /status is derived solely from the consensus reactor's WaitSync() latch. WaitSync is set once at startup and cleared the first time the node switches from block-sync to the consensus reactor, after which it stays false for the remainder of the process. As a result the field only ever reflects the initial block-sync and never reflects a node that later stops making progress relative to its peers, so such a node keeps reporting catching_up=false while operators and downstream consumers treat it as fully synced. Add Reactor.IsBehind(), evaluated from peer-reported heights, and OR it into catching_up: CatchingUp = WaitSync() || IsBehind() IsBehind reports true when a connected peer is more than catchup_lag_threshold blocks ahead of our consensus height, or when the connected-peer count is below min_expected_peers. The decision uses peer heights rather than local block-time staleness on purpose: staleness cannot distinguish a node that is behind its peers (some peer is ahead) from a network in which every node has legitimately stopped at the same height (no peer is ahead). Only the former is catching up; reporting the latter as catching up would be a false positive. The lag condition must hold continuously for catchup_debounce_duration before catching_up flips to true, which damps flapping at the threshold boundary; the transition back to false is immediate. New [consensus] config (with defaults): - catchup_lag_threshold = 5 (0 disables; must be >=2 to absorb the one-height round skew between synced peers) - min_expected_peers = 0 (off by default so single-node deployments stay healthy) - catchup_debounce_duration = 10s WaitSync is left untouched: it gates consensus message handling, so IsBehind is an independent read-only signal rather than a re-arming of that latch. The offline inspect server's reactor stub returns IsBehind()=false. Tests cover the lag decision (peer ahead beyond/at/just over the threshold, equal-height, one-height skew, no peer height learned, threshold disabled, single-node zero-peer, and the min-peers guard on/off) and the debounce state machine. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/config.go | 34 +++++++++++++ config/toml.go | 13 +++++ consensus/reactor.go | 82 +++++++++++++++++++++++++++++++ consensus/reactor_catchup_test.go | 81 ++++++++++++++++++++++++++++++ inspect/rpc/rpc.go | 4 ++ node/setup.go | 7 ++- rpc/core/env.go | 1 + rpc/core/status.go | 2 +- 8 files changed, 222 insertions(+), 2 deletions(-) create mode 100644 consensus/reactor_catchup_test.go diff --git a/config/config.go b/config/config.go index 6b5b4faa963..5d7c36efe5a 100644 --- a/config/config.go +++ b/config/config.go @@ -1030,6 +1030,25 @@ type ConsensusConfig struct { // BlockTimeTolerance is the maximum allowed difference between the proposed block time and wall-clock time. BlockTimeTolerance time.Duration `mapstructure:"block_time_tolerance"` + + // CatchupLagThreshold is how many blocks a peer may be ahead of us before we + // report catching_up=true. The reactor's WaitSync latch only reflects the + // initial block-sync at startup and stays false for the rest of the process, so + // without this check a node that later stops keeping up with its peers still + // reports catching_up=false. The comparison is against peer-reported heights + // rather than block-time staleness, so a network where every node has + // legitimately stopped at the same height is not misreported as catching up. + // 0 disables the check; must be >=2 when enabled to absorb the normal one-height + // round skew between synced peers. + CatchupLagThreshold int64 `mapstructure:"catchup_lag_threshold"` + // MinExpectedPeers, when >0, reports catching_up=true while connected peers are + // below this count, since a node that cannot reach enough peers cannot establish + // that it is current. 0 keeps single-node deployments healthy. + MinExpectedPeers int `mapstructure:"min_expected_peers"` + // CatchupDebounceDuration is how long the peer-lag condition must hold + // continuously before catching_up flips to true, damping flapping at the + // threshold boundary. The transition back to false is immediate. + CatchupDebounceDuration time.Duration `mapstructure:"catchup_debounce_duration"` } // DefaultConsensusConfig returns a default configuration for the consensus service @@ -1050,6 +1069,9 @@ func DefaultConsensusConfig() *ConsensusConfig { PeerQueryMaj23SleepDuration: 2000 * time.Millisecond, DoubleSignCheckHeight: int64(0), BlockTimeTolerance: 60 * time.Second, + CatchupLagThreshold: 5, + MinExpectedPeers: 0, + CatchupDebounceDuration: 10 * time.Second, } } @@ -1155,6 +1177,18 @@ func (cfg *ConsensusConfig) ValidateBasic() error { if cfg.BlockTimeTolerance <= 0 { return errors.New("block_time_tolerance must be positive") } + if cfg.CatchupLagThreshold < 0 { + return errors.New("catchup_lag_threshold can't be negative") + } + if cfg.CatchupLagThreshold == 1 { + return errors.New("catchup_lag_threshold must be 0 (disabled) or >=2 to absorb round skew") + } + if cfg.MinExpectedPeers < 0 { + return errors.New("min_expected_peers can't be negative") + } + if cfg.CatchupDebounceDuration < 0 { + return errors.New("catchup_debounce_duration can't be negative") + } return nil } diff --git a/config/toml.go b/config/toml.go index 6bb6fb92a77..575cf4f7b4a 100644 --- a/config/toml.go +++ b/config/toml.go @@ -519,6 +519,19 @@ peer_query_maj23_sleep_duration = "{{ .Consensus.PeerQueryMaj23SleepDuration }}" # Maximum allowed difference between proposed block time and wall-clock time. block_time_tolerance = "{{ .Consensus.BlockTimeTolerance }}" +# After initial block-sync completes, report catching_up=true when a peer is more +# than this many blocks ahead of us (i.e. this node has stopped keeping up). Set to +# 0 to disable; must be >=2 when enabled to absorb normal round skew. +catchup_lag_threshold = {{ .Consensus.CatchupLagThreshold }} + +# When >0, report catching_up=true while connected peers are below this count, since +# the node cannot establish that it is current. Keep at 0 for single-node deployments. +min_expected_peers = {{ .Consensus.MinExpectedPeers }} + +# How long the peer-lag condition must hold before catching_up flips to true +# (flap damping). The transition back to false is immediate. +catchup_debounce_duration = "{{ .Consensus.CatchupDebounceDuration }}" + ####################################################### ### Storage Configuration Options ### ####################################################### diff --git a/consensus/reactor.go b/consensus/reactor.go index ac4c9bc95f9..b09297d5e7e 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -46,6 +46,15 @@ type Reactor struct { eventBus *types.EventBus rs *cstypes.RoundState + // catchUpLagThreshold, minExpectedPeers and catchUpDebounce drive IsBehind, + // which lets /status report catching_up=true after initial sync when this node + // has fallen behind live peers. behindSince tracks how long the lag condition + // has held continuously, for debouncing. + catchUpLagThreshold int64 + minExpectedPeers int + catchUpDebounce time.Duration + behindSince time.Time + Metrics *Metrics } @@ -422,6 +431,70 @@ func (conR *Reactor) WaitSync() bool { return conR.waitSync } +// IsBehind reports whether this node has stopped keeping up with its peers after +// the initial block-sync has completed. WaitSync only reflects startup block-sync +// and stays false for the rest of the process, so on its own catching_up never +// reflects a node that later falls behind. IsBehind closes that gap from +// peer-reported heights. +// +// It compares heights rather than block-time staleness on purpose: a stale local +// block time can't distinguish a node that is behind its peers (some peer reports a +// greater height) from a network where every node has legitimately stopped at the +// same height (no peer is ahead). Only the former is "catching up"; reporting the +// latter as catching up would be a false positive. +func (conR *Reactor) IsBehind() bool { + raw := conR.isBehindRaw() + + conR.mtx.Lock() + defer conR.mtx.Unlock() + return conR.applyDebounceLocked(raw, time.Now()) +} + +// isBehindRaw gathers the local and max peer heights and applies the lag decision, +// without debouncing. It holds no lock while reading peers / round state. +func (conR *Reactor) isBehindRaw() bool { + peers := conR.Switch.Peers().List() + + var maxPeerHeight int64 + for _, peer := range peers { + ps, ok := peer.Get(types.PeerStateKey).(*PeerState) + if !ok { + continue + } + if h := ps.GetHeight(); h > maxPeerHeight { + maxPeerHeight = h + } + } + + return conR.evaluateBehind(conR.getRoundState().Height, maxPeerHeight, len(peers)) +} + +// evaluateBehind is the pure lag decision. Too few peers means we can't prove we're +// current (off by default so single-node stays healthy). A zero threshold disables +// the height check. maxPeerHeight == 0 means no peer height learned yet. +func (conR *Reactor) evaluateBehind(myHeight, maxPeerHeight int64, nPeers int) bool { + if conR.minExpectedPeers > 0 && nPeers < conR.minExpectedPeers { + return true + } + if conR.catchUpLagThreshold <= 0 || maxPeerHeight == 0 { + return false + } + return maxPeerHeight-myHeight > conR.catchUpLagThreshold +} + +// applyDebounceLocked requires the lag condition to hold for catchUpDebounce before +// returning true; recovery to false is immediate. conR.mtx must be held. +func (conR *Reactor) applyDebounceLocked(raw bool, now time.Time) bool { + if !raw { + conR.behindSince = time.Time{} + return false + } + if conR.behindSince.IsZero() { + conR.behindSince = now + } + return now.Sub(conR.behindSince) >= conR.catchUpDebounce +} + //-------------------------------------- // subscribeToBroadcastEvents subscribes for new round steps and votes @@ -1057,6 +1130,15 @@ func ReactorMetrics(metrics *Metrics) ReactorOption { return func(conR *Reactor) { conR.Metrics = metrics } } +// ReactorCatchupConfig configures the IsBehind heuristic that backs catching_up. +func ReactorCatchupConfig(lagThreshold int64, minPeers int, debounce time.Duration) ReactorOption { + return func(conR *Reactor) { + conR.catchUpLagThreshold = lagThreshold + conR.minExpectedPeers = minPeers + conR.catchUpDebounce = debounce + } +} + //----------------------------------------------------------------------------- var ( diff --git a/consensus/reactor_catchup_test.go b/consensus/reactor_catchup_test.go new file mode 100644 index 00000000000..407570e8806 --- /dev/null +++ b/consensus/reactor_catchup_test.go @@ -0,0 +1,81 @@ +package consensus + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestEvaluateBehind(t *testing.T) { + tests := []struct { + name string + lagThreshold int64 + minPeers int + myHeight int64 + maxPeerHeight int64 + nPeers int + want bool + }{ + {"peer far ahead", 5, 0, 100, 110, 3, true}, + {"peer ahead within threshold", 5, 0, 100, 105, 3, false}, + {"peer exactly at threshold", 5, 0, 100, 105, 3, false}, + {"peer one over threshold", 5, 0, 100, 106, 3, true}, + {"equal height network halt", 5, 0, 100, 100, 3, false}, + {"round skew peer one ahead", 5, 0, 100, 101, 3, false}, + {"no peer height learned", 5, 0, 100, 0, 3, false}, + {"threshold disabled ignores lag", 0, 0, 100, 999, 3, false}, + {"single node zero peers healthy", 5, 0, 100, 0, 0, false}, + {"min peers guard off with zero peers", 5, 0, 100, 100, 0, false}, + {"min peers guard trips below min", 5, 2, 100, 100, 1, true}, + {"min peers guard satisfied", 5, 2, 100, 100, 2, false}, + {"min peers guard fires even when lag disabled", 0, 2, 100, 0, 0, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + conR := &Reactor{ + catchUpLagThreshold: tt.lagThreshold, + minExpectedPeers: tt.minPeers, + } + got := conR.evaluateBehind(tt.myHeight, tt.maxPeerHeight, tt.nPeers) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestApplyDebounce(t *testing.T) { + debounce := 10 * time.Second + t0 := time.Now() + + t.Run("not behind resets and returns false", func(t *testing.T) { + conR := &Reactor{catchUpDebounce: debounce, behindSince: t0} + assert.False(t, conR.applyDebounceLocked(false, t0.Add(time.Hour))) + assert.True(t, conR.behindSince.IsZero()) + }) + + t.Run("behind but within debounce stays false", func(t *testing.T) { + conR := &Reactor{catchUpDebounce: debounce} + assert.False(t, conR.applyDebounceLocked(true, t0)) + assert.False(t, conR.applyDebounceLocked(true, t0.Add(5*time.Second))) + }) + + t.Run("behind past debounce flips true", func(t *testing.T) { + conR := &Reactor{catchUpDebounce: debounce} + assert.False(t, conR.applyDebounceLocked(true, t0)) + assert.True(t, conR.applyDebounceLocked(true, t0.Add(debounce))) + }) + + t.Run("transient blip resets the timer", func(t *testing.T) { + conR := &Reactor{catchUpDebounce: debounce} + assert.False(t, conR.applyDebounceLocked(true, t0)) + assert.False(t, conR.applyDebounceLocked(false, t0.Add(5*time.Second))) + // timer restarts; not yet past debounce from the new start. + assert.False(t, conR.applyDebounceLocked(true, t0.Add(6*time.Second))) + assert.True(t, conR.applyDebounceLocked(true, t0.Add(16*time.Second))) + }) + + t.Run("zero debounce flips immediately", func(t *testing.T) { + conR := &Reactor{catchUpDebounce: 0} + assert.True(t, conR.applyDebounceLocked(true, t0)) + }) +} diff --git a/inspect/rpc/rpc.go b/inspect/rpc/rpc.go index 4367ade2d59..85bfa57cbae 100644 --- a/inspect/rpc/rpc.go +++ b/inspect/rpc/rpc.go @@ -86,6 +86,10 @@ func (waitSyncCheckerImpl) WaitSync() bool { return false } +func (waitSyncCheckerImpl) IsBehind() bool { + return false +} + // ListenAndServe listens on the address specified in srv.Addr and handles any // incoming requests over HTTP using the Inspector rpc handler specified on the server. func (srv *Server) ListenAndServe(ctx context.Context) error { diff --git a/node/setup.go b/node/setup.go index df5e316254c..b62f63a7af9 100644 --- a/node/setup.go +++ b/node/setup.go @@ -331,7 +331,12 @@ func createConsensusReactor(config *cfg.Config, if privValidator != nil { consensusState.SetPrivValidator(privValidator) } - consensusReactor := cs.NewReactor(consensusState, waitSync, cs.ReactorMetrics(csMetrics)) + consensusReactor := cs.NewReactor(consensusState, waitSync, cs.ReactorMetrics(csMetrics), + cs.ReactorCatchupConfig( + config.Consensus.CatchupLagThreshold, + config.Consensus.MinExpectedPeers, + config.Consensus.CatchupDebounceDuration, + )) consensusReactor.SetLogger(consensusLogger) // services which will be publishing and/or subscribing for messages (events) // consensusReactor will set it on consensusState and blockExecutor diff --git a/rpc/core/env.go b/rpc/core/env.go index 9dc27b7c131..cbad670ee55 100644 --- a/rpc/core/env.go +++ b/rpc/core/env.go @@ -59,6 +59,7 @@ type peers interface { type consensusReactor interface { WaitSync() bool + IsBehind() bool } // ---------------------------------------------- diff --git a/rpc/core/status.go b/rpc/core/status.go index 5e3d6d1892e..78ffa7e6c7f 100644 --- a/rpc/core/status.go +++ b/rpc/core/status.go @@ -62,7 +62,7 @@ func (env *Environment) Status(*rpctypes.Context) (*ctypes.ResultStatus, error) EarliestAppHash: earliestAppHash, EarliestBlockHeight: earliestBlockHeight, EarliestBlockTime: time.Unix(0, earliestBlockTimeNano), - CatchingUp: env.ConsensusReactor.WaitSync(), + CatchingUp: env.ConsensusReactor.WaitSync() || env.ConsensusReactor.IsBehind(), }, ValidatorInfo: ctypes.ValidatorInfo{ Address: env.PubKey.Address(), From 1c7cc68fb624b09ce3840aca0201bd63a43c8c71 Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Thu, 4 Jun 2026 09:44:45 +0530 Subject: [PATCH 2/6] feat(consensus): derive zero-peer catching_up from sole-validator status A node that has finished initial block-sync but later ends up with no peers cannot observe whether the network has advanced past its height. The earlier approach gated this on a `min_expected_peers` config value (default 0), which had to be raised per network to have any effect. That encoded network-specific assumptions in configuration, did not reach already-initialized nodes on a binary upgrade (the key is absent from their config.toml), and could not be exercised on a local devnet whose chain id differs from the production networks. Replace the config knob with a self-determining rule: a node with zero peers reports catching_up=true unless it is the sole validator, in which case it can finalize blocks on its own and needs no peers to make progress. Sole-validator status uses the same local-validator membership test as the signing path (Validators.HasAddress on the private validator's address) and is read live from consensus state on each call, so it follows validator-set changes the node has committed into local round state without any cached value or operator action. A non-validator node has no private validator and so always needs peers, and is reported accordingly. This requires no configuration and is network-agnostic: the behavior is identical on mainnet, testnet, and local devnets, and ships entirely in the binary with no per-network value to deploy. Details: - consensus/reactor.go: evaluateBehind takes an isSoleValidator bool instead of a peer-count threshold; isBehindRaw sources it from State.isLocalSoleValidator. - consensus/state.go: add isLocalSoleValidator, read under the state lock. - config: remove the MinExpectedPeers field, its default, its ValidateBasic check, and the config.toml template entry. ReactorCatchupConfig and its node wiring drop the corresponding argument. - config: clarify that catchup_lag_threshold = 0 disables peer-height lag detection only; the zero-peer rule applies independently of it. - consensus/reactor_catchup_test.go: table updated for the sole-validator input. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/config.go | 14 ++++------- config/toml.go | 8 +++---- consensus/reactor.go | 33 ++++++++++++++----------- consensus/reactor_catchup_test.go | 40 ++++++++++++++----------------- consensus/state.go | 14 +++++++++++ node/setup.go | 1 - 6 files changed, 58 insertions(+), 52 deletions(-) diff --git a/config/config.go b/config/config.go index 5d7c36efe5a..beaa053a149 100644 --- a/config/config.go +++ b/config/config.go @@ -1038,13 +1038,11 @@ type ConsensusConfig struct { // reports catching_up=false. The comparison is against peer-reported heights // rather than block-time staleness, so a network where every node has // legitimately stopped at the same height is not misreported as catching up. - // 0 disables the check; must be >=2 when enabled to absorb the normal one-height - // round skew between synced peers. + // 0 disables peer-height lag detection only; must be >=2 when enabled to absorb + // the normal one-height round skew between synced peers. The separate zero-peer + // rule (a node with no peers in a multi-validator network reports catching_up) + // always applies, independent of this threshold. CatchupLagThreshold int64 `mapstructure:"catchup_lag_threshold"` - // MinExpectedPeers, when >0, reports catching_up=true while connected peers are - // below this count, since a node that cannot reach enough peers cannot establish - // that it is current. 0 keeps single-node deployments healthy. - MinExpectedPeers int `mapstructure:"min_expected_peers"` // CatchupDebounceDuration is how long the peer-lag condition must hold // continuously before catching_up flips to true, damping flapping at the // threshold boundary. The transition back to false is immediate. @@ -1070,7 +1068,6 @@ func DefaultConsensusConfig() *ConsensusConfig { DoubleSignCheckHeight: int64(0), BlockTimeTolerance: 60 * time.Second, CatchupLagThreshold: 5, - MinExpectedPeers: 0, CatchupDebounceDuration: 10 * time.Second, } } @@ -1183,9 +1180,6 @@ func (cfg *ConsensusConfig) ValidateBasic() error { if cfg.CatchupLagThreshold == 1 { return errors.New("catchup_lag_threshold must be 0 (disabled) or >=2 to absorb round skew") } - if cfg.MinExpectedPeers < 0 { - return errors.New("min_expected_peers can't be negative") - } if cfg.CatchupDebounceDuration < 0 { return errors.New("catchup_debounce_duration can't be negative") } diff --git a/config/toml.go b/config/toml.go index 575cf4f7b4a..c0723b21c41 100644 --- a/config/toml.go +++ b/config/toml.go @@ -521,13 +521,11 @@ block_time_tolerance = "{{ .Consensus.BlockTimeTolerance }}" # After initial block-sync completes, report catching_up=true when a peer is more # than this many blocks ahead of us (i.e. this node has stopped keeping up). Set to -# 0 to disable; must be >=2 when enabled to absorb normal round skew. +# 0 to disable peer-height lag detection only; must be >=2 when enabled to absorb +# normal round skew. A node with no peers in a multi-validator network still reports +# catching_up regardless of this setting. catchup_lag_threshold = {{ .Consensus.CatchupLagThreshold }} -# When >0, report catching_up=true while connected peers are below this count, since -# the node cannot establish that it is current. Keep at 0 for single-node deployments. -min_expected_peers = {{ .Consensus.MinExpectedPeers }} - # How long the peer-lag condition must hold before catching_up flips to true # (flap damping). The transition back to false is immediate. catchup_debounce_duration = "{{ .Consensus.CatchupDebounceDuration }}" diff --git a/consensus/reactor.go b/consensus/reactor.go index b09297d5e7e..aad146b3d10 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -46,12 +46,11 @@ type Reactor struct { eventBus *types.EventBus rs *cstypes.RoundState - // catchUpLagThreshold, minExpectedPeers and catchUpDebounce drive IsBehind, - // which lets /status report catching_up=true after initial sync when this node - // has fallen behind live peers. behindSince tracks how long the lag condition - // has held continuously, for debouncing. + // catchUpLagThreshold and catchUpDebounce drive IsBehind, which lets /status + // report catching_up=true after initial sync when this node has fallen behind + // live peers. behindSince tracks how long the lag condition has held + // continuously, for debouncing. catchUpLagThreshold int64 - minExpectedPeers int catchUpDebounce time.Duration behindSince time.Time @@ -466,15 +465,22 @@ func (conR *Reactor) isBehindRaw() bool { } } - return conR.evaluateBehind(conR.getRoundState().Height, maxPeerHeight, len(peers)) + // Sole-validator status is read live from consensus state on every call, so it + // reflects validator-set changes this node has committed into local round state + // without any cached value to update. (A node partitioned before a set change + // can't observe the other side's update, but that only makes it report behind, + // which is correct.) + return conR.evaluateBehind(conR.getRoundState().Height, maxPeerHeight, len(peers), conR.conS.isLocalSoleValidator()) } -// evaluateBehind is the pure lag decision. Too few peers means we can't prove we're -// current (off by default so single-node stays healthy). A zero threshold disables -// the height check. maxPeerHeight == 0 means no peer height learned yet. -func (conR *Reactor) evaluateBehind(myHeight, maxPeerHeight int64, nPeers int) bool { - if conR.minExpectedPeers > 0 && nPeers < conR.minExpectedPeers { - return true +// evaluateBehind is the pure lag decision. With no peers we can't observe progress, +// so we report behind unless this node can finalize on its own (it's the sole +// validator); a non-validator or a validator in a larger set genuinely needs peers. +// A zero threshold disables the height-lag check (the zero-peer rule still applies). +// maxPeerHeight == 0 means no peer height learned yet. +func (conR *Reactor) evaluateBehind(myHeight, maxPeerHeight int64, nPeers int, isSoleValidator bool) bool { + if nPeers == 0 { + return !isSoleValidator } if conR.catchUpLagThreshold <= 0 || maxPeerHeight == 0 { return false @@ -1131,10 +1137,9 @@ func ReactorMetrics(metrics *Metrics) ReactorOption { } // ReactorCatchupConfig configures the IsBehind heuristic that backs catching_up. -func ReactorCatchupConfig(lagThreshold int64, minPeers int, debounce time.Duration) ReactorOption { +func ReactorCatchupConfig(lagThreshold int64, debounce time.Duration) ReactorOption { return func(conR *Reactor) { conR.catchUpLagThreshold = lagThreshold - conR.minExpectedPeers = minPeers conR.catchUpDebounce = debounce } } diff --git a/consensus/reactor_catchup_test.go b/consensus/reactor_catchup_test.go index 407570e8806..ea01740e61c 100644 --- a/consensus/reactor_catchup_test.go +++ b/consensus/reactor_catchup_test.go @@ -9,35 +9,31 @@ import ( func TestEvaluateBehind(t *testing.T) { tests := []struct { - name string - lagThreshold int64 - minPeers int - myHeight int64 - maxPeerHeight int64 - nPeers int - want bool + name string + lagThreshold int64 + myHeight int64 + maxPeerHeight int64 + nPeers int + isSoleValidator bool + want bool }{ - {"peer far ahead", 5, 0, 100, 110, 3, true}, - {"peer ahead within threshold", 5, 0, 100, 105, 3, false}, - {"peer exactly at threshold", 5, 0, 100, 105, 3, false}, - {"peer one over threshold", 5, 0, 100, 106, 3, true}, - {"equal height network halt", 5, 0, 100, 100, 3, false}, - {"round skew peer one ahead", 5, 0, 100, 101, 3, false}, - {"no peer height learned", 5, 0, 100, 0, 3, false}, - {"threshold disabled ignores lag", 0, 0, 100, 999, 3, false}, - {"single node zero peers healthy", 5, 0, 100, 0, 0, false}, - {"min peers guard off with zero peers", 5, 0, 100, 100, 0, false}, - {"min peers guard trips below min", 5, 2, 100, 100, 1, true}, - {"min peers guard satisfied", 5, 2, 100, 100, 2, false}, - {"min peers guard fires even when lag disabled", 0, 2, 100, 0, 0, true}, + {"peer far ahead", 5, 100, 110, 3, false, true}, + {"peer ahead within threshold", 5, 100, 105, 3, false, false}, + {"peer one over threshold", 5, 100, 106, 3, false, true}, + {"equal height network halt", 5, 100, 100, 3, false, false}, + {"round skew peer one ahead", 5, 100, 101, 3, false, false}, + {"no peer height learned", 5, 100, 0, 3, false, false}, + {"threshold disabled ignores lag", 0, 100, 999, 3, false, false}, + {"sole validator zero peers healthy", 5, 100, 0, 0, true, false}, + {"non-sole zero peers behind", 5, 100, 0, 0, false, true}, + {"zero peers behind even when lag disabled", 0, 100, 0, 0, false, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { conR := &Reactor{ catchUpLagThreshold: tt.lagThreshold, - minExpectedPeers: tt.minPeers, } - got := conR.evaluateBehind(tt.myHeight, tt.maxPeerHeight, tt.nPeers) + got := conR.evaluateBehind(tt.myHeight, tt.maxPeerHeight, tt.nPeers, tt.isSoleValidator) assert.Equal(t, tt.want, got) }) } diff --git a/consensus/state.go b/consensus/state.go index daadbd17e5a..d530ec88bd3 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -2727,6 +2727,20 @@ func (cs *State) updatePrivValidatorPubKey() error { return nil } +// isLocalSoleValidator reports whether this node is the only validator in the +// current set, i.e. it can finalize blocks on its own and so does not need peers +// to make progress. Uses the same local-validator membership test as the signing +// path (Validators.HasAddress on the private validator's address), so a +// non-validator full node always returns false. +func (cs *State) isLocalSoleValidator() bool { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + if cs.privValidatorPubKey == nil || cs.Validators == nil { + return false + } + return cs.Validators.Size() == 1 && cs.Validators.HasAddress(cs.privValidatorPubKey.Address()) +} + // look back to check existence of the node's consensus votes before joining consensus func (cs *State) checkDoubleSigningRisk(height int64) error { if cs.privValidator != nil && cs.privValidatorPubKey != nil && cs.config.DoubleSignCheckHeight > 0 && height > 0 { diff --git a/node/setup.go b/node/setup.go index b62f63a7af9..558f20ee7b2 100644 --- a/node/setup.go +++ b/node/setup.go @@ -334,7 +334,6 @@ func createConsensusReactor(config *cfg.Config, consensusReactor := cs.NewReactor(consensusState, waitSync, cs.ReactorMetrics(csMetrics), cs.ReactorCatchupConfig( config.Consensus.CatchupLagThreshold, - config.Consensus.MinExpectedPeers, config.Consensus.CatchupDebounceDuration, )) consensusReactor.SetLogger(consensusLogger) From c90aa7fa191e5ac42484fb39d2be6ce54f095ab5 Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Fri, 5 Jun 2026 14:49:19 +0530 Subject: [PATCH 3/6] fix(consensus): require a majority of peers to corroborate catching_up lag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IsBehind() previously compared our height to the single highest peer height. Peer heights come from NewRoundStepMessage gossip, which is only shape-checked by ValidateBasic and is not proof a peer has committed or verified blocks at that height, so one peer reporting an inflated height could push catching_up=true on a healthy node. The signal is fail-safe for safety (it can only over-report "catching up", never forge progress or admit invalid blocks), but it feeds readiness/health probes and the execution client's sync gate, so a false positive is an availability/liveness concern. Decide the lag from a majority of connected peers instead of the maximum, and require at least minCorroboratingPeers (2) of them before trusting the signal: with a single peer that peer is a trivial majority, so a lone peer can no longer drive the field. Genuine lag is unaffected — a node that has truly fallen behind sees a majority of its peers ahead. A node with fewer than two peers abstains from the height-lag check; the separate zero-peer / sole-validator rule still covers the no-peer case. The residual is an attacker controlling a majority of a node's direct peers, which is the eclipse threat that already underlies p2p; a verified or voting-power-weighted height would be stronger but amounts to re-deriving block-sync and is out of scope here. Details: - consensus/reactor.go: evaluateBehind takes the per-peer heights and trips only when 2*ahead > len(peerHeights) with len >= minCorroboratingPeers; isBehindRaw collects the per-peer heights. - config: clarify that catchup_lag_threshold gates a majority of at least two peers, and that 0 disables peer-height lag detection only. - consensus/reactor_catchup_test.go: cover single-peer-minority, two-peer split, both-of-two ahead, and lone-peer-cannot-corroborate. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/config.go | 19 ++++++++++------- config/toml.go | 10 ++++----- consensus/reactor.go | 35 +++++++++++++++++++++---------- consensus/reactor_catchup_test.go | 31 +++++++++++++++------------ 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/config/config.go b/config/config.go index beaa053a149..499076c62d4 100644 --- a/config/config.go +++ b/config/config.go @@ -1031,16 +1031,19 @@ type ConsensusConfig struct { // BlockTimeTolerance is the maximum allowed difference between the proposed block time and wall-clock time. BlockTimeTolerance time.Duration `mapstructure:"block_time_tolerance"` - // CatchupLagThreshold is how many blocks a peer may be ahead of us before we - // report catching_up=true. The reactor's WaitSync latch only reflects the - // initial block-sync at startup and stays false for the rest of the process, so - // without this check a node that later stops keeping up with its peers still - // reports catching_up=false. The comparison is against peer-reported heights - // rather than block-time staleness, so a network where every node has - // legitimately stopped at the same height is not misreported as catching up. + // CatchupLagThreshold is how many blocks ahead of us a peer must be to count as + // reporting us behind; catching_up=true requires a majority of connected peers to + // be that far ahead. The reactor's WaitSync latch only reflects the initial + // block-sync at startup and stays false for the rest of the process, so without + // this check a node that later stops keeping up with its peers still reports + // catching_up=false. The comparison is against peer-reported heights rather than + // block-time staleness, so a network where every node has legitimately stopped at + // the same height is not misreported as catching up; peer heights are unverified + // gossip, so the lag must be corroborated by a majority of at least two connected + // peers, which keeps a single peer from driving the signal. // 0 disables peer-height lag detection only; must be >=2 when enabled to absorb // the normal one-height round skew between synced peers. The separate zero-peer - // rule (a node with no peers in a multi-validator network reports catching_up) + // rule (a node with no peers that is not the sole validator reports catching_up) // always applies, independent of this threshold. CatchupLagThreshold int64 `mapstructure:"catchup_lag_threshold"` // CatchupDebounceDuration is how long the peer-lag condition must hold diff --git a/config/toml.go b/config/toml.go index c0723b21c41..7641b3b9eb4 100644 --- a/config/toml.go +++ b/config/toml.go @@ -519,11 +519,11 @@ peer_query_maj23_sleep_duration = "{{ .Consensus.PeerQueryMaj23SleepDuration }}" # Maximum allowed difference between proposed block time and wall-clock time. block_time_tolerance = "{{ .Consensus.BlockTimeTolerance }}" -# After initial block-sync completes, report catching_up=true when a peer is more -# than this many blocks ahead of us (i.e. this node has stopped keeping up). Set to -# 0 to disable peer-height lag detection only; must be >=2 when enabled to absorb -# normal round skew. A node with no peers in a multi-validator network still reports -# catching_up regardless of this setting. +# After initial block-sync completes, report catching_up=true when a majority of the +# connected peers (at least two) are more than this many blocks ahead of us (i.e. this +# node has stopped keeping up). Set to 0 to disable peer-height lag detection only; +# must be >=2 when enabled to absorb normal round skew. A node with no peers that is +# not the sole validator still reports catching_up regardless of this setting. catchup_lag_threshold = {{ .Consensus.CatchupLagThreshold }} # How long the peer-lag condition must hold before catching_up flips to true diff --git a/consensus/reactor.go b/consensus/reactor.go index aad146b3d10..a8e03be82d2 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -454,15 +454,13 @@ func (conR *Reactor) IsBehind() bool { func (conR *Reactor) isBehindRaw() bool { peers := conR.Switch.Peers().List() - var maxPeerHeight int64 + peerHeights := make([]int64, 0, len(peers)) for _, peer := range peers { ps, ok := peer.Get(types.PeerStateKey).(*PeerState) if !ok { continue } - if h := ps.GetHeight(); h > maxPeerHeight { - maxPeerHeight = h - } + peerHeights = append(peerHeights, ps.GetHeight()) } // Sole-validator status is read live from consensus state on every call, so it @@ -470,22 +468,37 @@ func (conR *Reactor) isBehindRaw() bool { // without any cached value to update. (A node partitioned before a set change // can't observe the other side's update, but that only makes it report behind, // which is correct.) - return conR.evaluateBehind(conR.getRoundState().Height, maxPeerHeight, len(peers), conR.conS.isLocalSoleValidator()) + return conR.evaluateBehind(conR.getRoundState().Height, peerHeights, conR.conS.isLocalSoleValidator()) } +// minCorroboratingPeers is the fewest connected peers required before peer-height +// lag is trusted. Peer heights come from unverified gossip (NewRoundStepMessage), and +// with a single peer that peer is a trivial "majority"; requiring at least two means +// no single peer can drive catching_up on its own. Below this we can't corroborate, so +// the height-lag check abstains (the zero-peer rule still covers the no-peer case). +const minCorroboratingPeers = 2 + // evaluateBehind is the pure lag decision. With no peers we can't observe progress, // so we report behind unless this node can finalize on its own (it's the sole // validator); a non-validator or a validator in a larger set genuinely needs peers. -// A zero threshold disables the height-lag check (the zero-peer rule still applies). -// maxPeerHeight == 0 means no peer height learned yet. -func (conR *Reactor) evaluateBehind(myHeight, maxPeerHeight int64, nPeers int, isSoleValidator bool) bool { - if nPeers == 0 { +// Otherwise we report behind only when a majority of at least minCorroboratingPeers +// connected peers report a height more than catchUpLagThreshold ahead of ours, so an +// inflated height from a single peer (or a minority) can't drive the signal. A zero +// threshold disables the height-lag check; the zero-peer rule still applies. +func (conR *Reactor) evaluateBehind(myHeight int64, peerHeights []int64, isSoleValidator bool) bool { + if len(peerHeights) == 0 { return !isSoleValidator } - if conR.catchUpLagThreshold <= 0 || maxPeerHeight == 0 { + if conR.catchUpLagThreshold <= 0 || len(peerHeights) < minCorroboratingPeers { return false } - return maxPeerHeight-myHeight > conR.catchUpLagThreshold + ahead := 0 + for _, h := range peerHeights { + if h-myHeight > conR.catchUpLagThreshold { + ahead++ + } + } + return 2*ahead > len(peerHeights) } // applyDebounceLocked requires the lag condition to hold for catchUpDebounce before diff --git a/consensus/reactor_catchup_test.go b/consensus/reactor_catchup_test.go index ea01740e61c..395ca7808db 100644 --- a/consensus/reactor_catchup_test.go +++ b/consensus/reactor_catchup_test.go @@ -12,28 +12,33 @@ func TestEvaluateBehind(t *testing.T) { name string lagThreshold int64 myHeight int64 - maxPeerHeight int64 - nPeers int + peerHeights []int64 isSoleValidator bool want bool }{ - {"peer far ahead", 5, 100, 110, 3, false, true}, - {"peer ahead within threshold", 5, 100, 105, 3, false, false}, - {"peer one over threshold", 5, 100, 106, 3, false, true}, - {"equal height network halt", 5, 100, 100, 3, false, false}, - {"round skew peer one ahead", 5, 100, 101, 3, false, false}, - {"no peer height learned", 5, 100, 0, 3, false, false}, - {"threshold disabled ignores lag", 0, 100, 999, 3, false, false}, - {"sole validator zero peers healthy", 5, 100, 0, 0, true, false}, - {"non-sole zero peers behind", 5, 100, 0, 0, false, true}, - {"zero peers behind even when lag disabled", 0, 100, 0, 0, false, true}, + {"majority far ahead", 5, 100, []int64{110, 110, 110}, false, true}, + {"single peer ahead is minority", 5, 100, []int64{110, 100, 100}, false, false}, + {"majority of three ahead", 5, 100, []int64{110, 110, 100}, false, true}, + {"two peers split is not majority", 5, 100, []int64{110, 100}, false, false}, + {"both peers ahead", 5, 100, []int64{110, 110}, false, true}, + {"lone peer cannot corroborate", 5, 100, []int64{110}, false, false}, + {"lone peer far ahead still cannot corroborate", 5, 100, []int64{100000}, false, false}, + {"peers within threshold", 5, 100, []int64{105, 105, 105}, false, false}, + {"majority one over threshold", 5, 100, []int64{106, 106, 106}, false, true}, + {"equal height network halt", 5, 100, []int64{100, 100, 100}, false, false}, + {"round skew one ahead", 5, 100, []int64{101, 101, 101}, false, false}, + {"no peer height learned", 5, 100, []int64{0, 0, 0}, false, false}, + {"threshold disabled ignores lag", 0, 100, []int64{999, 999, 999}, false, false}, + {"sole validator zero peers healthy", 5, 100, nil, true, false}, + {"non-sole zero peers behind", 5, 100, nil, false, true}, + {"zero peers behind even when lag disabled", 0, 100, nil, false, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { conR := &Reactor{ catchUpLagThreshold: tt.lagThreshold, } - got := conR.evaluateBehind(tt.myHeight, tt.maxPeerHeight, tt.nPeers, tt.isSoleValidator) + got := conR.evaluateBehind(tt.myHeight, tt.peerHeights, tt.isSoleValidator) assert.Equal(t, tt.want, got) }) } From c5e8b23828f9f22e14082fd4d0a9ac825608ba09 Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Fri, 5 Jun 2026 15:40:57 +0530 Subject: [PATCH 4/6] test(consensus,config): unit-cover catching_up logic to 100% mutation kill Extends coverage of the IsBehind / catching_up path so the decision logic is pinned by fast, deterministic unit tests rather than relying on the heavy consensus suite. - Extract the per-peer height-gathering loop out of isBehindRaw into a pure collectPeerHeights helper, mirroring the existing evaluateBehind / applyDebounceLocked pattern, so the aggregation loop and the PeerState type assertion are testable without p2p plumbing. - consensus: add TestCollectPeerHeights (normal heights, peer with no PeerState, peer with a wrong-type value at the key), TestIsLocalSoleValidator (sole validator, sole set but not us, larger set, nil key, nil set), and TestReactorCatchupConfig. - config: cover the new catchup_lag_threshold / catchup_debounce_duration validation rules in ValidateBasic and assert their defaults. - inspect/rpc: pin the offline checker (WaitSync / IsBehind report synced). diffguard --base develop mutation testing goes from 78.7% to 100% (47/47, all tiers 100%). Adding the direct ReactorCatchupConfig test also removed a flake: with no direct test, that option setter's mutant kill depended on an unrelated flaky test in the consensus suite and flipped between runs. Fix the stale "max peer heights" comment on isBehindRaw left over from the raw-max implementation; it gathers all peer heights now. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/config_test.go | 12 ++++++ consensus/reactor.go | 31 ++++++++------ consensus/reactor_catchup_test.go | 70 +++++++++++++++++++++++++++++++ inspect/rpc/rpc_test.go | 15 +++++++ 4 files changed, 116 insertions(+), 12 deletions(-) create mode 100644 inspect/rpc/rpc_test.go diff --git a/config/config_test.go b/config/config_test.go index 442a7d1898a..57d5e146cde 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -179,6 +179,12 @@ func TestConsensusConfig_ValidateBasic(t *testing.T) { "BlockTimeTolerance": {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = time.Second }, false}, "BlockTimeTolerance zero": {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = 0 }, true}, "BlockTimeTolerance negative": {func(c *config.ConsensusConfig) { c.BlockTimeTolerance = -1 }, true}, + "CatchupLagThreshold disabled": {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 0 }, false}, + "CatchupLagThreshold one": {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 1 }, true}, + "CatchupLagThreshold two": {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = 2 }, false}, + "CatchupLagThreshold negative": {func(c *config.ConsensusConfig) { c.CatchupLagThreshold = -1 }, true}, + "CatchupDebounceDuration zero": {func(c *config.ConsensusConfig) { c.CatchupDebounceDuration = 0 }, false}, + "CatchupDebounceDuration negative": {func(c *config.ConsensusConfig) { c.CatchupDebounceDuration = -1 }, true}, } for desc, tc := range testcases { t.Run(desc, func(t *testing.T) { @@ -195,6 +201,12 @@ func TestConsensusConfig_ValidateBasic(t *testing.T) { } } +func TestDefaultConsensusConfigCatchupDefaults(t *testing.T) { + cfg := config.DefaultConsensusConfig() + assert.Equal(t, int64(5), cfg.CatchupLagThreshold) + assert.Equal(t, 10*time.Second, cfg.CatchupDebounceDuration) +} + func TestInstrumentationConfigValidateBasic(t *testing.T) { cfg := config.TestInstrumentationConfig() assert.NoError(t, cfg.ValidateBasic()) diff --git a/consensus/reactor.go b/consensus/reactor.go index a8e03be82d2..aad05fc5f8f 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -449,19 +449,10 @@ func (conR *Reactor) IsBehind() bool { return conR.applyDebounceLocked(raw, time.Now()) } -// isBehindRaw gathers the local and max peer heights and applies the lag decision, -// without debouncing. It holds no lock while reading peers / round state. +// isBehindRaw gathers the local height and peer heights and applies the lag +// decision, without debouncing. It holds no lock while reading peers / round state. func (conR *Reactor) isBehindRaw() bool { - peers := conR.Switch.Peers().List() - - peerHeights := make([]int64, 0, len(peers)) - for _, peer := range peers { - ps, ok := peer.Get(types.PeerStateKey).(*PeerState) - if !ok { - continue - } - peerHeights = append(peerHeights, ps.GetHeight()) - } + peerHeights := collectPeerHeights(conR.Switch.Peers().List()) // Sole-validator status is read live from consensus state on every call, so it // reflects validator-set changes this node has committed into local round state @@ -471,6 +462,22 @@ func (conR *Reactor) isBehindRaw() bool { return conR.evaluateBehind(conR.getRoundState().Height, peerHeights, conR.conS.isLocalSoleValidator()) } +// collectPeerHeights returns the gossiped consensus height of every peer that +// carries a consensus PeerState. Peers without one (key absent or wrong type) +// are skipped rather than counted as height 0, so they don't dilute the +// majority calculation in evaluateBehind. +func collectPeerHeights(peers []p2p.Peer) []int64 { + heights := make([]int64, 0, len(peers)) + for _, peer := range peers { + ps, ok := peer.Get(types.PeerStateKey).(*PeerState) + if !ok { + continue + } + heights = append(heights, ps.GetHeight()) + } + return heights +} + // minCorroboratingPeers is the fewest connected peers required before peer-height // lag is trusted. Peer heights come from unverified gossip (NewRoundStepMessage), and // with a single peer that peer is a trivial "majority"; requiring at least two means diff --git a/consensus/reactor_catchup_test.go b/consensus/reactor_catchup_test.go index 395ca7808db..16c81b61ea1 100644 --- a/consensus/reactor_catchup_test.go +++ b/consensus/reactor_catchup_test.go @@ -5,6 +5,11 @@ import ( "time" "github.com/stretchr/testify/assert" + + "github.com/cometbft/cometbft/crypto" + "github.com/cometbft/cometbft/p2p" + p2pmock "github.com/cometbft/cometbft/p2p/mock" + "github.com/cometbft/cometbft/types" ) func TestEvaluateBehind(t *testing.T) { @@ -44,6 +49,71 @@ func TestEvaluateBehind(t *testing.T) { } } +// peerWithHeight returns a mock peer carrying a consensus PeerState at height h. +func peerWithHeight(h int64) *p2pmock.Peer { + p := p2pmock.NewPeer(nil) + ps := NewPeerState(p) + ps.PRS.Height = h + p.Set(types.PeerStateKey, ps) + return p +} + +func TestCollectPeerHeights(t *testing.T) { + t.Run("no peers yields empty slice", func(t *testing.T) { + assert.Empty(t, collectPeerHeights(nil)) + }) + + t.Run("collects every peer height in order", func(t *testing.T) { + peers := []p2p.Peer{peerWithHeight(100), peerWithHeight(110), peerWithHeight(90)} + assert.Equal(t, []int64{100, 110, 90}, collectPeerHeights(peers)) + }) + + t.Run("skips peer without a PeerState", func(t *testing.T) { + peers := []p2p.Peer{peerWithHeight(100), p2pmock.NewPeer(nil), peerWithHeight(110)} + assert.Equal(t, []int64{100, 110}, collectPeerHeights(peers)) + }) + + t.Run("skips peer with a non-PeerState value at the key", func(t *testing.T) { + bad := p2pmock.NewPeer(nil) + bad.Set(types.PeerStateKey, "not a peer state") + peers := []p2p.Peer{peerWithHeight(100), bad} + assert.Equal(t, []int64{100}, collectPeerHeights(peers)) + }) +} + +func TestIsLocalSoleValidator(t *testing.T) { + val, _ := types.RandValidator(false, 10) + other, _ := types.RandValidator(false, 10) + + tests := []struct { + name string + pubKey crypto.PubKey + valSet *types.ValidatorSet + want bool + }{ + {"sole validator and we are it", val.PubKey, types.NewValidatorSet([]*types.Validator{val}), true}, + {"sole validator but it is not us", other.PubKey, types.NewValidatorSet([]*types.Validator{val}), false}, + {"we are in a larger set", val.PubKey, types.NewValidatorSet([]*types.Validator{val, other}), false}, + {"no local validator key", nil, types.NewValidatorSet([]*types.Validator{val}), false}, + {"no validator set", val.PubKey, nil, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cs := &State{} + cs.privValidatorPubKey = tt.pubKey + cs.Validators = tt.valSet + assert.Equal(t, tt.want, cs.isLocalSoleValidator()) + }) + } +} + +func TestReactorCatchupConfig(t *testing.T) { + conR := &Reactor{} + ReactorCatchupConfig(7, 3*time.Second)(conR) + assert.Equal(t, int64(7), conR.catchUpLagThreshold) + assert.Equal(t, 3*time.Second, conR.catchUpDebounce) +} + func TestApplyDebounce(t *testing.T) { debounce := 10 * time.Second t0 := time.Now() diff --git a/inspect/rpc/rpc_test.go b/inspect/rpc/rpc_test.go new file mode 100644 index 00000000000..5929392313f --- /dev/null +++ b/inspect/rpc/rpc_test.go @@ -0,0 +1,15 @@ +package rpc + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// The offline inspect server has no live consensus reactor, so it always reports +// fully synced: never waiting on block-sync and never behind its peers. +func TestWaitSyncCheckerReportsSynced(t *testing.T) { + checker := waitSyncCheckerImpl{} + assert.False(t, checker.WaitSync()) + assert.False(t, checker.IsBehind()) +} From 67da2af7ba7bcf6c978f195c2ac3a6dbe3b66223 Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Fri, 5 Jun 2026 15:46:20 +0530 Subject: [PATCH 5/6] test(consensus): integration-test IsBehind across peer topologies Covers the live wiring that the pure-helper tests don't: IsBehind driving a real (non-started) Switch's peer set through collectPeerHeights into evaluateBehind, plus the sole-validator and debounce paths, end to end. Builds a Reactor with mock peers at chosen heights, a round state at a chosen height, and catchUpDebounce=0 so the result is the raw decision with no timing. Cases mirror the operational topologies: - majority of peers ahead -> behind (the partition / fell-behind case), - synced multi-peer network -> not behind (no false positive on a healthy multi-node network), - lone peer ahead -> not behind (cannot corroborate), - no peers and not the sole validator -> behind (isolated node), - no peers but the sole validator -> not behind (finalizes alone). Co-Authored-By: Claude Opus 4.8 (1M context) --- consensus/reactor_catchup_test.go | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/consensus/reactor_catchup_test.go b/consensus/reactor_catchup_test.go index 16c81b61ea1..4b7764c9d5a 100644 --- a/consensus/reactor_catchup_test.go +++ b/consensus/reactor_catchup_test.go @@ -5,7 +5,10 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + cfg "github.com/cometbft/cometbft/config" + cstypes "github.com/cometbft/cometbft/consensus/types" "github.com/cometbft/cometbft/crypto" "github.com/cometbft/cometbft/p2p" p2pmock "github.com/cometbft/cometbft/p2p/mock" @@ -114,6 +117,56 @@ func TestReactorCatchupConfig(t *testing.T) { assert.Equal(t, 3*time.Second, conR.catchUpDebounce) } +// newCatchupReactor wires a Reactor with a non-started Switch carrying mock peers +// at the given heights, a round state at myHeight, and (optionally) a single-validator +// set that this node belongs to. catchUpDebounce is 0 so IsBehind reflects the raw +// decision immediately, keeping the test free of timing. +func newCatchupReactor(t *testing.T, myHeight int64, sole bool, peerHeights ...int64) *Reactor { + t.Helper() + cs := &State{} + if sole { + val, _ := types.RandValidator(false, 10) + cs.Validators = types.NewValidatorSet([]*types.Validator{val}) + cs.privValidatorPubKey = val.PubKey + } + conR := &Reactor{ + conS: cs, + rs: &cstypes.RoundState{Height: myHeight}, + catchUpLagThreshold: 5, + catchUpDebounce: 0, + } + conR.BaseReactor = *p2p.NewBaseReactor("Consensus", conR) + + sw := p2p.NewSwitch(cfg.DefaultP2PConfig(), nil) + peerSet := sw.Peers().(*p2p.PeerSet) + for _, h := range peerHeights { + require.NoError(t, peerSet.Add(peerWithHeight(h))) + } + conR.SetSwitch(sw) + return conR +} + +// TestIsBehindIntegration exercises the full IsBehind path (Switch peers -> +// collectPeerHeights -> evaluateBehind, plus the sole-validator and debounce +// wiring) rather than the pure decision helpers in isolation. +func TestIsBehindIntegration(t *testing.T) { + t.Run("majority of peers ahead reports behind", func(t *testing.T) { + assert.True(t, newCatchupReactor(t, 100, false, 110, 110, 110).IsBehind()) + }) + t.Run("synced multi-peer network is not behind", func(t *testing.T) { + assert.False(t, newCatchupReactor(t, 100, false, 100, 100, 100).IsBehind()) + }) + t.Run("lone peer ahead cannot corroborate", func(t *testing.T) { + assert.False(t, newCatchupReactor(t, 100, false, 110).IsBehind()) + }) + t.Run("no peers and not sole validator is behind", func(t *testing.T) { + assert.True(t, newCatchupReactor(t, 100, false).IsBehind()) + }) + t.Run("no peers but sole validator is not behind", func(t *testing.T) { + assert.False(t, newCatchupReactor(t, 100, true).IsBehind()) + }) +} + func TestApplyDebounce(t *testing.T) { debounce := 10 * time.Second t0 := time.Now() From b7ed08e21d122338b1a7b84b6fedf5cd2e183aa1 Mon Sep 17 00:00:00 2001 From: Pratik Patil Date: Fri, 5 Jun 2026 17:04:17 +0530 Subject: [PATCH 6/6] chore(config): clarify catchup_lag_threshold=1 rejection message Reword the ValidateBasic error for catchup_lag_threshold == 1 from "to absorb round skew" to "(margin beyond the one-height round skew)". Rejecting 1 isn't required to absorb a one-height skew; the point is to keep the smallest enabled threshold above the routine one-height proposer/commit difference between synced peers. Message text only; no logic change. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.go b/config/config.go index 499076c62d4..27184a3c439 100644 --- a/config/config.go +++ b/config/config.go @@ -1181,7 +1181,7 @@ func (cfg *ConsensusConfig) ValidateBasic() error { return errors.New("catchup_lag_threshold can't be negative") } if cfg.CatchupLagThreshold == 1 { - return errors.New("catchup_lag_threshold must be 0 (disabled) or >=2 to absorb round skew") + return errors.New("catchup_lag_threshold must be 0 (disabled) or >=2 (margin beyond the one-height round skew)") } if cfg.CatchupDebounceDuration < 0 { return errors.New("catchup_debounce_duration can't be negative")