-
Notifications
You must be signed in to change notification settings - Fork 4
[KLC-1920] fix: force not-synced when gossip is ahead of probable highest nonce #69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 3 commits
240a6e5
875fbfc
b860d55
e97e103
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -301,10 +301,23 @@ func (boot *baseBootstrap) computeNodeState() { | |
| } else { | ||
| lastNonce = currentHeader.GetNonce() | ||
| lastSlot = currentHeader.GetSlot() | ||
| boot.hasLastBlock = boot.forkDetector.ProbableHighestNonce() <= boot.chainHandler.GetCurrentBlockHeader().GetNonce() | ||
| currentBlockNonce := boot.chainHandler.GetCurrentBlockHeader().GetNonce() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the refetch of |
||
| probableHighestNonce := boot.forkDetector.ProbableHighestNonce() | ||
| highestNonceReceived := boot.forkDetector.HighestNonceReceived() | ||
| boot.hasLastBlock = probableHighestNonce <= currentBlockNonce | ||
| // KLC-1920: gossip-derived ceiling is the source of truth that | ||
| // probableHighestNonce can lag behind when the BHReceived path is | ||
| // disrupted (peer churn after an election, fallback observer not | ||
| // receiving fetched headers). If gossip reports the network ahead | ||
| // by more than the normal proposal/commit window, the node is not | ||
| // really synced even if probableHighestNonce equals currentBlockNonce. | ||
| if highestNonceReceived > currentBlockNonce+process.BlockFinality { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| boot.hasLastBlock = false | ||
| } | ||
| log.Debug("computeNodeState", | ||
| "probableHighestNonce", boot.forkDetector.ProbableHighestNonce(), | ||
| "currentBlockNonce", boot.chainHandler.GetCurrentBlockHeader().GetNonce(), | ||
| "probableHighestNonce", probableHighestNonce, | ||
| "highestNonceReceived", highestNonceReceived, | ||
| "currentBlockNonce", currentBlockNonce, | ||
| "boot.hasLastBlock", boot.hasLastBlock) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| package sync_test | ||
|
|
||
| import ( | ||
| "sync" | ||
| "testing" | ||
| "time" | ||
|
|
||
| commonMock "github.com/klever-io/klever-go/common/mock" | ||
| "github.com/klever-io/klever-go/core" | ||
| consensusMock "github.com/klever-io/klever-go/core/consensus/mock" | ||
| "github.com/klever-io/klever-go/core/process" | ||
| syncpkg "github.com/klever-io/klever-go/core/process/sync" | ||
| "github.com/klever-io/klever-go/data" | ||
| "github.com/klever-io/klever-go/data/block" | ||
| "github.com/stretchr/testify/assert" | ||
| ) | ||
|
|
||
| // klc1920_node_state_test.go covers the new branch in | ||
| // baseBootstrap.computeNodeState: when HighestNonceReceived is more than | ||
| // BlockFinality blocks ahead of currentBlockNonce, hasLastBlock is forced | ||
| // to false so isNodeSynchronized correctly reports the node is behind. | ||
| // | ||
| // Without this branch, a fallback whose BHReceived path is broken (peer | ||
| // churn after election) would have probableHighestNonce == currentBlockNonce | ||
| // and falsely declare itself synced — the production failure mode KLC-1920 | ||
| // and KLC-2389 describe. | ||
|
|
||
| type observableStatusHandler struct { | ||
| mu sync.Mutex | ||
| isSyncing uint64 | ||
| } | ||
|
|
||
| func (o *observableStatusHandler) Increment(_ string) {} | ||
| func (o *observableStatusHandler) AddUint64(_ string, _ uint64) {} | ||
| func (o *observableStatusHandler) Decrement(_ string) {} | ||
| func (o *observableStatusHandler) SetInt64Value(_ string, _ int64) {} | ||
| func (o *observableStatusHandler) SetUInt64Value(key string, value uint64) { | ||
| if key != core.MetricIsSyncing { | ||
| return | ||
| } | ||
| o.mu.Lock() | ||
| o.isSyncing = value | ||
| o.mu.Unlock() | ||
| } | ||
| func (o *observableStatusHandler) SetStringValue(_ string, _ string) {} | ||
| func (o *observableStatusHandler) Close() {} | ||
| func (o *observableStatusHandler) IsInterfaceNil() bool { return o == nil } | ||
|
|
||
| func (o *observableStatusHandler) IsSyncing() uint64 { | ||
| o.mu.Lock() | ||
| defer o.mu.Unlock() | ||
| return o.isSyncing | ||
| } | ||
|
|
||
| func buildKLC1920Bootstrap(probable, highest, currentBlockNonce uint64) (*syncpkg.BaseBootstrap, *observableStatusHandler) { | ||
| forkDetector := &commonMock.ForkDetectorMock{ | ||
| CheckForkCalled: func() *process.ForkInfo { return &process.ForkInfo{} }, | ||
| ProbableHighestNonceCalled: func() uint64 { return probable }, | ||
| HighestNonceReceivedCalled: func() uint64 { return highest }, | ||
| GetHighestFinalBlockNonceCalled: func() uint64 { return 0 }, | ||
| } | ||
|
|
||
| genesisHeader := &block.Block{Header: &block.BlockHeader{Nonce: 0, Slot: 0}} | ||
| currentHeader := &block.Block{Header: &block.BlockHeader{Nonce: currentBlockNonce, Slot: currentBlockNonce}} | ||
|
|
||
| chainHandler := &commonMock.BlockChainMock{ | ||
| GetGenesisHeaderCalled: func() data.HeaderHandler { return genesisHeader }, | ||
| GetCurrentBlockHeaderCalled: func() data.HeaderHandler { return currentHeader }, | ||
| } | ||
|
|
||
| slotManager := &consensusMock.SlotManagerMock{ | ||
| SlotIndex: int64(currentBlockNonce + 5), | ||
| TimeDurationCalled: func() time.Duration { return 0 }, | ||
| BeforeGenesisCalled: func() bool { return true }, // suppress requestHeadersIfSyncIsStuck path | ||
| } | ||
|
|
||
| networkWatcher := &commonMock.MessengerStub{ | ||
| IsConnectedToTheNetworkCalled: func() bool { return true }, | ||
| } | ||
| statusHandler := &observableStatusHandler{} | ||
|
|
||
| boot := syncpkg.NewBaseBootstrapForKLC1920Test( | ||
| forkDetector, | ||
| chainHandler, | ||
| slotManager, | ||
| networkWatcher, | ||
| statusHandler, | ||
| ) | ||
|
|
||
| return boot, statusHandler | ||
| } | ||
|
|
||
| // TestKLC1920_ComputeNodeState_GossipAheadForcesNotSynced is the regression | ||
| // guard for the synced-state gate. Pre-fix: with probable == current the | ||
| // node declared itself synced even when HighestNonceReceived was 20 blocks | ||
| // ahead. Post-fix: any gossip-vs-current gap > BlockFinality forces | ||
| // hasLastBlock=false and isNodeSynchronized=false. | ||
| func TestKLC1920_ComputeNodeState_GossipAheadForcesNotSynced(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| // Production failure shape: probable matches current (fork detector | ||
| // thinks it's caught up) but gossip has reported headers 20 ahead. | ||
| boot, statusHandler := buildKLC1920Bootstrap( | ||
| uint64(50), // probable | ||
| uint64(70), // highest received from gossip | ||
| uint64(50), // current block nonce | ||
| ) | ||
|
|
||
| boot.ComputeNodeState() | ||
|
|
||
| assert.False(t, boot.HasLastBlock(), | ||
| "KLC-1920 fix: gossip-ahead gap must force hasLastBlock=false") | ||
| assert.False(t, boot.IsNodeSynchronized(), | ||
| "KLC-1920 fix: node must not declare synced when gossip is ahead") | ||
| assert.Equal(t, uint64(1), statusHandler.IsSyncing(), | ||
| "KLC-1920 fix: klv_is_syncing must report 1 — the production-bug metric was 0 (false-synced)") | ||
| } | ||
|
|
||
| // TestKLC1920_ComputeNodeState_GossipWithinFinalityStaysSynced confirms the | ||
| // gate does NOT spuriously fire during normal proposal rounds where gossip | ||
| // is briefly one block ahead of the just-committed block. | ||
| func TestKLC1920_ComputeNodeState_GossipWithinFinalityStaysSynced(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| // Normal cycle: a BHProposed for nonce N+1 has arrived but the block | ||
| // hasn't committed yet. gap = 1 = BlockFinality — must NOT fire. | ||
| boot, statusHandler := buildKLC1920Bootstrap( | ||
| uint64(50), // probable | ||
| uint64(51), // highest received (one proposal ahead — normal) | ||
| uint64(50), // current block nonce | ||
| ) | ||
|
|
||
| boot.ComputeNodeState() | ||
|
|
||
| assert.True(t, boot.HasLastBlock(), | ||
| "normal proposal cycle: gap == BlockFinality must NOT force not-synced") | ||
| assert.True(t, boot.IsNodeSynchronized(), | ||
| "normal proposal cycle: node remains synced; consensus must not be gated") | ||
| assert.Equal(t, uint64(0), statusHandler.IsSyncing(), | ||
| "normal proposal cycle: klv_is_syncing stays 0") | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| package sync_test | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/klever-io/klever-go/common/mock" | ||
| consensusMock "github.com/klever-io/klever-go/core/consensus/mock" | ||
| "github.com/klever-io/klever-go/core/process" | ||
| "github.com/klever-io/klever-go/core/process/sync" | ||
| "github.com/klever-io/klever-go/data/block" | ||
| "github.com/stretchr/testify/assert" | ||
| ) | ||
|
|
||
| // klc1920_repro_test.go pins down the invariant the KLC-1920 fix relies on: | ||
| // under the production failure mode (only BHProposed deliveries arrive, the | ||
| // BHReceived path is broken by peer churn after an election), the fork | ||
| // detector's HighestNonceReceived must advance with gossip while | ||
| // ProbableHighestNonce stays at the last processed nonce. The gap between | ||
| // them is what baseBootstrap.computeNodeState uses to force hasLastBlock=false | ||
| // and prevent the false isNodeSynchronized=true reported in the Slack-thread | ||
| // log at sprint-97/KLC-1920/slack-thread/log.txt. | ||
|
|
||
| func newSlotManagerForRepro(slot int64) *consensusMock.SlotManagerMock { | ||
| return &consensusMock.SlotManagerMock{ | ||
| SlotIndex: slot, | ||
| TimeDurationCalled: func() time.Duration { return 0 }, | ||
| } | ||
| } | ||
|
|
||
| // TestKLC1920_HighestNonceReceivedAdvancesUnderBHProposedOnly is the | ||
| // regression guard for the gossip-ceiling invariant. Production logs showed | ||
| // `setHighestNonceReceived` firing constantly while `forkDetector.AddHeader | ||
| // state=0` (BHReceived) never appeared. This test reproduces exactly that | ||
| // shape and asserts both sides of the gap are observable. | ||
| func TestKLC1920_HighestNonceReceivedAdvancesUnderBHProposedOnly(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| bfd, err := sync.NewMetaForkDetector(newSlotManagerForRepro(100), &mock.BlackListHandlerStub{}, 0) | ||
| assert.Nil(t, err) | ||
| assert.NotNil(t, bfd) | ||
|
|
||
| processedHdr := &block.BlockHeader{Nonce: 10, Slot: 10} | ||
| err = bfd.AddHeader(&block.Block{Header: processedHdr}, []byte("processed-10"), process.BHProcessed, nil, nil) | ||
| assert.Nil(t, err) | ||
| assert.Equal(t, uint64(10), bfd.ProbableHighestNonce(), | ||
| "baseline: probable highest after BHProcessed at 10") | ||
| assert.Equal(t, uint64(10), bfd.HighestNonceReceived(), | ||
| "baseline: highest received tracks the same processed nonce") | ||
|
|
||
| for nonce := uint64(11); nonce <= uint64(15); nonce++ { | ||
| hdr := &block.BlockHeader{Nonce: nonce, Slot: nonce} | ||
| hash := []byte(fmt.Sprintf("proposed-%d", nonce)) | ||
| err = bfd.AddHeader(&block.Block{Header: hdr}, hash, process.BHProposed, nil, nil) | ||
| assert.Nil(t, err) | ||
| } | ||
|
|
||
| assert.Equal(t, uint64(15), bfd.HighestNonceReceived(), | ||
| "gossip ceiling must reflect every BHProposed delivery") | ||
| assert.Equal(t, uint64(10), bfd.ProbableHighestNonce(), | ||
| "probableHighestNonce intentionally stays at last processed — BHProposed must not advance it (would break consensus during proposal rounds)") | ||
|
|
||
| gap := bfd.HighestNonceReceived() - bfd.ProbableHighestNonce() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test pins |
||
| assert.Equal(t, uint64(5), gap, | ||
| "the gap between gossip ceiling and probable is the signal computeNodeState uses to force hasLastBlock=false when it exceeds BlockFinality") | ||
| } | ||
|
|
||
| // TestKLC1920_GapExceedsBlockFinality demonstrates that the gap threshold | ||
| // (HighestNonceReceived - currentBlockNonce > BlockFinality) is the | ||
| // condition the fix watches for. BlockFinality is 1, so any gap >= 2 | ||
| // indicates the node is not really synced. | ||
| func TestKLC1920_GapExceedsBlockFinality(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| bfd, err := sync.NewMetaForkDetector(newSlotManagerForRepro(100), &mock.BlackListHandlerStub{}, 0) | ||
| assert.Nil(t, err) | ||
|
|
||
| processedHdr := &block.BlockHeader{Nonce: 50, Slot: 50} | ||
| err = bfd.AddHeader(&block.Block{Header: processedHdr}, []byte("p-50"), process.BHProcessed, nil, nil) | ||
| assert.Nil(t, err) | ||
|
|
||
| for nonce := uint64(51); nonce <= uint64(70); nonce++ { | ||
| hdr := &block.BlockHeader{Nonce: nonce, Slot: nonce} | ||
| hash := []byte(fmt.Sprintf("g-%d", nonce)) | ||
| err = bfd.AddHeader(&block.Block{Header: hdr}, hash, process.BHProposed, nil, nil) | ||
| assert.Nil(t, err) | ||
| } | ||
|
|
||
| currentBlockNonce := uint64(50) | ||
| gossipGap := bfd.HighestNonceReceived() - currentBlockNonce | ||
| assert.Equal(t, uint64(20), gossipGap, | ||
| "matches Slack-log production amplitude (~70-block gap) at scale") | ||
| assert.True(t, gossipGap > uint64(process.BlockFinality), | ||
| "gap exceeds BlockFinality — computeNodeState must declare not-synced") | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.