Skip to content

Commit ed24187

Browse files
NRG: Don't reset WAL when failing to load last snapshot (#7580)
In most cases we can either install a new snapshot before shutting down, or if not, we can better detect the situation on the next startup. ref: #7556 Signed-off-by: Neil Twigg <[email protected]>
2 parents 1ad4569 + dca7a87 commit ed24187

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

server/raft.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2916,8 +2916,6 @@ func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) {
29162916
if err != nil {
29172917
// We need to stepdown here when this happens.
29182918
n.stepdownLocked(noLeader)
2919-
// We need to reset our state here as well.
2920-
n.resetWAL()
29212919
return 0, err
29222920
}
29232921
// Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower.

server/raft_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3864,6 +3864,53 @@ func TestNRGQuorumAfterLeaderStepdown(t *testing.T) {
38643864
}
38653865
}
38663866

3867+
func TestNRGNoLogResetOnCorruptedSendToFollower(t *testing.T) {
3868+
c := createJetStreamClusterExplicit(t, "R3S", 2)
3869+
defer c.shutdown()
3870+
3871+
rg := c.createRaftGroup("TEST", 2, newStateAdder)
3872+
rg.waitOnLeader()
3873+
3874+
leader := rg.leader().(*stateAdder)
3875+
follower := rg.nonLeader().(*stateAdder)
3876+
3877+
leaderrg := leader.node().(*raft)
3878+
followerrg := follower.node().(*raft)
3879+
3880+
for i := range int64(10) {
3881+
leader.proposeDelta(i + 1)
3882+
time.Sleep(50 * time.Millisecond) // ... for multiple AEs.
3883+
}
3884+
3885+
// Snapshot and compact.
3886+
rg.waitOnTotal(t, 55)
3887+
leader.snapshot(t)
3888+
3889+
// Above snapshot should have resulted in compaction of the WAL.
3890+
var ss StreamState
3891+
leaderrg.wal.FastState(&ss)
3892+
require_Equal(t, ss.Msgs, 0)
3893+
3894+
// Stop the follower, we'll flatten their state so they have to
3895+
// request a snapshot.
3896+
require_NoError(t, followerrg.wal.Truncate(0))
3897+
follower.stop()
3898+
3899+
// Now we're going to subtly corrupt the snapshot on the leader.
3900+
stat, err := os.Stat(leaderrg.snapfile)
3901+
require_NoError(t, err)
3902+
require_NoError(t, os.Truncate(leaderrg.snapfile, stat.Size()-1))
3903+
3904+
// Now we'll bring the follower back. It should request a snapshot
3905+
// from the leader. Previously this would have caused the leader to
3906+
// blow away the entire log, but in reality we will probably just
3907+
// install a new snapshot at some point soon anyway.
3908+
follower.restart()
3909+
c.waitOnAllCurrent()
3910+
leaderrg.wal.FastState(&ss)
3911+
require_NotEqual(t, ss.LastSeq, 0)
3912+
}
3913+
38673914
// This is a RaftChainOfBlocks test where a block is proposed and then we wait for all replicas to apply it before
38683915
// proposing the next one.
38693916
// The test may fail if:

0 commit comments

Comments
 (0)