NRG: Don't reset WAL when failing to load last snapshot (#7580)

neilalexander · web-flow · commit ed24187f90ed · 2025-11-24T12:55:33.000Z
In most cases we can either install a new snapshot before shutting down, or if not, we can better detect the situation on the next startup. ref: #7556 Signed-off-by: Neil Twigg <neil@nats.io>
diff --git a/server/raft.go b/server/raft.go
@@ -2916,8 +2916,6 @@ func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) {
 	if err != nil {
 		// We need to stepdown here when this happens.
 		n.stepdownLocked(noLeader)
-		// We need to reset our state here as well.
-		n.resetWAL()
 		return 0, err
 	}
 	// Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower.
diff --git a/server/raft_test.go b/server/raft_test.go
@@ -3864,6 +3864,53 @@ func TestNRGQuorumAfterLeaderStepdown(t *testing.T) {
 	}
 }
 
+func TestNRGNoLogResetOnCorruptedSendToFollower(t *testing.T) {
+	c := createJetStreamClusterExplicit(t, "R3S", 2)
+	defer c.shutdown()
+
+	rg := c.createRaftGroup("TEST", 2, newStateAdder)
+	rg.waitOnLeader()
+
+	leader := rg.leader().(*stateAdder)
+	follower := rg.nonLeader().(*stateAdder)
+
+	leaderrg := leader.node().(*raft)
+	followerrg := follower.node().(*raft)
+
+	for i := range int64(10) {
+		leader.proposeDelta(i + 1)
+		time.Sleep(50 * time.Millisecond) // ... for multiple AEs.
+	}
+
+	// Snapshot and compact.
+	rg.waitOnTotal(t, 55)
+	leader.snapshot(t)
+
+	// Above snapshot should have resulted in compaction of the WAL.
+	var ss StreamState
+	leaderrg.wal.FastState(&ss)
+	require_Equal(t, ss.Msgs, 0)
+
+	// Stop the follower, we'll flatten their state so they have to
+	// request a snapshot.
+	require_NoError(t, followerrg.wal.Truncate(0))
+	follower.stop()
+
+	// Now we're going to subtly corrupt the snapshot on the leader.
+	stat, err := os.Stat(leaderrg.snapfile)
+	require_NoError(t, err)
+	require_NoError(t, os.Truncate(leaderrg.snapfile, stat.Size()-1))
+
+	// Now we'll bring the follower back. It should request a snapshot
+	// from the leader. Previously this would have caused the leader to
+	// blow away the entire log, but in reality we will probably just
+	// install a new snapshot at some point soon anyway.
+	follower.restart()
+	c.waitOnAllCurrent()
+	leaderrg.wal.FastState(&ss)
+	require_NotEqual(t, ss.LastSeq, 0)
+}
+
 // This is a RaftChainOfBlocks test where a block is proposed and then we wait for all replicas to apply it before
 // proposing the next one.
 // The test may fail if:

Original file line number	Diff line number	Diff line change
`@@ -2916,8 +2916,6 @@ func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) {`
`2916`	`2916`	`if err != nil {`
`2917`	`2917`	`// We need to stepdown here when this happens.`
`2918`	`2918`	`n.stepdownLocked(noLeader)`
`2919`		`- // We need to reset our state here as well.`
`2920`		`- n.resetWAL()`
`2921`	`2919`	`return 0, err`
`2922`	`2920`	`}`
`2923`	`2921`	`// Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower.`