@@ -1975,3 +1975,60 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) {
19751975 require .True (t , inID2Member , "member should be present in id2Member after re-learning" )
19761976 require .True (t , inAliveLastTS , "member should be present in aliveLastTS after re-learning" )
19771977}
1978+
1979+ func TestLearnExistingMembers_NilMemberAfterConcurrentPurge (t * testing.T ) {
1980+ // 1) Initialize a discovery instance (use existing helpers like createDiscoveryInstanceWithNoGossip).
1981+ inst := createDiscoveryInstanceWithNoGossip (10000 , "testInst" , nil )
1982+ defer inst .Stop ()
1983+
1984+ // Access the underlying implementation
1985+ d := inst .discoveryImpl ()
1986+
1987+ // 2) Prepare a PKIid and endpoint.
1988+ pkiID := common .PKIidType ("test-pki-id" )
1989+ endpoint := "localhost:1234"
1990+
1991+ // 3) Under lock, insert an entry into aliveLastTS for that PKIid.
1992+ // 4) Do NOT insert the corresponding entry into id2Member (simulate it being purged).
1993+ d .lock .Lock ()
1994+ d .aliveLastTS [string (pkiID )] = & timestamp {
1995+ incTime : time .Now (),
1996+ seqNum : 1 ,
1997+ lastSeen : time .Now (),
1998+ }
1999+ d .lock .Unlock ()
2000+
2001+ // 5) Build a valid AliveMessage and wrap it with protoext.NoopSign.
2002+ aliveMsg := & proto.GossipMessage {
2003+ Tag : proto .GossipMessage_EMPTY ,
2004+ Content : & proto.GossipMessage_AliveMsg {
2005+ AliveMsg : & proto.AliveMessage {
2006+ Membership : & proto.Member {
2007+ PkiId : pkiID ,
2008+ Endpoint : endpoint ,
2009+ },
2010+ Timestamp : & proto.PeerTime {
2011+ IncNum : uint64 (time .Now ().UnixNano ()),
2012+ SeqNum : 2 ,
2013+ },
2014+ },
2015+ },
2016+ }
2017+ signedMsg , err := protoext .NoopSign (aliveMsg )
2018+ require .NoError (t , err )
2019+
2020+ // We invoke learnExistingMembers() directly to deterministically reproduce
2021+ // the inconsistent state where the member is present in aliveLastTS but
2022+ // missing from id2Member.
2023+ //
2024+ // In the real flow, handleAliveMessage first reads state under a read lock,
2025+ // and then learnExistingMembers acquires a write lock. A concurrent purge
2026+ // can remove the member between these two steps, leading to a nil access.
2027+ //
2028+ // Reproducing this via the full handleAliveMessage path would require a
2029+ // timing-dependent race, so we simulate the exact post-condition directly
2030+ // to keep the test deterministic and reliable.
2031+ require .NotPanics (t , func () {
2032+ d .learnExistingMembers ([]* protoext.SignedGossipMessage {signedMsg })
2033+ }, "learnExistingMembers should not panic when member is nil in id2Member" )
2034+ }
0 commit comments