Skip to content

Commit b3a6035

Browse files
fix(network): validate HELLO before the simultaneous-dial index update
`update_ingress_node_id` ran before HELLO validation, so a simultaneous-dial `Replaced` overwrote `node_id_index` to the new ingress; if HELLO then failed validation only the new stream was killed, leaving the old session in the slab but absent from the index — unaddressable by node id. Validate HELLO into a candidate first and commit it (protocols, `node_db`, `had_hello`) only once the session is accepted, so a failed HELLO never touches the index. Surfaced via #3510. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent bcdec2d commit b3a6035

1 file changed

Lines changed: 51 additions & 29 deletions

File tree

crates/network/src/session.rs

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ pub struct SessionDataWithDisconnectInfo {
9090
pub token_to_disconnect: Option<(StreamToken, String)>,
9191
}
9292

93+
/// A HELLO validated but not yet committed; `commit_hello` applies it only
94+
/// after the simultaneous-dial tie-break accepts the session.
95+
struct HelloCandidate {
96+
peer_protocols: Vec<ProtocolInfo>,
97+
node_entry: NodeEntry,
98+
pos_public_key: Option<(ConsensusPublicKey, ConsensusVRFPublicKey)>,
99+
}
100+
93101
// id for Hello packet
94102
const PACKET_HELLO: u8 = 0x80;
95103
// id for Disconnect packet
@@ -302,11 +310,16 @@ impl Session {
302310
PACKET_HELLO => {
303311
debug!("Read HELLO in session {:?}", self);
304312
self.metadata.peer_header_version = packet.header_version;
305-
// For ingress session, update the node id in `SessionManager`
313+
314+
// Validate before the tie-break touches the index, so a failed
315+
// HELLO can't orphan a replaced session's index entry.
316+
let rlp = Rlp::new(&packet.data);
317+
let candidate = self.validate_hello(&rlp, host)?;
318+
306319
let token_to_disconnect = self.update_ingress_node_id(host)?;
307320

308-
// Dropped by the tie-break: stop before `Ready` so the loser
309-
// never reaches `on_peer_connected`; the caller kills it.
321+
// Lost the tie-break: stop before `Ready` and commit nothing;
322+
// the caller kills it.
310323
if token_to_disconnect.as_ref().map(|(t, _)| *t)
311324
== Some(self.token())
312325
{
@@ -316,9 +329,7 @@ impl Session {
316329
});
317330
}
318331

319-
// Handle Hello packet to exchange protocols
320-
let rlp = Rlp::new(&packet.data);
321-
let pos_public_key = self.read_hello(&rlp, host)?;
332+
let pos_public_key = self.commit_hello(candidate, host);
322333
Ok(SessionDataWithDisconnectInfo {
323334
session_data: SessionData::Ready { pos_public_key },
324335
token_to_disconnect,
@@ -406,16 +417,12 @@ impl Session {
406417
}
407418
}
408419

409-
/// Read Hello packet to exchange the supported protocols, and set the
410-
/// `had_hello` flag to indicates that session is ready to send/receive
411-
/// protocol packets.
412-
///
413-
/// Besides, the node endpoint of remote peer will be added or updated in
414-
/// node database, which is used to establish outgoing connections.
415-
fn read_hello(
420+
/// Validate a HELLO into a `HelloCandidate` without committing it. The
421+
/// caller commits via `commit_hello` only after the tie-break accepts the
422+
/// session, so a failed HELLO never half-updates the index.
423+
fn validate_hello(
416424
&mut self, rlp: &Rlp, host: &NetworkServiceInner,
417-
) -> Result<Option<(ConsensusPublicKey, ConsensusVRFPublicKey)>, Error>
418-
{
425+
) -> Result<HelloCandidate, Error> {
419426
let remote_network_id: u64 = rlp.val_at(0)?;
420427
if remote_network_id != host.metadata.network_id {
421428
debug!(
@@ -450,8 +457,7 @@ impl Session {
450457
.any(|hc| hc.protocol == c.protocol && hc.version <= c.version)
451458
});
452459

453-
self.metadata.peer_protocols = peer_caps;
454-
if self.metadata.peer_protocols.is_empty() {
460+
if peer_caps.is_empty() {
455461
debug!("No common capabilities with remote peer, peer_node_id = {:?}, session = {:?}", self.metadata.id, self);
456462
return Err(self.send_disconnect(DisconnectReason::UselessPeer));
457463
}
@@ -487,14 +493,11 @@ impl Session {
487493
entry, self
488494
);
489495
return Err(self.send_disconnect(DisconnectReason::IpLimited));
490-
} else {
491-
debug!("Received valid endpoint {:?}, session = {:?}", entry, self);
492-
host.node_db.write().insert_with_token(entry, self.token());
493496
}
497+
debug!("Received valid endpoint {:?}, session = {:?}", entry, self);
494498

495-
self.had_hello = Some(Instant::now());
496-
match rlp.item_count()? {
497-
3 => Ok(None),
499+
let pos_public_key = match rlp.item_count()? {
500+
3 => None,
498501
4 => {
499502
// FIXME(lpl): Verify keys.
500503
let pos_public_key_bytes: Vec<u8> = rlp.val_at(3)?;
@@ -511,13 +514,32 @@ impl Session {
511514
)
512515
.map_err(|e| Error::Decoder(format!("{:?}", e)))?;
513516

514-
Ok(Some((bls_pub_key, vrf_pub_key)))
517+
Some((bls_pub_key, vrf_pub_key))
515518
}
516-
length => Err(Error::Decoder(format!(
517-
"Hello has incorrect rlp length: {:?}",
518-
length
519-
))),
520-
}
519+
length => {
520+
return Err(Error::Decoder(format!(
521+
"Hello has incorrect rlp length: {:?}",
522+
length
523+
)))
524+
}
525+
};
526+
527+
Ok(HelloCandidate {
528+
peer_protocols: peer_caps,
529+
node_entry: entry,
530+
pos_public_key,
531+
})
532+
}
533+
534+
fn commit_hello(
535+
&mut self, candidate: HelloCandidate, host: &NetworkServiceInner,
536+
) -> Option<(ConsensusPublicKey, ConsensusVRFPublicKey)> {
537+
self.metadata.peer_protocols = candidate.peer_protocols;
538+
host.node_db
539+
.write()
540+
.insert_with_token(candidate.node_entry, self.token());
541+
self.had_hello = Some(Instant::now());
542+
candidate.pos_public_key
521543
}
522544

523545
/// Assemble a packet with specified protocol id, packet id and data.

0 commit comments

Comments
 (0)