Skip to content

Commit 6125890

Browse files
lexnvbkchrgithub-actions[bot]
authored
collator-protocol: Remove stale pending collations from the waiting queue (#10906)
This PR removes the stale pending collations from the waiting queue when the peer that advertised the collation disconnects. When the peer reconnects, the peer data is freshly created without any prior information about advertised collations. Then the state-pending collation is picked from the queue. The network request will not be emitted since the `fn fetch_collation` sees no prior advertisement via `peer_data.has_advertised` and returns `Err(FetchError::NotAdvertised)`. To avoid this, remove the stale entries immediately when the peer disconnects. Part of the stabilization of: - #10425 --------- Signed-off-by: Alexandru Vasile <[email protected]> Co-authored-by: Bastian Köcher <[email protected]> Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 03ae1a7 commit 6125890

File tree

4 files changed

+162
-0
lines changed

4 files changed

+162
-0
lines changed

polkadot/node/network/collator-protocol/src/validator_side/collation.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,13 @@ impl Collations {
324324
pub(super) fn queued_for_para(&self, para_id: &ParaId) -> usize {
325325
self.waiting_queue.get(para_id).map(|queue| queue.len()).unwrap_or_default()
326326
}
327+
328+
/// Remove all pending collations for a specific peer from the waiting queue.
329+
pub(super) fn remove_pending_for_peer(&mut self, peer_id: &PeerId) {
330+
for queue in self.waiting_queue.values_mut() {
331+
queue.retain(|(pending, _)| &pending.peer_id != peer_id);
332+
}
333+
}
327334
}
328335

329336
// Any error that can occur when awaiting a collation fetch response.

polkadot/node/network/collator-protocol/src/validator_side/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1695,6 +1695,11 @@ async fn handle_network_msg<Context>(
16951695
},
16961696
PeerDisconnected(peer_id) => {
16971697
state.peer_data.remove(&peer_id);
1698+
// Clean up any pending collations from this peer in all waiting queues.
1699+
// This prevents stale entries when the peer reconnects with empty advertisements.
1700+
for per_relay_parent in state.per_relay_parent.values_mut() {
1701+
per_relay_parent.collations.remove_pending_for_peer(&peer_id);
1702+
}
16981703
state.metrics.note_collator_peer_count(state.peer_data.len());
16991704
},
17001705
NewGossipTopology { .. } => {

polkadot/node/network/collator-protocol/src/validator_side/tests/mod.rs

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,3 +1161,137 @@ fn view_change_clears_old_collators() {
11611161
virtual_overseer
11621162
})
11631163
}
1164+
1165+
/// Test that when a peer disconnects, their pending collations are removed from the waiting queue.
1166+
/// This prevents "NotAdvertised" errors when the peer reconnects with empty advertisement state.
1167+
#[test]
1168+
fn peer_disconnect_clears_pending_collations_from_waiting_queue() {
1169+
let mut test_state = TestState::default();
1170+
1171+
test_harness(ReputationAggregator::new(|_| true), HashSet::new(), |test_harness| async move {
1172+
let TestHarness { mut virtual_overseer, .. } = test_harness;
1173+
1174+
let relay_parent = test_state.relay_parent;
1175+
update_view(&mut virtual_overseer, &mut test_state, vec![(relay_parent, 0)]).await;
1176+
1177+
// Connect first collator and have them advertise - this will trigger a fetch.
1178+
let peer_a = PeerId::random();
1179+
let collator_a = test_state.collators[0].clone();
1180+
1181+
connect_and_declare_collator(
1182+
&mut virtual_overseer,
1183+
peer_a,
1184+
collator_a.clone(),
1185+
test_state.chain_ids[0],
1186+
CollationVersion::V1,
1187+
)
1188+
.await;
1189+
1190+
advertise_collation(&mut virtual_overseer, peer_a, relay_parent, None).await;
1191+
1192+
// First collation fetch is initiated.
1193+
let response_channel_a = assert_fetch_collation_request(
1194+
&mut virtual_overseer,
1195+
relay_parent,
1196+
test_state.chain_ids[0],
1197+
None,
1198+
)
1199+
.await;
1200+
1201+
// Connect second collator and have them advertise.
1202+
// Since we're already fetching, this goes into the waiting queue.
1203+
let peer_b = PeerId::random();
1204+
let collator_b = test_state.collators[1].clone();
1205+
1206+
connect_and_declare_collator(
1207+
&mut virtual_overseer,
1208+
peer_b,
1209+
collator_b.clone(),
1210+
test_state.chain_ids[0],
1211+
CollationVersion::V1,
1212+
)
1213+
.await;
1214+
1215+
advertise_collation(&mut virtual_overseer, peer_b, relay_parent, None).await;
1216+
1217+
// Now disconnect peer_b. This should clean up their entry from the waiting queue.
1218+
overseer_send(
1219+
&mut virtual_overseer,
1220+
CollatorProtocolMessage::NetworkBridgeUpdate(NetworkBridgeEvent::PeerDisconnected(
1221+
peer_b,
1222+
)),
1223+
)
1224+
.await;
1225+
1226+
// Peer_b reconnects and declares again (but does NOT re-advertise yet).
1227+
overseer_send(
1228+
&mut virtual_overseer,
1229+
CollatorProtocolMessage::NetworkBridgeUpdate(NetworkBridgeEvent::PeerConnected(
1230+
peer_b,
1231+
ObservedRole::Full,
1232+
CollationVersion::V1.into(),
1233+
None,
1234+
)),
1235+
)
1236+
.await;
1237+
1238+
overseer_send(
1239+
&mut virtual_overseer,
1240+
CollatorProtocolMessage::NetworkBridgeUpdate(NetworkBridgeEvent::PeerMessage(
1241+
peer_b,
1242+
CollationProtocols::V1(protocol_v1::CollatorProtocolMessage::Declare(
1243+
collator_b.public(),
1244+
test_state.chain_ids[0],
1245+
collator_b.sign(&protocol_v1::declare_signature_payload(&peer_b)),
1246+
)),
1247+
)),
1248+
)
1249+
.await;
1250+
1251+
// Complete the first fetch from peer_a.
1252+
let pov = PoV { block_data: BlockData(vec![]) };
1253+
let mut candidate_a =
1254+
dummy_candidate_receipt_bad_sig(dummy_hash(), Some(Default::default()));
1255+
candidate_a.descriptor.para_id = test_state.chain_ids[0];
1256+
candidate_a.descriptor.relay_parent = relay_parent;
1257+
candidate_a.descriptor.persisted_validation_data_hash = dummy_pvd().hash();
1258+
1259+
response_channel_a
1260+
.send(Ok((
1261+
request_v1::CollationFetchingResponse::Collation(
1262+
candidate_a.clone().into(),
1263+
pov.clone(),
1264+
)
1265+
.encode(),
1266+
ProtocolName::from(""),
1267+
)))
1268+
.expect("Sending response should succeed");
1269+
1270+
// This triggers candidate backing.
1271+
assert_candidate_backing_second(
1272+
&mut virtual_overseer,
1273+
relay_parent,
1274+
test_state.chain_ids[0],
1275+
&pov,
1276+
CollationVersion::V1,
1277+
)
1278+
.await;
1279+
1280+
// Ensure the subsystem is polled.
1281+
test_helpers::Yield::new().await;
1282+
1283+
// The key assertion: after completing the first fetch, the subsystem should NOT
1284+
// attempt to fetch from peer_b because their waiting queue entry was cleaned up
1285+
// on disconnect. Without the fix, we would see a fetch request here that would
1286+
// fail with "NotAdvertised" because peer_b's advertisement state was cleared
1287+
// when they disconnected.
1288+
assert!(
1289+
overseer_recv_with_timeout(&mut virtual_overseer, Duration::from_millis(100))
1290+
.await
1291+
.is_none(),
1292+
"There should be no fetch request for peer_b - their entry was cleaned from waiting queue on disconnect"
1293+
);
1294+
1295+
virtual_overseer
1296+
})
1297+
}

prdoc/pr_10906.prdoc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
title: 'collator-protocol: Remove stale pending collations from the waiting queue'
2+
doc:
3+
- audience: Node Dev
4+
description: |-
5+
This PR removes the stale pending collations from the waiting queue when the peer that advertised the collation disconnects.
6+
7+
When the peer reconnects, the peer data is freshly created without any prior information about advertised collations.
8+
Then the state-pending collation is picked from the queue. The network request will not be emitted since the `fn fetch_collation` sees no prior advertisement via `peer_data.has_advertised` and returns `Err(FetchError::NotAdvertised)`.
9+
10+
To avoid this, remove the stale entries immediately when the peer disconnects.
11+
12+
Part of the stabilization of:
13+
- https://github.com/paritytech/polkadot-sdk/issues/10425
14+
crates:
15+
- name: polkadot-collator-protocol
16+
bump: patch

0 commit comments

Comments
 (0)