Skip to content

Commit a9c09b0

Browse files
collator-protocol: Re-advertise collations when peer authority IDs are updated (#10891)
The collator protocol contained a race-condition which could manifest as "Collation wasn't advertised". A given peer ("A") can connect before the new authority keys are received via `UpdatedAuthorityIds` (nk -- new key). - T0: peer A connects`PeerConnected` - T1: peer A sends its current view `PeerViewChange` - Peer A wants the block N - T2: `validator_group.should_advertise_to`: checks peer A for key nK (the new key) - We don't have this key stored and therefore return `ShouldAdvertiseTo::NotAuthority` - T3: `UpdatedAuthorityIds` arrives with (peer A, [nK]) At this point, we have the collation, peer A wants to collation, we know peer A is an authority but we never send the collation back. Then, the collation will expire with "Collation wasn't advertised". To close the gap, the `UpdatedAuthorityIds` events will trigger a re-advertisement of collations - note: if the advertisement was already sent, the logic does not resend it (achieved in should_advertise_to). Part of the stabilization of: - #10425 --------- Signed-off-by: Alexandru Vasile <[email protected]> Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 81bdc42 commit a9c09b0

File tree

3 files changed

+289
-39
lines changed

3 files changed

+289
-39
lines changed

polkadot/node/network/collator-protocol/src/collator_side/mod.rs

Lines changed: 84 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,61 +1276,85 @@ async fn handle_incoming_request<Context>(
12761276
Ok(())
12771277
}
12781278

1279-
/// Peer's view has changed. Send advertisements for new relay parents
1280-
/// if there're any.
1279+
/// Advertises collations for the given relay parents to the specified peer.
1280+
///
1281+
/// Returns a list of unknown relay parents.
12811282
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
1282-
async fn handle_peer_view_change<Context>(
1283+
async fn advertise_collations_for_relay_parents<Context>(
12831284
ctx: &mut Context,
12841285
state: &mut State,
1285-
peer_id: PeerId,
1286-
view: View,
1287-
) {
1288-
let Some(PeerData { view: current, unknown_heads }) = state.peer_data.get_mut(&peer_id) else {
1289-
return;
1290-
};
1291-
1292-
let added: Vec<Hash> = view.difference(&*current).cloned().collect();
1286+
peer_id: &PeerId,
1287+
relay_parents: impl IntoIterator<Item = Hash>,
1288+
) -> Vec<Hash> {
1289+
let mut unknown_relay_parents = Vec::new();
12931290

1294-
*current = view;
1295-
1296-
for added in added.into_iter() {
1297-
let block_hashes = match state.per_relay_parent.contains_key(&added) {
1291+
for relay_parent in relay_parents {
1292+
let block_hashes = match state.per_relay_parent.contains_key(&relay_parent) {
12981293
true => state
12991294
.implicit_view
13001295
.as_ref()
13011296
.and_then(|implicit_view| {
1302-
implicit_view.known_allowed_relay_parents_under(&added, state.collating_on)
1297+
implicit_view
1298+
.known_allowed_relay_parents_under(&relay_parent, state.collating_on)
13031299
})
13041300
.unwrap_or_default(),
13051301
false => {
1306-
gum::trace!(
1307-
target: LOG_TARGET,
1308-
?peer_id,
1309-
new_leaf = ?added,
1310-
"New leaf in peer's view is unknown",
1311-
);
1312-
1313-
unknown_heads.insert(added, ());
1314-
1302+
unknown_relay_parents.push(relay_parent);
13151303
continue;
13161304
},
13171305
};
13181306

13191307
for block_hash in block_hashes {
1320-
let Some(per_relay_parent) = state.per_relay_parent.get_mut(block_hash) else {
1321-
continue;
1322-
};
1308+
if let Some(per_relay_parent) = state.per_relay_parent.get_mut(block_hash) {
1309+
advertise_collation(
1310+
ctx,
1311+
*block_hash,
1312+
per_relay_parent,
1313+
peer_id,
1314+
&state.peer_ids,
1315+
&mut state.advertisement_timeouts,
1316+
&state.metrics,
1317+
)
1318+
.await;
1319+
}
1320+
}
1321+
}
13231322

1324-
advertise_collation(
1325-
ctx,
1326-
*block_hash,
1327-
per_relay_parent,
1328-
&peer_id,
1329-
&state.peer_ids,
1330-
&mut state.advertisement_timeouts,
1331-
&state.metrics,
1332-
)
1333-
.await;
1323+
unknown_relay_parents
1324+
}
1325+
1326+
/// Peer's view has changed. Send advertisements for new relay parents
1327+
/// if there're any.
1328+
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
1329+
async fn handle_peer_view_change<Context>(
1330+
ctx: &mut Context,
1331+
state: &mut State,
1332+
peer_id: PeerId,
1333+
view: View,
1334+
) {
1335+
let Some(added) = state.peer_data.get_mut(&peer_id).map(|peer| {
1336+
let diff: Vec<Hash> = view.difference(&peer.view).cloned().collect();
1337+
peer.view = view;
1338+
diff
1339+
}) else {
1340+
return;
1341+
};
1342+
1343+
let unknown_relay_parents =
1344+
advertise_collations_for_relay_parents(ctx, state, &peer_id, added).await;
1345+
1346+
if !unknown_relay_parents.is_empty() {
1347+
gum::trace!(
1348+
target: LOG_TARGET,
1349+
?peer_id,
1350+
new_leaves = ?unknown_relay_parents,
1351+
"New leaves in peer's view are unknown",
1352+
);
1353+
1354+
if let Some(PeerData { unknown_heads, .. }) = state.peer_data.get_mut(&peer_id) {
1355+
for unknown in unknown_relay_parents {
1356+
unknown_heads.insert(unknown, ());
1357+
}
13341358
}
13351359
}
13361360
}
@@ -1443,8 +1467,29 @@ async fn handle_network_msg<Context>(
14431467
UpdatedAuthorityIds(peer_id, authority_ids) => {
14441468
gum::trace!(target: LOG_TARGET, ?peer_id, ?authority_ids, "Updated authority ids");
14451469
if state.peer_data.contains_key(&peer_id) {
1446-
if state.peer_ids.insert(peer_id, authority_ids).is_none() {
1470+
let is_new_peer = state.peer_ids.insert(peer_id, authority_ids).is_none();
1471+
1472+
if is_new_peer {
14471473
declare(ctx, state, &peer_id).await;
1474+
} else {
1475+
// Authority IDs changed for an existing peer. Re-advertise collations
1476+
// for relay parents already in their view, as the previous authority IDs
1477+
// may not have matched our validator groups.
1478+
let relay_parents_in_view: Vec<_> = state
1479+
.peer_data
1480+
.get(&peer_id)
1481+
.map(|data| data.view.iter().cloned().collect())
1482+
.unwrap_or_default();
1483+
1484+
// Unknown relay parents are ignored because they were
1485+
// handled when the peer's view was first processed.
1486+
let _ = advertise_collations_for_relay_parents(
1487+
ctx,
1488+
state,
1489+
&peer_id,
1490+
relay_parents_in_view,
1491+
)
1492+
.await;
14481493
}
14491494
}
14501495
},

polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,3 +2042,183 @@ fn connect_advertise_disconnect_three_backing_groups() {
20422042
},
20432043
);
20442044
}
2045+
2046+
/// Test that collations are re-advertised when a peer's authority IDs are updated
2047+
/// to match the validator group (when they previously didn't match).
2048+
///
2049+
/// This tests the fix for a race condition where:
2050+
/// 1. A peer connects with authority IDs that don't match the validator group
2051+
/// 2. A collation is distributed but can't be advertised (no authority ID match)
2052+
/// 3. The peer's view includes the relay parent
2053+
/// 4. UpdatedAuthorityIds arrives with new authority IDs that DO match
2054+
/// 5. The collation should now be advertised
2055+
#[test]
2056+
fn readvertise_collation_on_authority_id_update() {
2057+
let test_state = TestState::default();
2058+
let local_peer_id = test_state.local_peer_id;
2059+
let collator_pair = test_state.collator_pair.clone();
2060+
2061+
test_harness(
2062+
local_peer_id,
2063+
collator_pair,
2064+
ReputationAggregator::new(|_| true),
2065+
|mut test_harness| async move {
2066+
let virtual_overseer = &mut test_harness.virtual_overseer;
2067+
2068+
let peer = test_state.current_group_validator_peer_ids()[0];
2069+
// Use an unrelated authority ID that doesn't match the validator group
2070+
let unrelated_authority_id: AuthorityDiscoveryId = Sr25519Keyring::Eve.public().into();
2071+
// The actual authority ID that matches the validator group
2072+
let matching_authority_id =
2073+
test_state.current_group_validator_authority_ids()[0].clone();
2074+
2075+
overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;
2076+
2077+
overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
2078+
.await;
2079+
2080+
update_view(
2081+
Some(test_state.current_group_validator_authority_ids()),
2082+
&test_state,
2083+
virtual_overseer,
2084+
vec![(test_state.relay_parent, 10)],
2085+
1,
2086+
)
2087+
.await;
2088+
2089+
// Connect peer with an authority ID that does NOT match the validator group
2090+
connect_peer(
2091+
virtual_overseer,
2092+
peer,
2093+
CollationVersion::V2,
2094+
Some(unrelated_authority_id),
2095+
)
2096+
.await;
2097+
2098+
// We still declare to the peer (declaration doesn't depend on validator group match)
2099+
expect_declare_msg(virtual_overseer, &test_state, &peer).await;
2100+
2101+
// Distribute a collation
2102+
let DistributeCollation { candidate, .. } = distribute_collation(
2103+
virtual_overseer,
2104+
test_state.current_group_validator_authority_ids(),
2105+
&test_state,
2106+
test_state.relay_parent,
2107+
CoreIndex(0),
2108+
)
2109+
.await;
2110+
2111+
// Send peer view change - peer is interested in the relay parent
2112+
send_peer_view_change(virtual_overseer, &peer, vec![test_state.relay_parent]).await;
2113+
2114+
// No advertisement should happen because the peer's authority ID (Eve)
2115+
// doesn't match any validator in the group for this collation
2116+
assert!(
2117+
overseer_recv_with_timeout(virtual_overseer, TIMEOUT).await.is_none(),
2118+
"Should not advertise to peer with non-matching authority ID"
2119+
);
2120+
2121+
// Now send UpdatedAuthorityIds with the correct authority ID
2122+
overseer_send(
2123+
virtual_overseer,
2124+
CollatorProtocolMessage::NetworkBridgeUpdate(
2125+
NetworkBridgeEvent::UpdatedAuthorityIds(
2126+
peer,
2127+
HashSet::from([matching_authority_id]),
2128+
),
2129+
),
2130+
)
2131+
.await;
2132+
2133+
// Now the collation should be advertised because the authority ID matches
2134+
expect_advertise_collation_msg(
2135+
virtual_overseer,
2136+
&[peer],
2137+
test_state.relay_parent,
2138+
vec![candidate.hash()],
2139+
)
2140+
.await;
2141+
2142+
test_harness
2143+
},
2144+
)
2145+
}
2146+
2147+
/// Test that UpdatedAuthorityIds for existing peers with unchanged matching authority IDs
2148+
/// doesn't cause duplicate advertisements (idempotency check).
2149+
#[test]
2150+
fn no_duplicate_advertisement_on_authority_id_update() {
2151+
let test_state = TestState::default();
2152+
let local_peer_id = test_state.local_peer_id;
2153+
let collator_pair = test_state.collator_pair.clone();
2154+
2155+
test_harness(
2156+
local_peer_id,
2157+
collator_pair,
2158+
ReputationAggregator::new(|_| true),
2159+
|mut test_harness| async move {
2160+
let virtual_overseer = &mut test_harness.virtual_overseer;
2161+
2162+
let peer = test_state.current_group_validator_peer_ids()[0];
2163+
let validator_id = test_state.current_group_validator_authority_ids()[0].clone();
2164+
2165+
overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;
2166+
2167+
overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
2168+
.await;
2169+
2170+
update_view(
2171+
Some(test_state.current_group_validator_authority_ids()),
2172+
&test_state,
2173+
virtual_overseer,
2174+
vec![(test_state.relay_parent, 10)],
2175+
1,
2176+
)
2177+
.await;
2178+
2179+
// Connect peer with matching authority ID
2180+
connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id.clone()))
2181+
.await;
2182+
expect_declare_msg(virtual_overseer, &test_state, &peer).await;
2183+
2184+
// Distribute a collation
2185+
let DistributeCollation { candidate, .. } = distribute_collation(
2186+
virtual_overseer,
2187+
test_state.current_group_validator_authority_ids(),
2188+
&test_state,
2189+
test_state.relay_parent,
2190+
CoreIndex(0),
2191+
)
2192+
.await;
2193+
2194+
// Peer view change triggers advertisement
2195+
send_peer_view_change(virtual_overseer, &peer, vec![test_state.relay_parent]).await;
2196+
2197+
// First advertisement
2198+
expect_advertise_collation_msg(
2199+
virtual_overseer,
2200+
&[peer],
2201+
test_state.relay_parent,
2202+
vec![candidate.hash()],
2203+
)
2204+
.await;
2205+
2206+
// Send UpdatedAuthorityIds with the same authority ID
2207+
overseer_send(
2208+
virtual_overseer,
2209+
CollatorProtocolMessage::NetworkBridgeUpdate(
2210+
NetworkBridgeEvent::UpdatedAuthorityIds(peer, HashSet::from([validator_id])),
2211+
),
2212+
)
2213+
.await;
2214+
2215+
// No duplicate advertisement should happen (already advertised to this validator)
2216+
assert!(
2217+
overseer_recv_with_timeout(virtual_overseer, TIMEOUT).await.is_none(),
2218+
"Should not re-advertise to peer that was already advertised to"
2219+
);
2220+
2221+
test_harness
2222+
},
2223+
)
2224+
}

prdoc/pr_10891.prdoc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
title: 'collator-protocol: Re-advertise collations when peer authority IDs are updated'
2+
doc:
3+
- audience: Node Dev
4+
description: |-
5+
The collator protocol contained a race-condition which could manifest as "Collation wasn't advertised".
6+
7+
A given peer ("A") can connect before the new authority keys are received via `UpdatedAuthorityIds` (nk -- new key).
8+
9+
- T0: peer A connects`PeerConnected`
10+
- T1: peer A sends its current view `PeerViewChange`
11+
- Peer A wants the block N
12+
- T2: `validator_group.should_advertise_to`: checks peer A for key nK (the new key)
13+
- We don't have this key stored and therefore return `ShouldAdvertiseTo::NotAuthority`
14+
- T3: `UpdatedAuthorityIds` arrives with (peer A, [nK])
15+
16+
At this point, we have the collation, peer A wants to collation, we know peer A is an authority but we never send the collation back. Then, the collation will expire with "Collation wasn't advertised".
17+
18+
To close the gap, the `UpdatedAuthorityIds` events will trigger a re-advertisement of collations
19+
- note: if the advertisement was already sent, the logic does not resend it (achieved in should_advertise_to).
20+
21+
Part of the stabilization of:
22+
- https://github.com/paritytech/polkadot-sdk/issues/10425
23+
crates:
24+
- name: polkadot-collator-protocol
25+
bump: patch

0 commit comments

Comments
 (0)