Skip to content
123 changes: 84 additions & 39 deletions polkadot/node/network/collator-protocol/src/collator_side/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1262,61 +1262,85 @@ async fn handle_incoming_request<Context>(
Ok(())
}

/// Peer's view has changed. Send advertisements for new relay parents
/// if there're any.
/// Advertises collations for the given relay parents to the specified peer.
///
/// Returns a list of unknown relay parents.
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
async fn handle_peer_view_change<Context>(
async fn advertise_collations_for_relay_parents<Context>(
ctx: &mut Context,
state: &mut State,
peer_id: PeerId,
view: View,
) {
let Some(PeerData { view: current, unknown_heads }) = state.peer_data.get_mut(&peer_id) else {
return;
};

let added: Vec<Hash> = view.difference(&*current).cloned().collect();
peer_id: &PeerId,
relay_parents: impl IntoIterator<Item = Hash>,
) -> Vec<Hash> {
let mut unknown_relay_parents = Vec::new();

*current = view;

for added in added.into_iter() {
let block_hashes = match state.per_relay_parent.contains_key(&added) {
for relay_parent in relay_parents {
let block_hashes = match state.per_relay_parent.contains_key(&relay_parent) {
true => state
.implicit_view
.as_ref()
.and_then(|implicit_view| {
implicit_view.known_allowed_relay_parents_under(&added, state.collating_on)
implicit_view
.known_allowed_relay_parents_under(&relay_parent, state.collating_on)
})
.unwrap_or_default(),
false => {
gum::trace!(
target: LOG_TARGET,
?peer_id,
new_leaf = ?added,
"New leaf in peer's view is unknown",
);

unknown_heads.insert(added, ());

unknown_relay_parents.push(relay_parent);
continue;
},
};

for block_hash in block_hashes {
let Some(per_relay_parent) = state.per_relay_parent.get_mut(block_hash) else {
continue;
};
if let Some(per_relay_parent) = state.per_relay_parent.get_mut(block_hash) {
advertise_collation(
ctx,
*block_hash,
per_relay_parent,
peer_id,
&state.peer_ids,
&mut state.advertisement_timeouts,
&state.metrics,
)
.await;
}
}
}

advertise_collation(
ctx,
*block_hash,
per_relay_parent,
&peer_id,
&state.peer_ids,
&mut state.advertisement_timeouts,
&state.metrics,
)
.await;
unknown_relay_parents
}

/// Peer's view has changed. Send advertisements for new relay parents
/// if there're any.
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
async fn handle_peer_view_change<Context>(
ctx: &mut Context,
state: &mut State,
peer_id: PeerId,
view: View,
) {
let Some(added) = state.peer_data.get_mut(&peer_id).map(|peer| {
let diff: Vec<Hash> = view.difference(&peer.view).cloned().collect();
peer.view = view;
diff
}) else {
return;
};

let unknown_relay_parents =
advertise_collations_for_relay_parents(ctx, state, &peer_id, added).await;

if !unknown_relay_parents.is_empty() {
gum::trace!(
target: LOG_TARGET,
?peer_id,
new_leaves = ?unknown_relay_parents,
"New leaves in peer's view are unknown",
);

if let Some(PeerData { unknown_heads, .. }) = state.peer_data.get_mut(&peer_id) {
for unknown in unknown_relay_parents {
unknown_heads.insert(unknown, ());
}
}
}
}
Expand Down Expand Up @@ -1418,8 +1442,29 @@ async fn handle_network_msg<Context>(
UpdatedAuthorityIds(peer_id, authority_ids) => {
gum::trace!(target: LOG_TARGET, ?peer_id, ?authority_ids, "Updated authority ids");
if state.peer_data.contains_key(&peer_id) {
if state.peer_ids.insert(peer_id, authority_ids).is_none() {
let is_new_peer = state.peer_ids.insert(peer_id, authority_ids).is_none();

if is_new_peer {
declare(ctx, state, &peer_id).await;
} else {
// Authority IDs changed for an existing peer. Re-advertise collations
// for relay parents already in their view, as the previous authority IDs
// may not have matched our validator groups.
let relay_parents_in_view: Vec<_> = state
.peer_data
.get(&peer_id)
.map(|data| data.view.iter().cloned().collect())
.unwrap_or_default();

// Unknown relay parents are ignored because they were
// handled when the peer's view was first processed.
let _ = advertise_collations_for_relay_parents(
ctx,
state,
&peer_id,
relay_parents_in_view,
)
.await;
}
}
},
Expand Down
180 changes: 180 additions & 0 deletions polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2025,3 +2025,183 @@ fn connect_advertise_disconnect_three_backing_groups() {
},
);
}

/// Test that collations are re-advertised when a peer's authority IDs are updated
/// to match the validator group (when they previously didn't match).
///
/// This tests the fix for a race condition where:
/// 1. A peer connects with authority IDs that don't match the validator group
/// 2. A collation is distributed but can't be advertised (no authority ID match)
/// 3. The peer's view includes the relay parent
/// 4. UpdatedAuthorityIds arrives with new authority IDs that DO match
/// 5. The collation should now be advertised
#[test]
fn readvertise_collation_on_authority_id_update() {
let test_state = TestState::default();
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();

test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|mut test_harness| async move {
let virtual_overseer = &mut test_harness.virtual_overseer;

let peer = test_state.current_group_validator_peer_ids()[0];
// Use an unrelated authority ID that doesn't match the validator group
let unrelated_authority_id: AuthorityDiscoveryId = Sr25519Keyring::Eve.public().into();
// The actual authority ID that matches the validator group
let matching_authority_id =
test_state.current_group_validator_authority_ids()[0].clone();

overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;

overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
.await;

update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(test_state.relay_parent, 10)],
1,
)
.await;

// Connect peer with an authority ID that does NOT match the validator group
connect_peer(
virtual_overseer,
peer,
CollationVersion::V2,
Some(unrelated_authority_id),
)
.await;

// We still declare to the peer (declaration doesn't depend on validator group match)
expect_declare_msg(virtual_overseer, &test_state, &peer).await;

// Distribute a collation
let DistributeCollation { candidate, .. } = distribute_collation(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
&test_state,
test_state.relay_parent,
CoreIndex(0),
)
.await;

// Send peer view change - peer is interested in the relay parent
send_peer_view_change(virtual_overseer, &peer, vec![test_state.relay_parent]).await;

// No advertisement should happen because the peer's authority ID (Eve)
// doesn't match any validator in the group for this collation
assert!(
overseer_recv_with_timeout(virtual_overseer, TIMEOUT).await.is_none(),
"Should not advertise to peer with non-matching authority ID"
);

// Now send UpdatedAuthorityIds with the correct authority ID
overseer_send(
virtual_overseer,
CollatorProtocolMessage::NetworkBridgeUpdate(
NetworkBridgeEvent::UpdatedAuthorityIds(
peer,
HashSet::from([matching_authority_id]),
),
),
)
.await;

// Now the collation should be advertised because the authority ID matches
expect_advertise_collation_msg(
virtual_overseer,
&[peer],
test_state.relay_parent,
vec![candidate.hash()],
)
.await;

test_harness
},
)
}

/// Test that UpdatedAuthorityIds for existing peers with unchanged matching authority IDs
/// doesn't cause duplicate advertisements (idempotency check).
#[test]
fn no_duplicate_advertisement_on_authority_id_update() {
let test_state = TestState::default();
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();

test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|mut test_harness| async move {
let virtual_overseer = &mut test_harness.virtual_overseer;

let peer = test_state.current_group_validator_peer_ids()[0];
let validator_id = test_state.current_group_validator_authority_ids()[0].clone();

overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;

overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
.await;

update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(test_state.relay_parent, 10)],
1,
)
.await;

// Connect peer with matching authority ID
connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id.clone()))
.await;
expect_declare_msg(virtual_overseer, &test_state, &peer).await;

// Distribute a collation
let DistributeCollation { candidate, .. } = distribute_collation(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
&test_state,
test_state.relay_parent,
CoreIndex(0),
)
.await;

// Peer view change triggers advertisement
send_peer_view_change(virtual_overseer, &peer, vec![test_state.relay_parent]).await;

// First advertisement
expect_advertise_collation_msg(
virtual_overseer,
&[peer],
test_state.relay_parent,
vec![candidate.hash()],
)
.await;

// Send UpdatedAuthorityIds with the same authority ID
overseer_send(
virtual_overseer,
CollatorProtocolMessage::NetworkBridgeUpdate(
NetworkBridgeEvent::UpdatedAuthorityIds(peer, HashSet::from([validator_id])),
),
)
.await;

// No duplicate advertisement should happen (already advertised to this validator)
assert!(
overseer_recv_with_timeout(virtual_overseer, TIMEOUT).await.is_none(),
"Should not re-advertise to peer that was already advertised to"
);

test_harness
},
)
}
25 changes: 25 additions & 0 deletions prdoc/pr_10891.prdoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
title: 'collator-protocol: Re-advertise collations when peer authority IDs are updated'
doc:
- audience: Node Dev
description: |-
The collator protocol contained a race-condition which could manifest as "Collation wasn't advertised".

A given peer ("A") can connect before the new authority keys are received via `UpdatedAuthorityIds` (nk -- new key).

- T0: peer A connects`PeerConnected`
- T1: peer A sends its current view `PeerViewChange`
- Peer A wants the block N
- T2: `validator_group.should_advertise_to`: checks peer A for key nK (the new key)
- We don't have this key stored and therefore return `ShouldAdvertiseTo::NotAuthority`
- T3: `UpdatedAuthorityIds` arrives with (peer A, [nK])

At this point, we have the collation, peer A wants to collation, we know peer A is an authority but we never send the collation back. Then, the collation will expire with "Collation wasn't advertised".

To close the gap, the `UpdatedAuthorityIds` events will trigger a re-advertisement of collations
- note: if the advertisement was already sent, the logic does not resend it (achieved in should_advertise_to).

Part of the stabilization of:
- https://github.com/paritytech/polkadot-sdk/issues/10425
crates:
- name: polkadot-collator-protocol
bump: patch
Loading