diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 7abaf09e5e0..84011e23ff9 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -199,7 +199,7 @@ impl RpcBlock { custody_columns: Vec>, expected_custody_indices: Vec, spec: &ChainSpec, - ) -> Result { + ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let custody_columns_count = expected_custody_indices.len(); @@ -209,11 +209,7 @@ impl RpcBlock { custody_columns, spec.number_of_columns as usize, ) - .map_err(|e| { - AvailabilityCheckError::Unexpected(format!( - "custody_columns len exceeds number_of_columns: {e:?}" - )) - })?, + .map_err(|e| format!("custody_columns len exceeds number_of_columns: {e:?}"))?, expected_custody_indices, }; Ok(Self { diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 9aea52bbee6..c4044c2ce7a 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2412,7 +2412,8 @@ where columns, expected_custody_indices, &self.spec, - )? + ) + .map_err(BlockError::InternalError)? } else { RpcBlock::new_without_blobs(Some(block_root), block, sampling_column_count) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index b36f8cc2154..8300ad4bb89 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -59,6 +59,14 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, + /// The Id of the parent custody by range request that issued this data_columns_by_range request + pub parent_request_id: CustodyByRangeRequestId, +} + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyByRangeRequestId { + /// Id to identify this attempt at a meta custody by range request for `parent_request_id` + pub id: Id, /// The Id of the overall By Range request for block components. pub parent_request_id: ComponentsByRangeRequestId, } @@ -221,6 +229,7 @@ macro_rules! impl_display { impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); +impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); @@ -299,14 +308,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { + parent_request_id: CustodyByRangeRequestId { id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: ComponentsByRangeRequestId { + id: 121, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, }, }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/122/121/RangeSync/0/54"); } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index fd99d935890..7fa751a9057 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -245,6 +245,25 @@ impl NetworkGlobals { Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) } + pub fn new_test_globals_as_supernode( + trusted_peers: Vec, + config: Arc, + spec: Arc, + is_supernode: bool, + ) -> NetworkGlobals { + let metadata = MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets: Default::default(), + custody_group_count: if is_supernode { + spec.number_of_custody_groups + } else { + spec.custody_requirement + }, + }); + Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) + } + pub(crate) fn new_test_globals_with_metadata( trusted_peers: Vec, metadata: MetaData, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 784bf2fdc1b..5488e019b6a 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -554,7 +554,7 @@ impl NetworkBeaconProcessor { pub fn send_rpc_validate_data_columns( self: &Arc, block_root: Hash256, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, seen_timestamp: Duration, id: SamplingId, ) -> Result<(), Error> { diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7b5701cc8d2..47810d536e5 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -23,9 +23,10 @@ use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use parking_lot::RwLock; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashSet, + HashMap, HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -135,6 +136,8 @@ pub struct BackFillSync { /// This signifies that we are able to attempt to restart a failed chain. restart_failed_sync: bool, + peers: Arc>>, + /// Reference to the beacon chain to obtain initial starting points for the backfill sync. beacon_chain: Arc>, @@ -179,6 +182,7 @@ impl BackFillSync { current_processing_batch: None, validated_batches: 0, restart_failed_sync: false, + peers: <_>::default(), beacon_chain, }; @@ -218,14 +222,7 @@ impl BackFillSync { match self.state() { BackFillState::Syncing => {} // already syncing ignore. BackFillState::Paused => { - if self - .network_globals - .peers - .read() - .synced_peers() - .next() - .is_some() - { + if !self.peers.read().is_empty() { // If there are peers to resume with, begin the resume. debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync"); self.set_state(BackFillState::Syncing); @@ -298,6 +295,14 @@ impl BackFillSync { } } + pub fn add_peer(&mut self, peer_id: PeerId) { + self.peers.write().insert(peer_id); + } + + pub fn peer_disconnected(&mut self, peer_id: &PeerId) { + self.peers.write().remove(peer_id); + } + /// An RPC error has occurred. /// /// If the batch exists it is re-requested. @@ -312,7 +317,6 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> Result<(), BackFillError> { @@ -326,11 +330,18 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed(Some(*peer_id)) { + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + match batch.download_failed(None) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) - } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { + RpcResponseError::RpcError(_) + | RpcResponseError::VerifyError(_) + | RpcResponseError::InternalError(_) => { + BackFillError::BatchDownloadFailed(batch_id) + } + RpcResponseError::RequestExpired(_) => BackFillError::Paused, + }), Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { @@ -914,21 +925,15 @@ impl BackFillSync { batch_id: BatchId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { - let synced_peers = self - .network_globals - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, - &synced_peers, + self.peers.clone(), &failed_peers, + // Does not track total requests per peers for now + &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request @@ -940,15 +945,6 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // If we are here the chain has no more synced peers - info!( - "reason" = format!("insufficient_synced_peers({no_peer:?})"), - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 8c884f644e1..f676068326b 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -494,7 +494,7 @@ impl BlockLookups { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { // We don't have the ability to cancel in-flight RPC requests. So this can happen // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); + debug!(%id, "Block returned for single block lookup not present"); return Err(LookupRequestError::UnknownLookup); }; @@ -507,7 +507,7 @@ impl BlockLookups { Ok((response, peer_group, seen_timestamp)) => { debug!( ?block_root, - ?id, + %id, ?peer_group, ?response_type, "Received lookup download success" @@ -540,7 +540,7 @@ impl BlockLookups { // the peer and the request ID which is linked to this `id` value here. debug!( ?block_root, - ?id, + %id, ?response_type, error = ?e, "Received lookup download failure" @@ -724,7 +724,7 @@ impl BlockLookups { // Collect all peers that sent a column that was invalid. Must // run .unique as a single peer can send multiple invalid // columns. Penalize once to avoid insta-bans - .flat_map(|(index, _)| peer_group.of_index((*index) as usize)) + .flat_map(|(index, _)| peer_group.of_index(&(*index as usize))) .unique() .collect(), _ => peer_group.all().collect(), diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs deleted file mode 100644 index 68f15491256..00000000000 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ /dev/null @@ -1,588 +0,0 @@ -use beacon_chain::{ - block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, -}; -use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - }, - PeerId, -}; -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; -use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, -}; - -use super::range_sync::BatchPeers; - -pub struct RangeBlockComponentsRequest { - /// Blocks we have received awaiting for their corresponding sidecar. - blocks_request: ByRangeRequest>>>, - /// Sidecars we have received awaiting for their corresponding block. - block_data_request: RangeBlockDataRequest, -} - -enum ByRangeRequest { - Active(I), - Complete(T, PeerId), -} - -enum RangeBlockDataRequest { - /// All pre-deneb blocks - NoData, - /// All post-Deneb blocks, regardless of if they have data or not - Blobs(ByRangeRequest>>>), - /// All post-Fulu blocks, regardless of if they have data or not - DataColumns { - requests: HashMap< - DataColumnsByRangeRequestId, - ByRangeRequest>, - >, - expected_column_to_peer: HashMap, - }, -} - -impl RangeBlockComponentsRequest { - pub fn new( - blocks_req_id: BlocksByRangeRequestId, - blobs_req_id: Option, - data_columns: Option<( - Vec, - HashMap, - )>, - ) -> Self { - let block_data_request = if let Some(blobs_req_id) = blobs_req_id { - RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_column_to_peer)) = data_columns { - RangeBlockDataRequest::DataColumns { - requests: requests - .into_iter() - .map(|id| (id, ByRangeRequest::Active(id))) - .collect(), - expected_column_to_peer, - } - } else { - RangeBlockDataRequest::NoData - }; - - Self { - blocks_request: ByRangeRequest::Active(blocks_req_id), - block_data_request, - } - } - - pub fn add_blocks( - &mut self, - req_id: BlocksByRangeRequestId, - blocks: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - self.blocks_request.finish(req_id, blocks, peer_id) - } - - pub fn add_blobs( - &mut self, - req_id: BlobsByRangeRequestId, - blobs: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), - RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs, peer_id), - RangeBlockDataRequest::DataColumns { .. } => { - Err("received blobs but expected data columns".to_owned()) - } - } - } - - pub fn add_custody_columns( - &mut self, - req_id: DataColumnsByRangeRequestId, - columns: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => { - Err("received data columns but expected no data".to_owned()) - } - RangeBlockDataRequest::Blobs(_) => { - Err("received data columns but expected blobs".to_owned()) - } - RangeBlockDataRequest::DataColumns { - ref mut requests, .. - } => { - let req = requests - .get_mut(&req_id) - .ok_or(format!("unknown data columns by range req_id {req_id}"))?; - req.finish(req_id, columns, peer_id) - } - } - } - - /// If all internal requests are complete returns a Vec of coupled RpcBlocks - #[allow(clippy::type_complexity)] - pub fn responses( - &self, - spec: &ChainSpec, - ) -> Option>, BatchPeers), String>> { - let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { - return None; - }; - - match &self.block_data_request { - RangeBlockDataRequest::NoData => Some( - Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ), - RangeBlockDataRequest::Blobs(request) => { - let Some((blobs, _blob_peer)) = request.to_finished() else { - return None; - }; - Some( - Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ) - } - RangeBlockDataRequest::DataColumns { - requests, - expected_column_to_peer, - } => { - let mut data_columns = vec![]; - let mut column_peers = HashMap::new(); - for req in requests.values() { - let Some((resp_columns, column_peer)) = req.to_finished() else { - return None; - }; - data_columns.extend(resp_columns.clone()); - for column in resp_columns { - column_peers.insert(column.index, *column_peer); - } - } - - Some( - Self::responses_with_custody_columns( - blocks.to_vec(), - data_columns, - expected_column_to_peer.clone(), - spec, - ) - .map(|blocks| (blocks, BatchPeers::new(block_peer, column_peers))), - ) - } - } - } - - fn responses_with_blobs( - blocks: Vec>>, - blobs: Vec>>, - spec: &ChainSpec, - ) -> Result>, String> { - // There can't be more more blobs than blocks. i.e. sending any blob (empty - // included) for a skipped slot is not permitted. - let mut responses = Vec::with_capacity(blocks.len()); - let mut blob_iter = blobs.into_iter().peekable(); - for block in blocks.into_iter() { - let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; - let mut blob_list = Vec::with_capacity(max_blobs_per_block); - while { - let pair_next_blob = blob_iter - .peek() - .map(|sidecar| sidecar.slot() == block.slot()) - .unwrap_or(false); - pair_next_blob - } { - blob_list.push(blob_iter.next().ok_or("Missing next blob".to_string())?); - } - - let mut blobs_buffer = vec![None; max_blobs_per_block]; - for blob in blob_list { - let blob_index = blob.index as usize; - let Some(blob_opt) = blobs_buffer.get_mut(blob_index) else { - return Err("Invalid blob index".to_string()); - }; - if blob_opt.is_some() { - return Err("Repeat blob index".to_string()); - } else { - *blob_opt = Some(blob); - } - } - let blobs = RuntimeVariableList::new( - blobs_buffer.into_iter().flatten().collect::>(), - max_blobs_per_block, - ) - .map_err(|_| "Blobs returned exceeds max length".to_string())?; - responses.push(RpcBlock::new(None, block, Some(blobs)).map_err(|e| format!("{e:?}"))?) - } - - // if accumulated sidecars is not empty, throw an error. - if blob_iter.next().is_some() { - return Err("Received sidecars that don't pair well".to_string()); - } - - Ok(responses) - } - - fn responses_with_custody_columns( - blocks: Vec>>, - data_columns: DataColumnSidecarList, - expected_custody_columns: HashMap, - spec: &ChainSpec, - ) -> Result>, String> { - // Group data columns by block_root and index - let mut custody_columns_by_block = HashMap::>>::new(); - let mut block_roots_by_slot = HashMap::>::new(); - let expected_custody_indices = expected_custody_columns.keys().cloned().collect::>(); - - for column in data_columns { - let block_root = column.block_root(); - let index = column.index; - - block_roots_by_slot - .entry(column.slot()) - .or_default() - .insert(block_root); - - // Sanity check before casting to `CustodyDataColumn`. But this should never happen - if !expected_custody_columns.contains_key(&index) { - return Err(format!( - "Received column not in expected custody indices {index}" - )); - } - - custody_columns_by_block - .entry(block_root) - .or_default() - .push(CustodyDataColumn::from_asserted_custody(column)); - } - - // Now iterate all blocks ensuring that the block roots of each block and data column match, - // plus we have columns for our custody requirements - let rpc_blocks = blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - block_roots_by_slot - .entry(block.slot()) - .or_default() - .insert(block_root); - - let custody_columns = custody_columns_by_block - .remove(&block_root) - .unwrap_or_default(); - - RpcBlock::new_with_custody_columns( - Some(block_root), - block, - custody_columns, - expected_custody_indices.clone(), - spec, - ) - .map_err(|e| format!("{e:?}")) - }) - .collect::, _>>()?; - - // Assert that there are no columns left for other blocks - if !custody_columns_by_block.is_empty() { - let remaining_roots = custody_columns_by_block.keys().collect::>(); - return Err(format!("Not all columns consumed: {remaining_roots:?}")); - } - - for (_slot, block_roots) in block_roots_by_slot { - if block_roots.len() > 1 { - // TODO: Some peer(s) are faulty or malicious. This batch will fail processing but - // we want to send it to the process to better attribute fault. Maybe warn log for - // now and track it in a metric? - } - } - - Ok(rpc_blocks) - } -} - -impl ByRangeRequest { - fn finish(&mut self, id: I, data: T, peer_id: PeerId) -> Result<(), String> { - match self { - Self::Active(expected_id) => { - if expected_id != &id { - return Err(format!("unexpected req_id expected {expected_id} got {id}")); - } - *self = Self::Complete(data, peer_id); - Ok(()) - } - Self::Complete(_, _) => Err("request already complete".to_owned()), - } - } - - fn to_finished(&self) -> Option<(&T, &PeerId)> { - match self { - Self::Active(_) => None, - Self::Complete(data, peer_id) => Some((data, peer_id)), - } - } -} - -#[cfg(test)] -mod tests { - use super::RangeBlockComponentsRequest; - use beacon_chain::test_utils::{ - generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, NumBlobs, - }; - use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, - }, - PeerId, - }; - use rand::SeedableRng; - use std::{collections::HashMap, sync::Arc}; - use types::{test_utils::XorShiftRng, Epoch, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; - - fn components_id() -> ComponentsByRangeRequestId { - ComponentsByRangeRequestId { - id: 0, - requester: RangeRequestId::RangeSync { - chain_id: 1, - batch_id: Epoch::new(0), - }, - } - } - - fn blocks_id(parent_request_id: ComponentsByRangeRequestId) -> BlocksByRangeRequestId { - BlocksByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn blobs_id(parent_request_id: ComponentsByRangeRequestId) -> BlobsByRangeRequestId { - BlobsByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn columns_id( - id: Id, - parent_request_id: ComponentsByRangeRequestId, - ) -> DataColumnsByRangeRequestId { - DataColumnsByRangeRequestId { - id, - parent_request_id, - } - } - - fn is_finished(info: &RangeBlockComponentsRequest) -> bool { - let spec = test_spec::(); - info.responses(&spec).is_some() - } - - #[test] - fn no_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec) - .0 - .into() - }) - .collect::>>>(); - - let blocks_req_id = blocks_id(components_id()); - let mut info = RangeBlockComponentsRequest::::new(blocks_req_id, None, None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn empty_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - // Always generate some blobs. - generate_rand_block_and_blobs::( - ForkName::Deneb, - NumBlobs::Number(3), - &mut rng, - &spec, - ) - .0 - .into() - }) - .collect::>>>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let blobs_req_id = blobs_id(components_id); - let mut info = - RangeBlockComponentsRequest::::new(blocks_req_id, Some(blobs_req_id), None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - // Expect no blobs returned - info.add_blobs(blobs_req_id, vec![], peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed, even if blobs weren't returned. - // This makes sure we don't expect blobs here when they have expired. Checking this logic should - // be hendled elsewhere. - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns() { - let spec = test_spec::(); - let peer = PeerId::random(); - let expects_custody_columns = [1, 2, 3, 4]; - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = expects_custody_columns - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let column_to_peer = expects_custody_columns - .iter() - .map(|index| (*index, peer)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), column_to_peer)), - ); - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - // Send data columns - for (i, &column_index) in expects_custody_columns.iter().enumerate() { - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| b.1.iter().filter(|d| d.index == column_index).cloned()) - .collect(), - peer, - ) - .unwrap(); - - if i < expects_custody_columns.len() - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns_batched() { - let spec = test_spec::(); - let peer = PeerId::random(); - let batched_column_requests = [vec![1_u64, 2], vec![3, 4]]; - let expects_custody_columns = batched_column_requests - .iter() - .flatten() - .map(|index| (*index, peer)) - .collect::>(); - let custody_column_request_ids = - (0..batched_column_requests.len() as u32).collect::>(); - let num_of_data_column_requests = custody_column_request_ids.len(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = batched_column_requests - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), expects_custody_columns.clone())), - ); - - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - for (i, column_indices) in batched_column_requests.iter().enumerate() { - // Send the set of columns in the same batch request - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| { - b.1.iter() - .filter(|d| column_indices.contains(&d.index)) - .cloned() - }) - .collect::>(), - peer, - ) - .unwrap(); - - if i < num_of_data_column_requests - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } -} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 3c94793941c..dfafc884050 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,7 +36,7 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, + CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -58,9 +58,10 @@ use beacon_chain::{ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, + SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -336,23 +337,6 @@ impl SyncManager { .collect() } - #[cfg(test)] - pub(crate) fn get_range_sync_chains( - &self, - ) -> Result, &'static str> { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn range_sync_state(&self) -> super::range_sync::SyncChainStatus { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn __range_failed_chains(&mut self) -> Vec { - self.range_sync.__failed_chains() - } - #[cfg(test)] pub(crate) fn get_failed_chains(&mut self) -> Vec { self.block_lookups.get_failed_chains() @@ -377,6 +361,18 @@ impl SyncManager { self.sampling.get_request_status(block_root, index) } + // Leak the full network context to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn network(&mut self) -> &mut SyncNetworkContext { + &mut self.network + } + + // Leak the full range_sync to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn range_sync(&mut self) -> &mut RangeSync { + &mut self.range_sync + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); @@ -417,6 +413,7 @@ impl SyncManager { PeerSyncType::Advanced => { self.range_sync .add_peer(&mut self.network, local, peer_id, remote); + self.backfill_sync.add_peer(peer_id); } PeerSyncType::FullySynced => { // Sync considers this peer close enough to the head to not trigger range sync. @@ -442,6 +439,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. @@ -531,6 +531,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); + self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. @@ -545,6 +546,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Updates the syncing state of a peer. @@ -1186,10 +1190,9 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Block(id, resp), + RangeBlockComponent::Block(id, resp, peer_id), ); } } @@ -1201,10 +1204,9 @@ impl SyncManager { blob: RpcEvent>>, ) { if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Blob(id, resp), + RangeBlockComponent::Blob(id, resp, peer_id), ); } } @@ -1215,22 +1217,35 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { + // data_columns_by_range returns either an Ok list of data columns, or an RpcResponseError if let Some(resp) = self .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + // custody_by_range accumulates the results of multiple data_columns_by_range requests + // returning a bigger list of data columns across all the column indices this node has + // to custody + if let Some(result) = self.network.on_custody_by_range_response(id, peer_id, resp) { + self.on_custody_by_range_result(id.parent_request_id, result); + } } } + fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + result: CustodyRequestResult, + ) { + self.on_block_components_by_range_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns(id, result), + ); + } + fn on_custody_by_root_result( &mut self, requester: CustodyRequester, - response: CustodyByRootResult, + response: CustodyRequestResult, ) { self.block_lookups .on_download_response::>( @@ -1267,18 +1282,16 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_range_components_response( + fn on_block_components_by_range_response( &mut self, range_request_id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self.network.range_block_component_response( - range_request_id, - peer_id, - range_block_component, - ) { - match resp { + if let Some(result) = self + .network + .on_block_components_by_range_response(range_request_id, range_block_component) + { + match result { Ok((blocks, batch_peers)) => { match range_request_id.requester { RangeRequestId::RangeSync { chain_id, batch_id } => { @@ -1315,7 +1328,6 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.inject_error( &mut self.network, - peer_id, batch_id, chain_id, range_request_id.id, @@ -1327,7 +1339,6 @@ impl SyncManager { match self.backfill_sync.inject_error( &mut self.network, batch_id, - &peer_id, range_request_id.id, e, ) { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 0f5fd6fb9f1..97302df04e8 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -3,7 +3,6 @@ //! Stores the various syncing methods for the beacon chain. mod backfill_sync; mod block_lookups; -mod block_sidecar_coupling; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 50b39fe72ef..5bb277d996c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,11 +1,11 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; +use self::custody_by_range::ActiveCustodyByRangeRequest; +use self::custody_by_root::ActiveCustodyByRootRequest; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::{BatchPeers, ByRangeRequestType}; +use super::range_sync::BatchPeers; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -17,15 +17,17 @@ use crate::sync::block_lookups::SingleLookupId; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -use custody::CustodyRequestResult; +pub use block_components_by_range::BlockComponentsByRangeRequest; +#[cfg(test)] +pub use block_components_by_range::BlockComponentsByRangeRequestStep; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + CustodyByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -36,7 +38,6 @@ use requests::{ }; #[cfg(test)] use slot_clock::SlotClock; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -47,11 +48,13 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; -pub mod custody; +pub mod block_components_by_range; +pub mod custody_by_range; +pub mod custody_by_root; mod requests; #[derive(Debug)] @@ -73,31 +76,28 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; -#[derive(Debug)] +/// Common result type for `custody_by_root` and `custody_by_range` requests. The peers are part of +/// the `Ok` response since they are not known until the entire request succeeds. +pub type CustodyRequestResult = RpcResponseBatchResult>; + +#[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), - CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - BlockComponentCouplingError(#[allow(dead_code)] String), + RequestExpired(String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// No peer available matching the required criteria - NoPeer(NoPeerError), /// These errors should never happen, including unreachable custody errors or network send /// errors. InternalError(String), -} - -/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` -#[derive(Debug, PartialEq, Eq)] -pub enum NoPeerError { - BlockPeer, - CustodyPeer(ColumnIndex), + // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream + // consumers that sends are expected to be infallible. If this assumption changes in the future, + // add a new variant. } #[derive(Debug, PartialEq, Eq)] @@ -124,31 +124,41 @@ pub struct PeerGroup { /// Peers group by which indexed section of the block component they served. For example: /// - PeerA served = [blob index 0, blob index 2] /// - PeerA served = [blob index 1] - peers: HashMap>, + peers: HashMap, } impl PeerGroup { + pub fn empty() -> Self { + Self { + peers: HashMap::new(), + } + } + /// Return a peer group where a single peer returned all parts of a block component. For /// example, a block has a single component (the block = index 0/1). pub fn from_single(peer: PeerId) -> Self { Self { - peers: HashMap::from_iter([(peer, vec![0])]), + peers: HashMap::from_iter([(0, peer)]), } } - pub fn from_set(peers: HashMap>) -> Self { + pub fn from_set(peer_to_indices: HashMap>) -> Self { + let mut peers = HashMap::new(); + for (peer, indices) in peer_to_indices { + for index in indices { + peers.insert(index, peer); + } + } Self { peers } } pub fn all(&self) -> impl Iterator + '_ { - self.peers.keys() + self.peers.values() } - pub fn of_index(&self, index: usize) -> impl Iterator + '_ { - self.peers.iter().filter_map(move |(peer, indices)| { - if indices.contains(&index) { - Some(peer) - } else { - None - } - }) + pub fn of_index(&self, index: &usize) -> Option<&PeerId> { + self.peers.get(index) + } + + pub fn as_map(&self) -> &HashMap { + &self.peers } } @@ -195,12 +205,15 @@ pub struct SyncNetworkContext { data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root - custody_by_root_requests: FnvHashMap>, + /// Mapping of active custody column by root requests for a block root + custody_by_root_requests: FnvHashMap>, + + /// Mapping of active custody column by range requests + custody_by_range_requests: FnvHashMap>, /// BlocksByRange requests paired with other ByRange requests for data components - components_by_range_requests: - FnvHashMap>, + block_components_by_range_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -219,15 +232,14 @@ pub enum RangeBlockComponent { Block( BlocksByRangeRequestId, RpcResponseResult>>>, + PeerId, ), Blob( BlobsByRangeRequestId, RpcResponseResult>>>, + PeerId, ), - CustodyColumns( - DataColumnsByRangeRequestId, - RpcResponseResult>>>, - ), + CustodyColumns(CustodyByRangeRequestId, CustodyRequestResult), } #[cfg(test)] @@ -283,7 +295,8 @@ impl SyncNetworkContext { blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), - components_by_range_requests: FnvHashMap::default(), + custody_by_range_requests: <_>::default(), + block_components_by_range_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -297,6 +310,14 @@ impl SyncNetworkContext { /// Returns the ids of all the requests made to the given peer_id. pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Vec { + self.active_requests() + .filter(|(_, request_peer)| *request_peer == peer_id) + .map(|(id, _)| id) + .collect() + } + + /// Returns the ids of all active requests + pub fn active_requests(&mut self) -> impl Iterator { // Note: using destructuring pattern without a default case to make sure we don't forget to // add new request types to this function. Otherwise, lookup sync can break and lookups // will get stuck if a peer disconnects during an active requests. @@ -311,8 +332,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -320,29 +342,23 @@ impl SyncNetworkContext { } = self; let blocks_by_root_ids = blocks_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlock { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlock { id: *id }, peer)); let blobs_by_root_ids = blobs_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlob { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlob { id: *id }, peer)); let data_column_by_root_ids = data_columns_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); let blocks_by_range_ids = blocks_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlocksByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlocksByRange(*id), peer)); let blobs_by_range_ids = blobs_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlobsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlobsByRange(*id), peer)); let data_column_by_range_ids = data_columns_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRange(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) @@ -350,6 +366,18 @@ impl SyncNetworkContext { .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) + } + + #[cfg(test)] + pub fn active_block_components_by_range_requests( + &self, + ) -> Vec<( + ComponentsByRangeRequestId, + BlockComponentsByRangeRequestStep, + )> { + self.block_components_by_range_requests + .iter() + .map(|(id, req)| (*id, req.state_step())) .collect() } @@ -362,6 +390,10 @@ impl SyncNetworkContext { &self.network_beacon_processor.network_globals } + pub fn spec(&self) -> &ChainSpec { + &self.chain.spec + } + /// Returns the Client type of the peer if known pub fn client_type(&self, peer_id: &PeerId) -> Client { self.network_globals() @@ -414,8 +446,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -445,209 +478,29 @@ impl SyncNetworkContext { &mut self, request: BlocksByRangeRequest, requester: RangeRequestId, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, ) -> Result { - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_type = self.batch_type(batch_epoch); - - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Attempt to find all required custody peers before sending any request or creating an ID - let columns_by_range_peers_to_request = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.select_columns_by_range_peers_to_request( - &column_indexes, - peers, - active_request_count_by_peer, - peers_to_deprioritize, - )?) - } else { - None - }; - - // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?) - } else { - None - }; - - let data_column_requests = columns_by_range_peers_to_request - .map(|columns_by_range_peers_to_request| { - let column_to_peer_map = columns_by_range_peers_to_request - .iter() - .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) - .collect::>(); - - let requests = columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?; - - Ok((requests, column_to_peer_map)) - }) - .transpose()?; + let req = BlockComponentsByRangeRequest::new( + id, + request, + peers, + peers_to_deprioritize, + total_requests_per_peer, + self, + )?; - let info = - RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); - self.components_by_range_requests.insert(id, info); + self.block_components_by_range_requests.insert(id, req); Ok(id.id) } - fn select_columns_by_range_peers_to_request( - &self, - custody_indexes: &HashSet, - peers: &HashSet, - active_request_count_by_peer: HashMap, - peers_to_deprioritize: &HashSet, - ) -> Result>, RpcRequestSendError> { - let mut columns_to_request_by_peer = HashMap::>::new(); - - for column_index in custody_indexes { - // Strictly consider peers that are custodials of this column AND are part of this - // syncing chain. If the forward range sync chain has few peers, it's likely that this - // function will not be able to find peers on our custody columns. - let Some(custody_peer) = peers - .iter() - .filter(|peer| { - self.network_globals() - .is_custody_peer_of(*column_index, peer) - }) - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - // Also account for requests that are not yet issued tracked in peer_id_to_request_map - // We batch requests to the same peer, so count existance in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // TODO(das): this will be pretty bad UX. To improve we should: - // - Handle the no peers case gracefully, maybe add some timeout and give a few - // minutes / seconds to the peer manager to locate peers on this subnet before - // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( - *column_index, - ))); - }; - - columns_to_request_by_peer - .entry(custody_peer) - .or_default() - .push(*column_index); - } - - Ok(columns_to_request_by_peer) - } - - /// Received a _by_range response for a request that couples blocks and its data - /// - /// `peer_id` is the peer that served this individual RPC _by_range response. - #[allow(clippy::type_complexity)] - pub fn range_block_component_response( - &mut self, - id: ComponentsByRangeRequestId, - peer_id: PeerId, - range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeers), RpcResponseError>> { - let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { - metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); - return None; - }; - - if let Err(e) = { - let request = entry.get_mut(); - match range_block_component { - RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { - request - .add_blocks(req_id, blocks, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { - request - .add_blobs(req_id, blobs, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::CustodyColumns(req_id, resp) => { - resp.and_then(|(custody_columns, _)| { - request - .add_custody_columns(req_id, custody_columns, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }) - } - } - } { - entry.remove(); - return Some(Err(e)); - } - - if let Some(blocks_result) = entry.get().responses(&self.chain.spec) { - entry.remove(); - // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) - } else { - None - } - } - /// Request block of `block_root` if necessary by checking: /// - If the da_checker has a pending block from gossip or a previous request /// @@ -853,7 +706,7 @@ impl SyncNetworkContext { } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, @@ -951,7 +804,7 @@ impl SyncNetworkContext { ); let requester = CustodyRequester(id); - let mut request = ActiveCustodyRequest::new( + let mut request = ActiveCustodyByRootRequest::new( block_root, CustodyId { requester }, &custody_indexes_to_fetch, @@ -967,25 +820,7 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(id.req_id)) } - Err(e) => Err(match e { - CustodyRequestError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyRequestError::TooManyFailures - | CustodyRequestError::BadState { .. } - | CustodyRequestError::UnexpectedRequestId { .. } - | CustodyRequestError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } @@ -1073,8 +908,8 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { + parent_request_id: CustodyByRangeRequestId, + ) -> Result { let id = DataColumnsByRangeRequestId { id: self.next_id(), parent_request_id, @@ -1085,7 +920,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + .map_err(|_| "network send error")?; debug!( method = "DataColumnsByRange", @@ -1108,6 +943,50 @@ impl SyncNetworkContext { Ok(id) } + /// Request to fetch all needed custody columns of a range of slot. This function may not send + /// any request to the network if no columns have to be fetched based on the import state of the + /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_range` + /// requests. + pub fn send_custody_by_range_request( + &mut self, + parent_id: ComponentsByRangeRequestId, + blocks_with_data: Vec, + request: BlocksByRangeRequest, + column_indices: Vec, + lookup_peers: Arc>>, + ) -> Result { + let id = CustodyByRangeRequestId { + id: self.next_id(), + parent_request_id: parent_id, + }; + + debug!( + indices = ?column_indices, + %id, + "Starting custody columns by range request" + ); + + let mut request = ActiveCustodyByRangeRequest::new( + id, + request, + blocks_with_data, + &column_indices, + lookup_peers, + ); + + // Note that you can only send, but not handle a response here + match request.continue_requests(self) { + Ok(_) => { + // Ignoring the result of `continue_requests` is okay. A request that has just been + // created cannot return data immediately, it must send some request to the network + // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. + self.custody_by_range_requests.insert(id, request); + Ok(id) + } + Err(e) => Err(e.into()), + } + } + pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1212,40 +1091,12 @@ impl SyncNetworkContext { id } - /// Check whether a batch for this epoch (and only this epoch) should request just blocks or - /// blocks and blobs. - fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - - if self - .chain - .data_availability_checker - .data_columns_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndColumns - } else if self - .chain - .data_availability_checker - .blobs_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndBlobs - } else { - ByRangeRequestType::Blocks - } - } - /// Attempt to make progress on all custody_by_root requests. Some request may be stale waiting /// for custody peers. Returns a Vec of results as zero or more requests may fail in this /// attempt. pub fn continue_custody_by_root_requests( &mut self, - ) -> Vec<(CustodyRequester, CustodyByRootResult)> { + ) -> Vec<(CustodyRequester, CustodyRequestResult)> { let ids = self .custody_by_root_requests .keys() @@ -1259,15 +1110,49 @@ impl SyncNetworkContext { .custody_by_root_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id, request, result) .map(|result| (id, result)) }) .collect() } + /// Attempt to make progress on all custody_by_range requests. Some request may be stale waiting + /// for custody peers. Returns a Vec of results as zero or more requests may fail in this + /// attempt. + pub fn continue_custody_by_range_requests( + &mut self, + ) -> Vec<(CustodyByRangeRequestId, CustodyRequestResult)> { + let ids = self + .custody_by_range_requests + .keys() + .copied() + .collect::>(); + + // Need to collect ids and results in separate steps to re-borrow self. + ids.into_iter() + .filter_map(|id| { + let mut request = self + .custody_by_range_requests + .remove(&id) + .expect("key of hashmap"); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); + self.handle_custody_by_range_result(id, request, result) + .map(|result| (id, result)) + }) + .collect() + } + // Request handlers + /// Processes a single `RpcEvent` blocks_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] but it converts a `Vec` into a `Block` pub(crate) fn on_single_block_response( &mut self, id: SingleLookupReqId, @@ -1290,6 +1175,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` blobs_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] pub(crate) fn on_single_blob_response( &mut self, id: SingleLookupReqId, @@ -1319,19 +1206,25 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_root_response( &mut self, id: DataColumnsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>>> { + ) -> Option>> { let resp = self .data_columns_by_root_requests .on_response(id, rpc_event); self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a blocks_by_range RPC request. + /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks + /// - If the event is an error it fails the request and returns `Some(Err)` + /// - else it appends the response chunk to the active request state and returns `None` #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_range_response( &mut self, @@ -1343,6 +1236,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a blobs_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_blobs_by_range_response( &mut self, @@ -1354,6 +1249,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRangeRequest", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a data_columns_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_range_response( &mut self, @@ -1367,6 +1264,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRange", resp, peer_id, |d| d.len()) } + /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics + /// and peer reporting for all request types. fn on_rpc_response_result usize>( &mut self, id: I, @@ -1413,8 +1312,8 @@ impl SyncNetworkContext { id: CustodyId, req_id: DataColumnsByRootRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, - ) -> Option> { + resp: RpcResponseResult>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1425,12 +1324,17 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody column downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_root"], + ); return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id.requester, request, result) } @@ -1438,9 +1342,9 @@ impl SyncNetworkContext { fn handle_custody_by_root_result( &mut self, id: CustodyRequester, - request: ActiveCustodyRequest, - result: CustodyRequestResult, - ) -> Option> { + request: ActiveCustodyByRootRequest, + result: Option>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1448,21 +1352,145 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result - .map_err(RpcResponseError::CustodyRequestError) + match &result { + Some(Ok((columns, peer_group, _))) => { + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Custody by root request failure, removing") + } + None => { + self.custody_by_root_requests.insert(id, request); + } + } + result + } + + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result. + /// - `None`: Request still active, requester should do no action + #[allow(clippy::type_complexity)] + pub fn on_custody_by_range_response( + &mut self, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + resp: RpcResponseResult>, + ) -> Option> { + let custody_by_range_id = req_id.parent_request_id; + + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.custody_by_range_requests.remove(&custody_by_range_id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_range"], + ); + return None; + }; + + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) .transpose(); + self.handle_custody_by_range_result(custody_by_range_id, request, result) + } + + fn handle_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + request: ActiveCustodyByRangeRequest, + result: Option>, + ) -> Option> { + match &result { + Some(Ok((columns, _peer_group, _))) => { + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!(%id, count = columns.len(), "Custody by range request success, removing") + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Custody by range request failure, removing") + } + None => { + self.custody_by_range_requests.insert(id, request); + } + } + result + } + + /// Processes the result of an `*_by_range` RPC request issued by a + /// block_components_by_range_request. + /// + /// - If the result completes the request, it returns `Some(Ok)` with a vec of coupled RpcBlocks + /// - If the result fails the request, it returns `Some(Err)`. Note that a failed request may + /// not fail the block_components_by_range_request as it implements retries. + /// - else it appends the result to the active request state and returns `None` + #[allow(clippy::type_complexity)] + pub fn on_block_components_by_range_response( + &mut self, + id: ComponentsByRangeRequestId, + range_block_component: RangeBlockComponent, + ) -> Option>, BatchPeers), RpcResponseError>> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); + return None; + }; + + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp) => { + resp.and_then(|(custody_columns, peers, _)| { + request + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) + } + } // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. + .transpose(); + match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Block components by range request failure, removing" ) } None => { - self.custody_by_root_requests.insert(id, request); + self.block_components_by_range_requests.insert(id, request); } } result @@ -1529,7 +1557,7 @@ impl SyncNetworkContext { .beacon_processor_if_enabled() .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - debug!(?block_root, ?id, "Sending blobs for processing"); + debug!(?block_root, %id, "Sending blobs for processing"); // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type beacon_processor @@ -1600,8 +1628,8 @@ impl SyncNetworkContext { ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "components_by_range", - self.components_by_range_requests.len(), + "block_components_by_range", + self.block_components_by_range_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs new file mode 100644 index 00000000000..bb981e31543 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -0,0 +1,549 @@ +use crate::sync::network_context::{ + PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, +}; +use crate::sync::range_sync::BatchPeers; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::{get_block_root, BeaconChainTypes}; +use lighthouse_network::rpc::methods::{BlobsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use types::{ + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, Slot, +}; + +/// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return +/// potentially available RpcBlocks. +/// +/// See [`State`] for the set of `*_by_range` it may issue depending on the fork. +pub struct BlockComponentsByRangeRequest { + id: ComponentsByRangeRequestId, + peers: Arc>>, + request: BlocksByRangeRequest, + state: State, +} + +enum State { + Base { + blocks_by_range_request: + ByRangeRequest>>>, + }, + // Two single concurrent requests for block + blobs. As of now we request blocks and blobs to + // the same peer, so we can attribute coupling errors to the same unique peer. + DenebEnabled { + blocks_by_range_request: + ByRangeRequest>>>, + blobs_by_range_request: ByRangeRequest>>>, + }, + // Request blocks first, then columns. Assuming the block peer is honest we can attribute + // custody failures to the peers serving us columns. We want to get rid of the honest block + // peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 + FuluEnabled(FuluEnabledState), +} + +enum FuluEnabledState { + BlockRequest { + blocks_by_range_request: + ByRangeRequest>>>, + }, + CustodyRequest { + blocks: Vec>>, + block_peer: PeerId, + custody_by_range_request: + ByRangeRequest, PeerGroup>, + }, +} + +enum ByRangeRequest { + /// Active(RequestIndex) + Active(I), + /// Complete(DownloadedData, Peers) + Complete(T, P), +} + +pub type BlockComponentsByRangeRequestResult = + Result>, BatchPeers)>, Error>; + +pub enum Error { + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } +} + +/// Used to typesafe assertions of state in range sync tests +#[cfg(test)] +#[derive(Debug)] +pub enum BlockComponentsByRangeRequestStep { + BlocksRequest, + CustodyRequest, +} + +impl BlockComponentsByRangeRequest { + pub fn new( + id: ComponentsByRangeRequestId, + request: BlocksByRangeRequest, + peers: Arc>>, + peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, + cx: &mut SyncNetworkContext, + ) -> Result { + // Induces a compile time panic if this doesn't hold true. + #[allow(clippy::assertions_on_constants)] + const _: () = assert!( + super::super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 + && super::super::range_sync::EPOCHS_PER_BATCH == 1, + "To deal with alignment with deneb boundaries, batches need to be of just one epoch" + ); + // The assertion above ensures each batch is in one single epoch + let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + let batch_fork = cx.spec().fork_name_at_epoch(batch_epoch); + + // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we + // will request all blocks for the first 5 epochs to that same single peer. Before we would + // query only idle peers in the syncing chain. + let Some(block_peer) = peers + .read() + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // TODO(das): Should we use active_request_count_by_peer? + // Prefer peers with less overall requests + // active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Prefer peers with less total cummulative requests, so we fetch data from a + // diverse set of peers + total_requests_per_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the SyncingChain is removed. + return Err(RpcRequestSendError::InternalError( + "A batch peer set should never be empty".to_string(), + )); + }; + + let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; + + let state = if batch_fork.fulu_enabled() { + State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + }) + } else if batch_fork.deneb_enabled() { + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let blobs_req_id = cx.send_blobs_by_range_request( + block_peer, + BlobsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + }, + id, + )?; + State::DenebEnabled { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + blobs_by_range_request: ByRangeRequest::Active(blobs_req_id), + } + } else { + State::Base { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + } + }; + + Ok(Self { + id, + peers, + request, + state, + }) + } + + pub fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_base( + blocks.to_vec(), + cx.network_globals().sampling_columns.len(), + ); + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range requests to complete + Ok(None) + } + } + State::DenebEnabled { + blocks_by_range_request, + blobs_by_range_request, + } => { + if let (Some((blocks, block_peer)), Some((blobs, _))) = ( + blocks_by_range_request.to_finished(), + blobs_by_range_request.to_finished(), + ) { + // We use the same block_peer for the blobs request + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = + couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range and blobs_by_range requests to complete + Ok(None) + } + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + let blocks_with_data = blocks + .iter() + .filter(|block| block.has_data()) + .map(|block| block.signed_block_header()) + .collect::>(); + + if blocks_with_data.is_empty() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + // Done, we got blocks and no columns needed + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + vec![], + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + let mut column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let req_id = cx + .send_custody_by_range_request( + self.id, + blocks_with_data, + self.request.clone(), + column_indices, + self.peers.clone(), + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => { + Error::InternalError(e) + } + })?; + + *state = FuluEnabledState::CustodyRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + custody_by_range_request: ByRangeRequest::Active(req_id), + }; + + // Wait for the new custody_by_range request to complete + Ok(None) + } + } else { + // Wait for the block request to complete + Ok(None) + } + } + FuluEnabledState::CustodyRequest { + blocks, + block_peer, + custody_by_range_request, + } => { + if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + let peer_group = BatchPeers::new(*block_peer, column_peers.clone()); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + columns.to_vec(), + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for the custody_by_range request to complete + Ok(None) + } + } + }, + } + } + + pub fn on_blocks_by_range_result( + &mut self, + id: BlocksByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } + | State::DenebEnabled { + blocks_by_range_request, + .. + } + | State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request, + }) => { + blocks_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { + return Err(Error::InternalError( + "Received blocks_by_range response expecting custody_by_range".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_blobs_by_range_result( + &mut self, + id: BlobsByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } => { + return Err(Error::InternalError( + "Received blobs_by_range response before Deneb".to_string(), + )) + } + State::DenebEnabled { + blobs_by_range_request, + .. + } => { + blobs_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(_) => { + return Err(Error::InternalError( + "Received blobs_by_range response after PeerDAS".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + data: DataColumnSidecarList, + peers: PeerGroup, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } | State::DenebEnabled { .. } => { + return Err(Error::InternalError( + "Received custody_by_range response before PeerDAS".to_string(), + )) + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + return Err(Error::InternalError( + "Received custody_by_range expecting blocks_by_range".to_string(), + )); + } + FuluEnabledState::CustodyRequest { + custody_by_range_request, + .. + } => { + custody_by_range_request.finish(id, data, peers)?; + } + }, + } + + self.continue_requests(cx) + } + + #[cfg(test)] + pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { + match &self.state { + State::Base { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::DenebEnabled { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + BlockComponentsByRangeRequestStep::BlocksRequest + } + FuluEnabledState::CustodyRequest { .. } => { + BlockComponentsByRangeRequestStep::CustodyRequest + } + }, + } + } +} + +fn couple_blocks_base( + blocks: Vec>>, + custody_columns_count: usize, +) -> Vec> { + blocks + .into_iter() + .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .collect() +} + +fn couple_blocks_deneb( + blocks: Vec>>, + blobs: Vec>>, + spec: &ChainSpec, +) -> Result>, Error> { + let mut blobs_by_block = HashMap::>>>::new(); + for blob in blobs { + let block_root = blob.block_root(); + blobs_by_block.entry(block_root).or_default().push(blob); + } + + // Now collect all blobs that match to the block by block root. BlobsByRange request checks + // the inclusion proof so we know that the commitment is the expected. + // + // BlobsByRange request handler ensures that we don't receive more blobs than possible. + // If the peer serving the request sends us blobs that don't pair well we'll send to the + // processor blocks without expected blobs, resulting in a downscoring event. A serving peer + // could serve fake blobs for blocks that don't have data, but it would gain nothing by it + // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. + // + // RpcBlock::new ensures that the count of blobs is consistent with the block + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block).map_err(|_| { + Error::InternalError("Blobs returned exceeds max length".to_string()) + })?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)) + .expect("TODO: don't do matching here")) + }) + .collect::>, Error>>() +} + +fn couple_blocks_fulu( + blocks: Vec>>, + data_columns: DataColumnSidecarList, + custody_column_indices: Vec, + spec: &ChainSpec, +) -> Result>, Error> { + // Group data columns by block_root and index + let mut custody_columns_by_block = HashMap::>>::new(); + + for column in data_columns { + let block_root = column.block_root(); + + if custody_column_indices.contains(&column.index) { + custody_columns_by_block + .entry(block_root) + .or_default() + // Safe to convert to `CustodyDataColumn`: we have asserted that the index of + // this column is in the set of `expects_custody_columns` and with the expected + // block root, so for the expected epoch of this batch. + .push(CustodyDataColumn::from_asserted_custody(column)); + } + } + + // Now iterate all blocks ensuring that the block roots of each block and data column match, + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns_with_block_root, + custody_column_indices.clone(), + spec, + ) + .map_err(Error::InternalError) + }) + .collect::, _>>() +} + +impl ByRangeRequest { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { + match self { + Self::Active(expected_id) => { + if expected_id != &id { + return Err(Error::InternalError(format!( + "unexpected req_id expected {expected_id} got {id}" + ))); + } + *self = Self::Complete(data, peer_id); + Ok(()) + } + Self::Complete(_, _) => Err(Error::InternalError(format!( + "request already complete {id}" + ))), + } + } + + fn to_finished(&self) -> Option<(&T, &P)> { + match self { + Self::Active(_) => None, + Self::Complete(data, peer_id) => Some((data, peer_id)), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs new file mode 100644 index 00000000000..ed796155e26 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -0,0 +1,429 @@ +use super::custody_by_root::{ColumnRequest, Error}; +use beacon_chain::validator_monitor::timestamp_now; +use beacon_chain::BeaconChainTypes; +use fnv::FnvHashMap; +use lighthouse_network::rpc::{methods::DataColumnsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::service::api_types::{ + CustodyByRangeRequestId, DataColumnsByRangeRequestId, +}; +use lighthouse_network::{PeerAction, PeerId}; +use lru_cache::LRUTimeCache; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::HashSet; +use std::time::{Duration, Instant}; +use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use tracing::{debug, warn}; +use types::{ + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256, + SignedBeaconBlockHeader, Slot, +}; + +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; + +const FAILED_PEERS_EXPIRY_SECONDS: u64 = 15; +const REQUEST_EXPIRY_SECONDS: u64 = 300; + +pub struct ActiveCustodyByRangeRequest { + start_time: Instant, + id: CustodyByRangeRequestId, + request: BlocksByRangeRequest, + /// Blocks that we expect peers to serve data columns for + blocks_with_data: Vec, + /// List of column indices this request needs to download to complete successfully + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>, + >, + /// Active requests for 1 or more columns each + active_batch_columns_requests: + FnvHashMap, + /// Peers that have recently failed to successfully respond to a columns by root request. + /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. + failed_peers: LRUTimeCache, + /// Set of peers that claim to have imported this block and their custody columns + lookup_peers: Arc>>, + + _phantom: PhantomData, +} + +struct ActiveBatchColumnsRequest { + indices: Vec, +} + +pub type CustodyByRangeRequestResult = + Result, PeerGroup, Duration)>, Error>; + +enum ColumnResponseError { + NonMatchingColumn { + slot: Slot, + actual_block_root: Hash256, + expected_block_root: Hash256, + }, + MissingColumn(Slot), +} + +impl ActiveCustodyByRangeRequest { + pub(crate) fn new( + id: CustodyByRangeRequestId, + request: BlocksByRangeRequest, + blocks_with_data: Vec, + column_indices: &[ColumnIndex], + lookup_peers: Arc>>, + ) -> Self { + Self { + start_time: Instant::now(), + id, + request, + blocks_with_data, + column_requests: HashMap::from_iter( + column_indices + .iter() + .map(|index| (*index, ColumnRequest::new())), + ), + active_batch_columns_requests: <_>::default(), + failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_EXPIRY_SECONDS)), + lookup_peers, + _phantom: PhantomData, + } + } + + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Err`: Custody request has failed and will be dropped + /// - `Ok(Some)`: Custody request has successfully completed and will be dropped + /// - `Ok(None)`: Custody request still active + pub(crate) fn on_data_column_downloaded( + &mut self, + peer_id: PeerId, + req_id: DataColumnsByRangeRequestId, + resp: RpcResponseResult>, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { + warn!( + id = %self.id, + %req_id, + "Received custody by range response for unrequested index" + ); + return Ok(None); + }; + + match resp { + Ok((data_columns, seen_timestamp)) => { + // Map columns by index as an optimization to not loop the returned list on each + // requested index. The worse case is 128 loops over a 128 item vec + mutation to + // drop the consumed columns. + let mut data_columns_by_index = + HashMap::<(ColumnIndex, Slot), Arc>>::new(); + for data_column in data_columns { + data_columns_by_index + .insert((data_column.index, data_column.slot()), data_column); + } + + // Accumulate columns that the peer does not have to issue a single log per request + let mut missing_column_indices = vec![]; + let mut incorrect_column_indices = vec![]; + let mut imported_column_indices = vec![]; + + for index in &batch_request.indices { + let column_request = + self.column_requests + .get_mut(index) + .ok_or(Error::InternalError(format!( + "unknown column_index {index}" + )))?; + + let columns_at_index = self + .blocks_with_data + .iter() + .map(|block| { + let slot = block.message.slot; + if let Some(data_column) = data_columns_by_index.remove(&(*index, slot)) + { + let actual_block_root = + data_column.signed_block_header.message.canonical_root(); + let expected_block_root = block.message.canonical_root(); + if actual_block_root != expected_block_root { + Err(ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root: data_column + .signed_block_header + .message + .canonical_root(), + expected_block_root: block.message.canonical_root(), + }) + } else { + Ok(data_column) + } + } else { + // The following three statements are true: + // - block at `slot` is not missed, and has data + // - peer custodies this column `index` + // - peer claims to be synced to at least `slot` + // + // Then we penalize the faulty peer, mark it as failed and try with + // another. + Err(ColumnResponseError::MissingColumn(slot)) + } + }) + .collect::, _>>(); + + match columns_at_index { + Ok(columns_at_index) => { + column_request.on_download_success( + req_id, + peer_id, + columns_at_index, + seen_timestamp, + )?; + + imported_column_indices.push(index); + } + Err(e) => { + column_request.on_download_error(req_id)?; + + match e { + ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root, + expected_block_root, + } => { + incorrect_column_indices.push(( + index, + slot, + actual_block_root, + expected_block_root, + )); + } + ColumnResponseError::MissingColumn(slot) => { + missing_column_indices.push((index, slot)); + } + } + } + } + } + + // Log `imported_column_indices`, `missing_column_indexes` and + // `incorrect_column_indices` once per request to make the logs less noisy. + if !imported_column_indices.is_empty() { + // TODO(das): this log may be redundant. We already log on DataColumnsByRange + // completed, and on DataColumnsByRange sent we log the column indices + // ``` + // Sync RPC request sent method="DataColumnsByRange" slots=8 epoch=4 columns=[52] peer=16Uiu2HAmEooeoHzHDYS35TSHrJDSfmREecPyFskrLPYm9Gm1EURj id=493/399/10/RangeSync/4/1 + // Sync RPC request completed id=493/399/10/RangeSync/4/1 method="DataColumnsByRange" count=1 + // ``` + // Which can be traced to this custody by range request, and the initial log + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + count = imported_column_indices.len(), + "Custody by range request download imported columns" + ); + } + + if !incorrect_column_indices.is_empty() { + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + ?incorrect_column_indices, + "Custody by range peer returned non-matching columns" + ); + + // Returning a non-canonical column is not a permanent fault. We should not + // retry the peer for some time but the peer may return a canonical column in + // the future. + self.failed_peers.insert(peer_id); + cx.report_peer( + peer_id, + PeerAction::MidToleranceError, + "non-matching data column", + ); + } + + if !missing_column_indices.is_empty() { + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + ?missing_column_indices, + "Custody by range peer claims to not have some data" + ); + + // Not having columns is not a permanent fault. The peer may be backfilling. + self.failed_peers.insert(peer_id); + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); + } + } + Err(err) => { + debug!( + id = %self.id, + %req_id, + %peer_id, + error = ?err, + "Custody by range download error" + ); + + for column_index in &batch_request.indices { + self.column_requests + .get_mut(column_index) + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; + } + + // An RpcResponseError is already downscored in network_context + self.failed_peers.insert(peer_id); + } + }; + + self.continue_requests(cx) + } + + pub(crate) fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + if self.column_requests.values().all(|r| r.is_downloaded()) { + // All requests have completed successfully. + let mut peers = HashMap::>::new(); + let mut seen_timestamps = vec![]; + let columns = std::mem::take(&mut self.column_requests) + .into_values() + .map(|request| { + let (peer, data_columns, seen_timestamp) = request.complete()?; + + for data_column in &data_columns { + let columns_by_peer = peers.entry(peer).or_default(); + if !columns_by_peer.contains(&(data_column.index as usize)) { + columns_by_peer.push(data_column.index as usize); + } + } + + seen_timestamps.push(seen_timestamp); + + Ok(data_columns) + }) + .collect::, _>>()? + // Flatten Vec> to Vec + .into_iter() + .flatten() + .collect(); + + let peer_group = PeerGroup::from_set(peers); + let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); + return Ok(Some((columns, peer_group, max_seen_timestamp))); + } + + let active_request_count_by_peer = cx.active_request_count_by_peer(); + let mut columns_to_request_by_peer = HashMap::>::new(); + let lookup_peers = self.lookup_peers.read(); + + // Need to: + // - track how many active requests a peer has for load balancing + // - which peers have failures to attempt others + // - which peer returned what to have PeerGroup attributability + + for (column_index, request) in self.column_requests.iter_mut() { + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); + } + + // TODO(das): We should only query peers that are likely to know about this block. + // For by_range requests, only peers in the SyncingChain peer set. Else consider a + // fallback to the peers that are synced up to the epoch we want to query. + let custodial_peers = cx.get_custodial_peers(*column_index); + + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column + let mut priorized_peers = custodial_peers + .iter() + .filter(|peer| { + // Do not request faulty peers for some time + !self.failed_peers.contains(peer) + }) + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::thread_rng().gen::(), + *peer, + ) + }) + .collect::>(); + priorized_peers.sort_unstable(); + + if let Some((_, _, _, peer_id)) = priorized_peers.first() { + columns_to_request_by_peer + .entry(*peer_id) + .or_default() + .push(*column_index); + } else { + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request will be dropped and failed after some time. + } + } + } + + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { + let req_id = cx + .send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + start_slot: *self.request.start_slot(), + count: *self.request.count(), + columns: indices.clone(), + }, + self.id, + ) + .map_err(|e| Error::InternalError(format!("send failed {e}")))?; + + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::InternalError(format!( + "Unknown column_request {column_index}" + )))?; + + column_request.on_download_start(req_id)?; + } + + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); + } + + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + + Ok(None) + } +} diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs similarity index 70% rename from beacon_node/network/src/sync/network_context/custody.rs rename to beacon_node/network/src/sync/network_context/custody_by_root.rs index f4d010b881e..1ca2a55a13a 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,5 +1,6 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, + RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -12,22 +13,28 @@ use rand::Rng; use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::EthSpec; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); - -type DataColumnSidecarList = Vec>>; +const REQUEST_EXPIRY_SECONDS: u64 = 300; +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; -pub struct ActiveCustodyRequest { +pub struct ActiveCustodyByRootRequest { + start_time: Instant, block_root: Hash256, custody_id: CustodyId, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + #[allow(clippy::type_complexity)] + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -40,29 +47,47 @@ pub struct ActiveCustodyRequest { _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), - NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, - }, + InternalError(String), + TooManyDownloadErrors(RpcResponseError), + ExpiredNoCustodyPeers(Vec), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::ExpiredNoCustodyPeers(indices) => RpcResponseError::RequestExpired(format!( + "Expired waiting for custody peers {indices:?}" + )), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + Error::ExpiredNoCustodyPeers(_) => RpcRequestSendError::InternalError( + "Request can not expire when requesting it".to_string(), + ), + } + } } struct ActiveBatchColumnsRequest { indices: Vec, } -pub type CustodyRequestResult = +pub type CustodyByRootRequestResult = Result, PeerGroup, Duration)>, Error>; -impl ActiveCustodyRequest { +impl ActiveCustodyByRootRequest { pub(crate) fn new( block_root: Hash256, custody_id: CustodyId, @@ -70,6 +95,7 @@ impl ActiveCustodyRequest { lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), block_root, custody_id, column_requests: HashMap::from_iter( @@ -98,7 +124,7 @@ impl ActiveCustodyRequest { req_id: DataColumnsByRootRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( block_root = ?self.block_root, @@ -131,7 +157,7 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; if let Some(data_column) = data_columns.remove(column_index) { column_request.on_download_success( @@ -182,8 +208,8 @@ impl ActiveCustodyRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; } self.failed_peers.insert(peer_id); @@ -196,7 +222,7 @@ impl ActiveCustodyRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -229,9 +255,9 @@ impl ActiveCustodyRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -270,21 +296,20 @@ impl ActiveCustodyRequest { .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request will be dropped and failed after some time. } } } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx - .data_column_lookup_request( + .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, DataColumnsByRootSingleBlockRequest { @@ -297,7 +322,9 @@ impl ActiveCustodyRequest { // columns. For the rest of peers, don't downscore if columns are missing. lookup_peers.contains(&peer_id), ) - .map_err(Error::SendFailed)?; + .map_err(|e| { + Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) + })?; match request_result { LookupRequestResult::RequestSent(req_id) => { @@ -306,7 +333,7 @@ impl ActiveCustodyRequest { .column_requests .get_mut(column_index) // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; } @@ -319,117 +346,149 @@ impl ActiveCustodyRequest { } } + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + Ok(None) } } -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, - download_failures: usize, +pub struct ColumnRequest { + status: Status, + download_failures: Vec, } -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRootRequestId), - Downloaded(PeerId, Arc>, Duration), +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), } -impl ColumnRequest { - fn new() -> Self { +impl ColumnRequest { + pub fn new() -> Self { Self { - status: Status::NotStarted(Instant::now()), - download_failures: 0, + status: Status::NotStarted, + download_failures: vec![], } } - fn is_awaiting_download(&self) -> Option { + pub fn is_awaiting_download(&self) -> bool { match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, } } - fn is_downloaded(&self) -> bool { + pub fn is_downloading(&self) -> bool { match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, Status::Downloaded { .. } => true, } } - fn on_download_start(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn too_many_failures(&self) -> Option { + if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + Some( + self.download_failures + .last() + .cloned() + .expect("download_failures is not empty"), + ) + } else { + None + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { match &self.status { - Status::NotStarted { .. } => { + Status::NotStarted => { self.status = Status::Downloading(req_id); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_start expected NotStarted got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } - self.status = Status::NotStarted(Instant::now()); + self.status = Status::NotStarted; Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_error expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error_and_mark_failure( + pub fn on_download_error_and_mark_failure( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, + e: RpcResponseError, ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; + self.download_failures.push(e); self.on_download_error(req_id) } - fn on_download_success( + pub fn on_download_success( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, peer_id: PeerId, - data_column: Arc>, + data_column: T, seen_timestamp: Duration, ) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_success expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn complete(self) -> Result<(PeerId, Arc>, Duration), Error> { + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } - other => Err(Error::BadState(format!( - "bad state complete expected Downloaded got {other:?}" + other => Err(Error::InternalError(format!( + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), ))), } } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index cd70a2e7ebc..8228ea5d9d5 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -26,7 +26,7 @@ mod blocks_by_root; mod data_columns_by_range; mod data_columns_by_root; -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] +#[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { NotEnoughResponsesReturned { actual: usize, @@ -177,12 +177,10 @@ impl ActiveRequests { } } - pub fn active_requests_of_peer(&self, peer_id: &PeerId) -> Vec<&K> { + pub fn active_requests(&self) -> impl Iterator { self.requests .iter() - .filter(|(_, request)| &request.peer_id == peer_id) - .map(|(id, _)| id) - .collect() + .map(|(id, request)| (id, &request.peer_id)) } pub fn iter_request_peers(&self) -> impl Iterator + '_ { diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs index 276ede93c12..54ff0c1c735 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs @@ -1,13 +1,13 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use std::sync::Arc; -use types::{DataColumnSidecar, EthSpec, Slot}; +use types::{DataColumnSidecar, DataColumnSidecarList, EthSpec, Slot}; /// Accumulates results of a data_columns_by_range request. Only returns items after receiving the /// stream termination. pub struct DataColumnsByRangeRequestItems { request: DataColumnsByRangeRequest, - items: Vec>>, + items: DataColumnSidecarList, } impl DataColumnsByRangeRequestItems { diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 59b751787e3..d76c7d2bbc2 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -98,13 +98,13 @@ impl Sampling { // TODO(das): Should track failed sampling request for some time? Otherwise there's // a risk of a loop with multiple triggers creating the request, then failing, // and repeat. - debug!(?id, "Ignoring duplicate sampling request"); + debug!(%id, "Ignoring duplicate sampling request"); return None; } }; debug!( - ?id, + %id, column_selection = ?request.column_selection(), "Created new sample request" ); @@ -138,7 +138,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample downloaded event for unknown request"); + debug!(%id, "Sample downloaded event for unknown request"); return None; }; @@ -167,7 +167,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample verified event for unknown request"); + debug!(%id, "Sample verified event for unknown request"); return None; }; @@ -191,7 +191,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let result = result.transpose(); if let Some(result) = result { - debug!(?id, ?result, "Sampling request completed, removing"); + debug!(%id, ?result, "Sampling request completed, removing"); metrics::inc_counter_vec( &metrics::SAMPLING_REQUEST_RESULT, &[metrics::from_result(&result)], @@ -570,7 +570,7 @@ impl ActiveSamplingRequest { // Send requests. let mut sent_request = false; for (peer_id, column_indexes) in column_indexes_to_request { - cx.data_column_lookup_request( + cx.data_columns_by_root_request( DataColumnsByRootRequester::Sampling(SamplingId { id: self.requester_id, sampling_request_id: self.current_sampling_request_id, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 72598a25405..99ee4fb6be2 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,8 +1,10 @@ +use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; +use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Sub; @@ -17,29 +19,20 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -/// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] -#[strum(serialize_all = "snake_case")] -pub enum ByRangeRequestType { - BlocksAndColumns, - BlocksAndBlobs, - Blocks, -} - #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, - column_peers: HashMap, + column_peers: PeerGroup, } impl BatchPeers { pub fn new_from_block_peer(block_peer: PeerId) -> Self { Self { block_peer, - column_peers: <_>::default(), + column_peers: PeerGroup::empty(), } } - pub fn new(block_peer: PeerId, column_peers: HashMap) -> Self { + pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { Self { block_peer, column_peers, @@ -51,7 +44,13 @@ impl BatchPeers { } pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { - self.column_peers.get(index) + self.column_peers.of_index(&((*index) as usize)) + } + + pub fn iter_unique_peers(&self) -> impl Iterator { + std::iter::once(&self.block_peer) + .chain(self.column_peers.all()) + .unique() } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index ba809a14ba1..76721ec5aa3 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -10,7 +10,9 @@ use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; +use parking_lot::RwLock; +use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; +use std::sync::Arc; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -87,9 +89,15 @@ pub struct SyncingChain { batches: BTreeMap>, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from, as well as the batches we are currently - /// requesting. - peers: HashSet, + /// and thus available to download this chain from. + /// + /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain + /// `HashMap` + peers: Arc>>, + + /// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data + /// from all peers to prevent eclipse attacks + requests_per_peer: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -122,6 +130,15 @@ pub enum ChainSyncingState { } impl SyncingChain { + /// Leaks the state of all active batches for assertions in tests. + #[cfg(test)] + pub fn batches_state(&self) -> Vec<(BatchId, &BatchState)> { + self.batches + .iter() + .map(|(id, batch)| (*id, batch.state())) + .collect() + } + #[allow(clippy::too_many_arguments)] pub fn new( id: Id, @@ -138,7 +155,8 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashSet::from_iter([peer_id]), + peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), + requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -156,7 +174,7 @@ impl SyncingChain { /// Check if the chain has peers from which to process batches. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn available_peers(&self) -> usize { - self.peers.len() + self.peers.read().len() } /// Get the chain's id. @@ -168,7 +186,12 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.iter().cloned() + self.peers + .read() + .iter() + .copied() + .collect::>() + .into_iter() } /// Progress in epochs made by the chain @@ -192,9 +215,10 @@ impl SyncingChain { /// If the peer has active batches, those are considered failed and re-requested. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { - self.peers.remove(peer_id); + self.peers.write().remove(peer_id); + self.requests_per_peer.remove(peer_id); - if self.peers.is_empty() { + if self.peers.read().is_empty() { Err(RemoveChain::EmptyPeerPool) } else { Ok(KeepChain) @@ -221,6 +245,12 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { + // Account for one more requests to this peer + // TODO(das): this code assumes that we do a single request per peer per RpcBlock + for peer in batch_peers.iter_unique_peers() { + *self.requests_per_peer.entry(*peer).or_default() += 1; + } + // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -400,11 +430,6 @@ impl SyncingChain { self.request_batches(network)?; } } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -577,7 +602,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for peer in self.peers.drain() { + for peer in self.peers.write().drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -842,7 +867,8 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id); + self.peers.write().insert(peer_id); + self.requests_per_peer.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -854,7 +880,6 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> ProcessingResult { @@ -869,7 +894,6 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - %peer_id, %request_id, ?batch_state, "Batch not expecting block" @@ -880,12 +904,13 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - %peer_id, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + batch.download_failed(None)? { return Err(RemoveChain::ChainFailed { blacklist, @@ -896,7 +921,6 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - %peer_id, %request_id, batch_state, "Batch not found" @@ -918,25 +942,15 @@ impl SyncingChain { let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); - // TODO(das): we should request only from peers that are part of this SyncingChain. - // However, then we hit the NoPeer error frequently which causes the batch to fail and - // the SyncingChain to be dropped. We need to handle this case more gracefully. - let synced_peers = network - .network_globals() - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - match network.block_components_by_range_request( request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, - &synced_peers, + self.peers.clone(), &failed_peers, + &self.requests_per_peer, ) { Ok(request_id) => { // inform the batch about the new request @@ -953,14 +967,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For - // sync to work properly it must be okay to have "stalled" batches in - // AwaitingDownload state. Currently it will error with invalid state if - // that happens. Sync manager must periodicatlly prune stalled batches like - // we do for lookup sync. Then we can deprecate the redundant - // `good_peers_on_sampling_subnets` checks. - e - @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { + RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried @@ -1019,11 +1026,6 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); - return Ok(KeepChain); - } - if let Entry::Vacant(entry) = self.batches.entry(epoch) { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); entry.insert(optimistic_batch); @@ -1046,35 +1048,6 @@ impl SyncingChain { Ok(KeepChain) } - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] @@ -1107,15 +1080,6 @@ impl SyncingChain { return None; } - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); - return None; - } - // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index 9f500c61e0b..454f7c02d15 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -54,6 +54,13 @@ pub struct ChainCollection { } impl ChainCollection { + #[cfg(test)] + pub(crate) fn iter(&self) -> impl Iterator> { + self.finalized_chains + .values() + .chain(self.head_chains.values()) + } + pub fn new(beacon_chain: Arc>) -> Self { ChainCollection { beacon_chain, diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 1218e0cd09c..225b536d1de 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,10 +9,7 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, - ByRangeRequestType, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; -#[cfg(test)] -pub use chain_collection::SyncChainStatus; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index e2c076484a5..62d18252683 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -46,6 +46,8 @@ use super::BatchPeers; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; +#[cfg(test)] +use crate::sync::range_sync::BatchState; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -100,10 +102,23 @@ where } #[cfg(test)] - pub(crate) fn __failed_chains(&mut self) -> Vec { + pub(crate) fn failed_chains(&mut self) -> Vec { self.failed_chains.keys().copied().collect() } + #[cfg(test)] + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, &BatchState)> { + self.chains + .iter() + .flat_map(|chain| { + chain + .batches_state() + .into_iter() + .map(|(batch_id, state)| (chain.id(), batch_id, state)) + }) + .collect() + } + #[instrument(parent = None, level = "info", fields(component = "range_sync"), @@ -344,7 +359,6 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -352,7 +366,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) + chain.inject_error(network, batch_id, request_id, err) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 84ff1c7e259..f26a467f273 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -36,7 +36,7 @@ use lighthouse_network::{ SamplingRequester, SingleLookupReqId, SyncRequestId, }, types::SyncState, - NetworkConfig, NetworkGlobals, PeerId, + NetworkConfig, NetworkGlobals, PeerId, SyncInfo, }; use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; @@ -44,8 +44,8 @@ use tracing::info; use types::{ data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, }; const D: Duration = Duration::new(0, 0); @@ -54,8 +54,21 @@ const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); +pub enum PeersConfig { + SupernodeAndRandom, + SupernodeOnly, +} + impl TestRig { pub fn test_setup() -> Self { + Self::test_setup_with_options(false) + } + + pub fn test_setup_as_supernode() -> Self { + Self::test_setup_with_options(true) + } + + fn test_setup_with_options(is_supernode: bool) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -84,10 +97,11 @@ impl TestRig { // TODO(das): make the generation of the ENR use the deterministic rng to have consistent // column assignments let network_config = Arc::new(NetworkConfig::default()); - let globals = Arc::new(NetworkGlobals::new_test_globals( + let globals = Arc::new(NetworkGlobals::new_test_globals_as_supernode( Vec::new(), network_config, chain.spec.clone(), + is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -116,6 +130,7 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, + sent_blocks_by_range: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -204,9 +219,7 @@ impl TestRig { generate_rand_block_and_blobs::(fork_name, num_blobs, rng, &self.spec) } - fn rand_block_and_data_columns( - &mut self, - ) -> (SignedBeaconBlock, Vec>>) { + fn rand_block_and_data_columns(&mut self) -> (SignedBeaconBlock, DataColumnSidecarList) { let num_blobs = NumBlobs::Number(1); generate_rand_block_and_data_columns::( self.fork_name, @@ -247,8 +260,8 @@ impl TestRig { self.sync_manager.active_parent_lookups().len() } - fn active_range_sync_chain(&self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.get_range_sync_chains().unwrap().unwrap() + fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { + self.sync_manager.range_sync().state().unwrap().unwrap() } fn assert_single_lookups_count(&self, count: usize) { @@ -358,29 +371,67 @@ impl TestRig { self.expect_empty_network(); } - pub fn new_connected_peer(&mut self) -> PeerId { + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor + fn new_connected_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(false) + } + + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor + fn new_connected_supernode_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(true) + } + + /// Add a random connected peer that is not known by the sync module + pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key); - self.log(&format!("Added new peer for testing {peer_id:?}")); + .__add_connected_peer_testing_only(supernode, &self.harness.spec, key); + let mut peer_custody_subnets = self + .network_globals + .peers + .read() + .peer_info(&peer_id) + .expect("peer was just added") + .custody_subnets_iter() + .map(|subnet| **subnet) + .collect::>(); + peer_custody_subnets.sort_unstable(); + self.log(&format!( + "Added new peer for testing {peer_id:?} custody subnets {peer_custody_subnets:?}" + )); peer_id } - pub fn new_connected_supernode_peer(&mut self) -> PeerId { - let key = self.determinstic_key(); - self.network_globals - .peers - .write() - .__add_connected_peer_testing_only(true, &self.harness.spec, key) + /// Add a random connected peer + add it to sync with a specific remote Status + pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { + let peer_id = self.add_connected_peer_testing_only(supernode); + self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); + peer_id } fn determinstic_key(&mut self) -> CombinedKey { k256::ecdsa::SigningKey::random(&mut self.rng).into() } + pub fn add_sync_peers(&mut self, config: PeersConfig, remote_info: SyncInfo) { + match config { + PeersConfig::SupernodeAndRandom => { + for _ in 0..100 { + self.add_sync_peer(false, remote_info.clone()); + } + self.add_sync_peer(true, remote_info); + } + PeersConfig::SupernodeOnly => { + self.add_sync_peer(true, remote_info); + } + } + } + pub fn new_connected_peers_for_peerdas(&mut self) { // Enough sampling peers with few columns for _ in 0..100 { @@ -675,7 +726,7 @@ impl TestRig { fn complete_valid_sampling_column_requests( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, ) { for id in ids { self.log(&format!("return valid data column for {id:?}")); @@ -720,7 +771,7 @@ impl TestRig { fn complete_valid_custody_request( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, missing_components: bool, ) { let lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { @@ -843,6 +894,19 @@ impl TestRig { } } + /// Similar to `pop_received_network_events` but finds matching events without removing them. + pub fn filter_received_network_events) -> Option>( + &mut self, + predicate_transform: F, + ) -> Vec { + self.drain_network_rx(); + + self.network_rx_queue + .iter() + .filter_map(predicate_transform) + .collect() + } + pub fn pop_received_processor_event) -> Option>( &mut self, predicate_transform: F, @@ -1091,6 +1155,16 @@ impl TestRig { } } + pub fn expect_no_penalty_for_anyone(&mut self) { + let downscore_events = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if !downscore_events.is_empty() { + panic!("Expected no downscoring events but found: {downscore_events:?}"); + } + } + #[track_caller] fn expect_parent_chain_process(&mut self) { match self.beacon_processor_rx.try_recv() { @@ -1126,6 +1200,25 @@ impl TestRig { } } + #[track_caller] + pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { + let all_penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) + { + panic!( + "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" + ); + } + self.log(&format!( + "Found expected penalties {expected_penalty_msg}: {all_penalties:?}" + )); + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index 3dca4571086..cf8af7f5348 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -6,9 +6,12 @@ use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; +use lighthouse_network::service::api_types::ComponentsByRangeRequestId; use lighthouse_network::NetworkGlobals; +pub use lookups::PeersConfig; use rand_chacha::ChaCha20Rng; use slot_clock::ManualSlotClock; +use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::sync::{Arc, Once}; @@ -17,7 +20,7 @@ use tokio::sync::mpsc; use tracing_subscriber::fmt::MakeWriter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -use types::{ChainSpec, ForkName, MinimalEthSpec as E}; +use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; mod lookups; mod range; @@ -69,6 +72,9 @@ struct TestRig { rng: ChaCha20Rng, fork_name: ForkName, spec: Arc, + + // Cache of sent blocks for PeerDAS responses + sent_blocks_by_range: HashMap>>>, } // Environment variable to read if `fork_from_env` feature is enabled. diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 06dca355e53..642f92ee664 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -2,27 +2,33 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::RangeRequestId; -use crate::sync::range_sync::RangeSyncType; -use crate::sync::SyncMessage; +use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; +use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; +use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; +use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, - OldBlocksByRangeRequestV2, }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + DataColumnsByRangeRequestId, SyncRequestId, }; -use lighthouse_network::{PeerId, SyncInfo}; +use lighthouse_network::types::SyncState; +use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; +use rand::SeedableRng; +use rand_chacha::ChaCha20Rng; +use std::collections::HashSet; use std::time::Duration; +use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BlobSidecarList, BlockImportSource, Epoch, EthSpec, Hash256, MinimalEthSpec as E, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, + DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, + SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -34,10 +40,44 @@ pub(crate) enum DataSidecars { enum ByRangeDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId), - PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PrePeerDAS(BlobsByRangeRequestId, PeerId, BlobsByRangeRequest), + PostPeerDAS( + Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )>, + ), } +impl ByRangeDataRequestIds { + /// If there's a single active request, returns its peer, else panics + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + +struct Config { + peers: PeersConfig, +} + +type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); + +type DataColumnsByRangeRequestData = ( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, +); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -46,10 +86,11 @@ enum ByRangeDataRequestIds { /// To make writting tests succint, the machinery in this testing rig automatically identifies /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, Copy)] struct RequestFilter { peer: Option, epoch: Option, + column_index: Option, } impl RequestFilter { @@ -62,13 +103,115 @@ impl RequestFilter { self.epoch = Some(epoch); self } + + fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + fn blocks_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + } if self.matches_blocks_by_range(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + + fn data_columns_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), + } if self.matches_data_columns_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { + self.matches_common(peer, *req.start_slot()) + } + + fn matches_blobs_by_range(&self, peer: &PeerId, req: &BlobsByRangeRequest) -> bool { + self.matches_common(peer, req.start_slot) + } + + fn matches_data_columns_by_range( + &self, + peer: &PeerId, + req: &DataColumnsByRangeRequest, + ) -> bool { + if let Some(index) = self.column_index { + if !req.columns.contains(&index) { + return false; + } + } + self.matches_common(peer, req.start_slot) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + if let Some(expected_peer) = self.peer { + if *peer != expected_peer { + return false; + } + } + true + } } fn filter() -> RequestFilter { RequestFilter::default() } +/// Instruct the testing rig how to complete requests for _by_range requests +#[derive(Debug, Clone, Copy)] +struct CompleteConfig { + block_count: usize, + with_data: bool, + custody_failure_at_index: Option, +} + +impl CompleteConfig { + fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } +} + +fn complete() -> CompleteConfig { + CompleteConfig { + block_count: 1, + with_data: true, + custody_failure_at_index: None, + } +} + impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns + .iter() + .copied() + .collect() + } + /// Produce a head peer with an advanced head fn add_head_peer(&mut self) -> PeerId { self.add_head_peer_with_root(Hash256::random()) @@ -77,7 +220,7 @@ impl TestRig { /// Produce a head peer with an advanced head fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { let local_info = self.local_info(); - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { head_root, head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), ..local_info @@ -93,7 +236,7 @@ impl TestRig { fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { let local_info = self.local_info(); let finalized_epoch = local_info.finalized_epoch + 2; - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { finalized_epoch, finalized_root, head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), @@ -128,37 +271,22 @@ impl TestRig { } } - fn add_random_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - let peer_id = self.new_connected_peer(); - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) } - fn add_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { // Create valid peer known to network globals // TODO(fulu): Using supernode peers to ensure we have peer across all column // subnets for syncing. Should add tests connecting to full node peers. - let peer_id = self.new_connected_supernode_peer(); - // Send peer to sync - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id - } - - fn add_random_peers(&mut self, remote_info: SyncInfo, count: usize) { - for _ in 0..count { - let peer = self.new_connected_peer(); - self.add_peer(peer, remote_info.clone()); - } + self.add_sync_peer(true, remote_info) } - fn add_peer(&mut self, peer: PeerId, remote_info: SyncInfo) { - self.send_sync_message(SyncMessage::AddPeer(peer, remote_info)); - } - - fn assert_state(&self, state: RangeSyncType) { + fn assert_state(&mut self, state: RangeSyncType) { assert_eq!( self.sync_manager - .range_sync_state() + .range_sync() + .state() .expect("State is ok") .expect("Range should be syncing, there are no chains") .0, @@ -167,15 +295,35 @@ impl TestRig { ); } - fn assert_no_chains_exist(&self) { - if let Some(chain) = self.sync_manager.get_range_sync_chains().unwrap() { + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() + } + + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, &BatchState)> { + self.sync_manager.range_sync().batches_state() + } + + fn assert_sync_state(&mut self, expected_state: SyncState) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + assert_eq!(current_state, expected_state); + } + + fn assert_syncing_finalized(&mut self) { + self.assert_sync_state(SyncState::SyncingFinalized { + start_slot: Slot::new(0), + target_slot: Slot::new(0), + }); + } + + fn assert_no_chains_exist(&mut self) { + if let Some(chain) = self.sync_manager.range_sync().state().unwrap() { panic!("There still exists a chain {chain:?}"); } } fn assert_no_failed_chains(&mut self) { assert_eq!( - self.sync_manager.__range_failed_chains(), + self.sync_manager.range_sync().failed_chains(), Vec::::new(), "Expected no failed chains" ) @@ -191,110 +339,363 @@ impl TestRig { } } + fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = self + .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); + if !events.is_empty() { + panic!("Expected to not find data_columns_by_range requests {request_filter:?} by found {events:?}") + } + } + + fn expect_active_block_components_by_range_request_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if requests.is_empty() { + panic!("No active block_components_by_range requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRangeRequestStep::CustodyRequest) { + panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_by_range_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if !requests.is_empty() { + panic!("Still active block_components_by_range requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); + if !requests.is_empty() { + panic!("There are still active RPC requests {requests:?}"); + } + } + + fn expect_all_batches_in_state) -> bool>( + &mut self, + predicate: F, + expected_state: &'static str, + ) { + let batches = self.get_batch_states(); + if batches.is_empty() { + panic!("no batches"); + } + for (chain_id, batch_id, state) in &batches { + if !predicate(state) { + panic!("batch {chain_id} {batch_id} not in state {expected_state}, {state}"); + } + } + } + + fn expect_all_batches_downloading(&mut self) { + self.expect_all_batches_in_state( + |state| matches!(state, BatchState::Downloading { .. }), + "Downloading", + ); + } + + fn expect_all_batches_processing_or_awaiting(&mut self) { + self.expect_all_batches_in_state( + |state| { + matches!( + state, + BatchState::Processing { .. } | BatchState::AwaitingProcessing { .. } + ) + }, + "Processing or AwaitingProcessing", + ); + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); } - fn find_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ((BlocksByRangeRequestId, PeerId), ByRangeDataRequestIds) { - let filter_f = |peer: PeerId, start_slot: u64| { - if let Some(expected_epoch) = request_filter.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } - } - if let Some(expected_peer) = request_filter.peer { - if peer != expected_peer { - return false; - } + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } - true - }; + } + *block.slot_mut() = slot; + Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) + } - let block_req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::BlocksByRange(OldBlocksByRangeRequest::V2( - OldBlocksByRangeRequestV2 { start_slot, .. }, - )), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) + fn last_sent_blocks_by_range( + &mut self, + id: ComponentsByRangeRequestId, + ) -> Vec>> { + self.sent_blocks_by_range + .get(&id) + .cloned() + .unwrap_or_else(|| panic!("No blocks for ComponentsByRangeRequestId {id}")) + } + + fn send_blocks_by_range_response( + &mut self, + req_id: BlocksByRangeRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRange request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + + if self + .sent_blocks_by_range + .insert(req_id.parent_request_id, blocks.to_vec()) + .is_some() + { + panic!("Sent two blocks_by_range requests in the same epoch. We need better tracking"); + } + } + + fn send_data_columns_by_range_response( + &mut self, + id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn pop_blocks_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) .unwrap_or_else(|e| { panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }); + }) + } - let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::DataColumnsByRange(DataColumnsByRangeRequest { - start_slot, .. - }), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) { - data_columns_requests.push(data_columns_request); - } + fn pop_data_columns_by_range_requests( + &mut self, + request_filter: RequestFilter, + ) -> Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )> { + let mut data_columns_requests = vec![]; + while let Ok(data_columns_request) = + self.pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + data_columns_requests.push(data_columns_request); + } + data_columns_requests + } + + fn find_data_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> ByRangeDataRequestIds { + if self.after_fulu() { + let data_columns_requests = self.pop_data_columns_by_range_requests(request_filter); if data_columns_requests.is_empty() { panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); } ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) } else if self.after_deneb() { - let (id, peer) = self + let (id, peer, req) = self .pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, - request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), + request: RequestType::BlobsByRange(req), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches_blobs_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } _ => None, }) .unwrap_or_else(|e| { panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") }); - ByRangeDataRequestIds::PrePeerDAS(id, peer) + ByRangeDataRequestIds::PrePeerDAS(id, peer, req) } else { ByRangeDataRequestIds::PreDeneb - }; + } + } - (block_req, by_range_data_requests) + fn find_and_complete_block_components_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let id = self.find_and_complete_blocks_by_range_request(request_filter, complete_config); + self.find_and_complete_data_by_range_request(request_filter, complete_config); + id } fn find_and_complete_blocks_by_range_request( &mut self, request_filter: RequestFilter, + complete_config: CompleteConfig, ) -> RangeRequestId { - let ((blocks_req_id, block_peer), by_range_data_request_ids) = - self.find_blocks_by_range_request(request_filter); + let (blocks_req_id, block_peer, blocks_req) = + self.pop_blocks_by_range_request(request_filter); - // Complete the request with a single stream termination - self.log(&format!( - "Completing BlocksByRange request {blocks_req_id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(blocks_req_id), - peer_id: block_peer, - beacon_block: None, - seen_timestamp: D, - }); + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_blocks_by_range_request( + &mut self, + request: BlocksByRangeRequestData, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let (blocks_req_id, block_peer, blocks_req) = request; + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_data_columns_by_range_request( + &mut self, + (id, peer_id, req): DataColumnsByRangeRequestData, + complete_config: CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + )); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); + } + fn find_and_complete_data_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + let by_range_data_request_ids = self.find_data_by_range_request(request_filter); + self.complete_data_by_range_request(by_range_data_request_ids, complete_config); + } + + fn complete_data_by_range_request( + &mut self, + by_range_data_request_ids: ByRangeDataRequestIds, + complete_config: CompleteConfig, + ) { match by_range_data_request_ids { ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { + ByRangeDataRequestIds::PrePeerDAS(id, peer_id, req) => { // Complete the request with a single stream termination self.log(&format!( - "Completing BlobsByRange request {id:?} with empty stream" + "Completing BlobsByRange request {id} {req:?} with empty stream" )); self.send_sync_message(SyncMessage::RpcBlob { sync_request_id: SyncRequestId::BlobsByRange(id), @@ -305,21 +706,89 @@ impl TestRig { } ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { // Complete the request with a single stream termination - for (id, peer_id) in data_column_req_ids { - self.log(&format!( - "Completing DataColumnsByRange request {id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); + for (id, peer_id, req) in data_column_req_ids { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: + kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!("Forced custody failure at request {id} for peer {peer_id} index {index:?}")); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); } } } + } - blocks_req_id.parent_request_id.requester + fn progress_until_no_events( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + loop { + if let Ok(request) = + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) + { + self.complete_blocks_by_range_request(request, complete_config); + continue; + } + + if let Ok(request) = self + .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + self.complete_data_columns_by_range_request(request, complete_config); + continue; + } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; + } } fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { @@ -344,15 +813,18 @@ impl TestRig { &mut self, last_epoch: u64, request_filter: RequestFilter, + complete_config: CompleteConfig, ) { for epoch in 0..last_epoch { // Note: In this test we can't predict the block peer - let id = - self.find_and_complete_blocks_by_range_request(request_filter.clone().epoch(epoch)); + let id = self.find_and_complete_block_components_by_range_request( + request_filter.epoch(epoch), + complete_config, + ); if let RangeRequestId::RangeSync { batch_id, .. } = id { assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); } else { - panic!("unexpected RangeRequestId {id:?}"); + panic!("unexpected RangeRequestId {id}"); } let id = match id { @@ -476,14 +948,14 @@ fn head_chain_removed_while_finalized_syncing() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer(); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Fail the head chain by disconnecting the peer. rig.peer_disconnected(head_peer); @@ -510,14 +982,14 @@ async fn state_update_while_purging() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Now the chain knows both chains target roots. rig.remember_block(head_peer_block).await; @@ -536,7 +1008,10 @@ fn pause_and_resume_on_ee_offline() { // make the ee offline rig.update_execution_engine_state(EngineState::Offline); // send the response to the request - rig.find_and_complete_blocks_by_range_request(filter().peer(peer1).epoch(0)); + rig.find_and_complete_block_components_by_range_request( + filter().peer(peer1).epoch(0), + complete(), + ); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); @@ -547,7 +1022,7 @@ fn pause_and_resume_on_ee_offline() { // Don't filter requests and the columns requests may be sent to peer1 or peer2 // We need to filter by epoch, because the previous batch eagerly sent requests for the next // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_blocks_by_range_request(filter().epoch(0)); + rig.find_and_complete_block_components_by_range_request(filter().epoch(0), complete()); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); // make the beacon processor available again. @@ -576,19 +1051,34 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { // Current priorization only sends batches to idle peers, so we need enough peers for each batch // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS) as usize, - ); + r.add_sync_peer(false, remote_info); r.assert_state(RangeSyncType::Finalized); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); } +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids + #[test] -fn finalized_sync_not_enough_custody_peers_on_start() { - let mut r = TestRig::test_setup(); +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); +} + +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -599,24 +1089,158 @@ fn finalized_sync_not_enough_custody_peers_on_start() { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit - r.add_random_peer_not_supernode(remote_info.clone()); - r.assert_state(RangeSyncType::Finalized); + r.add_connected_sync_peer_not_supernode(remote_info.clone()); + r.assert_syncing_finalized(); + + // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it + // doesn't have enough peers to cover all columns + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); - // Because we don't have enough peers on all columns we haven't sent any request. - // NOTE: There's a small chance that this single peer happens to custody exactly the set we - // expect, in that case the test will fail. Find a way to make the test deterministic. - r.expect_empty_network(); + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_by_range_request_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers_for_peerdas(); + r.add_sync_peers(config.peers, remote_info.clone()); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS - 1) as usize, + + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + + r.add_sync_peer(true, remote_info.clone()); + r.assert_state(RangeSyncType::Finalized); + + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index + r.progress_until_no_events( + filter(), + complete().custody_failure_at_index(column_index_to_fail), + ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_all_batches_downloading(); + + // After adding a new peer we will try to fetch from it + r.add_sync_peer(true, remote_info.clone()); + r.progress_until_no_events( + // Find the requests first to assert that this is the only request that exists + filter().column_index(column_index_to_fail), + // complete this one request without the custody failure now + complete(), ); - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + r.expect_all_batches_processing_or_awaiting(); +} + +#[test] +fn finalized_sync_permanent_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + const PEERS_IN_BATCH: usize = 4; + + for _ in 0..PEERS_IN_BATCH { + r.add_connected_sync_random_peer(remote_info.clone()); + } + r.assert_state(RangeSyncType::Finalized); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. + r.find_and_complete_block_components_by_range_request( + filter().epoch(0), + complete().custody_failure_at_index(column_index_to_fail), + ); + + let mut requested_peers = HashSet::new(); + + for i in 0..PEERS_IN_BATCH - 1 { + r.log(&format!("Loop {i} of custody failure round")); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. We want to make sure that the request goes to different peer + // than the attempts before. + let reqs = + r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); + let req_peer = reqs.peer(); + if requested_peers.contains(&req_peer) { + panic!("Re-requested the same peer {req_peer} again after a custody failure"); + } + requested_peers.insert(req_peer); + + // Find the requests first to assert that this is the only request that exists + r.expect_no_data_columns_by_range_requests(filter().epoch(0)); + r.complete_data_by_range_request( + reqs, + complete().custody_failure_at_index(column_index_to_fail), + ); + } + + // custody_by_range request is still active waiting for a new peer to connect + r.expect_active_block_components_by_range_request_on_custody_step(); +} + +#[test] +#[ignore] +fn mine_peerids() { + let spec = test_spec::(); + let mut rng = ChaCha20Rng::from_seed([0u8; 32]); + + let expected_subnets = (0..3) + .map(|i| DataColumnSubnetId::new(i as u64)) + .collect::>(); + + for i in 0..usize::MAX { + let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); + let enr = Enr::builder().build(&key).unwrap(); + let peer_id = enr.peer_id(); + // Use default custody groups count + let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); + let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) + .expect("should compute custody subnets"); + if expected_subnets == subnets { + panic!("{:?}", subnets); + } else { + let matches = expected_subnets + .iter() + .filter(|index| subnets.contains(index)) + .count(); + if matches > 0 { + println!("{i} {:?}", matches); + } + } + } } diff --git a/consensus/types/src/signed_beacon_block.rs b/consensus/types/src/signed_beacon_block.rs index 85bed35a19c..de572014edc 100644 --- a/consensus/types/src/signed_beacon_block.rs +++ b/consensus/types/src/signed_beacon_block.rs @@ -321,6 +321,10 @@ impl> SignedBeaconBlock .unwrap_or(0) } + pub fn has_data(&self) -> bool { + self.num_expected_blobs() > 0 + } + /// Used for displaying commitments in logs. pub fn commitments_formatted(&self) -> String { let Ok(commitments) = self.message().body().blob_kzg_commitments() else {