From 9cbfcbfe833a6c8ff9f77bb9773ad44876669803 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 5 Sep 2024 23:39:52 -0700 Subject: [PATCH 01/16] initial changes --- beacon_node/beacon_processor/src/lib.rs | 7 +- .../src/scheduler/interface.rs | 14 + .../beacon_processor/src/scheduler/mod.rs | 2 + .../src/scheduler/priority_scheduler.rs | 852 ++++++++++++++++++ 4 files changed, 874 insertions(+), 1 deletion(-) create mode 100644 beacon_node/beacon_processor/src/scheduler/interface.rs create mode 100644 beacon_node/beacon_processor/src/scheduler/mod.rs create mode 100644 beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index f506f0bb94d..20ed890593b 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -37,7 +37,7 @@ //! Whenever the manager receives a notification that a worker has finished a parcel of work, it //! checks the queues to see if there are more parcels of work that can be spawned in a new worker //! task. - +mod scheduler; use crate::work_reprocessing_queue::{ QueuedBackfillBatch, QueuedGossipBlock, ReprocessQueueMessage, }; @@ -878,6 +878,7 @@ impl BeaconProcessor { // receive them back once they are ready (`ready_work_rx`). let (ready_work_tx, ready_work_rx) = mpsc::channel::(self.config.max_scheduled_work_queue_len); + // TODO(beacon-processor) reprocess scheduler spawn_reprocess_scheduler( ready_work_tx, work_reprocessing_rx, @@ -906,6 +907,7 @@ impl BeaconProcessor { self.current_workers = self.current_workers.saturating_sub(1); None } + // TODO(beacon-processor) backfill rate limiting is here Some(InboundEvent::WorkEvent(event)) if enable_backfill_rate_limiting => { match QueuedBackfillBatch::try_from(event) { Ok(backfill_batch) => { @@ -987,6 +989,8 @@ impl BeaconProcessor { .map_or(false, |event| event.drop_during_sync); let idle_tx = idle_tx.clone(); + + // TODO(beacon-processor) ordering is defined here match work_event { // There is no new work event, but we are able to spawn a new worker. // @@ -1485,6 +1489,7 @@ impl BeaconProcessor { Ok(()) } + // TODO(beacon-processor) should we move spawn_worker outside of self? /// Spawns a blocking worker thread to process some `Work`. /// /// Sends an message on `idle_tx` when the work is complete and the task is stopping. diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs new file mode 100644 index 00000000000..f68e7f15c4a --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -0,0 +1,14 @@ +use types::EthSpec; + +use super::priority_scheduler; + +pub enum SchedulerType { + PriorityScheduler(priority_scheduler::Scheduler), +} + +impl SchedulerType { + // TODO(beacon-processor) make this config driven + pub fn new() {} + + pub fn process_work_event(&self) {} +} diff --git a/beacon_node/beacon_processor/src/scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/mod.rs new file mode 100644 index 00000000000..60cf66022a4 --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/mod.rs @@ -0,0 +1,2 @@ +mod interface; +mod priority_scheduler; diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs new file mode 100644 index 00000000000..62da747c7c1 --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs @@ -0,0 +1,852 @@ +// The priority scheduler has three major facets +// 1. A priority ordering system +// 2. A backfill rate limiting feature +// 3. A retry queue + +use slog::error; +use slot_clock::SlotClock; +use std::{cmp, sync::Arc, time::Duration}; + +use futures::StreamExt; +use lighthouse_metrics::HistogramTimer; +use logging::TimeLatch; +use slog::{crit, debug, trace, warn}; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use types::EthSpec; + +use crate::{ + metrics, work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, BeaconProcessor, BeaconProcessorQueueLengths, FifoQueue, InboundEvent, InboundEvents, LifoQueue, Work, WorkEvent, NOTHING_TO_DO, WORKER_FREED +}; + +// TODO(beacon-processor) this will be impl specific +pub struct WorkQueues { + pub aggregate_queue: LifoQueue>, + pub aggregate_debounce: TimeLatch, + pub attestation_queue: LifoQueue>, + pub attestation_debounce: TimeLatch, + pub unknown_block_aggregate_queue: LifoQueue>, + pub unknown_block_attestation_queue: LifoQueue>, + pub sync_message_queue: LifoQueue>, + pub sync_contribution_queue: LifoQueue>, + pub gossip_voluntary_exit_queue: FifoQueue>, + pub gossip_proposer_slashing_queue: FifoQueue>, + pub gossip_attester_slashing_queue: FifoQueue>, + pub finality_update_queue: FifoQueue>, + pub optimistic_update_queue: FifoQueue>, + pub unknown_light_client_update_queue: FifoQueue>, + pub unknown_block_sampling_request_queue: FifoQueue>, + pub rpc_block_queue: FifoQueue>, + pub rpc_blob_queue: FifoQueue>, + pub rpc_custody_column_queue: FifoQueue>, + pub rpc_verify_data_column_queue: FifoQueue>, + pub sampling_result_queue: FifoQueue>, + pub chain_segment_queue: FifoQueue>, + pub backfill_chain_segment: FifoQueue>, + pub gossip_block_queue: FifoQueue>, + pub gossip_blob_queue: FifoQueue>, + pub gossip_data_column_queue: FifoQueue>, + pub delayed_block_queue: FifoQueue>, + pub status_queue: FifoQueue>, + pub bbrange_queue: FifoQueue>, + pub bbroots_queue: FifoQueue>, + pub blbroots_queue: FifoQueue>, + pub blbrange_queue: FifoQueue>, + pub dcbroots_queue: FifoQueue>, + pub dcbrange_queue: FifoQueue>, + pub gossip_bls_to_execution_change_queue: FifoQueue>, + pub lc_bootstrap_queue: FifoQueue>, + pub lc_optimistic_update_queue: FifoQueue>, + pub lc_finality_update_queue: FifoQueue>, + pub api_request_p0_queue: FifoQueue>, + pub api_request_p1_queue: FifoQueue>, +} + +impl WorkQueues { + pub fn new(queue_lengths: BeaconProcessorQueueLengths) -> Self { + let aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); + let aggregate_debounce = TimeLatch::default(); + let attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); + let attestation_debounce = TimeLatch::default(); + let unknown_block_aggregate_queue = + LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); + let unknown_block_attestation_queue = + LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + + let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); + let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); + + // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have + // a strong feeling about queue type for exits. + let gossip_voluntary_exit_queue = + FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); + + // Using a FIFO queue for slashing to prevent people from flushing their slashings from the + // queues with lots of junk messages. + let gossip_proposer_slashing_queue = + FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); + let gossip_attester_slashing_queue = + FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); + + // Using a FIFO queue for light client updates to maintain sequence order. + let finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); + let optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); + let unknown_light_client_update_queue = + FifoQueue::new(queue_lengths.unknown_light_client_update_queue); + let unknown_block_sampling_request_queue = + FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); + + // Using a FIFO queue since blocks need to be imported sequentially. + let rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); + let rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); + let rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); + let rpc_verify_data_column_queue = + FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); + let sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); + let chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); + let backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); + let gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); + let gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); + let gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); + let delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); + + let status_queue = FifoQueue::new(queue_lengths.status_queue); + let bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); + let bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); + let blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); + let blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); + let dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); + let dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); + + let gossip_bls_to_execution_change_queue = + FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); + + let lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); + let lc_optimistic_update_queue = + FifoQueue::new(queue_lengths.lc_optimistic_update_queue); + let lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); + + let api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); + let api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); + + WorkQueues { + aggregate_queue, + aggregate_debounce, + attestation_queue, + attestation_debounce, + unknown_block_aggregate_queue, + unknown_block_attestation_queue, + sync_message_queue, + sync_contribution_queue, + gossip_voluntary_exit_queue, + gossip_proposer_slashing_queue, + gossip_attester_slashing_queue, + finality_update_queue, + optimistic_update_queue, + unknown_light_client_update_queue, + unknown_block_sampling_request_queue, + rpc_block_queue, + rpc_blob_queue, + rpc_custody_column_queue, + rpc_verify_data_column_queue, + sampling_result_queue, + chain_segment_queue, + backfill_chain_segment, + gossip_block_queue, + gossip_blob_queue, + gossip_data_column_queue, + delayed_block_queue, + status_queue, + bbrange_queue, + bbroots_queue, + blbroots_queue, + blbrange_queue, + dcbroots_queue, + dcbrange_queue, + gossip_bls_to_execution_change_queue, + lc_bootstrap_queue, + lc_optimistic_update_queue, + lc_finality_update_queue, + api_request_p0_queue, + api_request_p1_queue, + } + } +} + +// Backend trait inits a channel, a run function +// A channel trait has send_work, reprocess_work etc. + +pub struct Scheduler { + beacon_processor: BeaconProcessor, + enable_backfill_rate_limiting: bool, + current_workers: usize, + idle_tx: Sender<()>, + idle_rx: Receiver<()>, + work_queues: WorkQueues, +} + +impl Scheduler { + pub async fn process_work_event(&self) {} + + async fn run( + mut self, + mut inbound_events: InboundEvents, + work_journal_tx: Option>, + slot_clock: S, + maximum_gossip_clock_disparity: Duration, + ) -> Result<(), String> { + // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to + // receive them back once they are ready (`ready_work_rx`). + let (ready_work_tx, ready_work_rx) = + mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); + // TODO(beacon-processor) reprocess scheduler + spawn_reprocess_scheduler( + ready_work_tx, + work_reprocessing_rx, + &self.beacon_processor.executor, + Arc::new(slot_clock), + self.beacon_processor.log.clone(), + maximum_gossip_clock_disparity, + )?; + + let work_event = match inbound_events.next().await { + Some(InboundEvent::WorkerIdle) => { + // TODO(beacon-processor) move current_workers from beacon_processor to self + self.current_workers = self.current_workers.saturating_sub(1); + None + } + Some(InboundEvent::WorkEvent(event)) if self.enable_backfill_rate_limiting => { + // TODO(beacon-processor) is backfill rate limiting going to be the same across all schedulers? + todo!() + } + Some(InboundEvent::WorkEvent(event)) | Some(InboundEvent::ReprocessingWork(event)) => { + Some(event) + } + None => { + debug!( + self.beacon_processor.log, + "Gossip processor stopped"; + "msg" => "stream ended" + ); + // TODO(beacon-processor) this should terminate the whole process + todo!() + } + }; + + let _event_timer = self.increment_metrics(&work_event); + self.worker_journal(&work_event, &work_journal_tx); + + let can_spawn = self.current_workers < self.beacon_processor.config.max_workers; + let drop_during_sync = work_event + .as_ref() + .map_or(false, |event| event.drop_during_sync); + + match work_event { + // There is no new work event, but we are able to spawn a new worker. + // + // We don't check the `work.drop_during_sync` here. We assume that if it made + // it into the queue at any point then we should process it. + None if can_spawn => { + // TODO(beacon-processor) implement the normal priority scheduler here + // also note that these match arms will look similar across all scheduler variants + // so maybe we can pull this function out and get creative with closure usage + self.priority_scheduler(&work_journal_tx); + } + // There is no new work event and we are unable to spawn a new worker. + // + // I cannot see any good reason why this would happen. + None => { + warn!( + self.beacon_processor.log, + "Unexpected gossip processor condition"; + "msg" => "no new work and cannot spawn worker" + ); + } + // The chain is syncing and this event should be dropped during sync. + Some(work_event) + if self + .beacon_processor + .network_globals + .sync_state + .read() + .is_syncing() + && drop_during_sync => + { + let work_id = work_event.work.str_id(); + metrics::inc_counter_vec( + &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, + &[work_id], + ); + trace!( + self.beacon_processor.log, + "Gossip processor skipping work"; + "msg" => "chain is syncing", + "work_id" => work_id + ); + } + + // There is a new work event and the chain is not syncing. Process it or queue + // it. + Some(WorkEvent { work, .. }) => { + self.process_or_queue_work_event(work, can_spawn); + } + } + } + + fn priority_scheduler(&mut self, work_journal_tx: &Option>) { + let idle_tx = self.idle_tx.clone(); + // Check for chain segments first, they're the most efficient way to get + // blocks into the system. + if let Some(item) = self.work_queues.chain_segment_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check sync blocks before gossip blocks, since we've already explicitly + // requested these blocks. + } else if let Some(item) = self.work_queues.rpc_block_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.rpc_blob_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // TODO(das): decide proper prioritization for sampling columns + } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.rpc_verify_data_column_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.sampling_result_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check delayed blocks before gossip blocks, the gossip blocks might rely + // on the delayed ones. + } else if let Some(item) = self.work_queues.delayed_block_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check gossip blocks before gossip attestations, since a block might be + // required to verify some attestations. + } else if let Some(item) = self.work_queues.gossip_block_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.gossip_blob_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.gossip_data_column_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check the priority 0 API requests after blocks and blobs, but before attestations. + } else if let Some(item) = self.work_queues.api_request_p0_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check the aggregates, *then* the unaggregates since we assume that + // aggregates are more valuable to local validators and effectively give us + // more information with less signature verification time. + } else if self.work_queues.aggregate_queue.len() > 0 { + let batch_size = cmp::min( + self.work_queues.aggregate_queue.len(), + self.beacon_processor.config.max_gossip_aggregate_batch_size, + ); + + if batch_size < 2 { + // One single aggregate is in the queue, process it individually. + if let Some(item) = self.work_queues.aggregate_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } + } else { + // Collect two or more aggregates into a batch, so they can take + // advantage of batch signature verification. + // + // Note: this will convert the `Work::GossipAggregate` item into a + // `Work::GossipAggregateBatch` item. + let mut aggregates = Vec::with_capacity(batch_size); + let mut process_batch_opt = None; + for _ in 0..batch_size { + if let Some(item) = self.work_queues.aggregate_queue.pop() { + match item { + Work::GossipAggregate { + aggregate, + process_individual: _, + process_batch, + } => { + aggregates.push(*aggregate); + if process_batch_opt.is_none() { + process_batch_opt = Some(process_batch); + } + } + _ => { + error!( + self.beacon_processor.log, + "Invalid item in aggregate queue" + ); + } + } + } + } + + if let Some(process_batch) = process_batch_opt { + // Process all aggregates with a single worker. + self.beacon_processor.spawn_worker( + Work::GossipAggregateBatch { + aggregates, + process_batch, + }, + idle_tx, + ) + } else { + // There is no good reason for this to + // happen, it is a serious logic error. + // Since we only form batches when multiple + // work items exist, we should always have a + // work closure at this point. + crit!(self.beacon_processor.log, "Missing aggregate work"); + } + } + // Check the unaggregated attestation queue. + // + // Potentially use batching. + } else if self.work_queues.attestation_queue.len() > 0 { + let batch_size = cmp::min( + self.work_queues.attestation_queue.len(), + self.beacon_processor + .config + .max_gossip_attestation_batch_size, + ); + + if batch_size < 2 { + // One single attestation is in the queue, process it individually. + if let Some(item) = self.work_queues.attestation_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } + } else { + // Collect two or more attestations into a batch, so they can take + // advantage of batch signature verification. + // + // Note: this will convert the `Work::GossipAttestation` item into a + // `Work::GossipAttestationBatch` item. + let mut attestations = Vec::with_capacity(batch_size); + let mut process_batch_opt = None; + for _ in 0..batch_size { + if let Some(item) = self.work_queues.attestation_queue.pop() { + match item { + Work::GossipAttestation { + attestation, + process_individual: _, + process_batch, + } => { + attestations.push(*attestation); + if process_batch_opt.is_none() { + process_batch_opt = Some(process_batch); + } + } + _ => error!( + self.beacon_processor.log, + "Invalid item in attestation queue" + ), + } + } + } + + if let Some(process_batch) = process_batch_opt { + // Process all attestations with a single worker. + self.beacon_processor.spawn_worker( + Work::GossipAttestationBatch { + attestations, + process_batch, + }, + idle_tx, + ) + } else { + // There is no good reason for this to + // happen, it is a serious logic error. + // Since we only form batches when multiple + // work items exist, we should always have a + // work closure at this point. + crit!(self.beacon_processor.log, "Missing attestations work"); + } + } + // Check sync committee messages after attestations as their rewards are lesser + // and they don't influence fork choice. + } else if let Some(item) = self.work_queues.sync_contribution_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.sync_message_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Aggregates and unaggregates queued for re-processing are older and we + // care about fresher ones, so check those first. + } else if let Some(item) = self.work_queues.unknown_block_aggregate_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.unknown_block_attestation_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check RPC methods next. Status messages are needed for sync so + // prioritize them over syncing requests from other peers (BlocksByRange + // and BlocksByRoot) + } else if let Some(item) = self.work_queues.status_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.bbrange_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.bbroots_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.blbrange_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.blbroots_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.dcbroots_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.dcbrange_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Prioritize sampling requests after block syncing requests + } else if let Some(item) = self.work_queues.unknown_block_sampling_request_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check slashings after all other consensus messages so we prioritize + // following head. + // + // Check attester slashings before proposer slashings since they have the + // potential to slash multiple validators at once. + } else if let Some(item) = self.work_queues.gossip_attester_slashing_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.gossip_proposer_slashing_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check exits and address changes late since our validators don't get + // rewards from them. + } else if let Some(item) = self.work_queues.gossip_voluntary_exit_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.gossip_bls_to_execution_change_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Check the priority 1 API requests after we've + // processed all the interesting things from the network + // and things required for us to stay in good repute + // with our P2P peers. + } else if let Some(item) = self.work_queues.api_request_p1_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Handle backfill sync chain segments. + } else if let Some(item) = self.work_queues.backfill_chain_segment.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // Handle light client requests. + } else if let Some(item) = self.work_queues.lc_bootstrap_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.lc_optimistic_update_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + } else if let Some(item) = self.work_queues.lc_finality_update_queue.pop() { + self.beacon_processor.spawn_worker(item, idle_tx); + // This statement should always be the final else statement. + } else { + // Let the journal know that a worker is freed and there's nothing else + // for it to do. + if let Some(work_journal_tx) = work_journal_tx { + // We don't care if this message was successfully sent, we only use the journal + // during testing. + let _ = work_journal_tx.try_send(NOTHING_TO_DO); + } + } + } + + // TODO(beacon-processor) this might be able to be moved to a more generalized location + pub fn process_or_queue_work_event(&mut self, work: Work, can_spawn: bool) { + let work_id = work.str_id(); + + match work { + _ if can_spawn => self + .beacon_processor + .spawn_worker(work, self.idle_tx.clone()), + Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), + // Attestation batches are formed internally within the + // `BeaconProcessor`, they are not sent from external services. + Work::GossipAttestationBatch { .. } => crit!( + self.beacon_processor.log, + "Unsupported inbound event"; + "type" => "GossipAttestationBatch" + ), + Work::GossipAggregate { .. } => self.work_queues.aggregate_queue.push(work), + // Aggregate batches are formed internally within the `BeaconProcessor`, + // they are not sent from external services. + Work::GossipAggregateBatch { .. } => crit!( + self.beacon_processor.log, + "Unsupported inbound event"; + "type" => "GossipAggregateBatch" + ), + Work::GossipBlock { .. } => { + self.work_queues + .gossip_block_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipBlobSidecar { .. } => { + self.work_queues + .gossip_blob_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipDataColumnSidecar { .. } => self.work_queues.gossip_data_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::DelayedImportBlock { .. } => { + self.work_queues + .delayed_block_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipVoluntaryExit { .. } => self.work_queues.gossip_voluntary_exit_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::GossipProposerSlashing { .. } => self + .work_queues + .gossip_proposer_slashing_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipAttesterSlashing { .. } => self + .work_queues + .gossip_attester_slashing_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipSyncSignature { .. } => self.work_queues.sync_message_queue.push(work), + Work::GossipSyncContribution { .. } => { + self.work_queues.sync_contribution_queue.push(work) + } + Work::GossipLightClientFinalityUpdate { .. } => self + .work_queues + .finality_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipLightClientOptimisticUpdate { .. } => self + .work_queues + .optimistic_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self + .work_queues + .rpc_block_queue + .push(work, work_id, &self.beacon_processor.log), + Work::RpcBlobs { .. } => { + self.work_queues + .rpc_blob_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::RpcCustodyColumn { .. } => self.work_queues.rpc_custody_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::RpcVerifyDataColumn(_) => self.work_queues.rpc_verify_data_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::SamplingResult(_) => self.work_queues.sampling_result_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::ChainSegment { .. } => { + self.work_queues + .chain_segment_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::ChainSegmentBackfill { .. } => self.work_queues.backfill_chain_segment.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::Status { .. } => { + self.work_queues + .status_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlocksByRangeRequest { .. } => { + self.work_queues + .bbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlocksByRootsRequest { .. } => { + self.work_queues + .bbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlobsByRangeRequest { .. } => { + self.work_queues + .blbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::LightClientBootstrapRequest { .. } => { + self.work_queues + .lc_bootstrap_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::LightClientOptimisticUpdateRequest { .. } => self + .work_queues + .lc_optimistic_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::LightClientFinalityUpdateRequest { .. } => self + .work_queues + .lc_finality_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::UnknownBlockAttestation { .. } => { + self.work_queues.unknown_block_attestation_queue.push(work) + } + Work::UnknownBlockAggregate { .. } => { + self.work_queues.unknown_block_aggregate_queue.push(work) + } + Work::GossipBlsToExecutionChange { .. } => self + .work_queues + .gossip_bls_to_execution_change_queue + .push(work, work_id, &self.beacon_processor.log), + Work::BlobsByRootsRequest { .. } => { + self.work_queues + .blbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::DataColumnsByRootsRequest { .. } => { + self.work_queues + .dcbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::DataColumnsByRangeRequest { .. } => { + self.work_queues + .dcbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::UnknownLightClientOptimisticUpdate { .. } => self + .work_queues + .unknown_light_client_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::UnknownBlockSamplingRequest { .. } => self + .work_queues + .unknown_block_sampling_request_queue + .push(work, work_id, &self.beacon_processor.log), + Work::ApiRequestP0 { .. } => self.work_queues.api_request_p0_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::ApiRequestP1 { .. } => self.work_queues.api_request_p1_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + } + + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, + self.current_workers as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL, + self.work_queues.attestation_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_AGGREGATED_ATTESTATION_QUEUE_TOTAL, + self.work_queues.aggregate_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_SYNC_MESSAGE_QUEUE_TOTAL, + self.work_queues.sync_message_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_SYNC_CONTRIBUTION_QUEUE_TOTAL, + self.work_queues.sync_contribution_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_QUEUE_TOTAL, + self.work_queues.gossip_block_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_GOSSIP_BLOB_QUEUE_TOTAL, + self.work_queues.gossip_blob_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_GOSSIP_DATA_COLUMN_QUEUE_TOTAL, + self.work_queues.gossip_data_column_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_RPC_BLOCK_QUEUE_TOTAL, + self.work_queues.rpc_block_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_RPC_BLOB_QUEUE_TOTAL, + self.work_queues.rpc_blob_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_RPC_CUSTODY_COLUMN_QUEUE_TOTAL, + self.work_queues.rpc_custody_column_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL, + self.work_queues.rpc_verify_data_column_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL, + self.work_queues.sampling_result_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL, + self.work_queues.chain_segment_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_QUEUE_TOTAL, + self.work_queues.backfill_chain_segment.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_EXIT_QUEUE_TOTAL, + self.work_queues.gossip_voluntary_exit_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_PROPOSER_SLASHING_QUEUE_TOTAL, + self.work_queues.gossip_proposer_slashing_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_ATTESTER_SLASHING_QUEUE_TOTAL, + self.work_queues.gossip_attester_slashing_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_BLS_TO_EXECUTION_CHANGE_QUEUE_TOTAL, + self.work_queues.gossip_bls_to_execution_change_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_API_REQUEST_P0_QUEUE_TOTAL, + self.work_queues.api_request_p0_queue.len() as i64, + ); + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_API_REQUEST_P1_QUEUE_TOTAL, + self.work_queues.api_request_p1_queue.len() as i64, + ); + + if self.work_queues.aggregate_queue.is_full() + && self.work_queues.aggregate_debounce.elapsed() + { + error!( + self.beacon_processor.log, + "Aggregate attestation queue full"; + "msg" => "the system has insufficient resources for load", + "queue_len" => self.work_queues.aggregate_queue.max_length, + ) + } + + if self.work_queues.attestation_queue.is_full() + && self.work_queues.attestation_debounce.elapsed() + { + error!( + self.beacon_processor.log, + "Attestation queue full"; + "msg" => "the system has insufficient resources for load", + "queue_len" => self.work_queues.attestation_queue.max_length, + ) + } + } + + // TODO(beacon-processor) this can live outside of this struct in a more general location + fn worker_journal( + &self, + work_event: &Option>, + work_journal_tx: &Option>, + ) { + if let Some(work_journal_tx) = work_journal_tx { + let id = work_event + .as_ref() + .map(|event| event.work.str_id()) + .unwrap_or(WORKER_FREED); + + // We don't care if this message was successfully sent, we only use the journal + // during testing. + let _ = work_journal_tx.try_send(id); + } + } + + // TODO(beacon-processor) this can live outside of this struct in a more general location + fn increment_metrics(&self, work_event: &Option>) -> Option { + let _event_timer = metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); + if let Some(event) = work_event { + metrics::inc_counter_vec( + &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, + &[event.work.str_id()], + ); + } else { + metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); + } + _event_timer + } +} From edc708f839a9017b0be31482482cf4fe85a569c2 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 19 Sep 2024 12:34:00 -0700 Subject: [PATCH 02/16] more tweaking --- beacon_node/beacon_processor/src/lib.rs | 17 +- .../src/scheduler/interface.rs | 29 +- .../src/scheduler/priority_scheduler.rs | 1718 +++++++++-------- beacon_node/http_api/src/test_utils.rs | 4 - .../gossip_methods.rs | 11 +- .../src/network_beacon_processor/mod.rs | 4 - .../src/network_beacon_processor/tests.rs | 3 - beacon_node/network/src/service/tests.rs | 2 - 8 files changed, 899 insertions(+), 889 deletions(-) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 20ed890593b..71d52442f5c 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -287,22 +287,17 @@ impl Default for BeaconProcessorConfig { pub struct BeaconProcessorChannels { pub beacon_processor_tx: BeaconProcessorSend, pub beacon_processor_rx: mpsc::Receiver>, - pub work_reprocessing_tx: mpsc::Sender, - pub work_reprocessing_rx: mpsc::Receiver, } impl BeaconProcessorChannels { pub fn new(config: &BeaconProcessorConfig) -> Self { let (beacon_processor_tx, beacon_processor_rx) = mpsc::channel(config.max_work_event_queue_len); - let (work_reprocessing_tx, work_reprocessing_rx) = - mpsc::channel(config.max_scheduled_work_queue_len); + Self { beacon_processor_tx: BeaconProcessorSend(beacon_processor_tx), beacon_processor_rx, - work_reprocessing_rx, - work_reprocessing_tx, } } } @@ -647,6 +642,7 @@ pub enum Work { LightClientFinalityUpdateRequest(BlockingFn), ApiRequestP0(BlockingOrAsync), ApiRequestP1(BlockingOrAsync), + Reprocess(BlockingFn), } impl fmt::Debug for Work { @@ -796,8 +792,6 @@ impl BeaconProcessor { pub fn spawn_manager( mut self, event_rx: mpsc::Receiver>, - work_reprocessing_tx: mpsc::Sender, - work_reprocessing_rx: mpsc::Receiver, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, @@ -805,6 +799,8 @@ impl BeaconProcessor { ) -> Result<(), String> { // Used by workers to communicate that they are finished a task. let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); + + let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel(self.config.max_scheduled_work_queue_len); // Using LIFO queues for attestations since validator profits rely upon getting fresh // attestations into blocks. Additionally, later attestations contain more information than @@ -1375,6 +1371,10 @@ impl BeaconProcessor { Work::ApiRequestP1 { .. } => { api_request_p1_queue.push(work, work_id, &self.log) } + Work::Reprocess { .. } => { + // TODO(beacon-processor) + todo!() + } } } } @@ -1610,6 +1610,7 @@ impl BeaconProcessor { | Work::LightClientFinalityUpdateRequest(process_fn) => { task_spawner.spawn_blocking(process_fn) } + Work::Reprocess(process_fn) }; } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index f68e7f15c4a..879f2168585 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -1,14 +1,23 @@ -use types::EthSpec; +// use tokio::sync::mpsc; +// use types::EthSpec; -use super::priority_scheduler; +// use crate::WorkEvent; -pub enum SchedulerType { - PriorityScheduler(priority_scheduler::Scheduler), -} +// use super::priority_scheduler; -impl SchedulerType { - // TODO(beacon-processor) make this config driven - pub fn new() {} +// pub enum SchedulerType { +// PriorityScheduler(priority_scheduler::Scheduler), +// } - pub fn process_work_event(&self) {} -} +// impl SchedulerType { +// // TODO(beacon-processor) make this config driven +// pub fn run( +// &self, +// event_rx: mpsc::Receiver>, +// work_journal_tx: Option>, +// ) { + +// } + +// pub fn process_work_event(&self) {} +// } diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs index 62da747c7c1..cfdd399ebd4 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs @@ -1,852 +1,866 @@ -// The priority scheduler has three major facets -// 1. A priority ordering system -// 2. A backfill rate limiting feature -// 3. A retry queue - -use slog::error; -use slot_clock::SlotClock; -use std::{cmp, sync::Arc, time::Duration}; - -use futures::StreamExt; -use lighthouse_metrics::HistogramTimer; -use logging::TimeLatch; -use slog::{crit, debug, trace, warn}; -use tokio::sync::mpsc::{self, Receiver, Sender}; -use types::EthSpec; - -use crate::{ - metrics, work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, BeaconProcessor, BeaconProcessorQueueLengths, FifoQueue, InboundEvent, InboundEvents, LifoQueue, Work, WorkEvent, NOTHING_TO_DO, WORKER_FREED -}; - -// TODO(beacon-processor) this will be impl specific -pub struct WorkQueues { - pub aggregate_queue: LifoQueue>, - pub aggregate_debounce: TimeLatch, - pub attestation_queue: LifoQueue>, - pub attestation_debounce: TimeLatch, - pub unknown_block_aggregate_queue: LifoQueue>, - pub unknown_block_attestation_queue: LifoQueue>, - pub sync_message_queue: LifoQueue>, - pub sync_contribution_queue: LifoQueue>, - pub gossip_voluntary_exit_queue: FifoQueue>, - pub gossip_proposer_slashing_queue: FifoQueue>, - pub gossip_attester_slashing_queue: FifoQueue>, - pub finality_update_queue: FifoQueue>, - pub optimistic_update_queue: FifoQueue>, - pub unknown_light_client_update_queue: FifoQueue>, - pub unknown_block_sampling_request_queue: FifoQueue>, - pub rpc_block_queue: FifoQueue>, - pub rpc_blob_queue: FifoQueue>, - pub rpc_custody_column_queue: FifoQueue>, - pub rpc_verify_data_column_queue: FifoQueue>, - pub sampling_result_queue: FifoQueue>, - pub chain_segment_queue: FifoQueue>, - pub backfill_chain_segment: FifoQueue>, - pub gossip_block_queue: FifoQueue>, - pub gossip_blob_queue: FifoQueue>, - pub gossip_data_column_queue: FifoQueue>, - pub delayed_block_queue: FifoQueue>, - pub status_queue: FifoQueue>, - pub bbrange_queue: FifoQueue>, - pub bbroots_queue: FifoQueue>, - pub blbroots_queue: FifoQueue>, - pub blbrange_queue: FifoQueue>, - pub dcbroots_queue: FifoQueue>, - pub dcbrange_queue: FifoQueue>, - pub gossip_bls_to_execution_change_queue: FifoQueue>, - pub lc_bootstrap_queue: FifoQueue>, - pub lc_optimistic_update_queue: FifoQueue>, - pub lc_finality_update_queue: FifoQueue>, - pub api_request_p0_queue: FifoQueue>, - pub api_request_p1_queue: FifoQueue>, -} - -impl WorkQueues { - pub fn new(queue_lengths: BeaconProcessorQueueLengths) -> Self { - let aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); - let aggregate_debounce = TimeLatch::default(); - let attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); - let attestation_debounce = TimeLatch::default(); - let unknown_block_aggregate_queue = - LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); - let unknown_block_attestation_queue = - LifoQueue::new(queue_lengths.unknown_block_attestation_queue); - - let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); - let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); - - // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have - // a strong feeling about queue type for exits. - let gossip_voluntary_exit_queue = - FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); - - // Using a FIFO queue for slashing to prevent people from flushing their slashings from the - // queues with lots of junk messages. - let gossip_proposer_slashing_queue = - FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); - let gossip_attester_slashing_queue = - FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); - - // Using a FIFO queue for light client updates to maintain sequence order. - let finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); - let optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); - let unknown_light_client_update_queue = - FifoQueue::new(queue_lengths.unknown_light_client_update_queue); - let unknown_block_sampling_request_queue = - FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); - - // Using a FIFO queue since blocks need to be imported sequentially. - let rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); - let rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); - let rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); - let rpc_verify_data_column_queue = - FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); - let sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); - let chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); - let backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); - let gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); - let gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); - let gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); - let delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); - - let status_queue = FifoQueue::new(queue_lengths.status_queue); - let bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); - let bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); - let blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); - let blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); - let dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); - let dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); - - let gossip_bls_to_execution_change_queue = - FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); - - let lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); - let lc_optimistic_update_queue = - FifoQueue::new(queue_lengths.lc_optimistic_update_queue); - let lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); - - let api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); - let api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); - - WorkQueues { - aggregate_queue, - aggregate_debounce, - attestation_queue, - attestation_debounce, - unknown_block_aggregate_queue, - unknown_block_attestation_queue, - sync_message_queue, - sync_contribution_queue, - gossip_voluntary_exit_queue, - gossip_proposer_slashing_queue, - gossip_attester_slashing_queue, - finality_update_queue, - optimistic_update_queue, - unknown_light_client_update_queue, - unknown_block_sampling_request_queue, - rpc_block_queue, - rpc_blob_queue, - rpc_custody_column_queue, - rpc_verify_data_column_queue, - sampling_result_queue, - chain_segment_queue, - backfill_chain_segment, - gossip_block_queue, - gossip_blob_queue, - gossip_data_column_queue, - delayed_block_queue, - status_queue, - bbrange_queue, - bbroots_queue, - blbroots_queue, - blbrange_queue, - dcbroots_queue, - dcbrange_queue, - gossip_bls_to_execution_change_queue, - lc_bootstrap_queue, - lc_optimistic_update_queue, - lc_finality_update_queue, - api_request_p0_queue, - api_request_p1_queue, - } - } -} - -// Backend trait inits a channel, a run function -// A channel trait has send_work, reprocess_work etc. - -pub struct Scheduler { - beacon_processor: BeaconProcessor, - enable_backfill_rate_limiting: bool, - current_workers: usize, - idle_tx: Sender<()>, - idle_rx: Receiver<()>, - work_queues: WorkQueues, -} - -impl Scheduler { - pub async fn process_work_event(&self) {} - - async fn run( - mut self, - mut inbound_events: InboundEvents, - work_journal_tx: Option>, - slot_clock: S, - maximum_gossip_clock_disparity: Duration, - ) -> Result<(), String> { - // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // receive them back once they are ready (`ready_work_rx`). - let (ready_work_tx, ready_work_rx) = - mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); - // TODO(beacon-processor) reprocess scheduler - spawn_reprocess_scheduler( - ready_work_tx, - work_reprocessing_rx, - &self.beacon_processor.executor, - Arc::new(slot_clock), - self.beacon_processor.log.clone(), - maximum_gossip_clock_disparity, - )?; - - let work_event = match inbound_events.next().await { - Some(InboundEvent::WorkerIdle) => { - // TODO(beacon-processor) move current_workers from beacon_processor to self - self.current_workers = self.current_workers.saturating_sub(1); - None - } - Some(InboundEvent::WorkEvent(event)) if self.enable_backfill_rate_limiting => { - // TODO(beacon-processor) is backfill rate limiting going to be the same across all schedulers? - todo!() - } - Some(InboundEvent::WorkEvent(event)) | Some(InboundEvent::ReprocessingWork(event)) => { - Some(event) - } - None => { - debug!( - self.beacon_processor.log, - "Gossip processor stopped"; - "msg" => "stream ended" - ); - // TODO(beacon-processor) this should terminate the whole process - todo!() - } - }; - - let _event_timer = self.increment_metrics(&work_event); - self.worker_journal(&work_event, &work_journal_tx); - - let can_spawn = self.current_workers < self.beacon_processor.config.max_workers; - let drop_during_sync = work_event - .as_ref() - .map_or(false, |event| event.drop_during_sync); - - match work_event { - // There is no new work event, but we are able to spawn a new worker. - // - // We don't check the `work.drop_during_sync` here. We assume that if it made - // it into the queue at any point then we should process it. - None if can_spawn => { - // TODO(beacon-processor) implement the normal priority scheduler here - // also note that these match arms will look similar across all scheduler variants - // so maybe we can pull this function out and get creative with closure usage - self.priority_scheduler(&work_journal_tx); - } - // There is no new work event and we are unable to spawn a new worker. - // - // I cannot see any good reason why this would happen. - None => { - warn!( - self.beacon_processor.log, - "Unexpected gossip processor condition"; - "msg" => "no new work and cannot spawn worker" - ); - } - // The chain is syncing and this event should be dropped during sync. - Some(work_event) - if self - .beacon_processor - .network_globals - .sync_state - .read() - .is_syncing() - && drop_during_sync => - { - let work_id = work_event.work.str_id(); - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, - &[work_id], - ); - trace!( - self.beacon_processor.log, - "Gossip processor skipping work"; - "msg" => "chain is syncing", - "work_id" => work_id - ); - } - - // There is a new work event and the chain is not syncing. Process it or queue - // it. - Some(WorkEvent { work, .. }) => { - self.process_or_queue_work_event(work, can_spawn); - } - } - } - - fn priority_scheduler(&mut self, work_journal_tx: &Option>) { - let idle_tx = self.idle_tx.clone(); - // Check for chain segments first, they're the most efficient way to get - // blocks into the system. - if let Some(item) = self.work_queues.chain_segment_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check sync blocks before gossip blocks, since we've already explicitly - // requested these blocks. - } else if let Some(item) = self.work_queues.rpc_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_blob_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // TODO(das): decide proper prioritization for sampling columns - } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_verify_data_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.sampling_result_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check delayed blocks before gossip blocks, the gossip blocks might rely - // on the delayed ones. - } else if let Some(item) = self.work_queues.delayed_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check gossip blocks before gossip attestations, since a block might be - // required to verify some attestations. - } else if let Some(item) = self.work_queues.gossip_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_blob_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_data_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the priority 0 API requests after blocks and blobs, but before attestations. - } else if let Some(item) = self.work_queues.api_request_p0_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the aggregates, *then* the unaggregates since we assume that - // aggregates are more valuable to local validators and effectively give us - // more information with less signature verification time. - } else if self.work_queues.aggregate_queue.len() > 0 { - let batch_size = cmp::min( - self.work_queues.aggregate_queue.len(), - self.beacon_processor.config.max_gossip_aggregate_batch_size, - ); - - if batch_size < 2 { - // One single aggregate is in the queue, process it individually. - if let Some(item) = self.work_queues.aggregate_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } - } else { - // Collect two or more aggregates into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAggregate` item into a - // `Work::GossipAggregateBatch` item. - let mut aggregates = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = self.work_queues.aggregate_queue.pop() { - match item { - Work::GossipAggregate { - aggregate, - process_individual: _, - process_batch, - } => { - aggregates.push(*aggregate); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => { - error!( - self.beacon_processor.log, - "Invalid item in aggregate queue" - ); - } - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all aggregates with a single worker. - self.beacon_processor.spawn_worker( - Work::GossipAggregateBatch { - aggregates, - process_batch, - }, - idle_tx, - ) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.beacon_processor.log, "Missing aggregate work"); - } - } - // Check the unaggregated attestation queue. - // - // Potentially use batching. - } else if self.work_queues.attestation_queue.len() > 0 { - let batch_size = cmp::min( - self.work_queues.attestation_queue.len(), - self.beacon_processor - .config - .max_gossip_attestation_batch_size, - ); - - if batch_size < 2 { - // One single attestation is in the queue, process it individually. - if let Some(item) = self.work_queues.attestation_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } - } else { - // Collect two or more attestations into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAttestation` item into a - // `Work::GossipAttestationBatch` item. - let mut attestations = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = self.work_queues.attestation_queue.pop() { - match item { - Work::GossipAttestation { - attestation, - process_individual: _, - process_batch, - } => { - attestations.push(*attestation); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => error!( - self.beacon_processor.log, - "Invalid item in attestation queue" - ), - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all attestations with a single worker. - self.beacon_processor.spawn_worker( - Work::GossipAttestationBatch { - attestations, - process_batch, - }, - idle_tx, - ) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.beacon_processor.log, "Missing attestations work"); - } - } - // Check sync committee messages after attestations as their rewards are lesser - // and they don't influence fork choice. - } else if let Some(item) = self.work_queues.sync_contribution_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.sync_message_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Aggregates and unaggregates queued for re-processing are older and we - // care about fresher ones, so check those first. - } else if let Some(item) = self.work_queues.unknown_block_aggregate_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.unknown_block_attestation_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check RPC methods next. Status messages are needed for sync so - // prioritize them over syncing requests from other peers (BlocksByRange - // and BlocksByRoot) - } else if let Some(item) = self.work_queues.status_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.bbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.bbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.blbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.blbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.dcbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.dcbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Prioritize sampling requests after block syncing requests - } else if let Some(item) = self.work_queues.unknown_block_sampling_request_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check slashings after all other consensus messages so we prioritize - // following head. - // - // Check attester slashings before proposer slashings since they have the - // potential to slash multiple validators at once. - } else if let Some(item) = self.work_queues.gossip_attester_slashing_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_proposer_slashing_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check exits and address changes late since our validators don't get - // rewards from them. - } else if let Some(item) = self.work_queues.gossip_voluntary_exit_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_bls_to_execution_change_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the priority 1 API requests after we've - // processed all the interesting things from the network - // and things required for us to stay in good repute - // with our P2P peers. - } else if let Some(item) = self.work_queues.api_request_p1_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Handle backfill sync chain segments. - } else if let Some(item) = self.work_queues.backfill_chain_segment.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Handle light client requests. - } else if let Some(item) = self.work_queues.lc_bootstrap_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.lc_optimistic_update_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.lc_finality_update_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // This statement should always be the final else statement. - } else { - // Let the journal know that a worker is freed and there's nothing else - // for it to do. - if let Some(work_journal_tx) = work_journal_tx { - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(NOTHING_TO_DO); - } - } - } - - // TODO(beacon-processor) this might be able to be moved to a more generalized location - pub fn process_or_queue_work_event(&mut self, work: Work, can_spawn: bool) { - let work_id = work.str_id(); - - match work { - _ if can_spawn => self - .beacon_processor - .spawn_worker(work, self.idle_tx.clone()), - Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), - // Attestation batches are formed internally within the - // `BeaconProcessor`, they are not sent from external services. - Work::GossipAttestationBatch { .. } => crit!( - self.beacon_processor.log, - "Unsupported inbound event"; - "type" => "GossipAttestationBatch" - ), - Work::GossipAggregate { .. } => self.work_queues.aggregate_queue.push(work), - // Aggregate batches are formed internally within the `BeaconProcessor`, - // they are not sent from external services. - Work::GossipAggregateBatch { .. } => crit!( - self.beacon_processor.log, - "Unsupported inbound event"; - "type" => "GossipAggregateBatch" - ), - Work::GossipBlock { .. } => { - self.work_queues - .gossip_block_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipBlobSidecar { .. } => { - self.work_queues - .gossip_blob_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipDataColumnSidecar { .. } => self.work_queues.gossip_data_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::DelayedImportBlock { .. } => { - self.work_queues - .delayed_block_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipVoluntaryExit { .. } => self.work_queues.gossip_voluntary_exit_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::GossipProposerSlashing { .. } => self - .work_queues - .gossip_proposer_slashing_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipAttesterSlashing { .. } => self - .work_queues - .gossip_attester_slashing_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipSyncSignature { .. } => self.work_queues.sync_message_queue.push(work), - Work::GossipSyncContribution { .. } => { - self.work_queues.sync_contribution_queue.push(work) - } - Work::GossipLightClientFinalityUpdate { .. } => self - .work_queues - .finality_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipLightClientOptimisticUpdate { .. } => self - .work_queues - .optimistic_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self - .work_queues - .rpc_block_queue - .push(work, work_id, &self.beacon_processor.log), - Work::RpcBlobs { .. } => { - self.work_queues - .rpc_blob_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::RpcCustodyColumn { .. } => self.work_queues.rpc_custody_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::RpcVerifyDataColumn(_) => self.work_queues.rpc_verify_data_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::SamplingResult(_) => self.work_queues.sampling_result_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::ChainSegment { .. } => { - self.work_queues - .chain_segment_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::ChainSegmentBackfill { .. } => self.work_queues.backfill_chain_segment.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::Status { .. } => { - self.work_queues - .status_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlocksByRangeRequest { .. } => { - self.work_queues - .bbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlocksByRootsRequest { .. } => { - self.work_queues - .bbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlobsByRangeRequest { .. } => { - self.work_queues - .blbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::LightClientBootstrapRequest { .. } => { - self.work_queues - .lc_bootstrap_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::LightClientOptimisticUpdateRequest { .. } => self - .work_queues - .lc_optimistic_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::LightClientFinalityUpdateRequest { .. } => self - .work_queues - .lc_finality_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::UnknownBlockAttestation { .. } => { - self.work_queues.unknown_block_attestation_queue.push(work) - } - Work::UnknownBlockAggregate { .. } => { - self.work_queues.unknown_block_aggregate_queue.push(work) - } - Work::GossipBlsToExecutionChange { .. } => self - .work_queues - .gossip_bls_to_execution_change_queue - .push(work, work_id, &self.beacon_processor.log), - Work::BlobsByRootsRequest { .. } => { - self.work_queues - .blbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::DataColumnsByRootsRequest { .. } => { - self.work_queues - .dcbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::DataColumnsByRangeRequest { .. } => { - self.work_queues - .dcbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::UnknownLightClientOptimisticUpdate { .. } => self - .work_queues - .unknown_light_client_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::UnknownBlockSamplingRequest { .. } => self - .work_queues - .unknown_block_sampling_request_queue - .push(work, work_id, &self.beacon_processor.log), - Work::ApiRequestP0 { .. } => self.work_queues.api_request_p0_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::ApiRequestP1 { .. } => self.work_queues.api_request_p1_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - } - - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, - self.current_workers as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL, - self.work_queues.attestation_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_AGGREGATED_ATTESTATION_QUEUE_TOTAL, - self.work_queues.aggregate_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SYNC_MESSAGE_QUEUE_TOTAL, - self.work_queues.sync_message_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SYNC_CONTRIBUTION_QUEUE_TOTAL, - self.work_queues.sync_contribution_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_QUEUE_TOTAL, - self.work_queues.gossip_block_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_BLOB_QUEUE_TOTAL, - self.work_queues.gossip_blob_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_DATA_COLUMN_QUEUE_TOTAL, - self.work_queues.gossip_data_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_BLOCK_QUEUE_TOTAL, - self.work_queues.rpc_block_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_BLOB_QUEUE_TOTAL, - self.work_queues.rpc_blob_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_CUSTODY_COLUMN_QUEUE_TOTAL, - self.work_queues.rpc_custody_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL, - self.work_queues.rpc_verify_data_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL, - self.work_queues.sampling_result_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL, - self.work_queues.chain_segment_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_QUEUE_TOTAL, - self.work_queues.backfill_chain_segment.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_EXIT_QUEUE_TOTAL, - self.work_queues.gossip_voluntary_exit_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_PROPOSER_SLASHING_QUEUE_TOTAL, - self.work_queues.gossip_proposer_slashing_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_ATTESTER_SLASHING_QUEUE_TOTAL, - self.work_queues.gossip_attester_slashing_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_BLS_TO_EXECUTION_CHANGE_QUEUE_TOTAL, - self.work_queues.gossip_bls_to_execution_change_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_API_REQUEST_P0_QUEUE_TOTAL, - self.work_queues.api_request_p0_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_API_REQUEST_P1_QUEUE_TOTAL, - self.work_queues.api_request_p1_queue.len() as i64, - ); - - if self.work_queues.aggregate_queue.is_full() - && self.work_queues.aggregate_debounce.elapsed() - { - error!( - self.beacon_processor.log, - "Aggregate attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => self.work_queues.aggregate_queue.max_length, - ) - } - - if self.work_queues.attestation_queue.is_full() - && self.work_queues.attestation_debounce.elapsed() - { - error!( - self.beacon_processor.log, - "Attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => self.work_queues.attestation_queue.max_length, - ) - } - } - - // TODO(beacon-processor) this can live outside of this struct in a more general location - fn worker_journal( - &self, - work_event: &Option>, - work_journal_tx: &Option>, - ) { - if let Some(work_journal_tx) = work_journal_tx { - let id = work_event - .as_ref() - .map(|event| event.work.str_id()) - .unwrap_or(WORKER_FREED); - - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(id); - } - } - - // TODO(beacon-processor) this can live outside of this struct in a more general location - fn increment_metrics(&self, work_event: &Option>) -> Option { - let _event_timer = metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); - if let Some(event) = work_event { - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, - &[event.work.str_id()], - ); - } else { - metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); - } - _event_timer - } -} +// // The priority scheduler has three major facets +// // 1. A priority ordering system +// // 2. A backfill rate limiting feature +// // 3. A retry queue + +// use slog::error; +// use slot_clock::SlotClock; +// use std::{cmp, sync::Arc, time::Duration}; + +// use futures::StreamExt; +// use lighthouse_metrics::HistogramTimer; +// use logging::TimeLatch; +// use slog::{crit, debug, trace, warn}; +// use tokio::sync::mpsc::{self, Receiver, Sender}; +// use types::EthSpec; + +// use crate::{ +// metrics, work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, BeaconProcessor, BeaconProcessorQueueLengths, FifoQueue, InboundEvent, InboundEvents, LifoQueue, Work, WorkEvent, NOTHING_TO_DO, WORKER_FREED +// }; + +// // TODO(beacon-processor) this will be impl specific +// pub struct WorkQueues { +// pub aggregate_queue: LifoQueue>, +// pub aggregate_debounce: TimeLatch, +// pub attestation_queue: LifoQueue>, +// pub attestation_debounce: TimeLatch, +// pub unknown_block_aggregate_queue: LifoQueue>, +// pub unknown_block_attestation_queue: LifoQueue>, +// pub sync_message_queue: LifoQueue>, +// pub sync_contribution_queue: LifoQueue>, +// pub gossip_voluntary_exit_queue: FifoQueue>, +// pub gossip_proposer_slashing_queue: FifoQueue>, +// pub gossip_attester_slashing_queue: FifoQueue>, +// pub finality_update_queue: FifoQueue>, +// pub optimistic_update_queue: FifoQueue>, +// pub unknown_light_client_update_queue: FifoQueue>, +// pub unknown_block_sampling_request_queue: FifoQueue>, +// pub rpc_block_queue: FifoQueue>, +// pub rpc_blob_queue: FifoQueue>, +// pub rpc_custody_column_queue: FifoQueue>, +// pub rpc_verify_data_column_queue: FifoQueue>, +// pub sampling_result_queue: FifoQueue>, +// pub chain_segment_queue: FifoQueue>, +// pub backfill_chain_segment: FifoQueue>, +// pub gossip_block_queue: FifoQueue>, +// pub gossip_blob_queue: FifoQueue>, +// pub gossip_data_column_queue: FifoQueue>, +// pub delayed_block_queue: FifoQueue>, +// pub status_queue: FifoQueue>, +// pub bbrange_queue: FifoQueue>, +// pub bbroots_queue: FifoQueue>, +// pub blbroots_queue: FifoQueue>, +// pub blbrange_queue: FifoQueue>, +// pub dcbroots_queue: FifoQueue>, +// pub dcbrange_queue: FifoQueue>, +// pub gossip_bls_to_execution_change_queue: FifoQueue>, +// pub lc_bootstrap_queue: FifoQueue>, +// pub lc_optimistic_update_queue: FifoQueue>, +// pub lc_finality_update_queue: FifoQueue>, +// pub api_request_p0_queue: FifoQueue>, +// pub api_request_p1_queue: FifoQueue>, +// } + +// impl WorkQueues { +// pub fn new(queue_lengths: BeaconProcessorQueueLengths) -> Self { +// let aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); +// let aggregate_debounce = TimeLatch::default(); +// let attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); +// let attestation_debounce = TimeLatch::default(); +// let unknown_block_aggregate_queue = +// LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); +// let unknown_block_attestation_queue = +// LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + +// let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); +// let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); + +// // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have +// // a strong feeling about queue type for exits. +// let gossip_voluntary_exit_queue = +// FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); + +// // Using a FIFO queue for slashing to prevent people from flushing their slashings from the +// // queues with lots of junk messages. +// let gossip_proposer_slashing_queue = +// FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); +// let gossip_attester_slashing_queue = +// FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); + +// // Using a FIFO queue for light client updates to maintain sequence order. +// let finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); +// let optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); +// let unknown_light_client_update_queue = +// FifoQueue::new(queue_lengths.unknown_light_client_update_queue); +// let unknown_block_sampling_request_queue = +// FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); + +// // Using a FIFO queue since blocks need to be imported sequentially. +// let rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); +// let rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); +// let rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); +// let rpc_verify_data_column_queue = +// FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); +// let sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); +// let chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); +// let backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); +// let gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); +// let gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); +// let gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); +// let delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); + +// let status_queue = FifoQueue::new(queue_lengths.status_queue); +// let bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); +// let bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); +// let blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); +// let blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); +// let dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); +// let dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); + +// let gossip_bls_to_execution_change_queue = +// FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); + +// let lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); +// let lc_optimistic_update_queue = +// FifoQueue::new(queue_lengths.lc_optimistic_update_queue); +// let lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); + +// let api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); +// let api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); + +// WorkQueues { +// aggregate_queue, +// aggregate_debounce, +// attestation_queue, +// attestation_debounce, +// unknown_block_aggregate_queue, +// unknown_block_attestation_queue, +// sync_message_queue, +// sync_contribution_queue, +// gossip_voluntary_exit_queue, +// gossip_proposer_slashing_queue, +// gossip_attester_slashing_queue, +// finality_update_queue, +// optimistic_update_queue, +// unknown_light_client_update_queue, +// unknown_block_sampling_request_queue, +// rpc_block_queue, +// rpc_blob_queue, +// rpc_custody_column_queue, +// rpc_verify_data_column_queue, +// sampling_result_queue, +// chain_segment_queue, +// backfill_chain_segment, +// gossip_block_queue, +// gossip_blob_queue, +// gossip_data_column_queue, +// delayed_block_queue, +// status_queue, +// bbrange_queue, +// bbroots_queue, +// blbroots_queue, +// blbrange_queue, +// dcbroots_queue, +// dcbrange_queue, +// gossip_bls_to_execution_change_queue, +// lc_bootstrap_queue, +// lc_optimistic_update_queue, +// lc_finality_update_queue, +// api_request_p0_queue, +// api_request_p1_queue, +// } +// } +// } + +// // Backend trait inits a channel, a run function +// // A channel trait has send_work, reprocess_work etc. + +// pub struct Scheduler { +// beacon_processor: BeaconProcessor, +// enable_backfill_rate_limiting: bool, +// current_workers: usize, +// idle_tx: Sender<()>, +// idle_rx: Receiver<()>, +// work_reprocessing_tx: Sender<()>, +// work_reprocessing_rx: Receiver<()>, +// work_queues: WorkQueues, +// } + +// impl Scheduler { + +// fn new() -> Self { +// // let (work_reprocessing_tx, work_reprocessing_rx) = +// // mpsc::channel(config.max_scheduled_work_queue_len); +// todo!() +// } + +// pub async fn process_work_event(&self) {} + + +// async fn run( +// mut self, +// mut inbound_events: InboundEvents, +// work_journal_tx: Option>, +// slot_clock: S, +// maximum_gossip_clock_disparity: Duration, +// ) -> Result<(), String> { +// // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to +// // receive them back once they are ready (`ready_work_rx`). +// let (ready_work_tx, ready_work_rx) = +// mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); +// // TODO(beacon-processor) reprocess scheduler +// spawn_reprocess_scheduler( +// ready_work_tx, +// self.work_reprocessing_rx, +// &self.beacon_processor.executor, +// Arc::new(slot_clock), +// self.beacon_processor.log.clone(), +// maximum_gossip_clock_disparity, +// )?; + +// let work_event = match inbound_events.next().await { +// Some(InboundEvent::WorkerIdle) => { +// // TODO(beacon-processor) move current_workers from beacon_processor to self +// self.current_workers = self.current_workers.saturating_sub(1); +// None +// } +// Some(InboundEvent::WorkEvent(event)) if self.enable_backfill_rate_limiting => { +// // TODO(beacon-processor) is backfill rate limiting going to be the same across all schedulers? +// todo!() +// } +// Some(InboundEvent::WorkEvent(event)) | Some(InboundEvent::ReprocessingWork(event)) => { +// Some(event) +// } +// None => { +// debug!( +// self.beacon_processor.log, +// "Gossip processor stopped"; +// "msg" => "stream ended" +// ); +// // TODO(beacon-processor) this should terminate the whole process +// todo!() +// } +// }; + +// let _event_timer = self.increment_metrics(&work_event); +// self.worker_journal(&work_event, &work_journal_tx); + +// let can_spawn = self.current_workers < self.beacon_processor.config.max_workers; +// let drop_during_sync = work_event +// .as_ref() +// .map_or(false, |event| event.drop_during_sync); + +// match work_event { +// // There is no new work event, but we are able to spawn a new worker. +// // +// // We don't check the `work.drop_during_sync` here. We assume that if it made +// // it into the queue at any point then we should process it. +// None if can_spawn => { +// // TODO(beacon-processor) implement the normal priority scheduler here +// // also note that these match arms will look similar across all scheduler variants +// // so maybe we can pull this function out and get creative with closure usage +// self.priority_scheduler(&work_journal_tx); +// todo!() +// } +// // There is no new work event and we are unable to spawn a new worker. +// // +// // I cannot see any good reason why this would happen. +// None => { +// warn!( +// self.beacon_processor.log, +// "Unexpected gossip processor condition"; +// "msg" => "no new work and cannot spawn worker" +// ); +// todo!() +// } +// // The chain is syncing and this event should be dropped during sync. +// Some(work_event) +// if self +// .beacon_processor +// .network_globals +// .sync_state +// .read() +// .is_syncing() +// && drop_during_sync => +// { +// let work_id = work_event.work.str_id(); +// metrics::inc_counter_vec( +// &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, +// &[work_id], +// ); +// trace!( +// self.beacon_processor.log, +// "Gossip processor skipping work"; +// "msg" => "chain is syncing", +// "work_id" => work_id +// ); +// todo!() +// } + +// // There is a new work event and the chain is not syncing. Process it or queue +// // it. +// Some(WorkEvent { work, .. }) => { +// self.process_or_queue_work_event(work, can_spawn); +// todo!() +// } +// } +// } + +// fn priority_scheduler(&mut self, work_journal_tx: &Option>) { +// let idle_tx = self.idle_tx.clone(); +// // Check for chain segments first, they're the most efficient way to get +// // blocks into the system. +// if let Some(item) = self.work_queues.chain_segment_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check sync blocks before gossip blocks, since we've already explicitly +// // requested these blocks. +// } else if let Some(item) = self.work_queues.rpc_block_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.rpc_blob_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // TODO(das): decide proper prioritization for sampling columns +// } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.rpc_verify_data_column_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.sampling_result_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check delayed blocks before gossip blocks, the gossip blocks might rely +// // on the delayed ones. +// } else if let Some(item) = self.work_queues.delayed_block_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check gossip blocks before gossip attestations, since a block might be +// // required to verify some attestations. +// } else if let Some(item) = self.work_queues.gossip_block_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.gossip_blob_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.gossip_data_column_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check the priority 0 API requests after blocks and blobs, but before attestations. +// } else if let Some(item) = self.work_queues.api_request_p0_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check the aggregates, *then* the unaggregates since we assume that +// // aggregates are more valuable to local validators and effectively give us +// // more information with less signature verification time. +// } else if self.work_queues.aggregate_queue.len() > 0 { +// let batch_size = cmp::min( +// self.work_queues.aggregate_queue.len(), +// self.beacon_processor.config.max_gossip_aggregate_batch_size, +// ); + +// if batch_size < 2 { +// // One single aggregate is in the queue, process it individually. +// if let Some(item) = self.work_queues.aggregate_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } +// } else { +// // Collect two or more aggregates into a batch, so they can take +// // advantage of batch signature verification. +// // +// // Note: this will convert the `Work::GossipAggregate` item into a +// // `Work::GossipAggregateBatch` item. +// let mut aggregates = Vec::with_capacity(batch_size); +// let mut process_batch_opt = None; +// for _ in 0..batch_size { +// if let Some(item) = self.work_queues.aggregate_queue.pop() { +// match item { +// Work::GossipAggregate { +// aggregate, +// process_individual: _, +// process_batch, +// } => { +// aggregates.push(*aggregate); +// if process_batch_opt.is_none() { +// process_batch_opt = Some(process_batch); +// } +// } +// _ => { +// error!( +// self.beacon_processor.log, +// "Invalid item in aggregate queue" +// ); +// } +// } +// } +// } + +// if let Some(process_batch) = process_batch_opt { +// // Process all aggregates with a single worker. +// self.beacon_processor.spawn_worker( +// Work::GossipAggregateBatch { +// aggregates, +// process_batch, +// }, +// idle_tx, +// ) +// } else { +// // There is no good reason for this to +// // happen, it is a serious logic error. +// // Since we only form batches when multiple +// // work items exist, we should always have a +// // work closure at this point. +// crit!(self.beacon_processor.log, "Missing aggregate work"); +// } +// } +// // Check the unaggregated attestation queue. +// // +// // Potentially use batching. +// } else if self.work_queues.attestation_queue.len() > 0 { +// let batch_size = cmp::min( +// self.work_queues.attestation_queue.len(), +// self.beacon_processor +// .config +// .max_gossip_attestation_batch_size, +// ); + +// if batch_size < 2 { +// // One single attestation is in the queue, process it individually. +// if let Some(item) = self.work_queues.attestation_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } +// } else { +// // Collect two or more attestations into a batch, so they can take +// // advantage of batch signature verification. +// // +// // Note: this will convert the `Work::GossipAttestation` item into a +// // `Work::GossipAttestationBatch` item. +// let mut attestations = Vec::with_capacity(batch_size); +// let mut process_batch_opt = None; +// for _ in 0..batch_size { +// if let Some(item) = self.work_queues.attestation_queue.pop() { +// match item { +// Work::GossipAttestation { +// attestation, +// process_individual: _, +// process_batch, +// } => { +// attestations.push(*attestation); +// if process_batch_opt.is_none() { +// process_batch_opt = Some(process_batch); +// } +// } +// _ => error!( +// self.beacon_processor.log, +// "Invalid item in attestation queue" +// ), +// } +// } +// } + +// if let Some(process_batch) = process_batch_opt { +// // Process all attestations with a single worker. +// self.beacon_processor.spawn_worker( +// Work::GossipAttestationBatch { +// attestations, +// process_batch, +// }, +// idle_tx, +// ) +// } else { +// // There is no good reason for this to +// // happen, it is a serious logic error. +// // Since we only form batches when multiple +// // work items exist, we should always have a +// // work closure at this point. +// crit!(self.beacon_processor.log, "Missing attestations work"); +// } +// } +// // Check sync committee messages after attestations as their rewards are lesser +// // and they don't influence fork choice. +// } else if let Some(item) = self.work_queues.sync_contribution_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.sync_message_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Aggregates and unaggregates queued for re-processing are older and we +// // care about fresher ones, so check those first. +// } else if let Some(item) = self.work_queues.unknown_block_aggregate_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.unknown_block_attestation_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check RPC methods next. Status messages are needed for sync so +// // prioritize them over syncing requests from other peers (BlocksByRange +// // and BlocksByRoot) +// } else if let Some(item) = self.work_queues.status_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.bbrange_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.bbroots_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.blbrange_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.blbroots_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.dcbroots_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.dcbrange_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Prioritize sampling requests after block syncing requests +// } else if let Some(item) = self.work_queues.unknown_block_sampling_request_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check slashings after all other consensus messages so we prioritize +// // following head. +// // +// // Check attester slashings before proposer slashings since they have the +// // potential to slash multiple validators at once. +// } else if let Some(item) = self.work_queues.gossip_attester_slashing_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.gossip_proposer_slashing_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check exits and address changes late since our validators don't get +// // rewards from them. +// } else if let Some(item) = self.work_queues.gossip_voluntary_exit_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.gossip_bls_to_execution_change_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Check the priority 1 API requests after we've +// // processed all the interesting things from the network +// // and things required for us to stay in good repute +// // with our P2P peers. +// } else if let Some(item) = self.work_queues.api_request_p1_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Handle backfill sync chain segments. +// } else if let Some(item) = self.work_queues.backfill_chain_segment.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // Handle light client requests. +// } else if let Some(item) = self.work_queues.lc_bootstrap_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.lc_optimistic_update_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// } else if let Some(item) = self.work_queues.lc_finality_update_queue.pop() { +// self.beacon_processor.spawn_worker(item, idle_tx); +// // This statement should always be the final else statement. +// } else { +// // Let the journal know that a worker is freed and there's nothing else +// // for it to do. +// if let Some(work_journal_tx) = work_journal_tx { +// // We don't care if this message was successfully sent, we only use the journal +// // during testing. +// let _ = work_journal_tx.try_send(NOTHING_TO_DO); +// } +// } +// } + +// // TODO(beacon-processor) this might be able to be moved to a more generalized location +// pub fn process_or_queue_work_event(&mut self, work: Work, can_spawn: bool) { +// let work_id = work.str_id(); + +// match work { +// _ if can_spawn => self +// .beacon_processor +// .spawn_worker(work, self.idle_tx.clone()), +// Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), +// // Attestation batches are formed internally within the +// // `BeaconProcessor`, they are not sent from external services. +// Work::GossipAttestationBatch { .. } => crit!( +// self.beacon_processor.log, +// "Unsupported inbound event"; +// "type" => "GossipAttestationBatch" +// ), +// Work::GossipAggregate { .. } => self.work_queues.aggregate_queue.push(work), +// // Aggregate batches are formed internally within the `BeaconProcessor`, +// // they are not sent from external services. +// Work::GossipAggregateBatch { .. } => crit!( +// self.beacon_processor.log, +// "Unsupported inbound event"; +// "type" => "GossipAggregateBatch" +// ), +// Work::GossipBlock { .. } => { +// self.work_queues +// .gossip_block_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::GossipBlobSidecar { .. } => { +// self.work_queues +// .gossip_blob_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::GossipDataColumnSidecar { .. } => self.work_queues.gossip_data_column_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::DelayedImportBlock { .. } => { +// self.work_queues +// .delayed_block_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::GossipVoluntaryExit { .. } => self.work_queues.gossip_voluntary_exit_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::GossipProposerSlashing { .. } => self +// .work_queues +// .gossip_proposer_slashing_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::GossipAttesterSlashing { .. } => self +// .work_queues +// .gossip_attester_slashing_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::GossipSyncSignature { .. } => self.work_queues.sync_message_queue.push(work), +// Work::GossipSyncContribution { .. } => { +// self.work_queues.sync_contribution_queue.push(work) +// } +// Work::GossipLightClientFinalityUpdate { .. } => self +// .work_queues +// .finality_update_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::GossipLightClientOptimisticUpdate { .. } => self +// .work_queues +// .optimistic_update_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self +// .work_queues +// .rpc_block_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::RpcBlobs { .. } => { +// self.work_queues +// .rpc_blob_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::RpcCustodyColumn { .. } => self.work_queues.rpc_custody_column_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::RpcVerifyDataColumn(_) => self.work_queues.rpc_verify_data_column_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::SamplingResult(_) => self.work_queues.sampling_result_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::ChainSegment { .. } => { +// self.work_queues +// .chain_segment_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::ChainSegmentBackfill { .. } => self.work_queues.backfill_chain_segment.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::Status { .. } => { +// self.work_queues +// .status_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::BlocksByRangeRequest { .. } => { +// self.work_queues +// .bbrange_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::BlocksByRootsRequest { .. } => { +// self.work_queues +// .bbroots_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::BlobsByRangeRequest { .. } => { +// self.work_queues +// .blbrange_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::LightClientBootstrapRequest { .. } => { +// self.work_queues +// .lc_bootstrap_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::LightClientOptimisticUpdateRequest { .. } => self +// .work_queues +// .lc_optimistic_update_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::LightClientFinalityUpdateRequest { .. } => self +// .work_queues +// .lc_finality_update_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::UnknownBlockAttestation { .. } => { +// self.work_queues.unknown_block_attestation_queue.push(work) +// } +// Work::UnknownBlockAggregate { .. } => { +// self.work_queues.unknown_block_aggregate_queue.push(work) +// } +// Work::GossipBlsToExecutionChange { .. } => self +// .work_queues +// .gossip_bls_to_execution_change_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::BlobsByRootsRequest { .. } => { +// self.work_queues +// .blbroots_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::DataColumnsByRootsRequest { .. } => { +// self.work_queues +// .dcbroots_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::DataColumnsByRangeRequest { .. } => { +// self.work_queues +// .dcbrange_queue +// .push(work, work_id, &self.beacon_processor.log) +// } +// Work::UnknownLightClientOptimisticUpdate { .. } => self +// .work_queues +// .unknown_light_client_update_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::UnknownBlockSamplingRequest { .. } => self +// .work_queues +// .unknown_block_sampling_request_queue +// .push(work, work_id, &self.beacon_processor.log), +// Work::ApiRequestP0 { .. } => self.work_queues.api_request_p0_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// Work::ApiRequestP1 { .. } => self.work_queues.api_request_p1_queue.push( +// work, +// work_id, +// &self.beacon_processor.log, +// ), +// } + +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, +// self.current_workers as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL, +// self.work_queues.attestation_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_AGGREGATED_ATTESTATION_QUEUE_TOTAL, +// self.work_queues.aggregate_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_SYNC_MESSAGE_QUEUE_TOTAL, +// self.work_queues.sync_message_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_SYNC_CONTRIBUTION_QUEUE_TOTAL, +// self.work_queues.sync_contribution_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_QUEUE_TOTAL, +// self.work_queues.gossip_block_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_GOSSIP_BLOB_QUEUE_TOTAL, +// self.work_queues.gossip_blob_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_GOSSIP_DATA_COLUMN_QUEUE_TOTAL, +// self.work_queues.gossip_data_column_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_RPC_BLOCK_QUEUE_TOTAL, +// self.work_queues.rpc_block_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_RPC_BLOB_QUEUE_TOTAL, +// self.work_queues.rpc_blob_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_RPC_CUSTODY_COLUMN_QUEUE_TOTAL, +// self.work_queues.rpc_custody_column_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL, +// self.work_queues.rpc_verify_data_column_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL, +// self.work_queues.sampling_result_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL, +// self.work_queues.chain_segment_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_QUEUE_TOTAL, +// self.work_queues.backfill_chain_segment.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_EXIT_QUEUE_TOTAL, +// self.work_queues.gossip_voluntary_exit_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_PROPOSER_SLASHING_QUEUE_TOTAL, +// self.work_queues.gossip_proposer_slashing_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_ATTESTER_SLASHING_QUEUE_TOTAL, +// self.work_queues.gossip_attester_slashing_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_BLS_TO_EXECUTION_CHANGE_QUEUE_TOTAL, +// self.work_queues.gossip_bls_to_execution_change_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_API_REQUEST_P0_QUEUE_TOTAL, +// self.work_queues.api_request_p0_queue.len() as i64, +// ); +// metrics::set_gauge( +// &metrics::BEACON_PROCESSOR_API_REQUEST_P1_QUEUE_TOTAL, +// self.work_queues.api_request_p1_queue.len() as i64, +// ); + +// if self.work_queues.aggregate_queue.is_full() +// && self.work_queues.aggregate_debounce.elapsed() +// { +// error!( +// self.beacon_processor.log, +// "Aggregate attestation queue full"; +// "msg" => "the system has insufficient resources for load", +// "queue_len" => self.work_queues.aggregate_queue.max_length, +// ) +// } + +// if self.work_queues.attestation_queue.is_full() +// && self.work_queues.attestation_debounce.elapsed() +// { +// error!( +// self.beacon_processor.log, +// "Attestation queue full"; +// "msg" => "the system has insufficient resources for load", +// "queue_len" => self.work_queues.attestation_queue.max_length, +// ) +// } +// } + +// // TODO(beacon-processor) this can live outside of this struct in a more general location +// fn worker_journal( +// &self, +// work_event: &Option>, +// work_journal_tx: &Option>, +// ) { +// if let Some(work_journal_tx) = work_journal_tx { +// let id = work_event +// .as_ref() +// .map(|event| event.work.str_id()) +// .unwrap_or(WORKER_FREED); + +// // We don't care if this message was successfully sent, we only use the journal +// // during testing. +// let _ = work_journal_tx.try_send(id); +// } +// } + +// // TODO(beacon-processor) this can live outside of this struct in a more general location +// fn increment_metrics(&self, work_event: &Option>) -> Option { +// let _event_timer = metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); +// if let Some(event) = work_event { +// metrics::inc_counter_vec( +// &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, +// &[event.work.str_id()], +// ); +// } else { +// metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); +// } +// _event_timer +// } +// } diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index dcd494a880f..529e79999d0 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -189,8 +189,6 @@ pub async fn create_api_server( let BeaconProcessorChannels { beacon_processor_tx, beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx, } = BeaconProcessorChannels::new(&beacon_processor_config); let beacon_processor_send = beacon_processor_tx; @@ -204,8 +202,6 @@ pub async fn create_api_server( } .spawn_manager( beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx, None, chain.slot_clock.clone(), chain.spec.maximum_gossip_clock_disparity(), diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index d5d83d540a0..ac967592ef9 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -49,8 +49,7 @@ use beacon_processor::{ work_reprocessing_queue::{ QueuedAggregate, QueuedGossipBlock, QueuedLightClientUpdate, QueuedUnaggregate, ReprocessQueueMessage, - }, - DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, + }, BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage }; /// Set to `true` to introduce stricter penalties for peers who send some types of late consensus @@ -1391,7 +1390,7 @@ impl NetworkBeaconProcessor { let inner_self = self.clone(); let process_fn = Box::pin(async move { - let reprocess_tx = inner_self.reprocess_tx.clone(); + let reprocess_tx = inner_self.beacon_processor_send.clone(); let invalid_block_storage = inner_self.invalid_block_storage.clone(); inner_self .process_gossip_verified_block( @@ -1443,7 +1442,7 @@ impl NetworkBeaconProcessor { self: Arc, peer_id: PeerId, verified_block: GossipVerifiedBlock, - reprocess_tx: mpsc::Sender, + reprocess_tx: BeaconProcessorSend, invalid_block_storage: InvalidBlockStorage, _seen_duration: Duration, ) { @@ -1480,10 +1479,10 @@ impl NetworkBeaconProcessor { metrics::inc_counter(&metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_IMPORTED_TOTAL); if reprocess_tx - .try_send(ReprocessQueueMessage::BlockImported { + .try_send(Work::Reprocess(ReprocessQueueMessage::BlockImported { block_root: *block_root, parent_root: block.message().parent_root(), - }) + })) .is_err() { error!( diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 7f551c544c7..5d905a20c8d 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -54,7 +54,6 @@ pub struct NetworkBeaconProcessor { pub chain: Arc>, pub network_tx: mpsc::UnboundedSender>, pub sync_tx: mpsc::UnboundedSender>, - pub reprocess_tx: mpsc::Sender, pub network_globals: Arc>, pub invalid_block_storage: InvalidBlockStorage, pub executor: TaskExecutor, @@ -786,8 +785,6 @@ impl NetworkBeaconProcessor> { let BeaconProcessorChannels { beacon_processor_tx, beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx: _work_reprocessing_rx, } = <_>::default(); let (network_tx, _network_rx) = mpsc::unbounded_channel(); @@ -799,7 +796,6 @@ impl NetworkBeaconProcessor> { chain, network_tx, sync_tx, - reprocess_tx: work_reprocessing_tx, network_globals, invalid_block_storage: InvalidBlockStorage::Disabled, executor, diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 40c69a0baa5..287094cae64 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -190,8 +190,6 @@ impl TestRig { let BeaconProcessorChannels { beacon_processor_tx, beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx, } = BeaconProcessorChannels::new(&beacon_processor_config); let (sync_tx, _sync_rx) = mpsc::unbounded_channel(); @@ -224,7 +222,6 @@ impl TestRig { chain: harness.chain.clone(), network_tx, sync_tx, - reprocess_tx: work_reprocessing_tx.clone(), network_globals: network_globals.clone(), invalid_block_storage: InvalidBlockStorage::Disabled, executor: executor.clone(), diff --git a/beacon_node/network/src/service/tests.rs b/beacon_node/network/src/service/tests.rs index b5731876968..828bd15bceb 100644 --- a/beacon_node/network/src/service/tests.rs +++ b/beacon_node/network/src/service/tests.rs @@ -80,8 +80,6 @@ mod tests { let BeaconProcessorChannels { beacon_processor_tx, beacon_processor_rx: _beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx: _work_reprocessing_rx, } = <_>::default(); let _network_service = NetworkService::start( From 18fb43e0652a61e5363a3545e0b7ab45d1d744eb Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 10:09:24 -0700 Subject: [PATCH 03/16] modularizing beacon processor --- beacon_node/beacon_processor/src/lib.rs | 1453 +++-------------- .../src/scheduler/interface.rs | 29 +- .../beacon_processor/src/scheduler/mod.rs | 2 +- .../src/scheduler/priority_scheduler.rs | 867 ---------- .../src/scheduler/priority_scheduler/mod.rs | 1020 ++++++++++++ .../priority_scheduler/work_queue.rs | 350 ++++ .../work_reprocessing_queue.rs | 1145 +++++++++++++ beacon_node/client/src/builder.rs | 25 +- .../src/compute_light_client_updates.rs | 13 +- beacon_node/http_api/src/lib.rs | 13 +- .../http_api/src/publish_attestations.rs | 20 +- beacon_node/http_api/src/task_spawner.rs | 27 + beacon_node/http_api/src/test_utils.rs | 11 +- .../gossip_methods.rs | 108 +- .../src/network_beacon_processor/mod.rs | 29 +- .../network_beacon_processor/sync_methods.rs | 25 +- beacon_node/network/src/router.rs | 6 +- beacon_node/network/src/service.rs | 6 +- 18 files changed, 2900 insertions(+), 2249 deletions(-) delete mode 100644 beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs create mode 100644 beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs create mode 100644 beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs create mode 100644 beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 4ad67e30565..8e27354fa1a 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -38,14 +38,13 @@ //! checks the queues to see if there are more parcels of work that can be spawned in a new worker //! task. mod scheduler; -use crate::work_reprocessing_queue::{ - QueuedBackfillBatch, QueuedGossipBlock, ReprocessQueueMessage, -}; +use crate::scheduler::interface::SchedulerType; use futures::stream::{Stream, StreamExt}; use futures::task::Poll; use lighthouse_network::{MessageId, NetworkGlobals, PeerId}; use logging::TimeLatch; use parking_lot::Mutex; +use scheduler::interface::Scheduler; use serde::{Deserialize, Serialize}; use slog::{crit, debug, error, trace, warn, Logger}; use slot_clock::SlotClock; @@ -57,6 +56,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::Context; use std::time::Duration; +use strum::AsRefStr; use strum::IntoStaticStr; use task_executor::TaskExecutor; use tokio::sync::mpsc; @@ -65,14 +65,7 @@ use types::{ Attestation, BeaconState, ChainSpec, Hash256, RelativeEpoch, SignedAggregateAndProof, SubnetId, }; use types::{EthSpec, Slot}; -use work_reprocessing_queue::{ - spawn_reprocess_scheduler, QueuedAggregate, QueuedLightClientUpdate, QueuedRpcBlock, - QueuedUnaggregate, ReadyWork, -}; -use work_reprocessing_queue::{IgnoredRpcBlock, QueuedSamplingRequest}; - mod metrics; -pub mod work_reprocessing_queue; /// The maximum size of the channel for work events to the `BeaconProcessor`. /// @@ -88,115 +81,6 @@ const MAX_IDLE_QUEUE_LEN: usize = 16_384; /// The maximum size of the channel for re-processing work events. const DEFAULT_MAX_SCHEDULED_WORK_QUEUE_LEN: usize = 3 * DEFAULT_MAX_WORK_EVENT_QUEUE_LEN / 4; -/// Over-provision queues based on active validator count by some factor. The beacon chain has -/// strict churns that prevent the validator set size from changing rapidly. By over-provisioning -/// slightly, we don't need to adjust the queues during the lifetime of a process. -const ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT: usize = 110; - -/// Maximum number of queued items that will be stored before dropping them -pub struct BeaconProcessorQueueLengths { - aggregate_queue: usize, - attestation_queue: usize, - unknown_block_aggregate_queue: usize, - unknown_block_attestation_queue: usize, - sync_message_queue: usize, - sync_contribution_queue: usize, - gossip_voluntary_exit_queue: usize, - gossip_proposer_slashing_queue: usize, - gossip_attester_slashing_queue: usize, - finality_update_queue: usize, - optimistic_update_queue: usize, - unknown_light_client_update_queue: usize, - unknown_block_sampling_request_queue: usize, - rpc_block_queue: usize, - rpc_blob_queue: usize, - rpc_custody_column_queue: usize, - rpc_verify_data_column_queue: usize, - sampling_result_queue: usize, - chain_segment_queue: usize, - backfill_chain_segment: usize, - gossip_block_queue: usize, - gossip_blob_queue: usize, - gossip_data_column_queue: usize, - delayed_block_queue: usize, - status_queue: usize, - bbrange_queue: usize, - bbroots_queue: usize, - blbroots_queue: usize, - blbrange_queue: usize, - dcbroots_queue: usize, - dcbrange_queue: usize, - gossip_bls_to_execution_change_queue: usize, - lc_bootstrap_queue: usize, - lc_optimistic_update_queue: usize, - lc_finality_update_queue: usize, - api_request_p0_queue: usize, - api_request_p1_queue: usize, -} - -impl BeaconProcessorQueueLengths { - pub fn from_state( - state: &BeaconState, - spec: &ChainSpec, - ) -> Result { - let active_validator_count = - match state.get_cached_active_validator_indices(RelativeEpoch::Current) { - Ok(indices) => indices.len(), - Err(_) => state - .get_active_validator_indices(state.current_epoch(), spec) - .map_err(|e| format!("Error computing active indices: {:?}", e))? - .len(), - }; - let active_validator_count = - (ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT * active_validator_count) / 100; - let slots_per_epoch = E::slots_per_epoch() as usize; - - Ok(Self { - aggregate_queue: 4096, - unknown_block_aggregate_queue: 1024, - // Capacity for a full slot's worth of attestations if subscribed to all subnets - attestation_queue: active_validator_count / slots_per_epoch, - // Capacity for a full slot's worth of attestations if subscribed to all subnets - unknown_block_attestation_queue: active_validator_count / slots_per_epoch, - sync_message_queue: 2048, - sync_contribution_queue: 1024, - gossip_voluntary_exit_queue: 4096, - gossip_proposer_slashing_queue: 4096, - gossip_attester_slashing_queue: 4096, - finality_update_queue: 1024, - optimistic_update_queue: 1024, - unknown_block_sampling_request_queue: 16384, - unknown_light_client_update_queue: 128, - rpc_block_queue: 1024, - rpc_blob_queue: 1024, - // TODO(das): Placeholder values - rpc_custody_column_queue: 1000, - rpc_verify_data_column_queue: 1000, - sampling_result_queue: 1000, - chain_segment_queue: 64, - backfill_chain_segment: 64, - gossip_block_queue: 1024, - gossip_blob_queue: 1024, - gossip_data_column_queue: 1024, - delayed_block_queue: 1024, - status_queue: 1024, - bbrange_queue: 1024, - bbroots_queue: 1024, - blbroots_queue: 1024, - blbrange_queue: 1024, - // TODO(das): pick proper values - dcbroots_queue: 1024, - dcbrange_queue: 1024, - gossip_bls_to_execution_change_queue: 16384, - lc_bootstrap_queue: 1024, - lc_optimistic_update_queue: 512, - lc_finality_update_queue: 512, - api_request_p0_queue: 1024, - api_request_p1_queue: 1024, - }) - } -} - /// The name of the manager tokio task. const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; @@ -254,7 +138,6 @@ impl BeaconProcessorChannels { pub fn new(config: &BeaconProcessorConfig) -> Self { let (beacon_processor_tx, beacon_processor_rx) = mpsc::channel(config.max_work_event_queue_len); - Self { beacon_processor_tx: BeaconProcessorSend(beacon_processor_tx), @@ -269,90 +152,6 @@ impl Default for BeaconProcessorChannels { } } -/// A simple first-in-first-out queue with a maximum length. -struct FifoQueue { - queue: VecDeque, - max_length: usize, -} - -impl FifoQueue { - /// Create a new, empty queue with the given length. - pub fn new(max_length: usize) -> Self { - Self { - queue: VecDeque::default(), - max_length, - } - } - - /// Add a new item to the queue. - /// - /// Drops `item` if the queue is full. - pub fn push(&mut self, item: T, item_desc: &str, log: &Logger) { - if self.queue.len() == self.max_length { - error!( - log, - "Work queue is full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => self.max_length, - "queue" => item_desc, - ) - } else { - self.queue.push_back(item); - } - } - - /// Remove the next item from the queue. - pub fn pop(&mut self) -> Option { - self.queue.pop_front() - } - - /// Returns the current length of the queue. - pub fn len(&self) -> usize { - self.queue.len() - } -} - -/// A simple last-in-first-out queue with a maximum length. -struct LifoQueue { - queue: VecDeque, - max_length: usize, -} - -impl LifoQueue { - /// Create a new, empty queue with the given length. - pub fn new(max_length: usize) -> Self { - Self { - queue: VecDeque::default(), - max_length, - } - } - - /// Add a new item to the front of the queue. - /// - /// If the queue is full, the item at the back of the queue is dropped. - pub fn push(&mut self, item: T) { - if self.queue.len() == self.max_length { - self.queue.pop_back(); - } - self.queue.push_front(item); - } - - /// Remove the next item from the queue. - pub fn pop(&mut self) -> Option { - self.queue.pop_front() - } - - /// Returns `true` if the queue is full. - pub fn is_full(&self) -> bool { - self.queue.len() >= self.max_length - } - - /// Returns the current length of the queue. - pub fn len(&self) -> usize { - self.queue.len() - } -} - /// A handle that sends a message on the provided channel to a receiver when it gets dropped. /// /// The receiver task is responsible for removing the provided `entry` from the `DuplicateCache` @@ -402,6 +201,111 @@ impl DuplicateCache { } } +#[derive(IntoStaticStr, PartialEq, Eq, Debug)] +#[strum(serialize_all = "snake_case")] +pub enum WorkType { + GossipAttestation, + UnknownBlockAttestation, + GossipAttestationBatch, + GossipAggregate, + UnknownBlockAggregate, + UnknownLightClientOptimisticUpdate, + UnknownBlockSamplingRequest, + GossipAggregateBatch, + GossipBlock, + GossipBlobSidecar, + GossipDataColumnSidecar, + DelayedImportBlock, + GossipVoluntaryExit, + GossipProposerSlashing, + GossipAttesterSlashing, + GossipSyncSignature, + GossipSyncContribution, + GossipLightClientFinalityUpdate, + GossipLightClientOptimisticUpdate, + RpcBlock, + RpcBlobs, + RpcCustodyColumn, + RpcVerifyDataColumn, + SamplingResult, + IgnoredRpcBlock, + ChainSegment, + ChainSegmentBackfill, + Status, + BlocksByRangeRequest, + BlocksByRootsRequest, + BlobsByRangeRequest, + BlobsByRootsRequest, + DataColumnsByRootsRequest, + DataColumnsByRangeRequest, + GossipBlsToExecutionChange, + LightClientBootstrapRequest, + LightClientOptimisticUpdateRequest, + LightClientFinalityUpdateRequest, + ApiRequestP0, + ApiRequestP1, + Reprocess, +} + +impl Work { + pub fn str_id(&self) -> &'static str { + self.to_type().into() + } + + /// Provides a `&str` that uniquely identifies each enum variant. + pub fn to_type(&self) -> WorkType { + match self { + Work::GossipAttestation { .. } => WorkType::GossipAttestation, + Work::GossipAttestationBatch { .. } => WorkType::GossipAttestationBatch, + Work::GossipAggregate { .. } => WorkType::GossipAggregate, + Work::GossipAggregateBatch { .. } => WorkType::GossipAggregateBatch, + Work::GossipBlock(_) => WorkType::GossipBlock, + Work::GossipBlobSidecar(_) => WorkType::GossipBlobSidecar, + Work::GossipDataColumnSidecar(_) => WorkType::GossipDataColumnSidecar, + Work::DelayedImportBlock { .. } => WorkType::DelayedImportBlock, + Work::GossipVoluntaryExit(_) => WorkType::GossipVoluntaryExit, + Work::GossipProposerSlashing(_) => WorkType::GossipProposerSlashing, + Work::GossipAttesterSlashing(_) => WorkType::GossipAttesterSlashing, + Work::GossipSyncSignature(_) => WorkType::GossipSyncSignature, + Work::GossipSyncContribution(_) => WorkType::GossipSyncContribution, + Work::GossipLightClientFinalityUpdate(_) => WorkType::GossipLightClientFinalityUpdate, + Work::GossipLightClientOptimisticUpdate(_) => { + WorkType::GossipLightClientOptimisticUpdate + } + Work::GossipBlsToExecutionChange(_) => WorkType::GossipBlsToExecutionChange, + Work::RpcBlock { .. } => WorkType::RpcBlock, + Work::RpcBlobs { .. } => WorkType::RpcBlobs, + Work::RpcCustodyColumn { .. } => WorkType::RpcCustodyColumn, + Work::RpcVerifyDataColumn { .. } => WorkType::RpcVerifyDataColumn, + Work::SamplingResult { .. } => WorkType::SamplingResult, + Work::IgnoredRpcBlock { .. } => WorkType::IgnoredRpcBlock, + Work::ChainSegment { .. } => WorkType::ChainSegment, + Work::ChainSegmentBackfill(_) => WorkType::ChainSegmentBackfill, + Work::Status(_) => WorkType::Status, + Work::BlocksByRangeRequest(_) => WorkType::BlocksByRangeRequest, + Work::BlocksByRootsRequest(_) => WorkType::BlocksByRootsRequest, + Work::BlobsByRangeRequest(_) => WorkType::BlobsByRangeRequest, + Work::BlobsByRootsRequest(_) => WorkType::BlobsByRootsRequest, + Work::DataColumnsByRootsRequest(_) => WorkType::DataColumnsByRootsRequest, + Work::DataColumnsByRangeRequest(_) => WorkType::DataColumnsByRangeRequest, + Work::LightClientBootstrapRequest(_) => WorkType::LightClientBootstrapRequest, + Work::LightClientOptimisticUpdateRequest(_) => { + WorkType::LightClientOptimisticUpdateRequest + } + Work::LightClientFinalityUpdateRequest(_) => WorkType::LightClientFinalityUpdateRequest, + Work::UnknownBlockAttestation { .. } => WorkType::UnknownBlockAttestation, + Work::UnknownBlockAggregate { .. } => WorkType::UnknownBlockAggregate, + Work::UnknownBlockSamplingRequest { .. } => WorkType::UnknownBlockSamplingRequest, + Work::UnknownLightClientOptimisticUpdate { .. } => { + WorkType::UnknownLightClientOptimisticUpdate + } + Work::ApiRequestP0 { .. } => WorkType::ApiRequestP0, + Work::ApiRequestP1 { .. } => WorkType::ApiRequestP1, + Work::Reprocess { .. } => WorkType::Reprocess, + } + } +} + /// An event to be processed by the manager task. #[derive(Debug)] pub struct WorkEvent { @@ -421,69 +325,6 @@ impl WorkEvent { } } -impl From for WorkEvent { - fn from(ready_work: ReadyWork) -> Self { - match ready_work { - ReadyWork::Block(QueuedGossipBlock { - beacon_block_slot, - beacon_block_root, - process_fn, - }) => Self { - drop_during_sync: false, - work: Work::DelayedImportBlock { - beacon_block_slot, - beacon_block_root, - process_fn, - }, - }, - ReadyWork::RpcBlock(QueuedRpcBlock { - beacon_block_root: _, - process_fn, - ignore_fn: _, - }) => Self { - drop_during_sync: false, - work: Work::RpcBlock { process_fn }, - }, - ReadyWork::IgnoredRpcBlock(IgnoredRpcBlock { process_fn }) => Self { - drop_during_sync: false, - work: Work::IgnoredRpcBlock { process_fn }, - }, - ReadyWork::Unaggregate(QueuedUnaggregate { - beacon_block_root: _, - process_fn, - }) => Self { - drop_during_sync: true, - work: Work::UnknownBlockAttestation { process_fn }, - }, - ReadyWork::Aggregate(QueuedAggregate { - process_fn, - beacon_block_root: _, - }) => Self { - drop_during_sync: true, - work: Work::UnknownBlockAggregate { process_fn }, - }, - ReadyWork::LightClientUpdate(QueuedLightClientUpdate { - parent_root, - process_fn, - }) => Self { - drop_during_sync: true, - work: Work::UnknownLightClientOptimisticUpdate { - parent_root, - process_fn, - }, - }, - ReadyWork::SamplingRequest(QueuedSamplingRequest { process_fn, .. }) => Self { - drop_during_sync: true, - work: Work::UnknownBlockSamplingRequest { process_fn }, - }, - ReadyWork::BackfillSync(QueuedBackfillBatch(process_fn)) => Self { - drop_during_sync: false, - work: Work::ChainSegmentBackfill(process_fn), - }, - } - } -} - /// Items required to verify a batch of unaggregated gossip attestations. #[derive(Debug)] pub struct GossipAttestationPackage { @@ -532,6 +373,111 @@ pub enum BlockingOrAsync { Async(AsyncFn), } +/// Messages that the scheduler can receive. +#[derive(AsRefStr)] +pub enum ReprocessQueueMessage { + /// A block that has been received early and we should queue for later processing. + EarlyBlock(QueuedGossipBlock), + /// A gossip block for hash `X` is being imported, we should queue the rpc block for the same + /// hash until the gossip block is imported. + RpcBlock(QueuedRpcBlock), + /// A block that was successfully processed. We use this to handle attestations updates + /// for unknown blocks. + BlockImported { + block_root: Hash256, + parent_root: Hash256, + }, + /// A new `LightClientOptimisticUpdate` has been produced. We use this to handle light client + /// updates for unknown parent blocks. + NewLightClientOptimisticUpdate { parent_root: Hash256 }, + /// An unaggregated attestation that references an unknown block. + UnknownBlockUnaggregate(QueuedUnaggregate), + /// An aggregated attestation that references an unknown block. + UnknownBlockAggregate(QueuedAggregate), + /// A light client optimistic update that references a parent root that has not been seen as a parent. + UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate), + /// A sampling request that references an unknown block. + UnknownBlockSamplingRequest(QueuedSamplingRequest), + /// A new backfill batch that needs to be scheduled for processing. + BackfillSync(QueuedBackfillBatch), +} + +/// An Attestation for which the corresponding block was not seen while processing, queued for +/// later. +pub struct QueuedUnaggregate { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + +/// An aggregated attestation for which the corresponding block was not seen while processing, queued for +/// later. +pub struct QueuedAggregate { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + +/// A light client update for which the corresponding parent block was not seen while processing, +/// queued for later. +pub struct QueuedLightClientUpdate { + pub parent_root: Hash256, + pub process_fn: BlockingFn, +} + +/// A sampling request for which the corresponding block is not known while processing. +pub struct QueuedSamplingRequest { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + +/// A block that arrived early and has been queued for later import. +pub struct QueuedGossipBlock { + pub beacon_block_slot: Slot, + pub beacon_block_root: Hash256, + pub process_fn: AsyncFn, +} + +/// A block that arrived for processing when the same block was being imported over gossip. +/// It is queued for later import. +pub struct QueuedRpcBlock { + pub beacon_block_root: Hash256, + /// Processes/imports the block. + pub process_fn: AsyncFn, + /// Ignores the block. + pub ignore_fn: BlockingFn, +} + +/// A block that arrived for processing when the same block was being imported over gossip. +/// It is queued for later import. +pub struct IgnoredRpcBlock { + pub process_fn: BlockingFn, +} + +/// A backfill batch work that has been queued for processing later. +pub struct QueuedBackfillBatch(pub AsyncFn); + +impl TryFrom> for QueuedBackfillBatch { + type Error = WorkEvent; + + fn try_from(event: WorkEvent) -> Result> { + match event { + WorkEvent { + work: Work::ChainSegmentBackfill(process_fn), + .. + } => Ok(QueuedBackfillBatch(process_fn)), + _ => Err(event), + } + } +} + +impl From for WorkEvent { + fn from(queued_backfill_batch: QueuedBackfillBatch) -> WorkEvent { + WorkEvent { + drop_during_sync: false, + work: Work::ChainSegmentBackfill(queued_backfill_batch.0), + } + } +} + /// Indicates the type of work to be performed and therefore its priority and /// queuing specifics. pub enum Work { @@ -608,7 +554,7 @@ pub enum Work { LightClientFinalityUpdateRequest(BlockingFn), ApiRequestP0(BlockingOrAsync), ApiRequestP1(BlockingOrAsync), - Reprocess(BlockingFn), + Reprocess(ReprocessQueueMessage), } impl fmt::Debug for Work { @@ -617,174 +563,6 @@ impl fmt::Debug for Work { } } -#[derive(IntoStaticStr, PartialEq, Eq, Debug)] -#[strum(serialize_all = "snake_case")] -pub enum WorkType { - GossipAttestation, - UnknownBlockAttestation, - GossipAttestationBatch, - GossipAggregate, - UnknownBlockAggregate, - UnknownLightClientOptimisticUpdate, - UnknownBlockSamplingRequest, - GossipAggregateBatch, - GossipBlock, - GossipBlobSidecar, - GossipDataColumnSidecar, - DelayedImportBlock, - GossipVoluntaryExit, - GossipProposerSlashing, - GossipAttesterSlashing, - GossipSyncSignature, - GossipSyncContribution, - GossipLightClientFinalityUpdate, - GossipLightClientOptimisticUpdate, - RpcBlock, - RpcBlobs, - RpcCustodyColumn, - RpcVerifyDataColumn, - SamplingResult, - IgnoredRpcBlock, - ChainSegment, - ChainSegmentBackfill, - Status, - BlocksByRangeRequest, - BlocksByRootsRequest, - BlobsByRangeRequest, - BlobsByRootsRequest, - DataColumnsByRootsRequest, - DataColumnsByRangeRequest, - GossipBlsToExecutionChange, - LightClientBootstrapRequest, - LightClientOptimisticUpdateRequest, - LightClientFinalityUpdateRequest, - ApiRequestP0, - ApiRequestP1, -} - -impl Work { - fn str_id(&self) -> &'static str { - self.to_type().into() - } - - /// Provides a `&str` that uniquely identifies each enum variant. - fn to_type(&self) -> WorkType { - match self { - Work::GossipAttestation { .. } => WorkType::GossipAttestation, - Work::GossipAttestationBatch { .. } => WorkType::GossipAttestationBatch, - Work::GossipAggregate { .. } => WorkType::GossipAggregate, - Work::GossipAggregateBatch { .. } => WorkType::GossipAggregateBatch, - Work::GossipBlock(_) => WorkType::GossipBlock, - Work::GossipBlobSidecar(_) => WorkType::GossipBlobSidecar, - Work::GossipDataColumnSidecar(_) => WorkType::GossipDataColumnSidecar, - Work::DelayedImportBlock { .. } => WorkType::DelayedImportBlock, - Work::GossipVoluntaryExit(_) => WorkType::GossipVoluntaryExit, - Work::GossipProposerSlashing(_) => WorkType::GossipProposerSlashing, - Work::GossipAttesterSlashing(_) => WorkType::GossipAttesterSlashing, - Work::GossipSyncSignature(_) => WorkType::GossipSyncSignature, - Work::GossipSyncContribution(_) => WorkType::GossipSyncContribution, - Work::GossipLightClientFinalityUpdate(_) => WorkType::GossipLightClientFinalityUpdate, - Work::GossipLightClientOptimisticUpdate(_) => { - WorkType::GossipLightClientOptimisticUpdate - } - Work::GossipBlsToExecutionChange(_) => WorkType::GossipBlsToExecutionChange, - Work::RpcBlock { .. } => WorkType::RpcBlock, - Work::RpcBlobs { .. } => WorkType::RpcBlobs, - Work::RpcCustodyColumn { .. } => WorkType::RpcCustodyColumn, - Work::RpcVerifyDataColumn { .. } => WorkType::RpcVerifyDataColumn, - Work::SamplingResult { .. } => WorkType::SamplingResult, - Work::IgnoredRpcBlock { .. } => WorkType::IgnoredRpcBlock, - Work::ChainSegment { .. } => WorkType::ChainSegment, - Work::ChainSegmentBackfill(_) => WorkType::ChainSegmentBackfill, - Work::Status(_) => WorkType::Status, - Work::BlocksByRangeRequest(_) => WorkType::BlocksByRangeRequest, - Work::BlocksByRootsRequest(_) => WorkType::BlocksByRootsRequest, - Work::BlobsByRangeRequest(_) => WorkType::BlobsByRangeRequest, - Work::BlobsByRootsRequest(_) => WorkType::BlobsByRootsRequest, - Work::DataColumnsByRootsRequest(_) => WorkType::DataColumnsByRootsRequest, - Work::DataColumnsByRangeRequest(_) => WorkType::DataColumnsByRangeRequest, - Work::LightClientBootstrapRequest(_) => WorkType::LightClientBootstrapRequest, - Work::LightClientOptimisticUpdateRequest(_) => { - WorkType::LightClientOptimisticUpdateRequest - } - Work::LightClientFinalityUpdateRequest(_) => WorkType::LightClientFinalityUpdateRequest, - Work::UnknownBlockAttestation { .. } => WorkType::UnknownBlockAttestation, - Work::UnknownBlockAggregate { .. } => WorkType::UnknownBlockAggregate, - Work::UnknownBlockSamplingRequest { .. } => WorkType::UnknownBlockSamplingRequest, - Work::UnknownLightClientOptimisticUpdate { .. } => { - WorkType::UnknownLightClientOptimisticUpdate - } - Work::ApiRequestP0 { .. } => WorkType::ApiRequestP0, - Work::ApiRequestP1 { .. } => WorkType::ApiRequestP1, - } - } -} - -/// Unifies all the messages processed by the `BeaconProcessor`. -enum InboundEvent { - /// A worker has completed a task and is free. - WorkerIdle, - /// There is new work to be done. - WorkEvent(WorkEvent), - /// A work event that was queued for re-processing has become ready. - ReprocessingWork(WorkEvent), -} - -/// Combines the various incoming event streams for the `BeaconProcessor` into a single stream. -/// -/// This struct has a similar purpose to `tokio::select!`, however it allows for more fine-grained -/// control (specifically in the ordering of event processing). -struct InboundEvents { - /// Used by workers when they finish a task. - idle_rx: mpsc::Receiver<()>, - /// Used by upstream processes to send new work to the `BeaconProcessor`. - event_rx: mpsc::Receiver>, - /// Used internally for queuing work ready to be re-processed. - reprocess_work_rx: mpsc::Receiver, -} - -impl Stream for InboundEvents { - type Item = InboundEvent; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // Always check for idle workers before anything else. This allows us to ensure that a big - // stream of new events doesn't suppress the processing of existing events. - match self.idle_rx.poll_recv(cx) { - Poll::Ready(Some(())) => { - return Poll::Ready(Some(InboundEvent::WorkerIdle)); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - // Poll for delayed blocks before polling for new work. It might be the case that a delayed - // block is required to successfully process some new work. - match self.reprocess_work_rx.poll_recv(cx) { - Poll::Ready(Some(ready_work)) => { - return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - match self.event_rx.poll_recv(cx) { - Poll::Ready(Some(event)) => { - return Poll::Ready(Some(InboundEvent::WorkEvent(event))); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - Poll::Pending - } -} - /// A mutli-threaded processor for messages received on the network /// that need to be processed by the `BeaconChain` /// @@ -811,810 +589,15 @@ impl BeaconProcessor { /// events processed by `self`. This should only be used during testing. #[allow(clippy::too_many_arguments)] pub fn spawn_manager( - mut self, + self, + beacon_state: &BeaconState, event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, - queue_lengths: BeaconProcessorQueueLengths, ) -> Result<(), String> { - // Used by workers to communicate that they are finished a task. - let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); - - let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel(self.config.max_scheduled_work_queue_len); - - // Using LIFO queues for attestations since validator profits rely upon getting fresh - // attestations into blocks. Additionally, later attestations contain more information than - // earlier ones, so we consider them more valuable. - let mut aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); - let mut aggregate_debounce = TimeLatch::default(); - let mut attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); - let mut attestation_debounce = TimeLatch::default(); - let mut unknown_block_aggregate_queue = - LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); - let mut unknown_block_attestation_queue = - LifoQueue::new(queue_lengths.unknown_block_attestation_queue); - - let mut sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); - let mut sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); - - // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have - // a strong feeling about queue type for exits. - let mut gossip_voluntary_exit_queue = - FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); - - // Using a FIFO queue for slashing to prevent people from flushing their slashings from the - // queues with lots of junk messages. - let mut gossip_proposer_slashing_queue = - FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); - let mut gossip_attester_slashing_queue = - FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); - - // Using a FIFO queue for light client updates to maintain sequence order. - let mut finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); - let mut optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); - let mut unknown_light_client_update_queue = - FifoQueue::new(queue_lengths.unknown_light_client_update_queue); - let mut unknown_block_sampling_request_queue = - FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); - - // Using a FIFO queue since blocks need to be imported sequentially. - let mut rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); - let mut rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); - let mut rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); - let mut rpc_verify_data_column_queue = - FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); - let mut sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); - let mut chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); - let mut backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); - let mut gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); - let mut gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); - let mut gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); - let mut delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); - - let mut status_queue = FifoQueue::new(queue_lengths.status_queue); - let mut bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); - let mut bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); - let mut blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); - let mut blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); - let mut dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); - let mut dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); - - let mut gossip_bls_to_execution_change_queue = - FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); - - let mut lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); - let mut lc_optimistic_update_queue = - FifoQueue::new(queue_lengths.lc_optimistic_update_queue); - let mut lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); - - let mut api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); - let mut api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); - - // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // receive them back once they are ready (`ready_work_rx`). - let (ready_work_tx, ready_work_rx) = - mpsc::channel::(self.config.max_scheduled_work_queue_len); - // TODO(beacon-processor) reprocess scheduler - spawn_reprocess_scheduler( - ready_work_tx, - work_reprocessing_rx, - &self.executor, - Arc::new(slot_clock), - self.log.clone(), - maximum_gossip_clock_disparity, - )?; - - let executor = self.executor.clone(); - - // The manager future will run on the core executor and delegate tasks to worker - // threads on the blocking executor. - let manager_future = async move { - let mut inbound_events = InboundEvents { - idle_rx, - event_rx, - reprocess_work_rx: ready_work_rx, - }; - - let enable_backfill_rate_limiting = self.config.enable_backfill_rate_limiting; - - loop { - let work_event = match inbound_events.next().await { - Some(InboundEvent::WorkerIdle) => { - self.current_workers = self.current_workers.saturating_sub(1); - None - } - // TODO(beacon-processor) backfill rate limiting is here - Some(InboundEvent::WorkEvent(event)) if enable_backfill_rate_limiting => { - match QueuedBackfillBatch::try_from(event) { - Ok(backfill_batch) => { - match work_reprocessing_tx - .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) - { - Err(e) => { - warn!( - self.log, - "Unable to queue backfill work event. Will try to process now."; - "error" => %e - ); - match e { - TrySendError::Full(reprocess_queue_message) - | TrySendError::Closed(reprocess_queue_message) => { - match reprocess_queue_message { - ReprocessQueueMessage::BackfillSync( - backfill_batch, - ) => Some(backfill_batch.into()), - other => { - crit!( - self.log, - "Unexpected queue message type"; - "message_type" => other.as_ref() - ); - // This is an unhandled exception, drop the message. - continue; - } - } - } - } - } - Ok(..) => { - // backfill work sent to "reprocessing" queue. Process the next event. - continue; - } - } - } - Err(event) => Some(event), - } - } - Some(InboundEvent::WorkEvent(event)) - | Some(InboundEvent::ReprocessingWork(event)) => Some(event), - None => { - debug!( - self.log, - "Gossip processor stopped"; - "msg" => "stream ended" - ); - break; - } - }; - - let _event_timer = - metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); - if let Some(event) = &work_event { - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, - &[event.work.str_id()], - ); - } else { - metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); - } - - if let Some(work_journal_tx) = &work_journal_tx { - let id = work_event - .as_ref() - .map(|event| event.work.str_id()) - .unwrap_or(WORKER_FREED); - - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(id); - } - - let can_spawn = self.current_workers < self.config.max_workers; - let drop_during_sync = work_event - .as_ref() - .map_or(false, |event| event.drop_during_sync); - - let idle_tx = idle_tx.clone(); - let modified_queue_id = match work_event { - // There is no new work event, but we are able to spawn a new worker. - // - // We don't check the `work.drop_during_sync` here. We assume that if it made - // it into the queue at any point then we should process it. - None if can_spawn => { - // Check for chain segments first, they're the most efficient way to get - // blocks into the system. - let work_event: Option> = if let Some(item) = - chain_segment_queue.pop() - { - Some(item) - // Check sync blocks before gossip blocks, since we've already explicitly - // requested these blocks. - } else if let Some(item) = rpc_block_queue.pop() { - Some(item) - } else if let Some(item) = rpc_blob_queue.pop() { - Some(item) - } else if let Some(item) = rpc_custody_column_queue.pop() { - Some(item) - // TODO(das): decide proper prioritization for sampling columns - } else if let Some(item) = rpc_custody_column_queue.pop() { - Some(item) - } else if let Some(item) = rpc_verify_data_column_queue.pop() { - Some(item) - } else if let Some(item) = sampling_result_queue.pop() { - Some(item) - // Check delayed blocks before gossip blocks, the gossip blocks might rely - // on the delayed ones. - } else if let Some(item) = delayed_block_queue.pop() { - Some(item) - // Check gossip blocks before gossip attestations, since a block might be - // required to verify some attestations. - } else if let Some(item) = gossip_block_queue.pop() { - Some(item) - } else if let Some(item) = gossip_blob_queue.pop() { - Some(item) - } else if let Some(item) = gossip_data_column_queue.pop() { - Some(item) - // Check the priority 0 API requests after blocks and blobs, but before attestations. - } else if let Some(item) = api_request_p0_queue.pop() { - Some(item) - // Check the aggregates, *then* the unaggregates since we assume that - // aggregates are more valuable to local validators and effectively give us - // more information with less signature verification time. - } else if aggregate_queue.len() > 0 { - let batch_size = cmp::min( - aggregate_queue.len(), - self.config.max_gossip_aggregate_batch_size, - ); - - if batch_size < 2 { - // One single aggregate is in the queue, process it individually. - aggregate_queue.pop() - } else { - // Collect two or more aggregates into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAggregate` item into a - // `Work::GossipAggregateBatch` item. - let mut aggregates = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = aggregate_queue.pop() { - match item { - Work::GossipAggregate { - aggregate, - process_individual: _, - process_batch, - } => { - aggregates.push(*aggregate); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => { - error!(self.log, "Invalid item in aggregate queue"); - } - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all aggregates with a single worker. - Some(Work::GossipAggregateBatch { - aggregates, - process_batch, - }) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.log, "Missing aggregate work"); - None - } - } - // Check the unaggregated attestation queue. - // - // Potentially use batching. - } else if attestation_queue.len() > 0 { - let batch_size = cmp::min( - attestation_queue.len(), - self.config.max_gossip_attestation_batch_size, - ); - - if batch_size < 2 { - // One single attestation is in the queue, process it individually. - attestation_queue.pop() - } else { - // Collect two or more attestations into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAttestation` item into a - // `Work::GossipAttestationBatch` item. - let mut attestations = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = attestation_queue.pop() { - match item { - Work::GossipAttestation { - attestation, - process_individual: _, - process_batch, - } => { - attestations.push(*attestation); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => error!( - self.log, - "Invalid item in attestation queue" - ), - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all attestations with a single worker. - Some(Work::GossipAttestationBatch { - attestations, - process_batch, - }) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.log, "Missing attestations work"); - None - } - } - // Check sync committee messages after attestations as their rewards are lesser - // and they don't influence fork choice. - } else if let Some(item) = sync_contribution_queue.pop() { - Some(item) - } else if let Some(item) = sync_message_queue.pop() { - Some(item) - // Aggregates and unaggregates queued for re-processing are older and we - // care about fresher ones, so check those first. - } else if let Some(item) = unknown_block_aggregate_queue.pop() { - Some(item) - } else if let Some(item) = unknown_block_attestation_queue.pop() { - Some(item) - // Check RPC methods next. Status messages are needed for sync so - // prioritize them over syncing requests from other peers (BlocksByRange - // and BlocksByRoot) - } else if let Some(item) = status_queue.pop() { - Some(item) - } else if let Some(item) = bbrange_queue.pop() { - Some(item) - } else if let Some(item) = bbroots_queue.pop() { - Some(item) - } else if let Some(item) = blbrange_queue.pop() { - Some(item) - } else if let Some(item) = blbroots_queue.pop() { - Some(item) - } else if let Some(item) = dcbroots_queue.pop() { - Some(item) - } else if let Some(item) = dcbrange_queue.pop() { - Some(item) - // Prioritize sampling requests after block syncing requests - } else if let Some(item) = unknown_block_sampling_request_queue.pop() { - Some(item) - // Check slashings after all other consensus messages so we prioritize - // following head. - // - // Check attester slashings before proposer slashings since they have the - // potential to slash multiple validators at once. - } else if let Some(item) = gossip_attester_slashing_queue.pop() { - Some(item) - } else if let Some(item) = gossip_proposer_slashing_queue.pop() { - Some(item) - // Check exits and address changes late since our validators don't get - // rewards from them. - } else if let Some(item) = gossip_voluntary_exit_queue.pop() { - Some(item) - } else if let Some(item) = gossip_bls_to_execution_change_queue.pop() { - Some(item) - // Check the priority 1 API requests after we've - // processed all the interesting things from the network - // and things required for us to stay in good repute - // with our P2P peers. - } else if let Some(item) = api_request_p1_queue.pop() { - Some(item) - // Handle backfill sync chain segments. - } else if let Some(item) = backfill_chain_segment.pop() { - Some(item) - // Handle light client requests. - } else if let Some(item) = lc_bootstrap_queue.pop() { - Some(item) - } else if let Some(item) = lc_optimistic_update_queue.pop() { - Some(item) - } else if let Some(item) = lc_finality_update_queue.pop() { - Some(item) - // This statement should always be the final else statement. - } else { - // Let the journal know that a worker is freed and there's nothing else - // for it to do. - if let Some(work_journal_tx) = &work_journal_tx { - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(NOTHING_TO_DO); - } - None - }; - - if let Some(work_event) = work_event { - let work_type = work_event.to_type(); - self.spawn_worker(work_event, idle_tx); - Some(work_type) - } else { - None - } - } - // There is no new work event and we are unable to spawn a new worker. - // - // I cannot see any good reason why this would happen. - None => { - warn!( - self.log, - "Unexpected gossip processor condition"; - "msg" => "no new work and cannot spawn worker" - ); - None - } - // The chain is syncing and this event should be dropped during sync. - Some(work_event) - if self.network_globals.sync_state.read().is_syncing() - && drop_during_sync => - { - let work_id = work_event.work.str_id(); - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, - &[work_id], - ); - trace!( - self.log, - "Gossip processor skipping work"; - "msg" => "chain is syncing", - "work_id" => work_id - ); - None - } - // There is a new work event and the chain is not syncing. Process it or queue - // it. - Some(WorkEvent { work, .. }) => { - let work_id = work.str_id(); - let work_type = work.to_type(); - - match work { - _ if can_spawn => self.spawn_worker(work, idle_tx), - Work::GossipAttestation { .. } => attestation_queue.push(work), - // Attestation batches are formed internally within the - // `BeaconProcessor`, they are not sent from external services. - Work::GossipAttestationBatch { .. } => crit!( - self.log, - "Unsupported inbound event"; - "type" => "GossipAttestationBatch" - ), - Work::GossipAggregate { .. } => aggregate_queue.push(work), - // Aggregate batches are formed internally within the `BeaconProcessor`, - // they are not sent from external services. - Work::GossipAggregateBatch { .. } => crit!( - self.log, - "Unsupported inbound event"; - "type" => "GossipAggregateBatch" - ), - Work::GossipBlock { .. } => { - gossip_block_queue.push(work, work_id, &self.log) - } - Work::GossipBlobSidecar { .. } => { - gossip_blob_queue.push(work, work_id, &self.log) - } - Work::GossipDataColumnSidecar { .. } => { - gossip_data_column_queue.push(work, work_id, &self.log) - } - Work::DelayedImportBlock { .. } => { - delayed_block_queue.push(work, work_id, &self.log) - } - Work::GossipVoluntaryExit { .. } => { - gossip_voluntary_exit_queue.push(work, work_id, &self.log) - } - Work::GossipProposerSlashing { .. } => { - gossip_proposer_slashing_queue.push(work, work_id, &self.log) - } - Work::GossipAttesterSlashing { .. } => { - gossip_attester_slashing_queue.push(work, work_id, &self.log) - } - Work::GossipSyncSignature { .. } => sync_message_queue.push(work), - Work::GossipSyncContribution { .. } => { - sync_contribution_queue.push(work) - } - Work::GossipLightClientFinalityUpdate { .. } => { - finality_update_queue.push(work, work_id, &self.log) - } - Work::GossipLightClientOptimisticUpdate { .. } => { - optimistic_update_queue.push(work, work_id, &self.log) - } - Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => { - rpc_block_queue.push(work, work_id, &self.log) - } - Work::RpcBlobs { .. } => rpc_blob_queue.push(work, work_id, &self.log), - Work::RpcCustodyColumn { .. } => { - rpc_custody_column_queue.push(work, work_id, &self.log) - } - Work::RpcVerifyDataColumn(_) => { - rpc_verify_data_column_queue.push(work, work_id, &self.log) - } - Work::SamplingResult(_) => { - sampling_result_queue.push(work, work_id, &self.log) - } - Work::ChainSegment { .. } => { - chain_segment_queue.push(work, work_id, &self.log) - } - Work::ChainSegmentBackfill { .. } => { - backfill_chain_segment.push(work, work_id, &self.log) - } - Work::Status { .. } => status_queue.push(work, work_id, &self.log), - Work::BlocksByRangeRequest { .. } => { - bbrange_queue.push(work, work_id, &self.log) - } - Work::BlocksByRootsRequest { .. } => { - bbroots_queue.push(work, work_id, &self.log) - } - Work::BlobsByRangeRequest { .. } => { - blbrange_queue.push(work, work_id, &self.log) - } - Work::LightClientBootstrapRequest { .. } => { - lc_bootstrap_queue.push(work, work_id, &self.log) - } - Work::LightClientOptimisticUpdateRequest { .. } => { - lc_optimistic_update_queue.push(work, work_id, &self.log) - } - Work::LightClientFinalityUpdateRequest { .. } => { - lc_finality_update_queue.push(work, work_id, &self.log) - } - Work::UnknownBlockAttestation { .. } => { - unknown_block_attestation_queue.push(work) - } - Work::UnknownBlockAggregate { .. } => { - unknown_block_aggregate_queue.push(work) - } - Work::GossipBlsToExecutionChange { .. } => { - gossip_bls_to_execution_change_queue.push(work, work_id, &self.log) - } - Work::BlobsByRootsRequest { .. } => { - blbroots_queue.push(work, work_id, &self.log) - } - Work::DataColumnsByRootsRequest { .. } => { - dcbroots_queue.push(work, work_id, &self.log) - } - Work::DataColumnsByRangeRequest { .. } => { - dcbrange_queue.push(work, work_id, &self.log) - } - Work::UnknownLightClientOptimisticUpdate { .. } => { - unknown_light_client_update_queue.push(work, work_id, &self.log) - } - Work::UnknownBlockSamplingRequest { .. } => { - unknown_block_sampling_request_queue.push(work, work_id, &self.log) - } - Work::ApiRequestP0 { .. } => { - api_request_p0_queue.push(work, work_id, &self.log) - } - Work::ApiRequestP1 { .. } => { - api_request_p1_queue.push(work, work_id, &self.log) - } - }; - Some(work_type) - } - }; - - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, - self.current_workers as i64, - ); - - if let Some(modified_queue_id) = modified_queue_id { - let queue_len = match modified_queue_id { - WorkType::GossipAttestation => aggregate_queue.len(), - WorkType::UnknownBlockAttestation => unknown_block_attestation_queue.len(), - WorkType::GossipAttestationBatch => 0, // No queue - WorkType::GossipAggregate => aggregate_queue.len(), - WorkType::UnknownBlockAggregate => unknown_block_aggregate_queue.len(), - WorkType::UnknownLightClientOptimisticUpdate => { - unknown_light_client_update_queue.len() - } - WorkType::UnknownBlockSamplingRequest => { - unknown_block_sampling_request_queue.len() - } - WorkType::GossipAggregateBatch => 0, // No queue - WorkType::GossipBlock => gossip_block_queue.len(), - WorkType::GossipBlobSidecar => gossip_blob_queue.len(), - WorkType::GossipDataColumnSidecar => gossip_data_column_queue.len(), - WorkType::DelayedImportBlock => delayed_block_queue.len(), - WorkType::GossipVoluntaryExit => gossip_voluntary_exit_queue.len(), - WorkType::GossipProposerSlashing => gossip_proposer_slashing_queue.len(), - WorkType::GossipAttesterSlashing => gossip_attester_slashing_queue.len(), - WorkType::GossipSyncSignature => sync_message_queue.len(), - WorkType::GossipSyncContribution => sync_contribution_queue.len(), - WorkType::GossipLightClientFinalityUpdate => finality_update_queue.len(), - WorkType::GossipLightClientOptimisticUpdate => { - optimistic_update_queue.len() - } - WorkType::RpcBlock => rpc_block_queue.len(), - WorkType::RpcBlobs | WorkType::IgnoredRpcBlock => rpc_blob_queue.len(), - WorkType::RpcCustodyColumn => rpc_custody_column_queue.len(), - WorkType::RpcVerifyDataColumn => rpc_verify_data_column_queue.len(), - WorkType::SamplingResult => sampling_result_queue.len(), - WorkType::ChainSegment => chain_segment_queue.len(), - WorkType::ChainSegmentBackfill => backfill_chain_segment.len(), - WorkType::Status => status_queue.len(), - WorkType::BlocksByRangeRequest => blbrange_queue.len(), - WorkType::BlocksByRootsRequest => blbroots_queue.len(), - WorkType::BlobsByRangeRequest => bbrange_queue.len(), - WorkType::BlobsByRootsRequest => bbroots_queue.len(), - WorkType::DataColumnsByRootsRequest => dcbroots_queue.len(), - WorkType::DataColumnsByRangeRequest => dcbrange_queue.len(), - WorkType::GossipBlsToExecutionChange => { - gossip_bls_to_execution_change_queue.len() - } - WorkType::LightClientBootstrapRequest => lc_bootstrap_queue.len(), - WorkType::LightClientOptimisticUpdateRequest => { - lc_optimistic_update_queue.len() - } - WorkType::LightClientFinalityUpdateRequest => { - lc_finality_update_queue.len() - } - WorkType::ApiRequestP0 => api_request_p0_queue.len(), - WorkType::ApiRequestP1 => api_request_p1_queue.len(), - }; - metrics::observe_vec( - &metrics::BEACON_PROCESSOR_QUEUE_LENGTH, - &[modified_queue_id.into()], - queue_len as f64, - ); - } - - if aggregate_queue.is_full() && aggregate_debounce.elapsed() { - error!( - self.log, - "Aggregate attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => aggregate_queue.max_length, - ) - } - - if attestation_queue.is_full() && attestation_debounce.elapsed() { - error!( - self.log, - "Attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => attestation_queue.max_length, - ) - } - } - }; - - // Spawn on the core executor. - executor.spawn(manager_future, MANAGER_TASK_NAME); - Ok(()) - } - - // TODO(beacon-processor) should we move spawn_worker outside of self? - /// Spawns a blocking worker thread to process some `Work`. - /// - /// Sends an message on `idle_tx` when the work is complete and the task is stopping. - fn spawn_worker(&mut self, work: Work, idle_tx: mpsc::Sender<()>) { - let work_id = work.str_id(); - let worker_timer = - metrics::start_timer_vec(&metrics::BEACON_PROCESSOR_WORKER_TIME, &[work_id]); - metrics::inc_counter(&metrics::BEACON_PROCESSOR_WORKERS_SPAWNED_TOTAL); - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_STARTED_COUNT, - &[work.str_id()], - ); - - // Wrap the `idle_tx` in a struct that will fire the idle message whenever it is dropped. - // - // This helps ensure that the worker is always freed in the case of an early exit or panic. - // As such, this instantiation should happen as early in the function as possible. - let send_idle_on_drop = SendOnDrop { - tx: idle_tx, - _worker_timer: worker_timer, - log: self.log.clone(), - }; - - let worker_id = self.current_workers; - self.current_workers = self.current_workers.saturating_add(1); - - let executor = self.executor.clone(); - - trace!( - self.log, - "Spawning beacon processor worker"; - "work" => work_id, - "worker" => worker_id, - ); - - let task_spawner = TaskSpawner { - executor, - send_idle_on_drop, - }; - - match work { - Work::GossipAttestation { - attestation, - process_individual, - process_batch: _, - } => task_spawner.spawn_blocking(move || { - process_individual(*attestation); - }), - Work::GossipAttestationBatch { - attestations, - process_batch, - } => task_spawner.spawn_blocking(move || { - process_batch(attestations); - }), - Work::GossipAggregate { - aggregate, - process_individual, - process_batch: _, - } => task_spawner.spawn_blocking(move || { - process_individual(*aggregate); - }), - Work::GossipAggregateBatch { - aggregates, - process_batch, - } => task_spawner.spawn_blocking(move || { - process_batch(aggregates); - }), - Work::ChainSegment(process_fn) => task_spawner.spawn_async(async move { - process_fn.await; - }), - Work::UnknownBlockAttestation { process_fn } - | Work::UnknownBlockAggregate { process_fn } - | Work::UnknownLightClientOptimisticUpdate { process_fn, .. } - | Work::UnknownBlockSamplingRequest { process_fn } => { - task_spawner.spawn_blocking(process_fn) - } - Work::DelayedImportBlock { - beacon_block_slot: _, - beacon_block_root: _, - process_fn, - } => task_spawner.spawn_async(process_fn), - Work::RpcBlock { process_fn } - | Work::RpcBlobs { process_fn } - | Work::RpcCustodyColumn(process_fn) - | Work::RpcVerifyDataColumn(process_fn) - | Work::SamplingResult(process_fn) => task_spawner.spawn_async(process_fn), - Work::IgnoredRpcBlock { process_fn } => task_spawner.spawn_blocking(process_fn), - Work::GossipBlock(work) - | Work::GossipBlobSidecar(work) - | Work::GossipDataColumnSidecar(work) => task_spawner.spawn_async(async move { - work.await; - }), - Work::BlobsByRangeRequest(process_fn) - | Work::BlobsByRootsRequest(process_fn) - | Work::DataColumnsByRootsRequest(process_fn) - | Work::DataColumnsByRangeRequest(process_fn) => { - task_spawner.spawn_blocking(process_fn) - } - Work::BlocksByRangeRequest(work) | Work::BlocksByRootsRequest(work) => { - task_spawner.spawn_async(work) - } - Work::ChainSegmentBackfill(process_fn) => task_spawner.spawn_async(process_fn), - Work::ApiRequestP0(process_fn) | Work::ApiRequestP1(process_fn) => match process_fn { - BlockingOrAsync::Blocking(process_fn) => task_spawner.spawn_blocking(process_fn), - BlockingOrAsync::Async(process_fn) => task_spawner.spawn_async(process_fn), - }, - Work::GossipVoluntaryExit(process_fn) - | Work::GossipProposerSlashing(process_fn) - | Work::GossipAttesterSlashing(process_fn) - | Work::GossipSyncSignature(process_fn) - | Work::GossipSyncContribution(process_fn) - | Work::GossipLightClientFinalityUpdate(process_fn) - | Work::GossipLightClientOptimisticUpdate(process_fn) - | Work::Status(process_fn) - | Work::GossipBlsToExecutionChange(process_fn) - | Work::LightClientBootstrapRequest(process_fn) - | Work::LightClientOptimisticUpdateRequest(process_fn) - | Work::LightClientFinalityUpdateRequest(process_fn) => { - task_spawner.spawn_blocking(process_fn) - } - Work::Reprocess(process_fn) - }; + let scheduler = SchedulerType::::new(self, beacon_state, event_rx); + scheduler.run(work_journal_tx, slot_clock, maximum_gossip_clock_disparity) } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index ffe3efee9f4..59b8567d9ac 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -2,20 +2,21 @@ use std::time::Duration; use slot_clock::SlotClock; use tokio::sync::mpsc; -use types::EthSpec; +use types::{BeaconState, EthSpec}; -use crate::WorkEvent; +use crate::{BeaconProcessor, WorkEvent}; use super::priority_scheduler; pub trait Scheduler { + fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>) -> Self; + fn run( - &self, - event_rx: mpsc::Receiver>, + self, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, - ); + ) -> Result<(), String>; } pub enum SchedulerType { @@ -23,14 +24,24 @@ pub enum SchedulerType { } impl Scheduler for SchedulerType { + fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>) -> Self { + SchedulerType::PriorityScheduler(priority_scheduler::Scheduler::new( + beacon_processor, + todo!(), + todo!(), + )) + } // TODO(beacon-processor) make this config driven fn run( - &self, - event_rx: mpsc::Receiver>, + self, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, - ) { - todo!() + ) -> Result<(), String> { + match self { + SchedulerType::PriorityScheduler(scheduler) => { + scheduler.run(work_journal_tx, slot_clock, maximum_gossip_clock_disparity) + } + } } } diff --git a/beacon_node/beacon_processor/src/scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/mod.rs index 60cf66022a4..967dc882c19 100644 --- a/beacon_node/beacon_processor/src/scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/mod.rs @@ -1,2 +1,2 @@ -mod interface; +pub mod interface; mod priority_scheduler; diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs deleted file mode 100644 index 9b6d90c5d7b..00000000000 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler.rs +++ /dev/null @@ -1,867 +0,0 @@ -// The priority scheduler has three major facets -// 1. A priority ordering system -// 2. A backfill rate limiting feature -// 3. A retry queue - -use slog::error; -use slot_clock::SlotClock; -use std::{cmp, marker::PhantomData, sync::Arc, time::Duration}; - -use futures::StreamExt; -use lighthouse_metrics::HistogramTimer; -use logging::TimeLatch; -use slog::{crit, debug, trace, warn}; -use tokio::sync::mpsc::{self, Receiver, Sender}; -use types::EthSpec; - -use crate::{ - metrics, work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, BeaconProcessor, BeaconProcessorQueueLengths, FifoQueue, InboundEvent, InboundEvents, LifoQueue, Work, WorkEvent, NOTHING_TO_DO, WORKER_FREED -}; - -// TODO(beacon-processor) this will be impl specific -pub struct WorkQueues { - pub aggregate_queue: LifoQueue>, - pub aggregate_debounce: TimeLatch, - pub attestation_queue: LifoQueue>, - pub attestation_debounce: TimeLatch, - pub unknown_block_aggregate_queue: LifoQueue>, - pub unknown_block_attestation_queue: LifoQueue>, - pub sync_message_queue: LifoQueue>, - pub sync_contribution_queue: LifoQueue>, - pub gossip_voluntary_exit_queue: FifoQueue>, - pub gossip_proposer_slashing_queue: FifoQueue>, - pub gossip_attester_slashing_queue: FifoQueue>, - pub finality_update_queue: FifoQueue>, - pub optimistic_update_queue: FifoQueue>, - pub unknown_light_client_update_queue: FifoQueue>, - pub unknown_block_sampling_request_queue: FifoQueue>, - pub rpc_block_queue: FifoQueue>, - pub rpc_blob_queue: FifoQueue>, - pub rpc_custody_column_queue: FifoQueue>, - pub rpc_verify_data_column_queue: FifoQueue>, - pub sampling_result_queue: FifoQueue>, - pub chain_segment_queue: FifoQueue>, - pub backfill_chain_segment: FifoQueue>, - pub gossip_block_queue: FifoQueue>, - pub gossip_blob_queue: FifoQueue>, - pub gossip_data_column_queue: FifoQueue>, - pub delayed_block_queue: FifoQueue>, - pub status_queue: FifoQueue>, - pub bbrange_queue: FifoQueue>, - pub bbroots_queue: FifoQueue>, - pub blbroots_queue: FifoQueue>, - pub blbrange_queue: FifoQueue>, - pub dcbroots_queue: FifoQueue>, - pub dcbrange_queue: FifoQueue>, - pub gossip_bls_to_execution_change_queue: FifoQueue>, - pub lc_bootstrap_queue: FifoQueue>, - pub lc_optimistic_update_queue: FifoQueue>, - pub lc_finality_update_queue: FifoQueue>, - pub api_request_p0_queue: FifoQueue>, - pub api_request_p1_queue: FifoQueue>, -} - -impl WorkQueues { - pub fn new(queue_lengths: BeaconProcessorQueueLengths) -> Self { - let aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); - let aggregate_debounce = TimeLatch::default(); - let attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); - let attestation_debounce = TimeLatch::default(); - let unknown_block_aggregate_queue = - LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); - let unknown_block_attestation_queue = - LifoQueue::new(queue_lengths.unknown_block_attestation_queue); - - let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); - let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); - - // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have - // a strong feeling about queue type for exits. - let gossip_voluntary_exit_queue = - FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); - - // Using a FIFO queue for slashing to prevent people from flushing their slashings from the - // queues with lots of junk messages. - let gossip_proposer_slashing_queue = - FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); - let gossip_attester_slashing_queue = - FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); - - // Using a FIFO queue for light client updates to maintain sequence order. - let finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); - let optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); - let unknown_light_client_update_queue = - FifoQueue::new(queue_lengths.unknown_light_client_update_queue); - let unknown_block_sampling_request_queue = - FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); - - // Using a FIFO queue since blocks need to be imported sequentially. - let rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); - let rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); - let rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); - let rpc_verify_data_column_queue = - FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); - let sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); - let chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); - let backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); - let gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); - let gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); - let gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); - let delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); - - let status_queue = FifoQueue::new(queue_lengths.status_queue); - let bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); - let bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); - let blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); - let blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); - let dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); - let dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); - - let gossip_bls_to_execution_change_queue = - FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); - - let lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); - let lc_optimistic_update_queue = - FifoQueue::new(queue_lengths.lc_optimistic_update_queue); - let lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); - - let api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); - let api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); - - WorkQueues { - aggregate_queue, - aggregate_debounce, - attestation_queue, - attestation_debounce, - unknown_block_aggregate_queue, - unknown_block_attestation_queue, - sync_message_queue, - sync_contribution_queue, - gossip_voluntary_exit_queue, - gossip_proposer_slashing_queue, - gossip_attester_slashing_queue, - finality_update_queue, - optimistic_update_queue, - unknown_light_client_update_queue, - unknown_block_sampling_request_queue, - rpc_block_queue, - rpc_blob_queue, - rpc_custody_column_queue, - rpc_verify_data_column_queue, - sampling_result_queue, - chain_segment_queue, - backfill_chain_segment, - gossip_block_queue, - gossip_blob_queue, - gossip_data_column_queue, - delayed_block_queue, - status_queue, - bbrange_queue, - bbroots_queue, - blbroots_queue, - blbrange_queue, - dcbroots_queue, - dcbrange_queue, - gossip_bls_to_execution_change_queue, - lc_bootstrap_queue, - lc_optimistic_update_queue, - lc_finality_update_queue, - api_request_p0_queue, - api_request_p1_queue, - } - } -} - -// Backend trait inits a channel, a run function -// A channel trait has send_work, reprocess_work etc. - -pub struct Scheduler { - beacon_processor: BeaconProcessor, - enable_backfill_rate_limiting: bool, - current_workers: usize, - idle_tx: Sender<()>, - idle_rx: Receiver<()>, - work_reprocessing_tx: Sender<()>, - work_reprocessing_rx: Receiver<()>, - work_queues: WorkQueues, - phantom_data: PhantomData -} - -impl Scheduler { - - fn new() -> Self { - // let (work_reprocessing_tx, work_reprocessing_rx) = - // mpsc::channel(config.max_scheduled_work_queue_len); - todo!() - } - - pub async fn process_work_event(&self) {} - - - async fn run( - mut self, - mut inbound_events: InboundEvents, - work_journal_tx: Option>, - slot_clock: S, - maximum_gossip_clock_disparity: Duration, - ) -> Result<(), String> { - // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // receive them back once they are ready (`ready_work_rx`). - let (ready_work_tx, ready_work_rx) = - mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); - // TODO(beacon-processor) reprocess scheduler - spawn_reprocess_scheduler( - ready_work_tx, - self.work_reprocessing_rx, - &self.beacon_processor.executor, - Arc::new(slot_clock), - self.beacon_processor.log.clone(), - maximum_gossip_clock_disparity, - )?; - - let work_event = match inbound_events.next().await { - Some(InboundEvent::WorkerIdle) => { - // TODO(beacon-processor) move current_workers from beacon_processor to self - self.current_workers = self.current_workers.saturating_sub(1); - None - } - Some(InboundEvent::WorkEvent(event)) if self.enable_backfill_rate_limiting => { - // TODO(beacon-processor) is backfill rate limiting going to be the same across all schedulers? - todo!() - } - Some(InboundEvent::WorkEvent(event)) | Some(InboundEvent::ReprocessingWork(event)) => { - Some(event) - } - None => { - debug!( - self.beacon_processor.log, - "Gossip processor stopped"; - "msg" => "stream ended" - ); - // TODO(beacon-processor) this should terminate the whole process - todo!() - } - }; - - let _event_timer = self.increment_metrics(&work_event); - self.worker_journal(&work_event, &work_journal_tx); - - let can_spawn = self.current_workers < self.beacon_processor.config.max_workers; - let drop_during_sync = work_event - .as_ref() - .map_or(false, |event| event.drop_during_sync); - - match work_event { - // There is no new work event, but we are able to spawn a new worker. - // - // We don't check the `work.drop_during_sync` here. We assume that if it made - // it into the queue at any point then we should process it. - None if can_spawn => { - // TODO(beacon-processor) implement the normal priority scheduler here - // also note that these match arms will look similar across all scheduler variants - // so maybe we can pull this function out and get creative with closure usage - self.priority_scheduler(&work_journal_tx); - todo!() - } - // There is no new work event and we are unable to spawn a new worker. - // - // I cannot see any good reason why this would happen. - None => { - warn!( - self.beacon_processor.log, - "Unexpected gossip processor condition"; - "msg" => "no new work and cannot spawn worker" - ); - todo!() - } - // The chain is syncing and this event should be dropped during sync. - Some(work_event) - if self - .beacon_processor - .network_globals - .sync_state - .read() - .is_syncing() - && drop_during_sync => - { - let work_id = work_event.work.str_id(); - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, - &[work_id], - ); - trace!( - self.beacon_processor.log, - "Gossip processor skipping work"; - "msg" => "chain is syncing", - "work_id" => work_id - ); - todo!() - } - - // There is a new work event and the chain is not syncing. Process it or queue - // it. - Some(WorkEvent { work, .. }) => { - self.process_or_queue_work_event(work, can_spawn); - todo!() - } - } - } - - fn priority_scheduler(&mut self, work_journal_tx: &Option>) { - let idle_tx = self.idle_tx.clone(); - // Check for chain segments first, they're the most efficient way to get - // blocks into the system. - if let Some(item) = self.work_queues.chain_segment_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check sync blocks before gossip blocks, since we've already explicitly - // requested these blocks. - } else if let Some(item) = self.work_queues.rpc_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_blob_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // TODO(das): decide proper prioritization for sampling columns - } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.rpc_verify_data_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.sampling_result_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check delayed blocks before gossip blocks, the gossip blocks might rely - // on the delayed ones. - } else if let Some(item) = self.work_queues.delayed_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check gossip blocks before gossip attestations, since a block might be - // required to verify some attestations. - } else if let Some(item) = self.work_queues.gossip_block_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_blob_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_data_column_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the priority 0 API requests after blocks and blobs, but before attestations. - } else if let Some(item) = self.work_queues.api_request_p0_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the aggregates, *then* the unaggregates since we assume that - // aggregates are more valuable to local validators and effectively give us - // more information with less signature verification time. - } else if self.work_queues.aggregate_queue.len() > 0 { - let batch_size = cmp::min( - self.work_queues.aggregate_queue.len(), - self.beacon_processor.config.max_gossip_aggregate_batch_size, - ); - - if batch_size < 2 { - // One single aggregate is in the queue, process it individually. - if let Some(item) = self.work_queues.aggregate_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } - } else { - // Collect two or more aggregates into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAggregate` item into a - // `Work::GossipAggregateBatch` item. - let mut aggregates = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = self.work_queues.aggregate_queue.pop() { - match item { - Work::GossipAggregate { - aggregate, - process_individual: _, - process_batch, - } => { - aggregates.push(*aggregate); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => { - error!( - self.beacon_processor.log, - "Invalid item in aggregate queue" - ); - } - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all aggregates with a single worker. - self.beacon_processor.spawn_worker( - Work::GossipAggregateBatch { - aggregates, - process_batch, - }, - idle_tx, - ) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.beacon_processor.log, "Missing aggregate work"); - } - } - // Check the unaggregated attestation queue. - // - // Potentially use batching. - } else if self.work_queues.attestation_queue.len() > 0 { - let batch_size = cmp::min( - self.work_queues.attestation_queue.len(), - self.beacon_processor - .config - .max_gossip_attestation_batch_size, - ); - - if batch_size < 2 { - // One single attestation is in the queue, process it individually. - if let Some(item) = self.work_queues.attestation_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } - } else { - // Collect two or more attestations into a batch, so they can take - // advantage of batch signature verification. - // - // Note: this will convert the `Work::GossipAttestation` item into a - // `Work::GossipAttestationBatch` item. - let mut attestations = Vec::with_capacity(batch_size); - let mut process_batch_opt = None; - for _ in 0..batch_size { - if let Some(item) = self.work_queues.attestation_queue.pop() { - match item { - Work::GossipAttestation { - attestation, - process_individual: _, - process_batch, - } => { - attestations.push(*attestation); - if process_batch_opt.is_none() { - process_batch_opt = Some(process_batch); - } - } - _ => error!( - self.beacon_processor.log, - "Invalid item in attestation queue" - ), - } - } - } - - if let Some(process_batch) = process_batch_opt { - // Process all attestations with a single worker. - self.beacon_processor.spawn_worker( - Work::GossipAttestationBatch { - attestations, - process_batch, - }, - idle_tx, - ) - } else { - // There is no good reason for this to - // happen, it is a serious logic error. - // Since we only form batches when multiple - // work items exist, we should always have a - // work closure at this point. - crit!(self.beacon_processor.log, "Missing attestations work"); - } - } - // Check sync committee messages after attestations as their rewards are lesser - // and they don't influence fork choice. - } else if let Some(item) = self.work_queues.sync_contribution_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.sync_message_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Aggregates and unaggregates queued for re-processing are older and we - // care about fresher ones, so check those first. - } else if let Some(item) = self.work_queues.unknown_block_aggregate_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.unknown_block_attestation_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check RPC methods next. Status messages are needed for sync so - // prioritize them over syncing requests from other peers (BlocksByRange - // and BlocksByRoot) - } else if let Some(item) = self.work_queues.status_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.bbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.bbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.blbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.blbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.dcbroots_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.dcbrange_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Prioritize sampling requests after block syncing requests - } else if let Some(item) = self.work_queues.unknown_block_sampling_request_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check slashings after all other consensus messages so we prioritize - // following head. - // - // Check attester slashings before proposer slashings since they have the - // potential to slash multiple validators at once. - } else if let Some(item) = self.work_queues.gossip_attester_slashing_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_proposer_slashing_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check exits and address changes late since our validators don't get - // rewards from them. - } else if let Some(item) = self.work_queues.gossip_voluntary_exit_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.gossip_bls_to_execution_change_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Check the priority 1 API requests after we've - // processed all the interesting things from the network - // and things required for us to stay in good repute - // with our P2P peers. - } else if let Some(item) = self.work_queues.api_request_p1_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Handle backfill sync chain segments. - } else if let Some(item) = self.work_queues.backfill_chain_segment.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // Handle light client requests. - } else if let Some(item) = self.work_queues.lc_bootstrap_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.lc_optimistic_update_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - } else if let Some(item) = self.work_queues.lc_finality_update_queue.pop() { - self.beacon_processor.spawn_worker(item, idle_tx); - // This statement should always be the final else statement. - } else { - // Let the journal know that a worker is freed and there's nothing else - // for it to do. - if let Some(work_journal_tx) = work_journal_tx { - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(NOTHING_TO_DO); - } - } - } - - // TODO(beacon-processor) this might be able to be moved to a more generalized location - pub fn process_or_queue_work_event(&mut self, work: Work, can_spawn: bool) { - let work_id = work.str_id(); - - match work { - _ if can_spawn => self - .beacon_processor - .spawn_worker(work, self.idle_tx.clone()), - Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), - // Attestation batches are formed internally within the - // `BeaconProcessor`, they are not sent from external services. - Work::GossipAttestationBatch { .. } => crit!( - self.beacon_processor.log, - "Unsupported inbound event"; - "type" => "GossipAttestationBatch" - ), - Work::GossipAggregate { .. } => self.work_queues.aggregate_queue.push(work), - // Aggregate batches are formed internally within the `BeaconProcessor`, - // they are not sent from external services. - Work::GossipAggregateBatch { .. } => crit!( - self.beacon_processor.log, - "Unsupported inbound event"; - "type" => "GossipAggregateBatch" - ), - Work::GossipBlock { .. } => { - self.work_queues - .gossip_block_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipBlobSidecar { .. } => { - self.work_queues - .gossip_blob_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipDataColumnSidecar { .. } => self.work_queues.gossip_data_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::DelayedImportBlock { .. } => { - self.work_queues - .delayed_block_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::GossipVoluntaryExit { .. } => self.work_queues.gossip_voluntary_exit_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::GossipProposerSlashing { .. } => self - .work_queues - .gossip_proposer_slashing_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipAttesterSlashing { .. } => self - .work_queues - .gossip_attester_slashing_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipSyncSignature { .. } => self.work_queues.sync_message_queue.push(work), - Work::GossipSyncContribution { .. } => { - self.work_queues.sync_contribution_queue.push(work) - } - Work::GossipLightClientFinalityUpdate { .. } => self - .work_queues - .finality_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::GossipLightClientOptimisticUpdate { .. } => self - .work_queues - .optimistic_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self - .work_queues - .rpc_block_queue - .push(work, work_id, &self.beacon_processor.log), - Work::RpcBlobs { .. } => { - self.work_queues - .rpc_blob_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::RpcCustodyColumn { .. } => self.work_queues.rpc_custody_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::RpcVerifyDataColumn(_) => self.work_queues.rpc_verify_data_column_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::SamplingResult(_) => self.work_queues.sampling_result_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::ChainSegment { .. } => { - self.work_queues - .chain_segment_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::ChainSegmentBackfill { .. } => self.work_queues.backfill_chain_segment.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::Status { .. } => { - self.work_queues - .status_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlocksByRangeRequest { .. } => { - self.work_queues - .bbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlocksByRootsRequest { .. } => { - self.work_queues - .bbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::BlobsByRangeRequest { .. } => { - self.work_queues - .blbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::LightClientBootstrapRequest { .. } => { - self.work_queues - .lc_bootstrap_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::LightClientOptimisticUpdateRequest { .. } => self - .work_queues - .lc_optimistic_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::LightClientFinalityUpdateRequest { .. } => self - .work_queues - .lc_finality_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::UnknownBlockAttestation { .. } => { - self.work_queues.unknown_block_attestation_queue.push(work) - } - Work::UnknownBlockAggregate { .. } => { - self.work_queues.unknown_block_aggregate_queue.push(work) - } - Work::GossipBlsToExecutionChange { .. } => self - .work_queues - .gossip_bls_to_execution_change_queue - .push(work, work_id, &self.beacon_processor.log), - Work::BlobsByRootsRequest { .. } => { - self.work_queues - .blbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::DataColumnsByRootsRequest { .. } => { - self.work_queues - .dcbroots_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::DataColumnsByRangeRequest { .. } => { - self.work_queues - .dcbrange_queue - .push(work, work_id, &self.beacon_processor.log) - } - Work::UnknownLightClientOptimisticUpdate { .. } => self - .work_queues - .unknown_light_client_update_queue - .push(work, work_id, &self.beacon_processor.log), - Work::UnknownBlockSamplingRequest { .. } => self - .work_queues - .unknown_block_sampling_request_queue - .push(work, work_id, &self.beacon_processor.log), - Work::ApiRequestP0 { .. } => self.work_queues.api_request_p0_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - Work::ApiRequestP1 { .. } => self.work_queues.api_request_p1_queue.push( - work, - work_id, - &self.beacon_processor.log, - ), - } - - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, - self.current_workers as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL, - self.work_queues.attestation_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_AGGREGATED_ATTESTATION_QUEUE_TOTAL, - self.work_queues.aggregate_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SYNC_MESSAGE_QUEUE_TOTAL, - self.work_queues.sync_message_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SYNC_CONTRIBUTION_QUEUE_TOTAL, - self.work_queues.sync_contribution_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_QUEUE_TOTAL, - self.work_queues.gossip_block_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_BLOB_QUEUE_TOTAL, - self.work_queues.gossip_blob_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_GOSSIP_DATA_COLUMN_QUEUE_TOTAL, - self.work_queues.gossip_data_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_BLOCK_QUEUE_TOTAL, - self.work_queues.rpc_block_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_BLOB_QUEUE_TOTAL, - self.work_queues.rpc_blob_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_CUSTODY_COLUMN_QUEUE_TOTAL, - self.work_queues.rpc_custody_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL, - self.work_queues.rpc_verify_data_column_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL, - self.work_queues.sampling_result_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL, - self.work_queues.chain_segment_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_QUEUE_TOTAL, - self.work_queues.backfill_chain_segment.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_EXIT_QUEUE_TOTAL, - self.work_queues.gossip_voluntary_exit_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_PROPOSER_SLASHING_QUEUE_TOTAL, - self.work_queues.gossip_proposer_slashing_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_ATTESTER_SLASHING_QUEUE_TOTAL, - self.work_queues.gossip_attester_slashing_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_BLS_TO_EXECUTION_CHANGE_QUEUE_TOTAL, - self.work_queues.gossip_bls_to_execution_change_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_API_REQUEST_P0_QUEUE_TOTAL, - self.work_queues.api_request_p0_queue.len() as i64, - ); - metrics::set_gauge( - &metrics::BEACON_PROCESSOR_API_REQUEST_P1_QUEUE_TOTAL, - self.work_queues.api_request_p1_queue.len() as i64, - ); - - if self.work_queues.aggregate_queue.is_full() - && self.work_queues.aggregate_debounce.elapsed() - { - error!( - self.beacon_processor.log, - "Aggregate attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => self.work_queues.aggregate_queue.max_length, - ) - } - - if self.work_queues.attestation_queue.is_full() - && self.work_queues.attestation_debounce.elapsed() - { - error!( - self.beacon_processor.log, - "Attestation queue full"; - "msg" => "the system has insufficient resources for load", - "queue_len" => self.work_queues.attestation_queue.max_length, - ) - } - } - - // TODO(beacon-processor) this can live outside of this struct in a more general location - fn worker_journal( - &self, - work_event: &Option>, - work_journal_tx: &Option>, - ) { - if let Some(work_journal_tx) = work_journal_tx { - let id = work_event - .as_ref() - .map(|event| event.work.str_id()) - .unwrap_or(WORKER_FREED); - - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(id); - } - } - - // TODO(beacon-processor) this can live outside of this struct in a more general location - fn increment_metrics(&self, work_event: &Option>) -> Option { - let _event_timer = metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); - if let Some(event) = work_event { - metrics::inc_counter_vec( - &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, - &[event.work.str_id()], - ); - } else { - metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); - } - _event_timer - } -} diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs new file mode 100644 index 00000000000..070dc533837 --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -0,0 +1,1020 @@ +// The priority scheduler has three major facets +// 1. A priority ordering system +// 2. A backfill rate limiting feature +// 3. A retry queue + +mod work_queue; +mod work_reprocessing_queue; + +use futures::stream::{Stream, StreamExt}; +use futures::task::Poll; +use lighthouse_metrics::HistogramTimer; +use slog::error; +use slog::{crit, debug, trace, warn}; +use slot_clock::SlotClock; +use std::borrow::BorrowMut; +use std::pin::Pin; +use std::task::Context; +use std::{cmp, marker::PhantomData, sync::Arc, time::Duration}; +use tokio::sync::mpsc::{self, error::TrySendError, Receiver, Sender}; +use types::{BeaconState, ChainSpec, EthSpec}; +use work_queue::{BeaconProcessorQueueLengths, WorkQueues}; +use work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; + +use crate::{ + metrics, BeaconProcessor, BeaconProcessorConfig, BlockingOrAsync, QueuedBackfillBatch, + ReprocessQueueMessage, SendOnDrop, TaskSpawner, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, + NOTHING_TO_DO, WORKER_FREED, +}; + +/// Unifies all the messages processed by the `BeaconProcessor`. +enum InboundEvent { + /// A worker has completed a task and is free. + WorkerIdle, + /// There is new work to be done. + WorkEvent(WorkEvent), + /// A work event that was queued for re-processing has become ready. + ReprocessingWork(WorkEvent), +} + +/// Combines the various incoming event streams for the `BeaconProcessor` into a single stream. +/// +/// This struct has a similar purpose to `tokio::select!`, however it allows for more fine-grained +/// control (specifically in the ordering of event processing). +struct InboundEvents { + /// Used by workers when they finish a task. + idle_rx: mpsc::Receiver<()>, + /// Used by upstream processes to send new work to the `BeaconProcessor`. + event_rx: mpsc::Receiver>, + /// Used internally for queuing work ready to be re-processed. + reprocess_work_rx: mpsc::Receiver, +} + +struct OutboundEvents { + /// Sends tasks to workers. + idle_tx: mpsc::Sender<()>, + /// Used internally for queuing work ready to be re-processed. + reprocess_work_tx: mpsc::Sender, +} + +impl Stream for InboundEvents { + type Item = InboundEvent; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Always check for idle workers before anything else. This allows us to ensure that a big + // stream of new events doesn't suppress the processing of existing events. + match self.idle_rx.poll_recv(cx) { + Poll::Ready(Some(())) => { + return Poll::Ready(Some(InboundEvent::WorkerIdle)); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + // Poll for delayed blocks before polling for new work. It might be the case that a delayed + // block is required to successfully process some new work. + match self.reprocess_work_rx.poll_recv(cx) { + Poll::Ready(Some(ready_work)) => { + return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + match self.event_rx.poll_recv(cx) { + Poll::Ready(Some(event)) => { + return Poll::Ready(Some(InboundEvent::WorkEvent(event))); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + Poll::Pending + } +} + +/// The name of the manager tokio task. +const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; + +/// The name of the worker tokio tasks. +const WORKER_TASK_NAME: &str = "beacon_processor_worker"; + +// TODO(beacon-processor) this will be impl specific + +// Backend trait inits a channel, a run function +// A channel trait has send_work, reprocess_work etc. +pub struct Scheduler { + beacon_processor: BeaconProcessor, + inbound_events: InboundEvents, + outbound_events: OutboundEvents, + work_queues: WorkQueues, + phantom_data: PhantomData, +} + +impl Scheduler { + pub fn new( + beacon_processor: BeaconProcessor, + beacon_state: &BeaconState, + event_rx: mpsc::Receiver>, + spec: Arc, + ) -> Result { + // Used by workers to communicate that they are finished a task. + let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); + + let queue_lengths = BeaconProcessorQueueLengths::from_state(beacon_state, spec)?; + + // Initialize the worker queues. + let work_queues: WorkQueues = WorkQueues::new(queue_lengths); + + // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to + // receive them back once they are ready (`ready_work_rx`). + let (ready_work_tx, ready_work_rx) = + mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); + + let (work_reprocessing_tx, reprocess_work_rx) = + mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); + + let inbound_events = InboundEvents { + idle_rx, + event_rx, + reprocess_work_rx: ready_work_rx, + }; + + let outbound_events = OutboundEvents { + idle_tx, + reprocess_work_tx: ready_work_tx + }; + + Self { + beacon_processor, + inbound_events, + outbound_events, + work_queues, + phantom_data: PhantomData + } + } + + pub fn run( + mut self, + work_journal_tx: Option>, + slot_clock: S, + maximum_gossip_clock_disparity: Duration, + ) -> Result<(), String> { + // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to + // receive them back once they are ready (`ready_work_rx`). + let (ready_work_tx, ready_work_rx) = + mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); + + let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel::( + self.beacon_processor.config.max_scheduled_work_queue_len, + ); + + // TODO(beacon-processor) reprocess scheduler + spawn_reprocess_scheduler( + ready_work_tx, + work_reprocessing_rx, + &self.beacon_processor.executor, + Arc::new(slot_clock), + self.beacon_processor.log.clone(), + maximum_gossip_clock_disparity, + )?; + + let executor = self.beacon_processor.executor.clone(); + + let manager_future = async move { + let idle_tx = self.outbound_events.idle_tx.clone(); + loop { + let work_event = match self.inbound_events.next().await { + Some(InboundEvent::WorkerIdle) => { + self.beacon_processor.current_workers = self.beacon_processor.current_workers.saturating_sub(1); + None + } + Some(InboundEvent::WorkEvent(event)) + if self.beacon_processor.config.enable_backfill_rate_limiting => + { + match QueuedBackfillBatch::try_from(event) { + Ok(backfill_batch) => { + match work_reprocessing_tx + .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) + { + Err(e) => { + warn!( + self.beacon_processor.log, + "Unable to queue backfill work event. Will try to process now."; + "error" => %e + ); + match e { + TrySendError::Full(reprocess_queue_message) + | TrySendError::Closed(reprocess_queue_message) => { + match reprocess_queue_message { + ReprocessQueueMessage::BackfillSync( + backfill_batch, + ) => Some(backfill_batch.into()), + other => { + crit!( + self.beacon_processor.log, + "Unexpected queue message type"; + "message_type" => other.as_ref() + ); + // This is an unhandled exception, drop the message. + continue; + } + } + } + } + } + Ok(..) => { + // backfill work sent to "reprocessing" queue. Process the next event. + continue; + } + } + } + Err(event) => Some(event), + } + } + Some(InboundEvent::WorkEvent(event)) + | Some(InboundEvent::ReprocessingWork(event)) => Some(event), + None => { + debug!( + self.beacon_processor.log, + "Gossip processor stopped"; + "msg" => "stream ended" + ); + break; + } + }; + + let _event_timer = self.increment_metrics(&work_event); + self.worker_journal(&work_event, &work_journal_tx); + + let can_spawn = self.beacon_processor.current_workers < self.beacon_processor.config.max_workers; + let drop_during_sync = work_event + .as_ref() + .map_or(false, |event| event.drop_during_sync); + + let modified_queue_id = match work_event { + // There is no new work event, but we are able to spawn a new worker. + // We don't check the `work.drop_during_sync` here. We assume that if it made + // it into the queue at any point then we should process it. + None if can_spawn => { + let work_event = self.priority_scheduler(&work_journal_tx); + if let Some(work_event) = work_event { + let work_type = work_event.to_type(); + // TODO(beacon-processor) check self.idle_tx + self.spawn_worker(work_event); + Some(work_type) + } else { + None + } + } + // There is no new work event and we are unable to spawn a new worker. + // + // I cannot see any good reason why this would happen. + None => { + warn!( + self.beacon_processor.log, + "Unexpected gossip processor condition"; + "msg" => "no new work and cannot spawn worker" + ); + None + } + // The chain is syncing and this event should be dropped during sync. + Some(work_event) + if self + .beacon_processor + .network_globals + .sync_state + .read() + .is_syncing() + && drop_during_sync => + { + let work_id = work_event.work.str_id(); + metrics::inc_counter_vec( + &metrics::BEACON_PROCESSOR_WORK_EVENTS_IGNORED_COUNT, + &[work_id], + ); + trace!( + self.beacon_processor.log, + "Gossip processor skipping work"; + "msg" => "chain is syncing", + "work_id" => work_id + ); + None + } + + // There is a new work event and the chain is not syncing. Process it or queue + // it. + Some(WorkEvent { work, .. }) => { + self.process_or_queue_work_event(work, can_spawn) + } + }; + + self.update_queue_metrics(modified_queue_id); + } + }; + + // Spawn on the core executor. + executor.spawn(manager_future, MANAGER_TASK_NAME); + + Ok(()) + } + + fn priority_scheduler( + &mut self, + work_journal_tx: &Option>, + ) -> Option> { + // Check for chain segments first, they're the most efficient way to get + // blocks into the system. + let work_event: Option> = + if let Some(item) = self.work_queues.chain_segment_queue.pop() { + Some(item) + // Check sync blocks before gossip blocks, since we've already explicitly + // requested these blocks. + } else if let Some(item) = self.work_queues.rpc_block_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.rpc_blob_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { + Some(item) + // TODO(das): decide proper prioritization for sampling columns + } else if let Some(item) = self.work_queues.rpc_custody_column_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.rpc_verify_data_column_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.sampling_result_queue.pop() { + Some(item) + // Check delayed blocks before gossip blocks, the gossip blocks might rely + // on the delayed ones. + } else if let Some(item) = self.work_queues.delayed_block_queue.pop() { + Some(item) + // Check gossip blocks before gossip attestations, since a block might be + // required to verify some attestations. + } else if let Some(item) = self.work_queues.gossip_block_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.gossip_blob_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.gossip_data_column_queue.pop() { + Some(item) + // Check the priority 0 API requests after blocks and blobs, but before attestations. + } else if let Some(item) = self.work_queues.api_request_p0_queue.pop() { + Some(item) + // Check the aggregates, *then* the unaggregates since we assume that + // aggregates are more valuable to local validators and effectively give us + // more information with less signature verification time. + } else if self.work_queues.aggregate_queue.len() > 0 { + let batch_size = cmp::min( + self.work_queues.aggregate_queue.len(), + self.beacon_processor.config.max_gossip_aggregate_batch_size, + ); + + if batch_size < 2 { + // One single aggregate is in the queue, process it individually. + self.work_queues.aggregate_queue.pop() + } else { + // Collect two or more aggregates into a batch, so they can take + // advantage of batch signature verification. + // + // Note: this will convert the `Work::GossipAggregate` item into a + // `Work::GossipAggregateBatch` item. + let mut aggregates = Vec::with_capacity(batch_size); + let mut process_batch_opt = None; + for _ in 0..batch_size { + if let Some(item) = self.work_queues.aggregate_queue.pop() { + match item { + Work::GossipAggregate { + aggregate, + process_individual: _, + process_batch, + } => { + aggregates.push(*aggregate); + if process_batch_opt.is_none() { + process_batch_opt = Some(process_batch); + } + } + _ => { + error!( + self.beacon_processor.log, + "Invalid item in aggregate queue" + ); + } + } + } + } + + if let Some(process_batch) = process_batch_opt { + // Process all aggregates with a single worker. + Some(Work::GossipAggregateBatch { + aggregates, + process_batch, + }) + } else { + // There is no good reason for this to + // happen, it is a serious logic error. + // Since we only form batches when multiple + // work items exist, we should always have a + // work closure at this point. + crit!(self.beacon_processor.log, "Missing aggregate work"); + None + } + } + // Check the unaggregated attestation queue. + // + // Potentially use batching. + } else if self.work_queues.attestation_queue.len() > 0 { + let batch_size = cmp::min( + self.work_queues.attestation_queue.len(), + self.beacon_processor + .config + .max_gossip_attestation_batch_size, + ); + + if batch_size < 2 { + // One single attestation is in the queue, process it individually. + self.work_queues.attestation_queue.pop() + } else { + // Collect two or more attestations into a batch, so they can take + // advantage of batch signature verification. + // + // Note: this will convert the `Work::GossipAttestation` item into a + // `Work::GossipAttestationBatch` item. + let mut attestations = Vec::with_capacity(batch_size); + let mut process_batch_opt = None; + for _ in 0..batch_size { + if let Some(item) = self.work_queues.attestation_queue.pop() { + match item { + Work::GossipAttestation { + attestation, + process_individual: _, + process_batch, + } => { + attestations.push(*attestation); + if process_batch_opt.is_none() { + process_batch_opt = Some(process_batch); + } + } + _ => error!( + self.beacon_processor.log, + "Invalid item in attestation queue" + ), + } + } + } + + if let Some(process_batch) = process_batch_opt { + // Process all attestations with a single worker. + Some(Work::GossipAttestationBatch { + attestations, + process_batch, + }) + } else { + // There is no good reason for this to + // happen, it is a serious logic error. + // Since we only form batches when multiple + // work items exist, we should always have a + // work closure at this point. + crit!(self.beacon_processor.log, "Missing attestations work"); + None + } + } + // Check sync committee messages after attestations as their rewards are lesser + // and they don't influence fork choice. + } else if let Some(item) = self.work_queues.sync_contribution_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.sync_message_queue.pop() { + Some(item) + // Aggregates and unaggregates queued for re-processing are older and we + // care about fresher ones, so check those first. + } else if let Some(item) = self.work_queues.unknown_block_aggregate_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.unknown_block_attestation_queue.pop() { + Some(item) + // Check RPC methods next. Status messages are needed for sync so + // prioritize them over syncing requests from other peers (BlocksByRange + // and BlocksByRoot) + } else if let Some(item) = self.work_queues.status_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.bbrange_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.bbroots_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.blbrange_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.blbroots_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.dcbroots_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.dcbrange_queue.pop() { + Some(item) + // Prioritize sampling requests after block syncing requests + } else if let Some(item) = self.work_queues.unknown_block_sampling_request_queue.pop() { + Some(item) + // Check slashings after all other consensus messages so we prioritize + // following head. + // + // Check attester slashings before proposer slashings since they have the + // potential to slash multiple validators at once. + } else if let Some(item) = self.work_queues.gossip_attester_slashing_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.gossip_proposer_slashing_queue.pop() { + Some(item) + // Check exits and address changes late since our validators don't get + // rewards from them. + } else if let Some(item) = self.work_queues.gossip_voluntary_exit_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.gossip_bls_to_execution_change_queue.pop() { + Some(item) + // Check the priority 1 API requests after we've + // processed all the interesting things from the network + // and things required for us to stay in good repute + // with our P2P peers. + } else if let Some(item) = self.work_queues.api_request_p1_queue.pop() { + Some(item) + // Handle backfill sync chain segments. + } else if let Some(item) = self.work_queues.backfill_chain_segment.pop() { + Some(item) + // Handle light client requests. + } else if let Some(item) = self.work_queues.lc_bootstrap_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.lc_optimistic_update_queue.pop() { + Some(item) + } else if let Some(item) = self.work_queues.lc_finality_update_queue.pop() { + Some(item) + // This statement should always be the final else statement. + } else { + // Let the journal know that a worker is freed and there's nothing else + // for it to do. + if let Some(work_journal_tx) = &work_journal_tx { + // We don't care if this message was successfully sent, we only use the journal + // during testing. + let _ = work_journal_tx.try_send(NOTHING_TO_DO); + } + None + }; + + work_event + } + + // TODO(beacon-processor) this might be able to be moved to a more generalized location + pub fn process_or_queue_work_event( + &mut self, + work: Work, + can_spawn: bool, + ) -> Option { + let work_id = work.str_id(); + + let work_type = work.to_type(); + + match work { + _ if can_spawn => self.spawn_worker(work), + Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), + // Attestation batches are formed internally within the + // `BeaconProcessor`, they are not sent from external services. + Work::GossipAttestationBatch { .. } => crit!( + self.beacon_processor.log, + "Unsupported inbound event"; + "type" => "GossipAttestationBatch" + ), + Work::GossipAggregate { .. } => self.work_queues.aggregate_queue.push(work), + // Aggregate batches are formed internally within the `BeaconProcessor`, + // they are not sent from external services. + Work::GossipAggregateBatch { .. } => crit!( + self.beacon_processor.log, + "Unsupported inbound event"; + "type" => "GossipAggregateBatch" + ), + Work::GossipBlock { .. } => { + self.work_queues + .gossip_block_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipBlobSidecar { .. } => { + self.work_queues + .gossip_blob_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipDataColumnSidecar { .. } => self.work_queues.gossip_data_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::DelayedImportBlock { .. } => { + self.work_queues + .delayed_block_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::GossipVoluntaryExit { .. } => self.work_queues.gossip_voluntary_exit_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::GossipProposerSlashing { .. } => self + .work_queues + .gossip_proposer_slashing_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipAttesterSlashing { .. } => self + .work_queues + .gossip_attester_slashing_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipSyncSignature { .. } => self.work_queues.sync_message_queue.push(work), + Work::GossipSyncContribution { .. } => { + self.work_queues.sync_contribution_queue.push(work) + } + Work::GossipLightClientFinalityUpdate { .. } => self + .work_queues + .finality_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::GossipLightClientOptimisticUpdate { .. } => self + .work_queues + .optimistic_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self + .work_queues + .rpc_block_queue + .push(work, work_id, &self.beacon_processor.log), + Work::RpcBlobs { .. } => { + self.work_queues + .rpc_blob_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::RpcCustodyColumn { .. } => self.work_queues.rpc_custody_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::RpcVerifyDataColumn(_) => self.work_queues.rpc_verify_data_column_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::SamplingResult(_) => self.work_queues.sampling_result_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::ChainSegment { .. } => { + self.work_queues + .chain_segment_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::ChainSegmentBackfill { .. } => self.work_queues.backfill_chain_segment.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::Status { .. } => { + self.work_queues + .status_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlocksByRangeRequest { .. } => { + self.work_queues + .bbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlocksByRootsRequest { .. } => { + self.work_queues + .bbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::BlobsByRangeRequest { .. } => { + self.work_queues + .blbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::LightClientBootstrapRequest { .. } => { + self.work_queues + .lc_bootstrap_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::LightClientOptimisticUpdateRequest { .. } => self + .work_queues + .lc_optimistic_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::LightClientFinalityUpdateRequest { .. } => self + .work_queues + .lc_finality_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::UnknownBlockAttestation { .. } => { + self.work_queues.unknown_block_attestation_queue.push(work) + } + Work::UnknownBlockAggregate { .. } => { + self.work_queues.unknown_block_aggregate_queue.push(work) + } + Work::GossipBlsToExecutionChange { .. } => self + .work_queues + .gossip_bls_to_execution_change_queue + .push(work, work_id, &self.beacon_processor.log), + Work::BlobsByRootsRequest { .. } => { + self.work_queues + .blbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::DataColumnsByRootsRequest { .. } => { + self.work_queues + .dcbroots_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::DataColumnsByRangeRequest { .. } => { + self.work_queues + .dcbrange_queue + .push(work, work_id, &self.beacon_processor.log) + } + Work::UnknownLightClientOptimisticUpdate { .. } => self + .work_queues + .unknown_light_client_update_queue + .push(work, work_id, &self.beacon_processor.log), + Work::UnknownBlockSamplingRequest { .. } => self + .work_queues + .unknown_block_sampling_request_queue + .push(work, work_id, &self.beacon_processor.log), + Work::ApiRequestP0 { .. } => self.work_queues.api_request_p0_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::ApiRequestP1 { .. } => self.work_queues.api_request_p1_queue.push( + work, + work_id, + &self.beacon_processor.log, + ), + Work::Reprocess { .. } => { + // TODO(beacon-processor) what to do here + todo!() + } + } + Some(work_type) + } + + fn update_queue_metrics(&mut self, modified_queue_id: Option) { + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, + self.beacon_processor.current_workers as i64, + ); + + if let Some(modified_queue_id) = modified_queue_id { + let queue_len = match modified_queue_id { + WorkType::GossipAttestation => self.work_queues.aggregate_queue.len(), + WorkType::UnknownBlockAttestation => { + self.work_queues.unknown_block_attestation_queue.len() + } + WorkType::GossipAttestationBatch => 0, // No queue + WorkType::GossipAggregate => self.work_queues.aggregate_queue.len(), + WorkType::UnknownBlockAggregate => { + self.work_queues.unknown_block_aggregate_queue.len() + } + WorkType::UnknownLightClientOptimisticUpdate => { + self.work_queues.unknown_light_client_update_queue.len() + } + WorkType::UnknownBlockSamplingRequest => { + self.work_queues.unknown_block_sampling_request_queue.len() + } + WorkType::GossipAggregateBatch => 0, // No queue + WorkType::GossipBlock => self.work_queues.gossip_block_queue.len(), + WorkType::GossipBlobSidecar => self.work_queues.gossip_blob_queue.len(), + WorkType::GossipDataColumnSidecar => { + self.work_queues.gossip_data_column_queue.len() + } + WorkType::DelayedImportBlock => self.work_queues.delayed_block_queue.len(), + WorkType::GossipVoluntaryExit => self.work_queues.gossip_voluntary_exit_queue.len(), + WorkType::GossipProposerSlashing => { + self.work_queues.gossip_proposer_slashing_queue.len() + } + WorkType::GossipAttesterSlashing => { + self.work_queues.gossip_attester_slashing_queue.len() + } + WorkType::GossipSyncSignature => self.work_queues.sync_message_queue.len(), + WorkType::GossipSyncContribution => self.work_queues.sync_contribution_queue.len(), + WorkType::GossipLightClientFinalityUpdate => { + self.work_queues.finality_update_queue.len() + } + WorkType::GossipLightClientOptimisticUpdate => { + self.work_queues.optimistic_update_queue.len() + } + WorkType::RpcBlock => self.work_queues.rpc_block_queue.len(), + WorkType::RpcBlobs | WorkType::IgnoredRpcBlock => { + self.work_queues.rpc_blob_queue.len() + } + WorkType::RpcCustodyColumn => self.work_queues.rpc_custody_column_queue.len(), + WorkType::RpcVerifyDataColumn => { + self.work_queues.rpc_verify_data_column_queue.len() + } + WorkType::SamplingResult => self.work_queues.sampling_result_queue.len(), + WorkType::ChainSegment => self.work_queues.chain_segment_queue.len(), + WorkType::ChainSegmentBackfill => self.work_queues.backfill_chain_segment.len(), + WorkType::Status => self.work_queues.status_queue.len(), + WorkType::BlocksByRangeRequest => self.work_queues.blbrange_queue.len(), + WorkType::BlocksByRootsRequest => self.work_queues.blbroots_queue.len(), + WorkType::BlobsByRangeRequest => self.work_queues.bbrange_queue.len(), + WorkType::BlobsByRootsRequest => self.work_queues.bbroots_queue.len(), + WorkType::DataColumnsByRootsRequest => self.work_queues.dcbroots_queue.len(), + WorkType::DataColumnsByRangeRequest => self.work_queues.dcbrange_queue.len(), + WorkType::GossipBlsToExecutionChange => { + self.work_queues.gossip_bls_to_execution_change_queue.len() + } + WorkType::LightClientBootstrapRequest => self.work_queues.lc_bootstrap_queue.len(), + WorkType::LightClientOptimisticUpdateRequest => { + self.work_queues.lc_optimistic_update_queue.len() + } + WorkType::LightClientFinalityUpdateRequest => { + self.work_queues.lc_finality_update_queue.len() + } + WorkType::ApiRequestP0 => self.work_queues.api_request_p0_queue.len(), + WorkType::ApiRequestP1 => self.work_queues.api_request_p1_queue.len(), + WorkType::Reprocess => 0, + }; + metrics::observe_vec( + &metrics::BEACON_PROCESSOR_QUEUE_LENGTH, + &[modified_queue_id.into()], + queue_len as f64, + ); + } + + if self.work_queues.aggregate_queue.is_full() + && self.work_queues.aggregate_debounce.elapsed() + { + error!( + self.beacon_processor.log, + "Aggregate attestation queue full"; + "msg" => "the system has insufficient resources for load", + "queue_len" => self.work_queues.aggregate_queue.max_length, + ) + } + + if self.work_queues.attestation_queue.is_full() + && self.work_queues.attestation_debounce.elapsed() + { + error!( + self.beacon_processor.log, + "Attestation queue full"; + "msg" => "the system has insufficient resources for load", + "queue_len" => self.work_queues.attestation_queue.max_length, + ) + } + } + + // TODO(beacon-processor) this can live outside of this struct in a more general location + fn worker_journal( + &self, + work_event: &Option>, + work_journal_tx: &Option>, + ) { + if let Some(work_journal_tx) = work_journal_tx { + let id = work_event + .as_ref() + .map(|event| event.work.str_id()) + .unwrap_or(WORKER_FREED); + + // We don't care if this message was successfully sent, we only use the journal + // during testing. + let _ = work_journal_tx.try_send(id); + } + } + + // TODO(beacon-processor) this can live outside of this struct in a more general location + fn increment_metrics(&self, work_event: &Option>) -> Option { + let _event_timer = metrics::start_timer(&metrics::BEACON_PROCESSOR_EVENT_HANDLING_SECONDS); + if let Some(event) = work_event { + metrics::inc_counter_vec( + &metrics::BEACON_PROCESSOR_WORK_EVENTS_RX_COUNT, + &[event.work.str_id()], + ); + } else { + metrics::inc_counter(&metrics::BEACON_PROCESSOR_IDLE_EVENTS_TOTAL); + } + _event_timer + } + + // TODO(beacon-processor) should we move spawn_worker outside of self? + /// Spawns a blocking worker thread to process some `Work`. + /// + /// Sends an message on `idle_tx` when the work is complete and the task is stopping. + fn spawn_worker(&mut self, work: Work) { + let work_id = work.str_id(); + let worker_timer = + metrics::start_timer_vec(&metrics::BEACON_PROCESSOR_WORKER_TIME, &[work_id]); + metrics::inc_counter(&metrics::BEACON_PROCESSOR_WORKERS_SPAWNED_TOTAL); + metrics::inc_counter_vec( + &metrics::BEACON_PROCESSOR_WORK_EVENTS_STARTED_COUNT, + &[work.str_id()], + ); + + // Wrap the `idle_tx` in a struct that will fire the idle message whenever it is dropped. + // + // This helps ensure that the worker is always freed in the case of an early exit or panic. + // As such, this instantiation should happen as early in the function as possible. + let send_idle_on_drop = SendOnDrop { + tx: self.outbound_events.idle_tx.clone(), + _worker_timer: worker_timer, + log: self.beacon_processor.log.clone(), + }; + + let worker_id = self.beacon_processor.current_workers; + self.beacon_processor.current_workers = self.beacon_processor.current_workers.saturating_add(1); + + let executor = self.beacon_processor.executor.clone(); + + trace!( + self.beacon_processor.log, + "Spawning beacon processor worker"; + "work" => work_id, + "worker" => worker_id, + ); + + let task_spawner = TaskSpawner { + executor, + send_idle_on_drop, + }; + + match work { + Work::GossipAttestation { + attestation, + process_individual, + process_batch: _, + } => task_spawner.spawn_blocking(move || { + process_individual(*attestation); + }), + Work::GossipAttestationBatch { + attestations, + process_batch, + } => task_spawner.spawn_blocking(move || { + process_batch(attestations); + }), + Work::GossipAggregate { + aggregate, + process_individual, + process_batch: _, + } => task_spawner.spawn_blocking(move || { + process_individual(*aggregate); + }), + Work::GossipAggregateBatch { + aggregates, + process_batch, + } => task_spawner.spawn_blocking(move || { + process_batch(aggregates); + }), + Work::ChainSegment(process_fn) => task_spawner.spawn_async(async move { + process_fn.await; + }), + Work::UnknownBlockAttestation { process_fn } + | Work::UnknownBlockAggregate { process_fn } + | Work::UnknownLightClientOptimisticUpdate { process_fn, .. } + | Work::UnknownBlockSamplingRequest { process_fn } => { + task_spawner.spawn_blocking(process_fn) + } + Work::DelayedImportBlock { + beacon_block_slot: _, + beacon_block_root: _, + process_fn, + } => task_spawner.spawn_async(process_fn), + Work::RpcBlock { process_fn } + | Work::RpcBlobs { process_fn } + | Work::RpcCustodyColumn(process_fn) + | Work::RpcVerifyDataColumn(process_fn) + | Work::SamplingResult(process_fn) => task_spawner.spawn_async(process_fn), + Work::IgnoredRpcBlock { process_fn } => task_spawner.spawn_blocking(process_fn), + Work::GossipBlock(work) + | Work::GossipBlobSidecar(work) + | Work::GossipDataColumnSidecar(work) => task_spawner.spawn_async(async move { + work.await; + }), + Work::BlobsByRangeRequest(process_fn) + | Work::BlobsByRootsRequest(process_fn) + | Work::DataColumnsByRootsRequest(process_fn) + | Work::DataColumnsByRangeRequest(process_fn) => { + task_spawner.spawn_blocking(process_fn) + } + Work::BlocksByRangeRequest(work) | Work::BlocksByRootsRequest(work) => { + task_spawner.spawn_async(work) + } + Work::ChainSegmentBackfill(process_fn) => task_spawner.spawn_async(process_fn), + Work::ApiRequestP0(process_fn) | Work::ApiRequestP1(process_fn) => match process_fn { + BlockingOrAsync::Blocking(process_fn) => task_spawner.spawn_blocking(process_fn), + BlockingOrAsync::Async(process_fn) => task_spawner.spawn_async(process_fn), + }, + Work::GossipVoluntaryExit(process_fn) + | Work::GossipProposerSlashing(process_fn) + | Work::GossipAttesterSlashing(process_fn) + | Work::GossipSyncSignature(process_fn) + | Work::GossipSyncContribution(process_fn) + | Work::GossipLightClientFinalityUpdate(process_fn) + | Work::GossipLightClientOptimisticUpdate(process_fn) + | Work::Status(process_fn) + | Work::GossipBlsToExecutionChange(process_fn) + | Work::LightClientBootstrapRequest(process_fn) + | Work::LightClientOptimisticUpdateRequest(process_fn) + | Work::LightClientFinalityUpdateRequest(process_fn) => { + task_spawner.spawn_blocking(process_fn) + } + Work::Reprocess(reprocess_message) => { + // TODO(beacon-processor) send to the reprocess queue + todo!() + } + }; + } +} diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs new file mode 100644 index 00000000000..8b294354729 --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs @@ -0,0 +1,350 @@ +use std::collections::VecDeque; + +use crate::Work; +use logging::TimeLatch; +use slog::{error, Logger}; +use types::{BeaconState, ChainSpec, EthSpec, RelativeEpoch}; + +/// Over-provision queues based on active validator count by some factor. The beacon chain has +/// strict churns that prevent the validator set size from changing rapidly. By over-provisioning +/// slightly, we don't need to adjust the queues during the lifetime of a process. +const ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT: usize = 110; + +/// A simple first-in-first-out queue with a maximum length. +pub struct FifoQueue { + queue: VecDeque, + max_length: usize, +} + +impl FifoQueue { + /// Create a new, empty queue with the given length. + pub fn new(max_length: usize) -> Self { + Self { + queue: VecDeque::default(), + max_length, + } + } + + /// Add a new item to the queue. + /// + /// Drops `item` if the queue is full. + pub fn push(&mut self, item: T, item_desc: &str, log: &Logger) { + if self.queue.len() == self.max_length { + error!( + log, + "Work queue is full"; + "msg" => "the system has insufficient resources for load", + "queue_len" => self.max_length, + "queue" => item_desc, + ) + } else { + self.queue.push_back(item); + } + } + + /// Remove the next item from the queue. + pub fn pop(&mut self) -> Option { + self.queue.pop_front() + } + + /// Returns the current length of the queue. + pub fn len(&self) -> usize { + self.queue.len() + } +} + +/// A simple last-in-first-out queue with a maximum length. +pub struct LifoQueue { + queue: VecDeque, + pub max_length: usize, +} + +impl LifoQueue { + /// Create a new, empty queue with the given length. + pub fn new(max_length: usize) -> Self { + Self { + queue: VecDeque::default(), + max_length, + } + } + + /// Add a new item to the front of the queue. + /// + /// If the queue is full, the item at the back of the queue is dropped. + pub fn push(&mut self, item: T) { + if self.queue.len() == self.max_length { + self.queue.pop_back(); + } + self.queue.push_front(item); + } + + /// Remove the next item from the queue. + pub fn pop(&mut self) -> Option { + self.queue.pop_front() + } + + /// Returns `true` if the queue is full. + pub fn is_full(&self) -> bool { + self.queue.len() >= self.max_length + } + + /// Returns the current length of the queue. + pub fn len(&self) -> usize { + self.queue.len() + } +} + +/// Maximum number of queued items that will be stored before dropping them +pub struct BeaconProcessorQueueLengths { + aggregate_queue: usize, + attestation_queue: usize, + unknown_block_aggregate_queue: usize, + unknown_block_attestation_queue: usize, + sync_message_queue: usize, + sync_contribution_queue: usize, + gossip_voluntary_exit_queue: usize, + gossip_proposer_slashing_queue: usize, + gossip_attester_slashing_queue: usize, + finality_update_queue: usize, + optimistic_update_queue: usize, + unknown_light_client_update_queue: usize, + unknown_block_sampling_request_queue: usize, + rpc_block_queue: usize, + rpc_blob_queue: usize, + rpc_custody_column_queue: usize, + rpc_verify_data_column_queue: usize, + sampling_result_queue: usize, + chain_segment_queue: usize, + backfill_chain_segment: usize, + gossip_block_queue: usize, + gossip_blob_queue: usize, + gossip_data_column_queue: usize, + delayed_block_queue: usize, + status_queue: usize, + bbrange_queue: usize, + bbroots_queue: usize, + blbroots_queue: usize, + blbrange_queue: usize, + dcbroots_queue: usize, + dcbrange_queue: usize, + gossip_bls_to_execution_change_queue: usize, + lc_bootstrap_queue: usize, + lc_optimistic_update_queue: usize, + lc_finality_update_queue: usize, + api_request_p0_queue: usize, + api_request_p1_queue: usize, +} + +impl BeaconProcessorQueueLengths { + pub fn from_state( + state: &BeaconState, + spec: &ChainSpec, + ) -> Result { + let active_validator_count = + match state.get_cached_active_validator_indices(RelativeEpoch::Current) { + Ok(indices) => indices.len(), + Err(_) => state + .get_active_validator_indices(state.current_epoch(), spec) + .map_err(|e| format!("Error computing active indices: {:?}", e))? + .len(), + }; + let active_validator_count = + (ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT * active_validator_count) / 100; + let slots_per_epoch = E::slots_per_epoch() as usize; + + Ok(Self { + aggregate_queue: 4096, + unknown_block_aggregate_queue: 1024, + // Capacity for a full slot's worth of attestations if subscribed to all subnets + attestation_queue: active_validator_count / slots_per_epoch, + // Capacity for a full slot's worth of attestations if subscribed to all subnets + unknown_block_attestation_queue: active_validator_count / slots_per_epoch, + sync_message_queue: 2048, + sync_contribution_queue: 1024, + gossip_voluntary_exit_queue: 4096, + gossip_proposer_slashing_queue: 4096, + gossip_attester_slashing_queue: 4096, + finality_update_queue: 1024, + optimistic_update_queue: 1024, + unknown_block_sampling_request_queue: 16384, + unknown_light_client_update_queue: 128, + rpc_block_queue: 1024, + rpc_blob_queue: 1024, + // TODO(das): Placeholder values + rpc_custody_column_queue: 1000, + rpc_verify_data_column_queue: 1000, + sampling_result_queue: 1000, + chain_segment_queue: 64, + backfill_chain_segment: 64, + gossip_block_queue: 1024, + gossip_blob_queue: 1024, + gossip_data_column_queue: 1024, + delayed_block_queue: 1024, + status_queue: 1024, + bbrange_queue: 1024, + bbroots_queue: 1024, + blbroots_queue: 1024, + blbrange_queue: 1024, + // TODO(das): pick proper values + dcbroots_queue: 1024, + dcbrange_queue: 1024, + gossip_bls_to_execution_change_queue: 16384, + lc_bootstrap_queue: 1024, + lc_optimistic_update_queue: 512, + lc_finality_update_queue: 512, + api_request_p0_queue: 1024, + api_request_p1_queue: 1024, + }) + } +} + +pub struct WorkQueues { + pub aggregate_queue: LifoQueue>, + pub aggregate_debounce: TimeLatch, + pub attestation_queue: LifoQueue>, + pub attestation_debounce: TimeLatch, + pub unknown_block_aggregate_queue: LifoQueue>, + pub unknown_block_attestation_queue: LifoQueue>, + pub sync_message_queue: LifoQueue>, + pub sync_contribution_queue: LifoQueue>, + pub gossip_voluntary_exit_queue: FifoQueue>, + pub gossip_proposer_slashing_queue: FifoQueue>, + pub gossip_attester_slashing_queue: FifoQueue>, + pub finality_update_queue: FifoQueue>, + pub optimistic_update_queue: FifoQueue>, + pub unknown_light_client_update_queue: FifoQueue>, + pub unknown_block_sampling_request_queue: FifoQueue>, + pub rpc_block_queue: FifoQueue>, + pub rpc_blob_queue: FifoQueue>, + pub rpc_custody_column_queue: FifoQueue>, + pub rpc_verify_data_column_queue: FifoQueue>, + pub sampling_result_queue: FifoQueue>, + pub chain_segment_queue: FifoQueue>, + pub backfill_chain_segment: FifoQueue>, + pub gossip_block_queue: FifoQueue>, + pub gossip_blob_queue: FifoQueue>, + pub gossip_data_column_queue: FifoQueue>, + pub delayed_block_queue: FifoQueue>, + pub status_queue: FifoQueue>, + pub bbrange_queue: FifoQueue>, + pub bbroots_queue: FifoQueue>, + pub blbroots_queue: FifoQueue>, + pub blbrange_queue: FifoQueue>, + pub dcbroots_queue: FifoQueue>, + pub dcbrange_queue: FifoQueue>, + pub gossip_bls_to_execution_change_queue: FifoQueue>, + pub lc_bootstrap_queue: FifoQueue>, + pub lc_optimistic_update_queue: FifoQueue>, + pub lc_finality_update_queue: FifoQueue>, + pub api_request_p0_queue: FifoQueue>, + pub api_request_p1_queue: FifoQueue>, +} + +impl WorkQueues { + pub fn new(queue_lengths: BeaconProcessorQueueLengths) -> Self { + let aggregate_queue = LifoQueue::new(queue_lengths.aggregate_queue); + let aggregate_debounce = TimeLatch::default(); + let attestation_queue = LifoQueue::new(queue_lengths.attestation_queue); + let attestation_debounce = TimeLatch::default(); + let unknown_block_aggregate_queue = + LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); + let unknown_block_attestation_queue = + LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + + let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); + let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); + + // Using a FIFO queue for voluntary exits since it prevents exit censoring. I don't have + // a strong feeling about queue type for exits. + let gossip_voluntary_exit_queue = FifoQueue::new(queue_lengths.gossip_voluntary_exit_queue); + + // Using a FIFO queue for slashing to prevent people from flushing their slashings from the + // queues with lots of junk messages. + let gossip_proposer_slashing_queue = + FifoQueue::new(queue_lengths.gossip_proposer_slashing_queue); + let gossip_attester_slashing_queue = + FifoQueue::new(queue_lengths.gossip_attester_slashing_queue); + + // Using a FIFO queue for light client updates to maintain sequence order. + let finality_update_queue = FifoQueue::new(queue_lengths.finality_update_queue); + let optimistic_update_queue = FifoQueue::new(queue_lengths.optimistic_update_queue); + let unknown_light_client_update_queue = + FifoQueue::new(queue_lengths.unknown_light_client_update_queue); + let unknown_block_sampling_request_queue = + FifoQueue::new(queue_lengths.unknown_block_sampling_request_queue); + + // Using a FIFO queue since blocks need to be imported sequentially. + let rpc_block_queue = FifoQueue::new(queue_lengths.rpc_block_queue); + let rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); + let rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); + let rpc_verify_data_column_queue = + FifoQueue::new(queue_lengths.rpc_verify_data_column_queue); + let sampling_result_queue = FifoQueue::new(queue_lengths.sampling_result_queue); + let chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); + let backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); + let gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); + let gossip_blob_queue = FifoQueue::new(queue_lengths.gossip_blob_queue); + let gossip_data_column_queue = FifoQueue::new(queue_lengths.gossip_data_column_queue); + let delayed_block_queue = FifoQueue::new(queue_lengths.delayed_block_queue); + + let status_queue = FifoQueue::new(queue_lengths.status_queue); + let bbrange_queue = FifoQueue::new(queue_lengths.bbrange_queue); + let bbroots_queue = FifoQueue::new(queue_lengths.bbroots_queue); + let blbroots_queue = FifoQueue::new(queue_lengths.blbroots_queue); + let blbrange_queue = FifoQueue::new(queue_lengths.blbrange_queue); + let dcbroots_queue = FifoQueue::new(queue_lengths.dcbroots_queue); + let dcbrange_queue = FifoQueue::new(queue_lengths.dcbrange_queue); + + let gossip_bls_to_execution_change_queue = + FifoQueue::new(queue_lengths.gossip_bls_to_execution_change_queue); + + let lc_bootstrap_queue = FifoQueue::new(queue_lengths.lc_bootstrap_queue); + let lc_optimistic_update_queue = FifoQueue::new(queue_lengths.lc_optimistic_update_queue); + let lc_finality_update_queue = FifoQueue::new(queue_lengths.lc_finality_update_queue); + + let api_request_p0_queue = FifoQueue::new(queue_lengths.api_request_p0_queue); + let api_request_p1_queue = FifoQueue::new(queue_lengths.api_request_p1_queue); + + WorkQueues { + aggregate_queue, + aggregate_debounce, + attestation_queue, + attestation_debounce, + unknown_block_aggregate_queue, + unknown_block_attestation_queue, + sync_message_queue, + sync_contribution_queue, + gossip_voluntary_exit_queue, + gossip_proposer_slashing_queue, + gossip_attester_slashing_queue, + finality_update_queue, + optimistic_update_queue, + unknown_light_client_update_queue, + unknown_block_sampling_request_queue, + rpc_block_queue, + rpc_blob_queue, + rpc_custody_column_queue, + rpc_verify_data_column_queue, + sampling_result_queue, + chain_segment_queue, + backfill_chain_segment, + gossip_block_queue, + gossip_blob_queue, + gossip_data_column_queue, + delayed_block_queue, + status_queue, + bbrange_queue, + bbroots_queue, + blbroots_queue, + blbrange_queue, + dcbroots_queue, + dcbrange_queue, + gossip_bls_to_execution_change_queue, + lc_bootstrap_queue, + lc_optimistic_update_queue, + lc_finality_update_queue, + api_request_p0_queue, + api_request_p1_queue, + } + } +} diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs new file mode 100644 index 00000000000..4d2edf142d6 --- /dev/null +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs @@ -0,0 +1,1145 @@ +//! Provides a mechanism which queues work for later processing. +//! +//! When the `beacon_processor::Worker` imports a block that is acceptably early (i.e., within the +//! gossip propagation tolerance) it will send it to this queue where it will be placed in a +//! `DelayQueue` until the slot arrives. Once the block has been determined to be ready, it will be +//! sent back out on a channel to be processed by the `BeaconProcessor` again. +//! +//! There is the edge-case where the slot arrives before this queue manages to process it. In that +//! case, the block will be sent off for immediate processing (skipping the `DelayQueue`). +//! +//! Aggregated and unaggregated attestations that failed verification due to referencing an unknown +//! block will be re-queued until their block is imported, or until they expire. +use crate::ReprocessQueueMessage::*; +use crate::{ + metrics, IgnoredRpcBlock, QueuedAggregate, QueuedBackfillBatch, QueuedGossipBlock, + QueuedLightClientUpdate, QueuedRpcBlock, QueuedSamplingRequest, QueuedUnaggregate, + ReprocessQueueMessage, +}; +use crate::{AsyncFn, BlockingFn, Work, WorkEvent}; +use fnv::FnvHashMap; +use futures::task::Poll; +use futures::{Stream, StreamExt}; +use itertools::Itertools; +use logging::TimeLatch; +use slog::{crit, debug, error, trace, warn, Logger}; +use slot_clock::SlotClock; +use std::collections::{HashMap, HashSet}; +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Context; +use std::time::Duration; +use strum::AsRefStr; +use task_executor::TaskExecutor; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio_util::time::delay_queue::{DelayQueue, Key as DelayKey}; +use types::{EthSpec, Hash256, Slot}; + +const TASK_NAME: &str = "beacon_processor_reprocess_queue"; +const GOSSIP_BLOCKS: &str = "gossip_blocks"; +const RPC_BLOCKS: &str = "rpc_blocks"; +const ATTESTATIONS: &str = "attestations"; +const LIGHT_CLIENT_UPDATES: &str = "lc_updates"; + +/// Queue blocks for re-processing with an `ADDITIONAL_QUEUED_BLOCK_DELAY` after the slot starts. +/// This is to account for any slight drift in the system clock. +pub const ADDITIONAL_QUEUED_BLOCK_DELAY: Duration = Duration::from_millis(5); + +/// For how long to queue aggregated and unaggregated attestations for re-processing. +pub const QUEUED_ATTESTATION_DELAY: Duration = Duration::from_secs(12); + +/// For how long to queue light client updates for re-processing. +pub const QUEUED_LIGHT_CLIENT_UPDATE_DELAY: Duration = Duration::from_secs(12); + +/// For how long to queue rpc blocks before sending them back for reprocessing. +pub const QUEUED_RPC_BLOCK_DELAY: Duration = Duration::from_secs(4); + +/// For how long to queue sampling requests for reprocessing. +pub const QUEUED_SAMPLING_REQUESTS_DELAY: Duration = Duration::from_secs(12); + +/// Set an arbitrary upper-bound on the number of queued blocks to avoid DoS attacks. The fact that +/// we signature-verify blocks before putting them in the queue *should* protect against this, but +/// it's nice to have extra protection. +const MAXIMUM_QUEUED_BLOCKS: usize = 16; + +/// How many attestations we keep before new ones get dropped. +const MAXIMUM_QUEUED_ATTESTATIONS: usize = 16_384; + +/// How many light client updates we keep before new ones get dropped. +const MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES: usize = 128; + +/// How many sampling requests we queue before new ones get dropped. +/// TODO(das): choose a sensible value +const MAXIMUM_QUEUED_SAMPLING_REQUESTS: usize = 16_384; + +// Process backfill batch 50%, 60%, 80% through each slot. +// +// Note: use caution to set these fractions in a way that won't cause panic-y +// arithmetic. +pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ + // One half: 6s on mainnet, 2.5s on Gnosis. + (1, 2), + // Three fifths: 7.2s on mainnet, 3s on Gnosis. + (3, 5), + // Four fifths: 9.6s on mainnet, 4s on Gnosis. + (4, 5), +]; + +/// Events sent by the scheduler once they are ready for re-processing. +pub enum ReadyWork { + Block(QueuedGossipBlock), + RpcBlock(QueuedRpcBlock), + IgnoredRpcBlock(IgnoredRpcBlock), + Unaggregate(QueuedUnaggregate), + Aggregate(QueuedAggregate), + LightClientUpdate(QueuedLightClientUpdate), + SamplingRequest(QueuedSamplingRequest), + BackfillSync(QueuedBackfillBatch), +} + +impl From for WorkEvent { + fn from(ready_work: ReadyWork) -> Self { + match ready_work { + ReadyWork::Block(QueuedGossipBlock { + beacon_block_slot, + beacon_block_root, + process_fn, + }) => Self { + drop_during_sync: false, + work: Work::DelayedImportBlock { + beacon_block_slot, + beacon_block_root, + process_fn, + }, + }, + ReadyWork::RpcBlock(QueuedRpcBlock { + beacon_block_root: _, + process_fn, + ignore_fn: _, + }) => Self { + drop_during_sync: false, + work: Work::RpcBlock { process_fn }, + }, + ReadyWork::IgnoredRpcBlock(IgnoredRpcBlock { process_fn }) => Self { + drop_during_sync: false, + work: Work::IgnoredRpcBlock { process_fn }, + }, + ReadyWork::Unaggregate(QueuedUnaggregate { + beacon_block_root: _, + process_fn, + }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockAttestation { process_fn }, + }, + ReadyWork::Aggregate(QueuedAggregate { + process_fn, + beacon_block_root: _, + }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockAggregate { process_fn }, + }, + ReadyWork::LightClientUpdate(QueuedLightClientUpdate { + parent_root, + process_fn, + }) => Self { + drop_during_sync: true, + work: Work::UnknownLightClientOptimisticUpdate { + parent_root, + process_fn, + }, + }, + ReadyWork::SamplingRequest(QueuedSamplingRequest { process_fn, .. }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockSamplingRequest { process_fn }, + }, + ReadyWork::BackfillSync(QueuedBackfillBatch(process_fn)) => Self { + drop_during_sync: false, + work: Work::ChainSegmentBackfill(process_fn), + }, + } + } +} + +/// Unifies the different messages processed by the block delay queue. +enum InboundEvent { + /// A gossip block that was queued for later processing and is ready for import. + ReadyGossipBlock(QueuedGossipBlock), + /// A rpc block that was queued because the same gossip block was being imported + /// will now be retried for import. + ReadyRpcBlock(QueuedRpcBlock), + /// An aggregated or unaggregated attestation is ready for re-processing. + ReadyAttestation(QueuedAttestationId), + /// A light client update that is ready for re-processing. + ReadyLightClientUpdate(QueuedLightClientUpdateId), + /// A backfill batch that was queued is ready for processing. + ReadyBackfillSync(QueuedBackfillBatch), + /// A message sent to the `ReprocessQueue` + Msg(ReprocessQueueMessage), +} + +/// Manages scheduling works that need to be later re-processed. +struct ReprocessQueue { + /// Receiver of messages relevant to schedule works for reprocessing. + work_reprocessing_rx: Receiver, + /// Sender of works once they become ready + ready_work_tx: Sender, + + /* Queues */ + /// Queue to manage scheduled early blocks. + gossip_block_delay_queue: DelayQueue, + /// Queue to manage scheduled early blocks. + rpc_block_delay_queue: DelayQueue, + /// Queue to manage scheduled attestations. + attestations_delay_queue: DelayQueue, + /// Queue to manage scheduled light client updates. + lc_updates_delay_queue: DelayQueue, + /// Queue to manage scheduled sampling requests + sampling_requests_delay_queue: DelayQueue, + + /* Queued items */ + /// Queued blocks. + queued_gossip_block_roots: HashSet, + /// Queued aggregated attestations. + queued_aggregates: FnvHashMap, + /// Queued attestations. + queued_unaggregates: FnvHashMap, + /// Attestations (aggregated and unaggregated) per root. + awaiting_attestations_per_root: HashMap>, + /// Queued Light Client Updates. + queued_lc_updates: FnvHashMap, + /// Light Client Updates per parent_root. + awaiting_lc_updates_per_parent_root: HashMap>, + /// Queued sampling requests. + queued_sampling_requests: FnvHashMap, + /// Sampling requests per block root. + awaiting_sampling_requests_per_block_root: HashMap>, + /// Queued backfill batches + queued_backfill_batches: Vec, + + /* Aux */ + /// Next attestation id, used for both aggregated and unaggregated attestations + next_attestation: usize, + next_lc_update: usize, + next_sampling_request_update: usize, + early_block_debounce: TimeLatch, + rpc_block_debounce: TimeLatch, + attestation_delay_debounce: TimeLatch, + lc_update_delay_debounce: TimeLatch, + sampling_request_delay_debounce: TimeLatch, + next_backfill_batch_event: Option>>, + slot_clock: Arc, +} + +pub type QueuedLightClientUpdateId = usize; +pub type QueuedSamplingRequestId = usize; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum QueuedAttestationId { + Aggregate(usize), + Unaggregate(usize), +} + +impl QueuedAggregate { + pub fn beacon_block_root(&self) -> &Hash256 { + &self.beacon_block_root + } +} + +impl QueuedUnaggregate { + pub fn beacon_block_root(&self) -> &Hash256 { + &self.beacon_block_root + } +} + +impl Stream for ReprocessQueue { + type Item = InboundEvent; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // NOTE: implementing `Stream` is not necessary but allows to maintain the future selection + // order fine-grained and separate from the logic of handling each message, which is nice. + + // Poll for expired blocks *before* we try to process new blocks. + // + // The sequential nature of blockchains means it is generally better to try and import all + // existing blocks before new ones. + match self.gossip_block_delay_queue.poll_expired(cx) { + Poll::Ready(Some(queued_block)) => { + return Poll::Ready(Some(InboundEvent::ReadyGossipBlock( + queued_block.into_inner(), + ))); + } + // `Poll::Ready(None)` means that there are no more entries in the delay queue and we + // will continue to get this result until something else is added into the queue. + Poll::Ready(None) | Poll::Pending => (), + } + + match self.rpc_block_delay_queue.poll_expired(cx) { + Poll::Ready(Some(queued_block)) => { + return Poll::Ready(Some(InboundEvent::ReadyRpcBlock(queued_block.into_inner()))); + } + // `Poll::Ready(None)` means that there are no more entries in the delay queue and we + // will continue to get this result until something else is added into the queue. + Poll::Ready(None) | Poll::Pending => (), + } + + match self.attestations_delay_queue.poll_expired(cx) { + Poll::Ready(Some(attestation_id)) => { + return Poll::Ready(Some(InboundEvent::ReadyAttestation( + attestation_id.into_inner(), + ))); + } + // `Poll::Ready(None)` means that there are no more entries in the delay queue and we + // will continue to get this result until something else is added into the queue. + Poll::Ready(None) | Poll::Pending => (), + } + + match self.lc_updates_delay_queue.poll_expired(cx) { + Poll::Ready(Some(lc_id)) => { + return Poll::Ready(Some(InboundEvent::ReadyLightClientUpdate( + lc_id.into_inner(), + ))); + } + // `Poll::Ready(None)` means that there are no more entries in the delay queue and we + // will continue to get this result until something else is added into the queue. + Poll::Ready(None) | Poll::Pending => (), + } + + if let Some(next_backfill_batch_event) = self.next_backfill_batch_event.as_mut() { + match next_backfill_batch_event.as_mut().poll(cx) { + Poll::Ready(_) => { + let maybe_batch = self.queued_backfill_batches.pop(); + self.recompute_next_backfill_batch_event(); + + if let Some(batch) = maybe_batch { + return Poll::Ready(Some(InboundEvent::ReadyBackfillSync(batch))); + } + } + Poll::Pending => (), + } + } + + // Last empty the messages channel. + match self.work_reprocessing_rx.poll_recv(cx) { + Poll::Ready(Some(message)) => return Poll::Ready(Some(InboundEvent::Msg(message))), + Poll::Ready(None) | Poll::Pending => {} + } + + Poll::Pending + } +} + +/// Starts the job that manages scheduling works that need re-processing. The returned `Sender` +/// gives the communicating channel to receive those works. Once a work is ready, it is sent back +/// via `ready_work_tx`. +pub fn spawn_reprocess_scheduler( + ready_work_tx: Sender, + work_reprocessing_rx: Receiver, + executor: &TaskExecutor, + slot_clock: Arc, + log: Logger, + maximum_gossip_clock_disparity: Duration, +) -> Result<(), String> { + // Sanity check + if ADDITIONAL_QUEUED_BLOCK_DELAY >= maximum_gossip_clock_disparity { + return Err("The block delay and gossip disparity don't match.".to_string()); + } + let mut queue = ReprocessQueue::new(ready_work_tx, work_reprocessing_rx, slot_clock); + + executor.spawn( + async move { + while let Some(msg) = queue.next().await { + queue.handle_message(msg, &log); + } + + debug!( + log, + "Re-process queue stopped"; + "msg" => "shutting down" + ); + }, + TASK_NAME, + ); + Ok(()) +} + +impl ReprocessQueue { + fn new( + ready_work_tx: Sender, + work_reprocessing_rx: Receiver, + slot_clock: Arc, + ) -> Self { + ReprocessQueue { + work_reprocessing_rx, + ready_work_tx, + gossip_block_delay_queue: DelayQueue::new(), + rpc_block_delay_queue: DelayQueue::new(), + attestations_delay_queue: DelayQueue::new(), + lc_updates_delay_queue: DelayQueue::new(), + sampling_requests_delay_queue: <_>::default(), + queued_gossip_block_roots: HashSet::new(), + queued_lc_updates: FnvHashMap::default(), + queued_aggregates: FnvHashMap::default(), + queued_unaggregates: FnvHashMap::default(), + queued_sampling_requests: <_>::default(), + awaiting_attestations_per_root: HashMap::new(), + awaiting_lc_updates_per_parent_root: HashMap::new(), + awaiting_sampling_requests_per_block_root: <_>::default(), + queued_backfill_batches: Vec::new(), + next_attestation: 0, + next_lc_update: 0, + next_sampling_request_update: 0, + early_block_debounce: TimeLatch::default(), + rpc_block_debounce: TimeLatch::default(), + attestation_delay_debounce: TimeLatch::default(), + lc_update_delay_debounce: TimeLatch::default(), + sampling_request_delay_debounce: <_>::default(), + next_backfill_batch_event: None, + slot_clock, + } + } + + fn handle_message(&mut self, msg: InboundEvent, log: &Logger) { + match msg { + // Some block has been indicated as "early" and should be processed when the + // appropriate slot arrives. + InboundEvent::Msg(EarlyBlock(early_block)) => { + let block_slot = early_block.beacon_block_slot; + let block_root = early_block.beacon_block_root; + + // Don't add the same block to the queue twice. This prevents DoS attacks. + if self.queued_gossip_block_roots.contains(&block_root) { + return; + } + + if let Some(duration_till_slot) = self.slot_clock.duration_to_slot(block_slot) { + // Check to ensure this won't over-fill the queue. + if self.queued_gossip_block_roots.len() >= MAXIMUM_QUEUED_BLOCKS { + if self.early_block_debounce.elapsed() { + warn!( + log, + "Early blocks queue is full"; + "queue_size" => MAXIMUM_QUEUED_BLOCKS, + "msg" => "check system clock" + ); + } + // Drop the block. + return; + } + + self.queued_gossip_block_roots.insert(block_root); + // Queue the block until the start of the appropriate slot, plus + // `ADDITIONAL_QUEUED_BLOCK_DELAY`. + self.gossip_block_delay_queue.insert( + early_block, + duration_till_slot + ADDITIONAL_QUEUED_BLOCK_DELAY, + ); + } else { + // If there is no duration till the next slot, check to see if the slot + // has already arrived. If it has already arrived, send it out for + // immediate processing. + // + // If we can't read the slot or the slot hasn't arrived, simply drop the + // block. + // + // This logic is slightly awkward since `SlotClock::duration_to_slot` + // doesn't distinguish between a slot that has already arrived and an + // error reading the slot clock. + if let Some(now) = self.slot_clock.now() { + if block_slot <= now + && self + .ready_work_tx + .try_send(ReadyWork::Block(early_block)) + .is_err() + { + error!( + log, + "Failed to send block"; + ); + } + } + } + } + // A rpc block arrived for processing at the same time when a gossip block + // for the same block hash is being imported. We wait for `QUEUED_RPC_BLOCK_DELAY` + // and then send the rpc block back for processing assuming the gossip import + // has completed by then. + InboundEvent::Msg(RpcBlock(rpc_block)) => { + // Check to ensure this won't over-fill the queue. + if self.rpc_block_delay_queue.len() >= MAXIMUM_QUEUED_BLOCKS { + if self.rpc_block_debounce.elapsed() { + warn!( + log, + "RPC blocks queue is full"; + "queue_size" => MAXIMUM_QUEUED_BLOCKS, + "msg" => "check system clock" + ); + } + // Return the block to the beacon processor signalling to + // ignore processing for this block + if self + .ready_work_tx + .try_send(ReadyWork::IgnoredRpcBlock(IgnoredRpcBlock { + process_fn: rpc_block.ignore_fn, + })) + .is_err() + { + error!( + log, + "Failed to send rpc block to beacon processor"; + ); + } + return; + } + + // Queue the block for 1/3rd of a slot + self.rpc_block_delay_queue + .insert(rpc_block, QUEUED_RPC_BLOCK_DELAY); + } + InboundEvent::ReadyRpcBlock(queued_rpc_block) => { + debug!( + log, + "Sending rpc block for reprocessing"; + "block_root" => %queued_rpc_block.beacon_block_root + ); + if self + .ready_work_tx + .try_send(ReadyWork::RpcBlock(queued_rpc_block)) + .is_err() + { + error!( + log, + "Failed to send rpc block to beacon processor"; + ); + } + } + InboundEvent::Msg(UnknownBlockAggregate(queued_aggregate)) => { + if self.attestations_delay_queue.len() >= MAXIMUM_QUEUED_ATTESTATIONS { + if self.attestation_delay_debounce.elapsed() { + error!( + log, + "Aggregate attestation delay queue is full"; + "queue_size" => MAXIMUM_QUEUED_ATTESTATIONS, + "msg" => "check system clock" + ); + } + // Drop the attestation. + return; + } + + let att_id = QueuedAttestationId::Aggregate(self.next_attestation); + + // Register the delay. + let delay_key = self + .attestations_delay_queue + .insert(att_id, QUEUED_ATTESTATION_DELAY); + + // Register this attestation for the corresponding root. + self.awaiting_attestations_per_root + .entry(*queued_aggregate.beacon_block_root()) + .or_default() + .push(att_id); + + // Store the attestation and its info. + self.queued_aggregates + .insert(self.next_attestation, (queued_aggregate, delay_key)); + + self.next_attestation += 1; + } + InboundEvent::Msg(UnknownBlockUnaggregate(queued_unaggregate)) => { + if self.attestations_delay_queue.len() >= MAXIMUM_QUEUED_ATTESTATIONS { + if self.attestation_delay_debounce.elapsed() { + error!( + log, + "Attestation delay queue is full"; + "queue_size" => MAXIMUM_QUEUED_ATTESTATIONS, + "msg" => "check system clock" + ); + } + // Drop the attestation. + return; + } + + let att_id = QueuedAttestationId::Unaggregate(self.next_attestation); + + // Register the delay. + let delay_key = self + .attestations_delay_queue + .insert(att_id, QUEUED_ATTESTATION_DELAY); + + // Register this attestation for the corresponding root. + self.awaiting_attestations_per_root + .entry(*queued_unaggregate.beacon_block_root()) + .or_default() + .push(att_id); + + // Store the attestation and its info. + self.queued_unaggregates + .insert(self.next_attestation, (queued_unaggregate, delay_key)); + + self.next_attestation += 1; + } + InboundEvent::Msg(UnknownLightClientOptimisticUpdate( + queued_light_client_optimistic_update, + )) => { + if self.lc_updates_delay_queue.len() >= MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES { + if self.lc_update_delay_debounce.elapsed() { + error!( + log, + "Light client updates delay queue is full"; + "queue_size" => MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES, + "msg" => "check system clock" + ); + } + // Drop the light client update. + return; + } + + let lc_id: QueuedLightClientUpdateId = self.next_lc_update; + + // Register the delay. + let delay_key = self + .lc_updates_delay_queue + .insert(lc_id, QUEUED_LIGHT_CLIENT_UPDATE_DELAY); + + // Register the light client update for the corresponding root. + self.awaiting_lc_updates_per_parent_root + .entry(queued_light_client_optimistic_update.parent_root) + .or_default() + .push(lc_id); + + // Store the light client update and its info. + self.queued_lc_updates.insert( + self.next_lc_update, + (queued_light_client_optimistic_update, delay_key), + ); + + self.next_lc_update += 1; + } + InboundEvent::Msg(UnknownBlockSamplingRequest(queued_sampling_request)) => { + if self.sampling_requests_delay_queue.len() >= MAXIMUM_QUEUED_SAMPLING_REQUESTS { + if self.sampling_request_delay_debounce.elapsed() { + error!( + log, + "Sampling requests delay queue is full"; + "queue_size" => MAXIMUM_QUEUED_SAMPLING_REQUESTS, + ); + } + // Drop the inbound message. + return; + } + + let id: QueuedSamplingRequestId = self.next_sampling_request_update; + self.next_sampling_request_update += 1; + + // Register the delay. + let delay_key = self + .sampling_requests_delay_queue + .insert(id, QUEUED_SAMPLING_REQUESTS_DELAY); + + self.awaiting_sampling_requests_per_block_root + .entry(queued_sampling_request.beacon_block_root) + .or_default() + .push(id); + + self.queued_sampling_requests + .insert(id, (queued_sampling_request, delay_key)); + } + InboundEvent::Msg(BlockImported { + block_root, + parent_root, + }) => { + // Unqueue the attestations we have for this root, if any. + if let Some(queued_ids) = self.awaiting_attestations_per_root.remove(&block_root) { + let mut sent_count = 0; + let mut failed_to_send_count = 0; + + for id in queued_ids { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_MATCHED_ATTESTATIONS, + ); + + if let Some((work, delay_key)) = match id { + QueuedAttestationId::Aggregate(id) => self + .queued_aggregates + .remove(&id) + .map(|(aggregate, delay_key)| { + (ReadyWork::Aggregate(aggregate), delay_key) + }), + QueuedAttestationId::Unaggregate(id) => self + .queued_unaggregates + .remove(&id) + .map(|(unaggregate, delay_key)| { + (ReadyWork::Unaggregate(unaggregate), delay_key) + }), + } { + // Remove the delay. + self.attestations_delay_queue.remove(&delay_key); + + // Send the work. + if self.ready_work_tx.try_send(work).is_err() { + failed_to_send_count += 1; + } else { + sent_count += 1; + } + } else { + // There is a mismatch between the attestation ids registered for this + // root and the queued attestations. This should never happen. + error!( + log, + "Unknown queued attestation for block root"; + "block_root" => ?block_root, + "att_id" => ?id, + ); + } + } + + if failed_to_send_count > 0 { + error!( + log, + "Ignored scheduled attestation(s) for block"; + "hint" => "system may be overloaded", + "parent_root" => ?parent_root, + "block_root" => ?block_root, + "failed_count" => failed_to_send_count, + "sent_count" => sent_count, + ); + } + } + // Unqueue the sampling requests we have for this root, if any. + if let Some(queued_ids) = self + .awaiting_sampling_requests_per_block_root + .remove(&block_root) + { + let mut sent_count = 0; + let mut failed_to_send_count = 0; + + for id in queued_ids { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_MATCHED_SAMPLING_REQUESTS, + ); + + if let Some((queued, delay_key)) = self.queued_sampling_requests.remove(&id) + { + // Remove the delay. + self.sampling_requests_delay_queue.remove(&delay_key); + + // Send the work. + let work = ReadyWork::SamplingRequest(queued); + + if self.ready_work_tx.try_send(work).is_err() { + failed_to_send_count += 1; + } else { + sent_count += 1; + } + } else { + // This should never happen. + error!(log, "Unknown sampling request for block root"; "block_root" => ?block_root, "id" => ?id); + } + } + + if failed_to_send_count > 0 { + error!( + log, + "Ignored scheduled sampling requests for block"; + "hint" => "system may be overloaded", + "block_root" => ?block_root, + "failed_count" => failed_to_send_count, + "sent_count" => sent_count, + ); + } + } + } + InboundEvent::Msg(NewLightClientOptimisticUpdate { parent_root }) => { + // Unqueue the light client optimistic updates we have for this root, if any. + if let Some(queued_lc_id) = self + .awaiting_lc_updates_per_parent_root + .remove(&parent_root) + { + debug!( + log, + "Dequeuing light client optimistic updates"; + "parent_root" => %parent_root, + "count" => queued_lc_id.len(), + ); + + for lc_id in queued_lc_id { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_MATCHED_OPTIMISTIC_UPDATES, + ); + if let Some((work, delay_key)) = self.queued_lc_updates.remove(&lc_id).map( + |(light_client_optimistic_update, delay_key)| { + ( + ReadyWork::LightClientUpdate(light_client_optimistic_update), + delay_key, + ) + }, + ) { + // Remove the delay + self.lc_updates_delay_queue.remove(&delay_key); + + // Send the work + match self.ready_work_tx.try_send(work) { + Ok(_) => trace!( + log, + "reprocessing light client update sent"; + ), + Err(_) => error!( + log, + "Failed to send scheduled light client update"; + ), + } + } else { + // There is a mismatch between the light client update ids registered for this + // root and the queued light client updates. This should never happen. + error!( + log, + "Unknown queued light client update for parent root"; + "parent_root" => ?parent_root, + "lc_id" => ?lc_id, + ); + } + } + } + } + InboundEvent::Msg(BackfillSync(queued_backfill_batch)) => { + self.queued_backfill_batches + .insert(0, queued_backfill_batch); + // only recompute if there is no `next_backfill_batch_event` already scheduled + if self.next_backfill_batch_event.is_none() { + self.recompute_next_backfill_batch_event(); + } + } + // A block that was queued for later processing is now ready to be processed. + InboundEvent::ReadyGossipBlock(ready_block) => { + let block_root = ready_block.beacon_block_root; + + if !self.queued_gossip_block_roots.remove(&block_root) { + // Log an error to alert that we've made a bad assumption about how this + // program works, but still process the block anyway. + error!( + log, + "Unknown block in delay queue"; + "block_root" => ?block_root + ); + } + + if self + .ready_work_tx + .try_send(ReadyWork::Block(ready_block)) + .is_err() + { + error!( + log, + "Failed to pop queued block"; + ); + } + } + InboundEvent::ReadyAttestation(queued_id) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_EXPIRED_ATTESTATIONS, + ); + + if let Some((root, work)) = match queued_id { + QueuedAttestationId::Aggregate(id) => { + self.queued_aggregates + .remove(&id) + .map(|(aggregate, _delay_key)| { + ( + *aggregate.beacon_block_root(), + ReadyWork::Aggregate(aggregate), + ) + }) + } + QueuedAttestationId::Unaggregate(id) => self + .queued_unaggregates + .remove(&id) + .map(|(unaggregate, _delay_key)| { + ( + *unaggregate.beacon_block_root(), + ReadyWork::Unaggregate(unaggregate), + ) + }), + } { + if self.ready_work_tx.try_send(work).is_err() { + error!( + log, + "Ignored scheduled attestation"; + "hint" => "system may be overloaded", + "beacon_block_root" => ?root + ); + } + + if let Some(queued_atts) = self.awaiting_attestations_per_root.get_mut(&root) { + if let Some(index) = queued_atts.iter().position(|&id| id == queued_id) { + queued_atts.swap_remove(index); + } + } + } + } + InboundEvent::ReadyLightClientUpdate(queued_id) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_EXPIRED_OPTIMISTIC_UPDATES, + ); + + if let Some((parent_root, work)) = self.queued_lc_updates.remove(&queued_id).map( + |(queued_lc_update, _delay_key)| { + ( + queued_lc_update.parent_root, + ReadyWork::LightClientUpdate(queued_lc_update), + ) + }, + ) { + if self.ready_work_tx.try_send(work).is_err() { + error!( + log, + "Failed to send scheduled light client optimistic update"; + ); + } + + if let Some(queued_lc_updates) = self + .awaiting_lc_updates_per_parent_root + .get_mut(&parent_root) + { + if let Some(index) = + queued_lc_updates.iter().position(|&id| id == queued_id) + { + queued_lc_updates.swap_remove(index); + } + } + } + } + InboundEvent::ReadyBackfillSync(queued_backfill_batch) => { + let millis_from_slot_start = self + .slot_clock + .millis_from_current_slot_start() + .map_or("null".to_string(), |duration| { + duration.as_millis().to_string() + }); + + debug!( + log, + "Sending scheduled backfill work"; + "millis_from_slot_start" => millis_from_slot_start + ); + + match self + .ready_work_tx + .try_send(ReadyWork::BackfillSync(queued_backfill_batch)) + { + // The message was sent successfully. + Ok(()) => (), + // The message was not sent, recover it from the returned `Err`. + Err(mpsc::error::TrySendError::Full(ReadyWork::BackfillSync(batch))) + | Err(mpsc::error::TrySendError::Closed(ReadyWork::BackfillSync(batch))) => { + error!( + log, + "Failed to send scheduled backfill work"; + "info" => "sending work back to queue" + ); + self.queued_backfill_batches.insert(0, batch); + + // only recompute if there is no `next_backfill_batch_event` already scheduled + if self.next_backfill_batch_event.is_none() { + self.recompute_next_backfill_batch_event(); + } + } + // The message was not sent and we didn't get the correct + // return result. This is a logic error. + _ => crit!( + log, + "Unexpected return from try_send error"; + ), + } + } + } + + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[GOSSIP_BLOCKS], + self.gossip_block_delay_queue.len() as i64, + ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[RPC_BLOCKS], + self.rpc_block_delay_queue.len() as i64, + ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[ATTESTATIONS], + self.attestations_delay_queue.len() as i64, + ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[LIGHT_CLIENT_UPDATES], + self.lc_updates_delay_queue.len() as i64, + ); + } + + fn recompute_next_backfill_batch_event(&mut self) { + // only recompute the `next_backfill_batch_event` if there are backfill batches in the queue + if !self.queued_backfill_batches.is_empty() { + self.next_backfill_batch_event = Some(Box::pin(tokio::time::sleep( + ReprocessQueue::::duration_until_next_backfill_batch_event(&self.slot_clock), + ))); + } else { + self.next_backfill_batch_event = None + } + } + + /// Returns duration until the next scheduled processing time. The schedule ensure that backfill + /// processing is done in windows of time that aren't critical + fn duration_until_next_backfill_batch_event(slot_clock: &S) -> Duration { + let slot_duration = slot_clock.slot_duration(); + slot_clock + .millis_from_current_slot_start() + .and_then(|duration_from_slot_start| { + BACKFILL_SCHEDULE_IN_SLOT + .into_iter() + // Convert fractions to seconds from slot start. + .map(|(multiplier, divisor)| (slot_duration / divisor) * multiplier) + .find_or_first(|&event_duration_from_slot_start| { + event_duration_from_slot_start > duration_from_slot_start + }) + .map(|next_event_time| { + if duration_from_slot_start >= next_event_time { + // event is in the next slot, add duration to next slot + let duration_to_next_slot = slot_duration - duration_from_slot_start; + duration_to_next_slot + next_event_time + } else { + next_event_time - duration_from_slot_start + } + }) + }) + // If we can't read the slot clock, just wait another slot. + .unwrap_or(slot_duration) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use logging::test_logger; + use slot_clock::{ManualSlotClock, TestingSlotClock}; + use std::ops::Add; + use std::sync::Arc; + use task_executor::test_utils::TestRuntime; + + #[test] + fn backfill_processing_schedule_calculation() { + let slot_duration = Duration::from_secs(12); + let slot_clock = TestingSlotClock::new(Slot::new(0), Duration::from_secs(0), slot_duration); + let current_slot_start = slot_clock.start_of(Slot::new(100)).unwrap(); + slot_clock.set_current_time(current_slot_start); + + let event_times = BACKFILL_SCHEDULE_IN_SLOT + .map(|(multiplier, divisor)| (slot_duration / divisor) * multiplier); + + for &event_duration_from_slot_start in event_times.iter() { + let duration_to_next_event = + ReprocessQueue::::duration_until_next_backfill_batch_event( + &slot_clock, + ); + + let current_time = slot_clock.millis_from_current_slot_start().unwrap(); + + assert_eq!( + duration_to_next_event, + event_duration_from_slot_start - current_time + ); + + slot_clock.set_current_time(current_slot_start + event_duration_from_slot_start) + } + + // check for next event beyond the current slot + let duration_to_next_slot = slot_clock.duration_to_next_slot().unwrap(); + let duration_to_next_event = + ReprocessQueue::::duration_until_next_backfill_batch_event( + &slot_clock, + ); + assert_eq!( + duration_to_next_event, + duration_to_next_slot + event_times[0] + ); + } + + // Regression test for issue #5504. + // See: https://github.com/sigp/lighthouse/issues/5504#issuecomment-2050930045 + #[tokio::test] + async fn backfill_schedule_failed_should_reschedule() { + let runtime = TestRuntime::default(); + let log = test_logger(); + let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel(1); + let (ready_work_tx, mut ready_work_rx) = mpsc::channel(1); + let slot_duration = 12; + let slot_clock = Arc::new(testing_slot_clock(slot_duration)); + + spawn_reprocess_scheduler( + ready_work_tx.clone(), + work_reprocessing_rx, + &runtime.task_executor, + slot_clock.clone(), + log, + Duration::from_millis(500), + ) + .unwrap(); + + // Pause time so it only advances manually + tokio::time::pause(); + + // Send some random work to `ready_work_tx` to fill up the capacity first. + ready_work_tx + .try_send(ReadyWork::IgnoredRpcBlock(IgnoredRpcBlock { + process_fn: Box::new(|| {}), + })) + .unwrap(); + + // Now queue a backfill sync batch. + work_reprocessing_tx + .try_send(ReprocessQueueMessage::BackfillSync(QueuedBackfillBatch( + Box::pin(async {}), + ))) + .unwrap(); + tokio::task::yield_now().await; + + // Advance the time by more than 1/2 the slot to trigger a scheduled backfill batch to be sent. + // This should fail as the `ready_work` channel is at capacity, and it should be rescheduled. + let duration_to_next_event = + ReprocessQueue::duration_until_next_backfill_batch_event(slot_clock.as_ref()); + let one_ms = Duration::from_millis(1); + advance_time(&slot_clock, duration_to_next_event.add(one_ms)).await; + + // Now drain the `ready_work` channel. + assert!(matches!( + ready_work_rx.try_recv(), + Ok(ReadyWork::IgnoredRpcBlock { .. }) + )); + assert!(ready_work_rx.try_recv().is_err()); + + // Advance time again, and assert that the re-scheduled batch is successfully sent. + let duration_to_next_event = + ReprocessQueue::duration_until_next_backfill_batch_event(slot_clock.as_ref()); + advance_time(&slot_clock, duration_to_next_event.add(one_ms)).await; + assert!(matches!( + ready_work_rx.try_recv(), + Ok(ReadyWork::BackfillSync { .. }) + )); + } + + /// Advances slot clock and test clock time by the same duration. + async fn advance_time(slot_clock: &ManualSlotClock, duration: Duration) { + slot_clock.advance_time(duration); + tokio::time::advance(duration).await; + // NOTE: The `tokio::time::advance` fn actually calls `yield_now()` after advancing the + // clock. Why do we need an extra `yield_now`? + tokio::task::yield_now().await; + } + + fn testing_slot_clock(slot_duration: u64) -> ManualSlotClock { + TestingSlotClock::new( + Slot::new(0), + Duration::from_secs(0), + Duration::from_secs(slot_duration), + ) + } +} diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 2fe482d4d27..e7e356b88cf 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -20,7 +20,7 @@ use beacon_chain::{ }; use beacon_chain::{Kzg, LightClientProducerEvent}; use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; -use beacon_processor::{BeaconProcessorConfig, BeaconProcessorQueueLengths}; +use beacon_processor::BeaconProcessorConfig; use environment::RuntimeContext; use eth1::{Config as Eth1Config, Service as Eth1Service}; use eth2::{ @@ -573,7 +573,6 @@ where network_senders: None, network_globals: None, beacon_processor_send: None, - beacon_processor_reprocess_send: None, eth1_service: Some(genesis_service.eth1_service.clone()), log: context.log().clone(), sse_logging_components: runtime_context.sse_logging_components.clone(), @@ -668,7 +667,6 @@ where context.executor, libp2p_registry.as_mut(), beacon_processor_channels.beacon_processor_tx.clone(), - beacon_processor_channels.work_reprocessing_tx.clone(), ) .await .map_err(|e| format!("Failed to start network: {:?}", e))?; @@ -808,9 +806,6 @@ where network_globals: self.network_globals.clone(), eth1_service: self.eth1_service.clone(), beacon_processor_send: Some(beacon_processor_channels.beacon_processor_tx.clone()), - beacon_processor_reprocess_send: Some( - beacon_processor_channels.work_reprocessing_tx.clone(), - ), sse_logging_components: runtime_context.sse_logging_components.clone(), log: log.clone(), }); @@ -869,6 +864,11 @@ where if let Some(beacon_chain) = self.beacon_chain.as_ref() { if let Some(network_globals) = &self.network_globals { let beacon_processor_context = runtime_context.service_context("bproc".into()); + let beacon_state = &beacon_chain + .canonical_head + .cached_head() + .snapshot + .beacon_state; BeaconProcessor { network_globals: network_globals.clone(), executor: beacon_processor_context.executor.clone(), @@ -877,20 +877,11 @@ where log: beacon_processor_context.log().clone(), } .spawn_manager( + beacon_state, beacon_processor_channels.beacon_processor_rx, - beacon_processor_channels.work_reprocessing_tx.clone(), - beacon_processor_channels.work_reprocessing_rx, None, beacon_chain.slot_clock.clone(), beacon_chain.spec.maximum_gossip_clock_disparity(), - BeaconProcessorQueueLengths::from_state( - &beacon_chain - .canonical_head - .cached_head() - .snapshot - .beacon_state, - &beacon_chain.spec, - )?, )?; } @@ -960,7 +951,7 @@ where compute_light_client_updates( &inner_chain, light_client_server_rv, - beacon_processor_channels.work_reprocessing_tx, + beacon_processor_channels.beacon_processor_tx, &log, ) .await diff --git a/beacon_node/client/src/compute_light_client_updates.rs b/beacon_node/client/src/compute_light_client_updates.rs index 1eb977d4213..4947148a06b 100644 --- a/beacon_node/client/src/compute_light_client_updates.rs +++ b/beacon_node/client/src/compute_light_client_updates.rs @@ -1,9 +1,8 @@ use beacon_chain::{BeaconChain, BeaconChainTypes, LightClientProducerEvent}; -use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage; +use beacon_processor::{BeaconProcessorSend, ReprocessQueueMessage, Work, WorkEvent}; use futures::channel::mpsc::Receiver; use futures::StreamExt; use slog::{error, Logger}; -use tokio::sync::mpsc::Sender; // Each `LightClientProducerEvent` is ~200 bytes. With the light_client server producing only recent // updates it is okay to drop some events in case of overloading. In normal network conditions @@ -14,7 +13,7 @@ pub(crate) const LIGHT_CLIENT_SERVER_CHANNEL_CAPACITY: usize = 32; pub async fn compute_light_client_updates( chain: &BeaconChain, mut light_client_server_rv: Receiver>, - reprocess_tx: Sender, + beacon_processor_send: BeaconProcessorSend, log: &Logger, ) { // Should only receive events for recent blocks, import_block filters by blocks close to clock. @@ -32,7 +31,13 @@ pub async fn compute_light_client_updates( }); let msg = ReprocessQueueMessage::NewLightClientOptimisticUpdate { parent_root }; - if reprocess_tx.try_send(msg).is_err() { + if beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: true, + work: Work::Reprocess(msg), + }) + .is_err() + { error!(log, "Failed to inform light client update"; "parent_root" => %parent_root) }; } diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index ffcfda46803..c326b6e63ae 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -39,7 +39,7 @@ use beacon_chain::{ validator_monitor::timestamp_now, AttestationError as AttnError, BeaconChain, BeaconChainError, BeaconChainTypes, WhenSlotSkipped, }; -use beacon_processor::{work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorSend}; +use beacon_processor::BeaconProcessorSend; pub use block_id::BlockId; use builder_states::get_next_withdrawals; use bytes::Bytes; @@ -132,7 +132,6 @@ pub struct Context { pub network_senders: Option>, pub network_globals: Option>>, pub beacon_processor_send: Option>, - pub beacon_processor_reprocess_send: Option>, pub eth1_service: Option, pub sse_logging_components: Option, pub log: Logger, @@ -536,12 +535,6 @@ pub fn serve( .filter(|_| config.enable_beacon_processor); let task_spawner_filter = warp::any().map(move || TaskSpawner::new(beacon_processor_send.clone())); - let beacon_processor_reprocess_send = ctx - .beacon_processor_reprocess_send - .clone() - .filter(|_| config.enable_beacon_processor); - let reprocess_send_filter = warp::any().map(move || beacon_processor_reprocess_send.clone()); - let duplicate_block_status_code = ctx.config.duplicate_block_status_code; /* @@ -1840,7 +1833,6 @@ pub fn serve( .and(warp::path::end()) .and(warp_utils::json::json()) .and(network_tx_filter.clone()) - .and(reprocess_send_filter) .and(log_filter.clone()) .then( // V1 and V2 are identical except V2 has a consensus version header in the request. @@ -1851,14 +1843,13 @@ pub fn serve( chain: Arc>, attestations: Vec>, network_tx: UnboundedSender>, - reprocess_tx: Option>, log: Logger| async move { let result = crate::publish_attestations::publish_attestations( task_spawner, chain, attestations, network_tx, - reprocess_tx, + true, // TODO(beacon-processor) always allow reprorcess? log, ) .await diff --git a/beacon_node/http_api/src/publish_attestations.rs b/beacon_node/http_api/src/publish_attestations.rs index 00654765325..6f999a31c2e 100644 --- a/beacon_node/http_api/src/publish_attestations.rs +++ b/beacon_node/http_api/src/publish_attestations.rs @@ -39,17 +39,14 @@ use beacon_chain::{ validator_monitor::timestamp_now, AttestationError, BeaconChain, BeaconChainError, BeaconChainTypes, }; -use beacon_processor::work_reprocessing_queue::{QueuedUnaggregate, ReprocessQueueMessage}; +use beacon_processor::{QueuedUnaggregate, ReprocessQueueMessage, Work, WorkEvent}; use eth2::types::Failure; use lighthouse_network::PubsubMessage; use network::NetworkMessage; use slog::{debug, error, warn, Logger}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{ - mpsc::{Sender, UnboundedSender}, - oneshot, -}; +use tokio::sync::{mpsc::UnboundedSender, oneshot}; use types::Attestation; // Error variants are only used in `Debug` and considered `dead_code` by the compiler. @@ -134,7 +131,7 @@ pub async fn publish_attestations( chain: Arc>, attestations: Vec>, network_tx: UnboundedSender>, - reprocess_send: Option>, + allow_reprocess: bool, log: Logger, ) -> Result<(), warp::Rejection> { // Collect metadata about attestations which we'll use to report failures. We need to @@ -148,6 +145,7 @@ pub async fn publish_attestations( let seen_timestamp = timestamp_now(); let inner_log = log.clone(); let mut prelim_results = task_spawner + .clone() .blocking_task(Priority::P0, move || { Ok(attestations .into_iter() @@ -163,7 +161,7 @@ pub async fn publish_attestations( Err(Error::Validation(AttestationError::UnknownHeadBlock { beacon_block_root, })) => { - let Some(reprocess_tx) = &reprocess_send else { + if !allow_reprocess { return PublishAttestationResult::Failure(Error::ReprocessDisabled); }; // Re-process. @@ -189,7 +187,13 @@ pub async fn publish_attestations( beacon_block_root, process_fn: Box::new(reprocess_fn), }); - if reprocess_tx.try_send(reprocess_msg).is_err() { + if task_spawner + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { PublishAttestationResult::Failure(Error::ReprocessFull) } else { PublishAttestationResult::Reprocessing(rx) diff --git a/beacon_node/http_api/src/task_spawner.rs b/beacon_node/http_api/src/task_spawner.rs index a679b294f65..71795102932 100644 --- a/beacon_node/http_api/src/task_spawner.rs +++ b/beacon_node/http_api/src/task_spawner.rs @@ -30,6 +30,7 @@ impl Priority { } /// Spawns tasks on the `BeaconProcessor` or directly on the tokio executor. +#[derive(Clone)] pub struct TaskSpawner { /// Used to send tasks to the `BeaconProcessor`. The tokio executor will be /// used if this is `None`. @@ -155,6 +156,32 @@ impl TaskSpawner { .and_then(|x| x) } } + + pub fn try_send(&self, work_event: WorkEvent) -> Result<(), warp::Rejection> { + if let Some(beacon_processor_send) = &self.beacon_processor_send { + let error_message = match beacon_processor_send.try_send(work_event) { + Ok(()) => None, + Err(TrySendError::Full(_)) => { + Some("The task was dropped. The server is overloaded.") + } + Err(TrySendError::Closed(_)) => { + Some("The task was dropped. The server is shutting down.") + } + }; + + if let Some(error_message) = error_message { + return Err(warp_utils::reject::custom_server_error( + error_message.to_string(), + )); + }; + + return Ok(()); + } else { + return Err(warp_utils::reject::custom_server_error( + "The beacon processor is unavailable".to_string(), + )); + }; + } } /// Send a task to the beacon processor and await execution. diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index f042175efa3..13bb79cf8b7 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -3,9 +3,7 @@ use beacon_chain::{ test_utils::{BeaconChainHarness, BoxedMutator, Builder, EphemeralHarnessType}, BeaconChain, BeaconChainTypes, }; -use beacon_processor::{ - BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig, BeaconProcessorQueueLengths, -}; +use beacon_processor::{BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig}; use directory::DEFAULT_ROOT_DIR; use eth2::{BeaconNodeHttpClient, Timeouts}; use lighthouse_network::{ @@ -206,7 +204,6 @@ pub async fn create_api_server_with_config( } = BeaconProcessorChannels::new(&beacon_processor_config); let beacon_processor_send = beacon_processor_tx; - let reprocess_send = work_reprocessing_tx.clone(); BeaconProcessor { network_globals: network_globals.clone(), executor: test_runtime.task_executor.clone(), @@ -219,11 +216,6 @@ pub async fn create_api_server_with_config( None, chain.slot_clock.clone(), chain.spec.maximum_gossip_clock_disparity(), - BeaconProcessorQueueLengths::from_state( - &chain.canonical_head.cached_head().snapshot.beacon_state, - &chain.spec, - ) - .unwrap(), ) .unwrap(); @@ -241,7 +233,6 @@ pub async fn create_api_server_with_config( network_senders: Some(network_senders), network_globals: Some(network_globals), beacon_processor_send: Some(beacon_processor_send), - beacon_processor_reprocess_send: Some(reprocess_send), eth1_service: Some(eth1_service), sse_logging_components: None, log, diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 643b755a949..014022de2f1 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -46,10 +46,9 @@ use types::{ }; use beacon_processor::{ - work_reprocessing_queue::{ - QueuedAggregate, QueuedGossipBlock, QueuedLightClientUpdate, QueuedUnaggregate, - ReprocessQueueMessage, - }, BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage + DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, QueuedAggregate, + QueuedGossipBlock, QueuedLightClientUpdate, QueuedUnaggregate, ReprocessQueueMessage, Work, + WorkEvent, }; /// Set to `true` to introduce stricter penalties for peers who send some types of late consensus @@ -226,7 +225,7 @@ impl NetworkBeaconProcessor { attestation: Box>, subnet_id: SubnetId, should_import: bool, - reprocess_tx: Option>, + allow_reprocess: bool, seen_timestamp: Duration, ) { let result = match self @@ -245,7 +244,6 @@ impl NetworkBeaconProcessor { message_id, peer_id, subnet_id, - reprocess_tx, should_import, seen_timestamp, ); @@ -254,7 +252,7 @@ impl NetworkBeaconProcessor { pub fn process_gossip_attestation_batch( self: Arc, packages: Vec>, - reprocess_tx: Option>, + allow_reprocess: bool, ) { let attestations_and_subnets = packages .iter() @@ -312,7 +310,6 @@ impl NetworkBeaconProcessor { package.message_id, package.peer_id, package.subnet_id, - reprocess_tx.clone(), package.should_import, package.seen_timestamp, ); @@ -321,6 +318,7 @@ impl NetworkBeaconProcessor { // Clippy warning is is ignored since the arguments are all of a different type (i.e., they // cant' be mixed-up) and creating a struct would result in more complexity. + // TODO(beacon-processor) disable reprocessing flag #[allow(clippy::too_many_arguments)] fn process_gossip_attestation_result( self: &Arc, @@ -328,7 +326,6 @@ impl NetworkBeaconProcessor { message_id: MessageId, peer_id: PeerId, subnet_id: SubnetId, - reprocess_tx: Option>, should_import: bool, seen_timestamp: Duration, ) { @@ -415,8 +412,8 @@ impl NetworkBeaconProcessor { should_import, seen_timestamp, }, - reprocess_tx, error, + true, // TODO(beacon-processor) enable or disbleretry? seen_timestamp, ); } @@ -435,7 +432,7 @@ impl NetworkBeaconProcessor { message_id: MessageId, peer_id: PeerId, aggregate: Box>, - reprocess_tx: Option>, + allow_reprocess: bool, seen_timestamp: Duration, ) { let beacon_block_root = aggregate.message().aggregate().data().beacon_block_root; @@ -459,7 +456,7 @@ impl NetworkBeaconProcessor { beacon_block_root, message_id, peer_id, - reprocess_tx, + allow_reprocess, seen_timestamp, ); } @@ -467,7 +464,7 @@ impl NetworkBeaconProcessor { pub fn process_gossip_aggregate_batch( self: Arc, packages: Vec>, - reprocess_tx: Option>, + allow_reprocess: bool, ) { let aggregates = packages.iter().map(|package| package.aggregate.as_ref()); @@ -523,7 +520,7 @@ impl NetworkBeaconProcessor { package.beacon_block_root, package.message_id, package.peer_id, - reprocess_tx.clone(), + allow_reprocess, package.seen_timestamp, ); } @@ -535,7 +532,7 @@ impl NetworkBeaconProcessor { beacon_block_root: Hash256, message_id: MessageId, peer_id: PeerId, - reprocess_tx: Option>, + allow_reprocess: bool, seen_timestamp: Duration, ) { match result { @@ -617,8 +614,8 @@ impl NetworkBeaconProcessor { attestation: signed_aggregate, seen_timestamp, }, - reprocess_tx, error, + allow_reprocess, seen_timestamp, ); } @@ -1092,20 +1089,12 @@ impl NetworkBeaconProcessor { peer_id: PeerId, peer_client: Client, block: Arc>, - reprocess_tx: mpsc::Sender, duplicate_cache: DuplicateCache, invalid_block_storage: InvalidBlockStorage, seen_duration: Duration, ) { if let Some(gossip_verified_block) = self - .process_gossip_unverified_block( - message_id, - peer_id, - peer_client, - block, - reprocess_tx.clone(), - seen_duration, - ) + .process_gossip_unverified_block(message_id, peer_id, peer_client, block, seen_duration) .await { let block_root = gossip_verified_block.block_root; @@ -1114,7 +1103,6 @@ impl NetworkBeaconProcessor { self.process_gossip_verified_block( peer_id, gossip_verified_block, - reprocess_tx, invalid_block_storage, seen_duration, ) @@ -1141,7 +1129,6 @@ impl NetworkBeaconProcessor { peer_id: PeerId, peer_client: Client, block: Arc>, - reprocess_tx: mpsc::Sender, seen_duration: Duration, ) -> Option> { let block_delay = @@ -1396,24 +1383,28 @@ impl NetworkBeaconProcessor { let inner_self = self.clone(); let process_fn = Box::pin(async move { - let reprocess_tx = inner_self.beacon_processor_send.clone(); let invalid_block_storage = inner_self.invalid_block_storage.clone(); inner_self .process_gossip_verified_block( peer_id, verified_block, - reprocess_tx, invalid_block_storage, seen_duration, ) .await; }); - if reprocess_tx - .try_send(ReprocessQueueMessage::EarlyBlock(QueuedGossipBlock { - beacon_block_slot: block_slot, - beacon_block_root: block_root, - process_fn, - })) + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(ReprocessQueueMessage::EarlyBlock( + QueuedGossipBlock { + beacon_block_slot: block_slot, + beacon_block_root: block_root, + process_fn, + }, + )), + }) .is_err() { error!( @@ -1448,7 +1439,6 @@ impl NetworkBeaconProcessor { self: Arc, peer_id: PeerId, verified_block: GossipVerifiedBlock, - reprocess_tx: BeaconProcessorSend, invalid_block_storage: InvalidBlockStorage, _seen_duration: Duration, ) { @@ -1484,11 +1474,15 @@ impl NetworkBeaconProcessor { Ok(AvailabilityProcessingStatus::Imported(block_root)) => { metrics::inc_counter(&metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_IMPORTED_TOTAL); - if reprocess_tx - .try_send(Work::Reprocess(ReprocessQueueMessage::BlockImported { - block_root: *block_root, - parent_root: block.message().parent_root(), - })) + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(ReprocessQueueMessage::BlockImported { + block_root: *block_root, + parent_root: block.message().parent_root(), + }), + }) .is_err() { error!( @@ -2041,7 +2035,7 @@ impl NetworkBeaconProcessor { message_id: MessageId, peer_id: PeerId, light_client_optimistic_update: LightClientOptimisticUpdate, - reprocess_tx: Option>, + allow_reprocess: bool, seen_timestamp: Duration, ) { match self.chain.verify_optimistic_update_for_gossip( @@ -2071,7 +2065,7 @@ impl NetworkBeaconProcessor { "parent_root" => ?parent_root ); - if let Some(sender) = reprocess_tx { + if allow_reprocess { let processor = self.clone(); let msg = ReprocessQueueMessage::UnknownLightClientOptimisticUpdate( QueuedLightClientUpdate { @@ -2081,14 +2075,21 @@ impl NetworkBeaconProcessor { message_id, peer_id, light_client_optimistic_update, - None, // Do not reprocess this message again. + false, // Do not reprocess this message again. seen_timestamp, ) }), }, ); - if sender.try_send(msg).is_err() { + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: true, + work: Work::Reprocess(msg), + }) + .is_err() + { error!( self.log, "Failed to send optimistic update for re-processing"; @@ -2165,8 +2166,8 @@ impl NetworkBeaconProcessor { peer_id: PeerId, message_id: MessageId, failed_att: FailedAtt, - reprocess_tx: Option>, error: AttnError, + allow_reprocess: bool, seen_timestamp: Duration, ) { let beacon_block_root = failed_att.beacon_block_root(); @@ -2399,7 +2400,7 @@ impl NetworkBeaconProcessor { "peer_id" => %peer_id, "block" => ?beacon_block_root ); - if let Some(sender) = reprocess_tx { + if allow_reprocess { // We don't know the block, get the sync manager to handle the block lookup, and // send the attestation to be scheduled for re-processing. self.sync_tx @@ -2430,7 +2431,7 @@ impl NetworkBeaconProcessor { message_id, peer_id, attestation, - None, // Do not allow this attestation to be re-processed beyond this point. + false, // Do not allow this attestation to be re-processed beyond this point. seen_timestamp, ) }), @@ -2455,7 +2456,7 @@ impl NetworkBeaconProcessor { attestation, subnet_id, should_import, - None, // Do not allow this attestation to be re-processed beyond this point. + false, // Do not allow this attestation to be re-processed beyond this point. seen_timestamp, ) }), @@ -2463,7 +2464,14 @@ impl NetworkBeaconProcessor { } }; - if sender.try_send(msg).is_err() { + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(msg), + }) + .is_err() + { error!( self.log, "Failed to send attestation for re-processing"; diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 5d905a20c8d..ab030db8266 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -5,9 +5,8 @@ use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{builder::Witness, eth1_chain::CachingEth1Backend, BeaconChain}; use beacon_chain::{BeaconChainTypes, NotifyExecutionLayer}; use beacon_processor::{ - work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorChannels, BeaconProcessorSend, - DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work, - WorkEvent as BeaconWorkEvent, + BeaconProcessorChannels, BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, + GossipAttestationPackage, ReprocessQueueMessage, Work, WorkEvent as BeaconWorkEvent, }; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, @@ -80,24 +79,21 @@ impl NetworkBeaconProcessor { // Define a closure for processing individual attestations. let processor = self.clone(); let process_individual = move |package: GossipAttestationPackage| { - let reprocess_tx = processor.reprocess_tx.clone(); processor.process_gossip_attestation( package.message_id, package.peer_id, package.attestation, package.subnet_id, package.should_import, - Some(reprocess_tx), + true, package.seen_timestamp, ) }; // Define a closure for processing batches of attestations. let processor = self.clone(); - let process_batch = move |attestations| { - let reprocess_tx = processor.reprocess_tx.clone(); - processor.process_gossip_attestation_batch(attestations, Some(reprocess_tx)) - }; + let process_batch = + move |attestations| processor.process_gossip_attestation_batch(attestations, true); self.try_send(BeaconWorkEvent { drop_during_sync: true, @@ -127,22 +123,20 @@ impl NetworkBeaconProcessor { // Define a closure for processing individual attestations. let processor = self.clone(); let process_individual = move |package: GossipAggregatePackage| { - let reprocess_tx = processor.reprocess_tx.clone(); processor.process_gossip_aggregate( package.message_id, package.peer_id, package.aggregate, - Some(reprocess_tx), + true, package.seen_timestamp, ) }; // Define a closure for processing batches of attestations. let processor = self.clone(); - let process_batch = move |aggregates| { - let reprocess_tx = processor.reprocess_tx.clone(); - processor.process_gossip_aggregate_batch(aggregates, Some(reprocess_tx)) - }; + // TODO(beacon-processor) allow reprocess? + let process_batch = + move |aggregates| processor.process_gossip_aggregate_batch(aggregates, true); let beacon_block_root = aggregate.message().aggregate().data().beacon_block_root; self.try_send(BeaconWorkEvent { @@ -172,7 +166,6 @@ impl NetworkBeaconProcessor { ) -> Result<(), Error> { let processor = self.clone(); let process_fn = async move { - let reprocess_tx = processor.reprocess_tx.clone(); let invalid_block_storage = processor.invalid_block_storage.clone(); let duplicate_cache = processor.duplicate_cache.clone(); processor @@ -181,7 +174,6 @@ impl NetworkBeaconProcessor { peer_id, peer_client, block, - reprocess_tx, duplicate_cache, invalid_block_storage, seen_timestamp, @@ -374,12 +366,11 @@ impl NetworkBeaconProcessor { ) -> Result<(), Error> { let processor = self.clone(); let process_fn = move || { - let reprocess_tx = processor.reprocess_tx.clone(); processor.process_gossip_optimistic_update( message_id, peer_id, light_client_optimistic_update, - Some(reprocess_tx), + true, seen_timestamp, ) }; diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index dcad6160b34..8bb15133f3b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -14,15 +14,13 @@ use beacon_chain::{ BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, }; use beacon_processor::{ - work_reprocessing_queue::{QueuedRpcBlock, ReprocessQueueMessage}, - AsyncFn, BlockingFn, DuplicateCache, + AsyncFn, BlockingFn, DuplicateCache, QueuedRpcBlock, ReprocessQueueMessage, Work, WorkEvent, }; use lighthouse_network::PeerAction; use slog::{debug, error, info, warn}; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; -use tokio::sync::mpsc; use types::beacon_block_body::format_kzg_commitments; use types::blob_sidecar::FixedBlobSidecarList; use types::{BlockImportSource, DataColumnSidecar, DataColumnSidecarList, Epoch, Hash256}; @@ -57,14 +55,12 @@ impl NetworkBeaconProcessor { process_type: BlockProcessType, ) -> AsyncFn { let process_fn = async move { - let reprocess_tx = self.reprocess_tx.clone(); let duplicate_cache = self.duplicate_cache.clone(); self.process_rpc_block( block_root, block, seen_timestamp, process_type, - reprocess_tx, duplicate_cache, ) .await; @@ -106,7 +102,6 @@ impl NetworkBeaconProcessor { block: RpcBlock, seen_timestamp: Duration, process_type: BlockProcessType, - reprocess_tx: mpsc::Sender, duplicate_cache: DuplicateCache, ) { // Check if the block is already being imported through another source @@ -132,7 +127,14 @@ impl NetworkBeaconProcessor { ignore_fn, }); - if reprocess_tx.try_send(reprocess_msg).is_err() { + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { error!(self.log, "Failed to inform block import"; "source" => "rpc", "block_root" => %block_root) }; return; @@ -174,7 +176,14 @@ impl NetworkBeaconProcessor { block_root: hash, parent_root, }; - if reprocess_tx.try_send(reprocess_msg).is_err() { + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { error!(self.log, "Failed to inform block import"; "source" => "rpc", "block_root" => %hash) }; self.chain.block_times_cache.write().set_time_observed( diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index 26c1d14f020..85ce1e9e82a 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -11,9 +11,7 @@ use crate::service::NetworkMessage; use crate::status::status_message; use crate::sync::SyncMessage; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use beacon_processor::{ - work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorSend, DuplicateCache, -}; +use beacon_processor::{BeaconProcessorSend, DuplicateCache}; use futures::prelude::*; use lighthouse_network::rpc::*; use lighthouse_network::{ @@ -88,7 +86,6 @@ impl Router { executor: task_executor::TaskExecutor, invalid_block_storage: InvalidBlockStorage, beacon_processor_send: BeaconProcessorSend, - beacon_processor_reprocess_tx: mpsc::Sender, log: slog::Logger, ) -> error::Result>> { let message_handler_log = log.new(o!("service"=> "router")); @@ -106,7 +103,6 @@ impl Router { chain: beacon_chain.clone(), network_tx: network_send.clone(), sync_tx: sync_send.clone(), - reprocess_tx: beacon_processor_reprocess_tx, network_globals: network_globals.clone(), invalid_block_storage, executor: executor.clone(), diff --git a/beacon_node/network/src/service.rs b/beacon_node/network/src/service.rs index 150402a7ab2..686b0eae4b5 100644 --- a/beacon_node/network/src/service.rs +++ b/beacon_node/network/src/service.rs @@ -9,7 +9,7 @@ use crate::{ NetworkConfig, }; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use beacon_processor::{work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorSend}; +use beacon_processor::BeaconProcessorSend; use futures::channel::mpsc::Sender; use futures::future::OptionFuture; use futures::prelude::*; @@ -209,7 +209,6 @@ impl NetworkService { executor: task_executor::TaskExecutor, libp2p_registry: Option<&'_ mut Registry>, beacon_processor_send: BeaconProcessorSend, - beacon_processor_reprocess_tx: mpsc::Sender, ) -> error::Result<( NetworkService, Arc>, @@ -310,7 +309,6 @@ impl NetworkService { executor.clone(), invalid_block_storage, beacon_processor_send, - beacon_processor_reprocess_tx, network_log.clone(), )?; @@ -372,7 +370,6 @@ impl NetworkService { executor: task_executor::TaskExecutor, libp2p_registry: Option<&'_ mut Registry>, beacon_processor_send: BeaconProcessorSend, - beacon_processor_reprocess_tx: mpsc::Sender, ) -> error::Result<(Arc>, NetworkSenders)> { let (network_service, network_globals, network_senders) = Self::build( beacon_chain, @@ -380,7 +377,6 @@ impl NetworkService { executor.clone(), libp2p_registry, beacon_processor_send, - beacon_processor_reprocess_tx, ) .await?; From 33594cd0d2e53a96d138c1400d0178ec6c3e1573 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 12:17:16 -0700 Subject: [PATCH 04/16] disable reporcess for now --- beacon_node/beacon_processor/src/lib.rs | 7 +- .../src/scheduler/interface.rs | 15 ++-- .../src/scheduler/priority_scheduler/mod.rs | 73 +++++++++---------- .../work_reprocessing_queue.rs | 10 +-- beacon_node/client/src/builder.rs | 2 +- beacon_node/http_api/src/test_utils.rs | 9 ++- .../http_api/tests/interactive_tests.rs | 12 +-- .../gossip_methods.rs | 6 +- .../src/network_beacon_processor/mod.rs | 2 +- 9 files changed, 71 insertions(+), 65 deletions(-) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 8e27354fa1a..75dee0c5c2f 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -355,6 +355,7 @@ impl BeaconProcessorSend { match self.0.try_send(message) { Ok(res) => Ok(res), Err(e) => { + println!("{e}"); metrics::inc_counter_vec( &metrics::BEACON_PROCESSOR_SEND_ERROR_PER_WORK_TYPE, &[work_type.into()], @@ -594,10 +595,10 @@ impl BeaconProcessor { event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, - maximum_gossip_clock_disparity: Duration, + spec: &ChainSpec, ) -> Result<(), String> { - let scheduler = SchedulerType::::new(self, beacon_state, event_rx); - scheduler.run(work_journal_tx, slot_clock, maximum_gossip_clock_disparity) + let scheduler = SchedulerType::::new(self, beacon_state, event_rx, spec)?; + scheduler.run(work_journal_tx, slot_clock, spec.maximum_gossip_clock_disparity()) } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index 59b8567d9ac..eab592facca 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -2,14 +2,14 @@ use std::time::Duration; use slot_clock::SlotClock; use tokio::sync::mpsc; -use types::{BeaconState, EthSpec}; +use types::{BeaconState, ChainSpec, EthSpec}; use crate::{BeaconProcessor, WorkEvent}; use super::priority_scheduler; pub trait Scheduler { - fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>) -> Self; + fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>, spec: &ChainSpec) -> Result, String>; fn run( self, @@ -24,12 +24,13 @@ pub enum SchedulerType { } impl Scheduler for SchedulerType { - fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>) -> Self { - SchedulerType::PriorityScheduler(priority_scheduler::Scheduler::new( + fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>, spec: &ChainSpec) -> Result, String> { + Ok(Box::new(SchedulerType::PriorityScheduler(priority_scheduler::Scheduler::new( beacon_processor, - todo!(), - todo!(), - )) + beacon_state, + event_rx, + spec + )?))) } // TODO(beacon-processor) make this config driven fn run( diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 070dc533837..f9e3d455ffb 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -12,17 +12,16 @@ use lighthouse_metrics::HistogramTimer; use slog::error; use slog::{crit, debug, trace, warn}; use slot_clock::SlotClock; -use std::borrow::BorrowMut; use std::pin::Pin; use std::task::Context; use std::{cmp, marker::PhantomData, sync::Arc, time::Duration}; -use tokio::sync::mpsc::{self, error::TrySendError, Receiver, Sender}; +use tokio::sync::mpsc::{self, error::TrySendError, Sender}; use types::{BeaconState, ChainSpec, EthSpec}; use work_queue::{BeaconProcessorQueueLengths, WorkQueues}; use work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; use crate::{ - metrics, BeaconProcessor, BeaconProcessorConfig, BlockingOrAsync, QueuedBackfillBatch, + metrics, BeaconProcessor, BlockingOrAsync, QueuedBackfillBatch, ReprocessQueueMessage, SendOnDrop, TaskSpawner, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, WORKER_FREED, }; @@ -47,14 +46,16 @@ struct InboundEvents { /// Used by upstream processes to send new work to the `BeaconProcessor`. event_rx: mpsc::Receiver>, /// Used internally for queuing work ready to be re-processed. - reprocess_work_rx: mpsc::Receiver, + ready_work_rx: mpsc::Receiver, + reprocess_work_rx: mpsc::Receiver, } struct OutboundEvents { /// Sends tasks to workers. idle_tx: mpsc::Sender<()>, /// Used internally for queuing work ready to be re-processed. - reprocess_work_tx: mpsc::Sender, + reprocess_work_tx: mpsc::Sender, + ready_work_tx: mpsc::Sender, } impl Stream for InboundEvents { @@ -77,7 +78,11 @@ impl Stream for InboundEvents { // block is required to successfully process some new work. match self.reprocess_work_rx.poll_recv(cx) { Poll::Ready(Some(ready_work)) => { - return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); + return Poll::Ready(Some(InboundEvent::ReprocessingWork( + WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(ready_work) + }))); } Poll::Ready(None) => { return Poll::Ready(None); @@ -122,7 +127,7 @@ impl Scheduler { beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>, - spec: Arc, + spec: &ChainSpec, ) -> Result { // Used by workers to communicate that they are finished a task. let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); @@ -137,27 +142,29 @@ impl Scheduler { let (ready_work_tx, ready_work_rx) = mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); - let (work_reprocessing_tx, reprocess_work_rx) = + let (reprocess_work_tx, reprocess_work_rx) = mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); let inbound_events = InboundEvents { idle_rx, event_rx, - reprocess_work_rx: ready_work_rx, + ready_work_rx, + reprocess_work_rx }; let outbound_events = OutboundEvents { idle_tx, - reprocess_work_tx: ready_work_tx + reprocess_work_tx, + ready_work_tx, }; - Self { + Ok(Self { beacon_processor, inbound_events, outbound_events, work_queues, phantom_data: PhantomData - } + }) } pub fn run( @@ -166,29 +173,18 @@ impl Scheduler { slot_clock: S, maximum_gossip_clock_disparity: Duration, ) -> Result<(), String> { - // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // receive them back once they are ready (`ready_work_rx`). - let (ready_work_tx, ready_work_rx) = - mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); - - let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel::( - self.beacon_processor.config.max_scheduled_work_queue_len, - ); - // TODO(beacon-processor) reprocess scheduler - spawn_reprocess_scheduler( - ready_work_tx, - work_reprocessing_rx, - &self.beacon_processor.executor, - Arc::new(slot_clock), - self.beacon_processor.log.clone(), - maximum_gossip_clock_disparity, - )?; + // spawn_reprocess_scheduler( + // self.outbound_events.ready_work_tx.clone(), + // self.inbound_events.reprocess_work_rx, + // &self.beacon_processor.executor, + // Arc::new(slot_clock), + // self.beacon_processor.log.clone(), + // maximum_gossip_clock_disparity, + // )?; let executor = self.beacon_processor.executor.clone(); - let manager_future = async move { - let idle_tx = self.outbound_events.idle_tx.clone(); loop { let work_event = match self.inbound_events.next().await { Some(InboundEvent::WorkerIdle) => { @@ -200,7 +196,7 @@ impl Scheduler { { match QueuedBackfillBatch::try_from(event) { Ok(backfill_batch) => { - match work_reprocessing_tx + match self.outbound_events.reprocess_work_tx .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) { Err(e) => { @@ -571,6 +567,10 @@ impl Scheduler { let work_type = work.to_type(); match work { + Work::Reprocess(work_event) => { + // TODO(beacon-processor) LOG ERROR + let _ = self.outbound_events.reprocess_work_tx.try_send(work_event); + } _ if can_spawn => self.spawn_worker(work), Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), // Attestation batches are formed internally within the @@ -743,10 +743,6 @@ impl Scheduler { work_id, &self.beacon_processor.log, ), - Work::Reprocess { .. } => { - // TODO(beacon-processor) what to do here - todo!() - } } Some(work_type) } @@ -1011,9 +1007,8 @@ impl Scheduler { | Work::LightClientFinalityUpdateRequest(process_fn) => { task_spawner.spawn_blocking(process_fn) } - Work::Reprocess(reprocess_message) => { - // TODO(beacon-processor) send to the reprocess queue - todo!() + Work::Reprocess(_) => { + () } }; } diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs index 4d2edf142d6..93c164caa0b 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs @@ -10,12 +10,7 @@ //! //! Aggregated and unaggregated attestations that failed verification due to referencing an unknown //! block will be re-queued until their block is imported, or until they expire. -use crate::ReprocessQueueMessage::*; -use crate::{ - metrics, IgnoredRpcBlock, QueuedAggregate, QueuedBackfillBatch, QueuedGossipBlock, - QueuedLightClientUpdate, QueuedRpcBlock, QueuedSamplingRequest, QueuedUnaggregate, - ReprocessQueueMessage, -}; +use crate::{metrics, IgnoredRpcBlock, QueuedAggregate, QueuedBackfillBatch, QueuedGossipBlock, QueuedLightClientUpdate, QueuedRpcBlock, QueuedSamplingRequest, QueuedUnaggregate, ReprocessQueueMessage}; use crate::{AsyncFn, BlockingFn, Work, WorkEvent}; use fnv::FnvHashMap; use futures::task::Poll; @@ -35,6 +30,7 @@ use task_executor::TaskExecutor; use tokio::sync::mpsc::{self, Receiver, Sender}; use tokio_util::time::delay_queue::{DelayQueue, Key as DelayKey}; use types::{EthSpec, Hash256, Slot}; +use crate::ReprocessQueueMessage::*; const TASK_NAME: &str = "beacon_processor_reprocess_queue"; const GOSSIP_BLOCKS: &str = "gossip_blocks"; @@ -86,6 +82,7 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; + /// Events sent by the scheduler once they are ready for re-processing. pub enum ReadyWork { Block(QueuedGossipBlock), @@ -98,6 +95,7 @@ pub enum ReadyWork { BackfillSync(QueuedBackfillBatch), } + impl From for WorkEvent { fn from(ready_work: ReadyWork) -> Self { match ready_work { diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index e7e356b88cf..dfae648e159 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -881,7 +881,7 @@ where beacon_processor_channels.beacon_processor_rx, None, beacon_chain.slot_clock.clone(), - beacon_chain.spec.maximum_gossip_clock_disparity(), + &beacon_chain.spec )?; } diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index 13bb79cf8b7..b5fc01a7495 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -203,6 +203,12 @@ pub async fn create_api_server_with_config( beacon_processor_rx, } = BeaconProcessorChannels::new(&beacon_processor_config); + let beacon_state = &chain + .canonical_head + .cached_head() + .snapshot + .beacon_state; + let beacon_processor_send = beacon_processor_tx; BeaconProcessor { network_globals: network_globals.clone(), @@ -212,10 +218,11 @@ pub async fn create_api_server_with_config( log: log.clone(), } .spawn_manager( + beacon_state, beacon_processor_rx, None, chain.slot_clock.clone(), - chain.spec.maximum_gossip_clock_disparity(), + &chain.spec, ) .unwrap(); diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index c3ed3347821..0f13035921f 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -4,7 +4,7 @@ use beacon_chain::{ test_utils::{AttestationStrategy, BlockStrategy, LightClientStrategy, SyncCommitteeStrategy}, ChainConfig, }; -use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage; +use beacon_processor::ReprocessQueueMessage; use eth2::types::ProduceBlockV3Response; use eth2::types::{DepositContractData, StateId}; use execution_layer::{ForkchoiceState, PayloadAttributes}; @@ -21,6 +21,7 @@ use types::{ Address, Epoch, EthSpec, ExecPayload, ExecutionBlockHash, FixedBytesExtended, ForkName, Hash256, MainnetEthSpec, MinimalEthSpec, ProposerPreparationData, Slot, Uint256, }; +use beacon_processor::{WorkEvent, Work}; type E = MainnetEthSpec; @@ -916,14 +917,15 @@ async fn queue_attestations_from_http() { .unwrap(); tester .ctx - .beacon_processor_reprocess_send + .beacon_processor_send .as_ref() .unwrap() - .send(ReprocessQueueMessage::BlockImported { + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(ReprocessQueueMessage::BlockImported { block_root, parent_root, - }) - .await + })}) .unwrap(); attestation_future.await.unwrap(); diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 014022de2f1..dd42be2490d 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -35,7 +35,6 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use store::hot_cold_store::HotColdDBError; -use tokio::sync::mpsc; use types::{ beacon_block::BlockImportSource, Attestation, AttestationRef, AttesterSlashing, BlobSidecar, DataColumnSidecar, DataColumnSubnetId, EthSpec, Hash256, IndexedAttestation, @@ -246,6 +245,7 @@ impl NetworkBeaconProcessor { subnet_id, should_import, seen_timestamp, + allow_reprocess, ); } @@ -312,6 +312,7 @@ impl NetworkBeaconProcessor { package.subnet_id, package.should_import, package.seen_timestamp, + true ); } } @@ -328,6 +329,7 @@ impl NetworkBeaconProcessor { subnet_id: SubnetId, should_import: bool, seen_timestamp: Duration, + allow_reprocess: bool, ) { match result { Ok(verified_attestation) => { @@ -413,7 +415,7 @@ impl NetworkBeaconProcessor { seen_timestamp, }, error, - true, // TODO(beacon-processor) enable or disbleretry? + allow_reprocess, seen_timestamp, ); } diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index ab030db8266..e145a28c49f 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -6,7 +6,7 @@ use beacon_chain::{builder::Witness, eth1_chain::CachingEth1Backend, BeaconChain use beacon_chain::{BeaconChainTypes, NotifyExecutionLayer}; use beacon_processor::{ BeaconProcessorChannels, BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, - GossipAttestationPackage, ReprocessQueueMessage, Work, WorkEvent as BeaconWorkEvent, + GossipAttestationPackage, Work, WorkEvent as BeaconWorkEvent, }; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, From 9de1322750f2143d54c93711d81fb2ab8735328c Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 14:21:21 -0700 Subject: [PATCH 05/16] reintroduce reprocessing queue --- beacon_node/beacon_processor/src/lib.rs | 24 ++-- .../src/scheduler/interface.rs | 32 +++-- .../src/scheduler/priority_scheduler/mod.rs | 133 ++++++++---------- .../work_reprocessing_queue.rs | 16 ++- beacon_node/client/src/builder.rs | 6 +- beacon_node/http_api/src/task_spawner.rs | 8 +- beacon_node/http_api/src/test_utils.rs | 6 +- .../http_api/tests/interactive_tests.rs | 9 +- .../gossip_methods.rs | 2 +- 9 files changed, 115 insertions(+), 121 deletions(-) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 75dee0c5c2f..0a3ba519de8 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -39,31 +39,25 @@ //! task. mod scheduler; use crate::scheduler::interface::SchedulerType; -use futures::stream::{Stream, StreamExt}; -use futures::task::Poll; use lighthouse_network::{MessageId, NetworkGlobals, PeerId}; -use logging::TimeLatch; use parking_lot::Mutex; use scheduler::interface::Scheduler; use serde::{Deserialize, Serialize}; -use slog::{crit, debug, error, trace, warn, Logger}; +use slog::{warn, Logger}; use slot_clock::SlotClock; use std::cmp; -use std::collections::{HashSet, VecDeque}; +use std::collections::HashSet; use std::fmt; use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use std::task::Context; use std::time::Duration; use strum::AsRefStr; use strum::IntoStaticStr; use task_executor::TaskExecutor; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TrySendError; -use types::{ - Attestation, BeaconState, ChainSpec, Hash256, RelativeEpoch, SignedAggregateAndProof, SubnetId, -}; +use types::{Attestation, BeaconState, ChainSpec, Hash256, SignedAggregateAndProof, SubnetId}; use types::{EthSpec, Slot}; mod metrics; @@ -81,9 +75,6 @@ const MAX_IDLE_QUEUE_LEN: usize = 16_384; /// The maximum size of the channel for re-processing work events. const DEFAULT_MAX_SCHEDULED_WORK_QUEUE_LEN: usize = 3 * DEFAULT_MAX_WORK_EVENT_QUEUE_LEN / 4; -/// The name of the manager tokio task. -const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; - /// The name of the worker tokio tasks. const WORKER_TASK_NAME: &str = "beacon_processor_worker"; @@ -597,8 +588,13 @@ impl BeaconProcessor { slot_clock: S, spec: &ChainSpec, ) -> Result<(), String> { - let scheduler = SchedulerType::::new(self, beacon_state, event_rx, spec)?; - scheduler.run(work_journal_tx, slot_clock, spec.maximum_gossip_clock_disparity()) + let scheduler = SchedulerType::::new(self, beacon_state, spec)?; + scheduler.run( + event_rx, + work_journal_tx, + slot_clock, + spec.maximum_gossip_clock_disparity(), + ) } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index eab592facca..48f4c358646 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -9,10 +9,15 @@ use crate::{BeaconProcessor, WorkEvent}; use super::priority_scheduler; pub trait Scheduler { - fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>, spec: &ChainSpec) -> Result, String>; + fn new( + beacon_processor: BeaconProcessor, + beacon_state: &BeaconState, + spec: &ChainSpec, + ) -> Result, String>; fn run( self, + event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, @@ -24,25 +29,30 @@ pub enum SchedulerType { } impl Scheduler for SchedulerType { - fn new(beacon_processor: BeaconProcessor, beacon_state: &BeaconState, event_rx: mpsc::Receiver>, spec: &ChainSpec) -> Result, String> { - Ok(Box::new(SchedulerType::PriorityScheduler(priority_scheduler::Scheduler::new( - beacon_processor, - beacon_state, - event_rx, - spec - )?))) + fn new( + beacon_processor: BeaconProcessor, + beacon_state: &BeaconState, + spec: &ChainSpec, + ) -> Result, String> { + Ok(Box::new(SchedulerType::PriorityScheduler( + priority_scheduler::Scheduler::new(beacon_processor, beacon_state, spec)?, + ))) } // TODO(beacon-processor) make this config driven fn run( self, + event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, ) -> Result<(), String> { match self { - SchedulerType::PriorityScheduler(scheduler) => { - scheduler.run(work_journal_tx, slot_clock, maximum_gossip_clock_disparity) - } + SchedulerType::PriorityScheduler(scheduler) => scheduler.run( + event_rx, + work_journal_tx, + slot_clock, + maximum_gossip_clock_disparity, + ), } } } diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index f9e3d455ffb..29dba27f055 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -21,9 +21,9 @@ use work_queue::{BeaconProcessorQueueLengths, WorkQueues}; use work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; use crate::{ - metrics, BeaconProcessor, BlockingOrAsync, QueuedBackfillBatch, - ReprocessQueueMessage, SendOnDrop, TaskSpawner, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, - NOTHING_TO_DO, WORKER_FREED, + metrics, BeaconProcessor, BlockingOrAsync, QueuedBackfillBatch, ReprocessQueueMessage, + SendOnDrop, TaskSpawner, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, + WORKER_FREED, }; /// Unifies all the messages processed by the `BeaconProcessor`. @@ -45,17 +45,7 @@ struct InboundEvents { idle_rx: mpsc::Receiver<()>, /// Used by upstream processes to send new work to the `BeaconProcessor`. event_rx: mpsc::Receiver>, - /// Used internally for queuing work ready to be re-processed. ready_work_rx: mpsc::Receiver, - reprocess_work_rx: mpsc::Receiver, -} - -struct OutboundEvents { - /// Sends tasks to workers. - idle_tx: mpsc::Sender<()>, - /// Used internally for queuing work ready to be re-processed. - reprocess_work_tx: mpsc::Sender, - ready_work_tx: mpsc::Sender, } impl Stream for InboundEvents { @@ -76,13 +66,9 @@ impl Stream for InboundEvents { // Poll for delayed blocks before polling for new work. It might be the case that a delayed // block is required to successfully process some new work. - match self.reprocess_work_rx.poll_recv(cx) { + match self.ready_work_rx.poll_recv(cx) { Poll::Ready(Some(ready_work)) => { - return Poll::Ready(Some(InboundEvent::ReprocessingWork( - WorkEvent { - drop_during_sync: false, - work: Work::Reprocess(ready_work) - }))); + return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); } Poll::Ready(None) => { return Poll::Ready(None); @@ -107,17 +93,12 @@ impl Stream for InboundEvents { /// The name of the manager tokio task. const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; -/// The name of the worker tokio tasks. -const WORKER_TASK_NAME: &str = "beacon_processor_worker"; - // TODO(beacon-processor) this will be impl specific // Backend trait inits a channel, a run function // A channel trait has send_work, reprocess_work etc. pub struct Scheduler { beacon_processor: BeaconProcessor, - inbound_events: InboundEvents, - outbound_events: OutboundEvents, work_queues: WorkQueues, phantom_data: PhantomData, } @@ -126,69 +107,72 @@ impl Scheduler { pub fn new( beacon_processor: BeaconProcessor, beacon_state: &BeaconState, - event_rx: mpsc::Receiver>, spec: &ChainSpec, ) -> Result { // Used by workers to communicate that they are finished a task. - let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); let queue_lengths = BeaconProcessorQueueLengths::from_state(beacon_state, spec)?; // Initialize the worker queues. let work_queues: WorkQueues = WorkQueues::new(queue_lengths); - // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // receive them back once they are ready (`ready_work_rx`). - let (ready_work_tx, ready_work_rx) = - mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); - - let (reprocess_work_tx, reprocess_work_rx) = - mpsc::channel::(beacon_processor.config.max_scheduled_work_queue_len); + // // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to + // // receive them back once they are ready (`ready_work_rx`). - let inbound_events = InboundEvents { - idle_rx, - event_rx, - ready_work_rx, - reprocess_work_rx - }; + // let inbound_events = InboundEvents { + // idle_rx, + // event_rx, + // }; - let outbound_events = OutboundEvents { - idle_tx, - reprocess_work_tx, - ready_work_tx, - }; + // let outbound_events = OutboundEvents { + // idle_tx, + // }; Ok(Self { beacon_processor, - inbound_events, - outbound_events, work_queues, - phantom_data: PhantomData + phantom_data: PhantomData, }) } pub fn run( mut self, + event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, maximum_gossip_clock_disparity: Duration, ) -> Result<(), String> { - // TODO(beacon-processor) reprocess scheduler - // spawn_reprocess_scheduler( - // self.outbound_events.ready_work_tx.clone(), - // self.inbound_events.reprocess_work_rx, - // &self.beacon_processor.executor, - // Arc::new(slot_clock), - // self.beacon_processor.log.clone(), - // maximum_gossip_clock_disparity, - // )?; + let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); + + let (ready_work_tx, ready_work_rx) = + mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); + + let (reprocess_work_tx, reprocess_work_rx) = mpsc::channel::( + self.beacon_processor.config.max_scheduled_work_queue_len, + ); + + let mut inbound_events = InboundEvents { + idle_rx, + event_rx, + ready_work_rx, + }; + + spawn_reprocess_scheduler( + ready_work_tx, + reprocess_work_rx, + &self.beacon_processor.executor, + Arc::new(slot_clock), + self.beacon_processor.log.clone(), + maximum_gossip_clock_disparity, + )?; let executor = self.beacon_processor.executor.clone(); let manager_future = async move { loop { - let work_event = match self.inbound_events.next().await { + let work_event = match inbound_events.next().await { Some(InboundEvent::WorkerIdle) => { - self.beacon_processor.current_workers = self.beacon_processor.current_workers.saturating_sub(1); + self.beacon_processor.current_workers = + self.beacon_processor.current_workers.saturating_sub(1); None } Some(InboundEvent::WorkEvent(event)) @@ -196,7 +180,7 @@ impl Scheduler { { match QueuedBackfillBatch::try_from(event) { Ok(backfill_batch) => { - match self.outbound_events.reprocess_work_tx + match reprocess_work_tx .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) { Err(e) => { @@ -249,7 +233,8 @@ impl Scheduler { let _event_timer = self.increment_metrics(&work_event); self.worker_journal(&work_event, &work_journal_tx); - let can_spawn = self.beacon_processor.current_workers < self.beacon_processor.config.max_workers; + let can_spawn = self.beacon_processor.current_workers + < self.beacon_processor.config.max_workers; let drop_during_sync = work_event .as_ref() .map_or(false, |event| event.drop_during_sync); @@ -263,7 +248,7 @@ impl Scheduler { if let Some(work_event) = work_event { let work_type = work_event.to_type(); // TODO(beacon-processor) check self.idle_tx - self.spawn_worker(work_event); + self.spawn_worker(idle_tx.clone(), work_event); Some(work_type) } else { None @@ -306,9 +291,12 @@ impl Scheduler { // There is a new work event and the chain is not syncing. Process it or queue // it. - Some(WorkEvent { work, .. }) => { - self.process_or_queue_work_event(work, can_spawn) - } + Some(WorkEvent { work, .. }) => self.process_or_queue_work_event( + &reprocess_work_tx, + idle_tx.clone(), + work, + can_spawn, + ), }; self.update_queue_metrics(modified_queue_id); @@ -559,6 +547,8 @@ impl Scheduler { // TODO(beacon-processor) this might be able to be moved to a more generalized location pub fn process_or_queue_work_event( &mut self, + reprocess_work_tx: &Sender, + idle_tx: Sender<()>, work: Work, can_spawn: bool, ) -> Option { @@ -569,9 +559,9 @@ impl Scheduler { match work { Work::Reprocess(work_event) => { // TODO(beacon-processor) LOG ERROR - let _ = self.outbound_events.reprocess_work_tx.try_send(work_event); + let _ = reprocess_work_tx.try_send(work_event); } - _ if can_spawn => self.spawn_worker(work), + _ if can_spawn => self.spawn_worker(idle_tx.clone(), work), Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), // Attestation batches are formed internally within the // `BeaconProcessor`, they are not sent from external services. @@ -890,7 +880,7 @@ impl Scheduler { /// Spawns a blocking worker thread to process some `Work`. /// /// Sends an message on `idle_tx` when the work is complete and the task is stopping. - fn spawn_worker(&mut self, work: Work) { + fn spawn_worker(&mut self, idle_tx: Sender<()>, work: Work) { let work_id = work.str_id(); let worker_timer = metrics::start_timer_vec(&metrics::BEACON_PROCESSOR_WORKER_TIME, &[work_id]); @@ -905,13 +895,14 @@ impl Scheduler { // This helps ensure that the worker is always freed in the case of an early exit or panic. // As such, this instantiation should happen as early in the function as possible. let send_idle_on_drop = SendOnDrop { - tx: self.outbound_events.idle_tx.clone(), + tx: idle_tx, _worker_timer: worker_timer, log: self.beacon_processor.log.clone(), }; let worker_id = self.beacon_processor.current_workers; - self.beacon_processor.current_workers = self.beacon_processor.current_workers.saturating_add(1); + self.beacon_processor.current_workers = + self.beacon_processor.current_workers.saturating_add(1); let executor = self.beacon_processor.executor.clone(); @@ -1007,9 +998,7 @@ impl Scheduler { | Work::LightClientFinalityUpdateRequest(process_fn) => { task_spawner.spawn_blocking(process_fn) } - Work::Reprocess(_) => { - () - } + Work::Reprocess(_) => (), }; } } diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs index 93c164caa0b..d9b1b3b472e 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs @@ -10,8 +10,13 @@ //! //! Aggregated and unaggregated attestations that failed verification due to referencing an unknown //! block will be re-queued until their block is imported, or until they expire. -use crate::{metrics, IgnoredRpcBlock, QueuedAggregate, QueuedBackfillBatch, QueuedGossipBlock, QueuedLightClientUpdate, QueuedRpcBlock, QueuedSamplingRequest, QueuedUnaggregate, ReprocessQueueMessage}; -use crate::{AsyncFn, BlockingFn, Work, WorkEvent}; +use crate::ReprocessQueueMessage::*; +use crate::{ + metrics, IgnoredRpcBlock, QueuedAggregate, QueuedBackfillBatch, QueuedGossipBlock, + QueuedLightClientUpdate, QueuedRpcBlock, QueuedSamplingRequest, QueuedUnaggregate, + ReprocessQueueMessage, +}; +use crate::{Work, WorkEvent}; use fnv::FnvHashMap; use futures::task::Poll; use futures::{Stream, StreamExt}; @@ -25,12 +30,10 @@ use std::pin::Pin; use std::sync::Arc; use std::task::Context; use std::time::Duration; -use strum::AsRefStr; use task_executor::TaskExecutor; use tokio::sync::mpsc::{self, Receiver, Sender}; use tokio_util::time::delay_queue::{DelayQueue, Key as DelayKey}; -use types::{EthSpec, Hash256, Slot}; -use crate::ReprocessQueueMessage::*; +use types::{EthSpec, Hash256}; const TASK_NAME: &str = "beacon_processor_reprocess_queue"; const GOSSIP_BLOCKS: &str = "gossip_blocks"; @@ -82,7 +85,6 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; - /// Events sent by the scheduler once they are ready for re-processing. pub enum ReadyWork { Block(QueuedGossipBlock), @@ -95,7 +97,6 @@ pub enum ReadyWork { BackfillSync(QueuedBackfillBatch), } - impl From for WorkEvent { fn from(ready_work: ReadyWork) -> Self { match ready_work { @@ -1022,6 +1023,7 @@ mod tests { use std::ops::Add; use std::sync::Arc; use task_executor::test_utils::TestRuntime; + use types::Slot; #[test] fn backfill_processing_schedule_calculation() { diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index dfae648e159..cf833efa5b5 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -19,8 +19,8 @@ use beacon_chain::{ BeaconChain, BeaconChainTypes, Eth1ChainBackend, MigratorConfig, ServerSentEventHandler, }; use beacon_chain::{Kzg, LightClientProducerEvent}; -use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; use beacon_processor::BeaconProcessorConfig; +use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; use environment::RuntimeContext; use eth1::{Config as Eth1Config, Service as Eth1Service}; use eth2::{ @@ -864,7 +864,7 @@ where if let Some(beacon_chain) = self.beacon_chain.as_ref() { if let Some(network_globals) = &self.network_globals { let beacon_processor_context = runtime_context.service_context("bproc".into()); - let beacon_state = &beacon_chain + let beacon_state = &beacon_chain .canonical_head .cached_head() .snapshot @@ -881,7 +881,7 @@ where beacon_processor_channels.beacon_processor_rx, None, beacon_chain.slot_clock.clone(), - &beacon_chain.spec + &beacon_chain.spec, )?; } diff --git a/beacon_node/http_api/src/task_spawner.rs b/beacon_node/http_api/src/task_spawner.rs index 71795102932..834cd29971f 100644 --- a/beacon_node/http_api/src/task_spawner.rs +++ b/beacon_node/http_api/src/task_spawner.rs @@ -175,12 +175,12 @@ impl TaskSpawner { )); }; - return Ok(()); + Ok(()) } else { - return Err(warp_utils::reject::custom_server_error( + Err(warp_utils::reject::custom_server_error( "The beacon processor is unavailable".to_string(), - )); - }; + )) + } } } diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index b5fc01a7495..f0f2108bdbb 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -203,11 +203,7 @@ pub async fn create_api_server_with_config( beacon_processor_rx, } = BeaconProcessorChannels::new(&beacon_processor_config); - let beacon_state = &chain - .canonical_head - .cached_head() - .snapshot - .beacon_state; + let beacon_state = &chain.canonical_head.cached_head().snapshot.beacon_state; let beacon_processor_send = beacon_processor_tx; BeaconProcessor { diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index 0f13035921f..6dd8776f228 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -5,6 +5,7 @@ use beacon_chain::{ ChainConfig, }; use beacon_processor::ReprocessQueueMessage; +use beacon_processor::{Work, WorkEvent}; use eth2::types::ProduceBlockV3Response; use eth2::types::{DepositContractData, StateId}; use execution_layer::{ForkchoiceState, PayloadAttributes}; @@ -21,7 +22,6 @@ use types::{ Address, Epoch, EthSpec, ExecPayload, ExecutionBlockHash, FixedBytesExtended, ForkName, Hash256, MainnetEthSpec, MinimalEthSpec, ProposerPreparationData, Slot, Uint256, }; -use beacon_processor::{WorkEvent, Work}; type E = MainnetEthSpec; @@ -923,9 +923,10 @@ async fn queue_attestations_from_http() { .try_send(WorkEvent { drop_during_sync: false, work: Work::Reprocess(ReprocessQueueMessage::BlockImported { - block_root, - parent_root, - })}) + block_root, + parent_root, + }), + }) .unwrap(); attestation_future.await.unwrap(); diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index dd42be2490d..7fc3402f2f0 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -312,7 +312,7 @@ impl NetworkBeaconProcessor { package.subnet_id, package.should_import, package.seen_timestamp, - true + allow_reprocess, ); } } From e9f17d9f421f51c0a9a1064ac9efa605a8121c0a Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 14:49:58 -0700 Subject: [PATCH 06/16] fix test failure --- .../src/scheduler/priority_scheduler/mod.rs | 18 ++++------------ .../src/network_beacon_processor/tests.rs | 21 +++++++++++-------- beacon_node/network/src/service/tests.rs | 2 -- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 29dba27f055..2a673117b5a 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -116,18 +116,6 @@ impl Scheduler { // Initialize the worker queues. let work_queues: WorkQueues = WorkQueues::new(queue_lengths); - // // Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to - // // receive them back once they are ready (`ready_work_rx`). - - // let inbound_events = InboundEvents { - // idle_rx, - // event_rx, - // }; - - // let outbound_events = OutboundEvents { - // idle_tx, - // }; - Ok(Self { beacon_processor, work_queues, @@ -857,8 +845,10 @@ impl Scheduler { .unwrap_or(WORKER_FREED); // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(id); + // during testing. We also ignore reprocess messages to ensure our test cases can pass. + if id != "reprocess" { + let _ = work_journal_tx.try_send(id); + } } } diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index bd8463ca0b1..ef897f6778e 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -13,7 +13,7 @@ use beacon_chain::test_utils::{ test_spec, AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, }; use beacon_chain::{BeaconChain, WhenSlotSkipped}; -use beacon_processor::{work_reprocessing_queue::*, *}; +use beacon_processor::*; use lighthouse_network::discovery::ConnectionId; use lighthouse_network::rpc::methods::BlobsByRangeRequest; use lighthouse_network::rpc::SubstreamId; @@ -48,6 +48,13 @@ const SEQ_NUMBER: u64 = 0; /// The default time to wait for `BeaconProcessor` events. const STANDARD_TIMEOUT: Duration = Duration::from_secs(10); +// TODO(beacon-processor) import these two instead of defining +/// For how long to queue rpc blocks before sending them back for reprocessing. +pub const QUEUED_RPC_BLOCK_DELAY: Duration = Duration::from_secs(4); + +/// For how long to queue aggregated and unaggregated attestations for re-processing. +pub const QUEUED_ATTESTATION_DELAY: Duration = Duration::from_secs(12); + /// Provides utilities for testing the `BeaconProcessor`. struct TestRig { chain: Arc>, @@ -232,6 +239,8 @@ impl TestRig { }; let network_beacon_processor = Arc::new(network_beacon_processor); + let beacon_state = &chain.canonical_head.cached_head().snapshot.beacon_state; + let beacon_processor = BeaconProcessor { network_globals, executor, @@ -240,17 +249,11 @@ impl TestRig { log: log.clone(), } .spawn_manager( + beacon_state, beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx, Some(work_journal_tx), harness.chain.slot_clock.clone(), - chain.spec.maximum_gossip_clock_disparity(), - BeaconProcessorQueueLengths::from_state( - &chain.canonical_head.cached_head().snapshot.beacon_state, - &chain.spec, - ) - .unwrap(), + &chain.spec, ); assert!(beacon_processor.is_ok()); diff --git a/beacon_node/network/src/service/tests.rs b/beacon_node/network/src/service/tests.rs index 0e91fcb7797..47136d4f916 100644 --- a/beacon_node/network/src/service/tests.rs +++ b/beacon_node/network/src/service/tests.rs @@ -89,7 +89,6 @@ mod tests { executor, None, beacon_processor_tx, - work_reprocessing_tx, ) .await .unwrap(); @@ -158,7 +157,6 @@ mod tests { executor.clone(), None, beacon_processor_channels.beacon_processor_tx, - beacon_processor_channels.work_reprocessing_tx, ) .await .unwrap() From a887096a72337b66e09e7f6a5426165d68f3ecc5 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 16:36:47 -0700 Subject: [PATCH 07/16] resolve TODOs --- .../src/scheduler/priority_scheduler/mod.rs | 13 +++++++------ .../priority_scheduler/work_reprocessing_queue.rs | 8 ++++---- .../src/network_beacon_processor/gossip_methods.rs | 1 - .../network/src/network_beacon_processor/mod.rs | 1 - 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 2a673117b5a..b50fc049cc1 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -93,8 +93,6 @@ impl Stream for InboundEvents { /// The name of the manager tokio task. const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; -// TODO(beacon-processor) this will be impl specific - // Backend trait inits a channel, a run function // A channel trait has send_work, reprocess_work etc. pub struct Scheduler { @@ -235,7 +233,6 @@ impl Scheduler { let work_event = self.priority_scheduler(&work_journal_tx); if let Some(work_event) = work_event { let work_type = work_event.to_type(); - // TODO(beacon-processor) check self.idle_tx self.spawn_worker(idle_tx.clone(), work_event); Some(work_type) } else { @@ -532,7 +529,6 @@ impl Scheduler { work_event } - // TODO(beacon-processor) this might be able to be moved to a more generalized location pub fn process_or_queue_work_event( &mut self, reprocess_work_tx: &Sender, @@ -546,8 +542,13 @@ impl Scheduler { match work { Work::Reprocess(work_event) => { - // TODO(beacon-processor) LOG ERROR - let _ = reprocess_work_tx.try_send(work_event); + if let Err(e) = reprocess_work_tx.try_send(work_event) { + error!( + self.beacon_processor.log, + "Failed to reprocess work event"; + "error" => %e + ) + } } _ if can_spawn => self.spawn_worker(idle_tx.clone(), work), Work::GossipAttestation { .. } => self.work_queues.attestation_queue.push(work), diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs index d9b1b3b472e..2459c002830 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs @@ -700,8 +700,8 @@ impl ReprocessQueue { "hint" => "system may be overloaded", "parent_root" => ?parent_root, "block_root" => ?block_root, - "failed_count" => failed_to_send_count, - "sent_count" => sent_count, + "failed_count" => %failed_to_send_count, + "sent_count" => %sent_count, ); } } @@ -743,8 +743,8 @@ impl ReprocessQueue { "Ignored scheduled sampling requests for block"; "hint" => "system may be overloaded", "block_root" => ?block_root, - "failed_count" => failed_to_send_count, - "sent_count" => sent_count, + "failed_count" => %failed_to_send_count, + "sent_count" => %sent_count, ); } } diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 7fc3402f2f0..a9bd03feabd 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -319,7 +319,6 @@ impl NetworkBeaconProcessor { // Clippy warning is is ignored since the arguments are all of a different type (i.e., they // cant' be mixed-up) and creating a struct would result in more complexity. - // TODO(beacon-processor) disable reprocessing flag #[allow(clippy::too_many_arguments)] fn process_gossip_attestation_result( self: &Arc, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index e145a28c49f..82aba0b5143 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -134,7 +134,6 @@ impl NetworkBeaconProcessor { // Define a closure for processing batches of attestations. let processor = self.clone(); - // TODO(beacon-processor) allow reprocess? let process_batch = move |aggregates| processor.process_gossip_aggregate_batch(aggregates, true); From 9e0980ed5ff25f7317eab36bb96539962bc38894 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 30 Sep 2024 16:36:55 -0700 Subject: [PATCH 08/16] fmt --- .../beacon_processor/src/scheduler/priority_scheduler/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index b50fc049cc1..d4bda3a1bdd 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -544,7 +544,7 @@ impl Scheduler { Work::Reprocess(work_event) => { if let Err(e) = reprocess_work_tx.try_send(work_event) { error!( - self.beacon_processor.log, + self.beacon_processor.log, "Failed to reprocess work event"; "error" => %e ) From 9b6f2e4c79136f63f2b1cf3c99ea15f23b99c7e1 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 29 Oct 2024 15:25:34 -0700 Subject: [PATCH 09/16] earliest deadline first boilerplate --- .../earliest_deadline_queue.rs | 13 +++++++++---- .../mod.rs | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) rename beacon_node/beacon_processor/src/scheduler/{earliest_deadline_first => earliest_deadline_scheduler}/earliest_deadline_queue.rs (86%) rename beacon_node/beacon_processor/src/scheduler/{earliest_deadline_first => earliest_deadline_scheduler}/mod.rs (99%) diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/earliest_deadline_queue.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs similarity index 86% rename from beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/earliest_deadline_queue.rs rename to beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs index c4eb8e32f7e..6e49161d613 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/earliest_deadline_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs @@ -5,7 +5,7 @@ use std::{ use types::EthSpec; -use crate::{WorkEvent, Work}; +use crate::{Work, WorkEvent}; pub struct WorkQueue { min_heap: BinaryHeap>>, @@ -16,11 +16,16 @@ pub struct QueueItem { pub work_event: WorkEvent, } -impl QueueItem { +impl QueueItem { pub fn new(work: Work) -> Self { + let work_event = WorkEvent { + drop_during_sync: false, + work, + }; + Self { - work_event: todo!(), - deadline: 0 + work_event, + deadline: 0, } } } diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs similarity index 99% rename from beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/mod.rs rename to beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index e943ccb82ac..a29f3cb320e 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_first/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -201,7 +201,7 @@ impl Scheduler { let work_id = work.str_id(); let work_type = work.to_type(); - + match work { _ if can_spawn => spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work), _ => { From efb444161791902f0a90170222638fb22c7706eb Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Wed, 30 Oct 2024 13:59:00 -0700 Subject: [PATCH 10/16] deadline scheduling boilerplate --- .../earliest_deadline_queue.rs | 167 ++++++++++++++++-- .../earliest_deadline_scheduler/mod.rs | 47 ++++- .../src/scheduler/interface.rs | 9 +- .../src/scheduler/priority_scheduler/mod.rs | 2 +- 4 files changed, 196 insertions(+), 29 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs index 6e49161d613..77c8f576c5e 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs @@ -1,67 +1,187 @@ use std::{ - cmp::{Ordering, Reverse}, + cmp::{max, Reverse}, collections::BinaryHeap, + marker::PhantomData, + time::Duration, }; -use types::EthSpec; +use slot_clock::SlotClock; +use types::{EthSpec, Slot}; use crate::{Work, WorkEvent}; -pub struct WorkQueue { - min_heap: BinaryHeap>>, +pub struct WorkQueue { + min_heap: BinaryHeap>>, } -pub struct QueueItem { - deadline: u64, +pub struct QueueItem { + deadline: Duration, pub work_event: WorkEvent, + phantom_data: PhantomData, } -impl QueueItem { - pub fn new(work: Work) -> Self { +impl QueueItem { + pub fn new(work: Work, slot_clock: &S) -> Option { + let Some(deadline) = QueueItem::calculate_deadline(&work, slot_clock) else { + return None; + }; + let work_event = WorkEvent { drop_during_sync: false, work, }; - Self { + Some(Self { work_event, - deadline: 0, - } + deadline, + phantom_data: PhantomData, + }) + } + + fn calculate_deadline(work: &Work, slot_clock: &S) -> Option { + let Some(current_time) = slot_clock.now_duration() else { + return None; + }; + let deadline = match work { + Work::GossipAttestation { attestation, .. } => { + let attestation_slot = attestation.attestation.data().slot; + + let Some(start_of_attestation_slot) = slot_clock.start_of(attestation_slot) else { + return None; + }; + + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + let four_seconds_into_slot = + start_of_attestation_slot.saturating_add(Duration::from_secs(4)); + Some(max(four_seconds_into_slot, arrival_time_with_buffer)) + } + Work::GossipAttestationBatch { attestations, .. } => { + let Some(attestation) = attestations.first() else { + return None; + }; + + let attestation_slot = attestation.attestation.data().slot; + + let Some(start_of_attestation_slot) = slot_clock.start_of(attestation_slot) else { + return None; + }; + + let four_seconds_into_slot = + start_of_attestation_slot.saturating_add(Duration::from_secs(4)); + + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + Some(max(four_seconds_into_slot, arrival_time_with_buffer)) + } + Work::GossipAggregate { aggregate, .. } => { + let attestation_slot = aggregate.aggregate.message().aggregate().data().slot; + let Some(start_of_next_slot) = slot_clock.start_of(attestation_slot + Slot::new(1)) + else { + return None; + }; + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + + Some(max(start_of_next_slot, arrival_time_with_buffer)) + } + Work::UnknownBlockAggregate { .. } => Some(current_time), + Work::GossipAggregateBatch { aggregates, .. } => { + let Some(aggregate) = aggregates.first() else { + return None; + }; + + let attestation_slot = aggregate.aggregate.message().aggregate().data().slot; + let Some(start_of_next_slot) = slot_clock.start_of(attestation_slot + Slot::new(1)) + else { + return None; + }; + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + + Some(max(start_of_next_slot, arrival_time_with_buffer)) + } + Work::GossipBlock(_) => Some(current_time), + Work::DelayedImportBlock { .. } => Some(current_time), + Work::GossipVoluntaryExit(_) => { + Some(current_time.saturating_add(Duration::from_secs(4))) + } + Work::UnknownLightClientOptimisticUpdate { .. } => { + Some(current_time.saturating_add(Duration::from_secs(4))) + } + Work::UnknownBlockAttestation { .. } + | Work::UnknownBlockSamplingRequest { .. } + | Work::GossipBlobSidecar(_) + | Work::GossipDataColumnSidecar(_) + | Work::GossipProposerSlashing(_) + | Work::GossipAttesterSlashing(_) + | Work::GossipSyncSignature(_) + | Work::GossipSyncContribution(_) + | Work::RpcBlobs { .. } + | Work::RpcCustodyColumn { .. } + | Work::RpcVerifyDataColumn { .. } => { + Some(current_time.saturating_add(Duration::from_secs(1))) + } + Work::GossipLightClientFinalityUpdate(..) => { + Some(current_time.saturating_add(Duration::from_secs(4))) + } + Work::GossipLightClientOptimisticUpdate(fn_once) => todo!(), + Work::RpcBlock { process_fn } => todo!(), + (pin) => todo!(), + (pin) => todo!(), + Work::SamplingResult(pin) => todo!(), + Work::IgnoredRpcBlock { process_fn } => todo!(), + Work::ChainSegment(pin) => todo!(), + Work::ChainSegmentBackfill(pin) => todo!(), + Work::Status(fn_once) => todo!(), + Work::BlocksByRangeRequest(pin) => todo!(), + Work::BlocksByRootsRequest(pin) => todo!(), + Work::BlobsByRangeRequest(fn_once) => todo!(), + Work::BlobsByRootsRequest(fn_once) => todo!(), + Work::DataColumnsByRootsRequest(fn_once) => todo!(), + Work::DataColumnsByRangeRequest(fn_once) => todo!(), + Work::GossipBlsToExecutionChange(fn_once) => todo!(), + Work::LightClientBootstrapRequest(fn_once) => todo!(), + Work::LightClientOptimisticUpdateRequest(fn_once) => todo!(), + Work::LightClientFinalityUpdateRequest(fn_once) => todo!(), + Work::LightClientUpdatesByRangeRequest(fn_once) => todo!(), + Work::ApiRequestP0(blocking_or_async) => todo!(), + Work::ApiRequestP1(blocking_or_async) => todo!(), + Work::Reprocess(reprocess_queue_message) => todo!(), + }; + + deadline } } -impl std::cmp::Eq for QueueItem {} +impl std::cmp::Eq for QueueItem {} -impl PartialEq for QueueItem { +impl PartialEq for QueueItem { fn eq(&self, other: &Self) -> bool { self.deadline == other.deadline } } -impl PartialOrd for QueueItem { +impl PartialOrd for QueueItem { fn partial_cmp(&self, other: &Self) -> Option { self.deadline.partial_cmp(&other.deadline) } } -impl Ord for QueueItem { +impl Ord for QueueItem { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.deadline.cmp(&other.deadline) } } -impl WorkQueue { +impl WorkQueue { pub fn new() -> Self { WorkQueue { min_heap: BinaryHeap::new(), } } - pub fn insert(&mut self, queue_item: QueueItem) { + pub fn insert(&mut self, queue_item: QueueItem) { self.min_heap.push(Reverse(queue_item)) } - pub fn pop(&mut self) -> Option> { + pub fn pop(&mut self) -> Option> { if let Some(queue_item) = self.min_heap.pop() { Some(queue_item.0) } else { @@ -69,7 +189,16 @@ impl WorkQueue { } } - fn peek(&self) -> Option<&Reverse>> { + fn peek(&self) -> Option<&Reverse>> { self.min_heap.peek() } + + pub fn len(&self) -> usize { + self.min_heap.len() + } + + // TODO do we want an is_full method? should there be a concept of full? + pub fn is_full(&self) -> bool { + todo!() + } } diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index a29f3cb320e..1882484254d 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -3,6 +3,7 @@ use std::task::Poll; use earliest_deadline_queue::{QueueItem, WorkQueue}; use futures::{Stream, StreamExt}; use slog::{debug, trace, warn}; +use slot_clock::SlotClock; use tokio::sync::mpsc::{self, Sender}; use types::EthSpec; @@ -14,9 +15,12 @@ use super::spawn_worker; mod earliest_deadline_queue; -pub struct Scheduler { +/// The name of the manager tokio task. +const MANAGER_TASK_NAME: &str = "earliest_deadline_first_scheduler"; + +pub struct Scheduler { beacon_processor: BeaconProcessor, - work_queue: WorkQueue, + work_queue: WorkQueue, } struct InboundEvents { @@ -67,7 +71,7 @@ impl Stream for InboundEvents { } } -impl Scheduler { +impl Scheduler { pub fn new(beacon_processor: BeaconProcessor) -> Self { let work_queue = WorkQueue::new(); Scheduler { @@ -80,6 +84,7 @@ impl Scheduler { mut self, event_rx: mpsc::Receiver>, work_journal_tx: Option>, + slot_clock: S, ) -> Result<(), String> { let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); @@ -162,16 +167,40 @@ impl Scheduler { } // There is a new work event and the chain is not syncing. Process it or queue // it. - Some(WorkEvent { work, .. }) => { - self.process_or_queue_work_event(idle_tx.clone(), work, can_spawn) - } + Some(WorkEvent { work, .. }) => self.process_or_queue_work_event( + idle_tx.clone(), + work, + &slot_clock, + can_spawn, + ), }; + + self.update_metrics(modified_queue_id); } }; + // Spawn on the core executor. + executor.spawn(manager_future, MANAGER_TASK_NAME); + Ok(()) } + fn update_metrics(&mut self, modified_queue_id: Option) { + metrics::set_gauge( + &metrics::BEACON_PROCESSOR_WORKERS_ACTIVE_TOTAL, + self.beacon_processor.current_workers as i64, + ); + + if let Some(modified_queue_id) = modified_queue_id { + metrics::observe_vec( + &metrics::BEACON_PROCESSOR_QUEUE_LENGTH, + &[modified_queue_id.into()], + self.work_queue.len() as f64, + ); + } + // TODO check if is_full? + } + fn earliest_deadline_first_scheduler( &mut self, work_journal_tx: &Option>, @@ -196,6 +225,7 @@ impl Scheduler { &mut self, idle_tx: Sender<()>, work: Work, + slot_clock: &S, can_spawn: bool, ) -> Option { let work_id = work.str_id(); @@ -205,7 +235,10 @@ impl Scheduler { match work { _ if can_spawn => spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work), _ => { - self.work_queue.insert(QueueItem::new(work)); + let Some(queue_item) = QueueItem::new(work, slot_clock) else { + return None; + }; + self.work_queue.insert(queue_item); } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index 48f4c358646..f9ee82d1deb 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -6,7 +6,7 @@ use types::{BeaconState, ChainSpec, EthSpec}; use crate::{BeaconProcessor, WorkEvent}; -use super::priority_scheduler; +use super::{earliest_deadline_scheduler, priority_scheduler}; pub trait Scheduler { fn new( @@ -26,9 +26,11 @@ pub trait Scheduler { pub enum SchedulerType { PriorityScheduler(priority_scheduler::Scheduler), + EarliestDeadlineScheduler(earliest_deadline_scheduler::Scheduler), } impl Scheduler for SchedulerType { + // TODO(beacon-processor) make this config driven fn new( beacon_processor: BeaconProcessor, beacon_state: &BeaconState, @@ -38,7 +40,7 @@ impl Scheduler for SchedulerType priority_scheduler::Scheduler::new(beacon_processor, beacon_state, spec)?, ))) } - // TODO(beacon-processor) make this config driven + fn run( self, event_rx: mpsc::Receiver>, @@ -53,6 +55,9 @@ impl Scheduler for SchedulerType slot_clock, maximum_gossip_clock_disparity, ), + SchedulerType::EarliestDeadlineScheduler(scheduler) => { + scheduler.run(event_rx, work_journal_tx, slot_clock) + } } } } diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 459a86d7e7a..849bdb1302d 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -91,7 +91,7 @@ impl Stream for InboundEvents { } /// The name of the manager tokio task. -const MANAGER_TASK_NAME: &str = "beacon_processor_manager"; +const MANAGER_TASK_NAME: &str = "priority_scheduler"; // Backend trait inits a channel, a run function // A channel trait has send_work, reprocess_work etc. From 0ff2d149c5b809950f873377842afd06b7dfbdcb Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 31 Oct 2024 16:29:11 -0700 Subject: [PATCH 11/16] futher generalize --- beacon_node/beacon_processor/src/lib.rs | 8 + .../earliest_deadline_queue.rs | 179 +++++++++------- .../earliest_deadline_scheduler/mod.rs | 191 +++++++++++------- .../src/scheduler/interface.rs | 22 +- .../beacon_processor/src/scheduler/mod.rs | 96 ++++++++- .../src/scheduler/priority_scheduler/mod.rs | 99 +-------- .../priority_scheduler/work_queue.rs | 23 ++- .../work_reprocessing_queue.rs | 0 lighthouse/tests/beacon_node.rs | 4 +- 9 files changed, 364 insertions(+), 258 deletions(-) rename beacon_node/beacon_processor/src/scheduler/{priority_scheduler => }/work_reprocessing_queue.rs (100%) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 232c018a890..e444ec425f9 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -104,6 +104,7 @@ pub struct BeaconProcessorConfig { pub max_gossip_attestation_batch_size: usize, pub max_gossip_aggregate_batch_size: usize, pub enable_backfill_rate_limiting: bool, + pub beacon_processor_type: BeaconProcessorType, } impl Default for BeaconProcessorConfig { @@ -115,6 +116,7 @@ impl Default for BeaconProcessorConfig { max_gossip_attestation_batch_size: DEFAULT_MAX_GOSSIP_ATTESTATION_BATCH_SIZE, max_gossip_aggregate_batch_size: DEFAULT_MAX_GOSSIP_AGGREGATE_BATCH_SIZE, enable_backfill_rate_limiting: true, + beacon_processor_type: BeaconProcessorType::EarliestDeadline, } } } @@ -143,6 +145,12 @@ impl Default for BeaconProcessorChannels { } } +#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +pub enum BeaconProcessorType { + Priority, + EarliestDeadline, +} + /// A handle that sends a message on the provided channel to a receiver when it gets dropped. /// /// The receiver task is responsible for removing the provided `entry` from the `DuplicateCache` diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs index 77c8f576c5e..045bb976125 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs @@ -8,7 +8,7 @@ use std::{ use slot_clock::SlotClock; use types::{EthSpec, Slot}; -use crate::{Work, WorkEvent}; +use crate::{ReprocessQueueMessage, Work, WorkEvent}; pub struct WorkQueue { min_heap: BinaryHeap>>, @@ -42,68 +42,30 @@ impl QueueItem { let Some(current_time) = slot_clock.now_duration() else { return None; }; + println!("work: {:?}", work); let deadline = match work { Work::GossipAttestation { attestation, .. } => { let attestation_slot = attestation.attestation.data().slot; - - let Some(start_of_attestation_slot) = slot_clock.start_of(attestation_slot) else { - return None; - }; - - let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); - let four_seconds_into_slot = - start_of_attestation_slot.saturating_add(Duration::from_secs(4)); - Some(max(four_seconds_into_slot, arrival_time_with_buffer)) + Self::calculate_unaggregated_attestation_deadline(attestation_slot, slot_clock) } Work::GossipAttestationBatch { attestations, .. } => { let Some(attestation) = attestations.first() else { return None; }; - let attestation_slot = attestation.attestation.data().slot; - - let Some(start_of_attestation_slot) = slot_clock.start_of(attestation_slot) else { - return None; - }; - - let four_seconds_into_slot = - start_of_attestation_slot.saturating_add(Duration::from_secs(4)); - - let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); - Some(max(four_seconds_into_slot, arrival_time_with_buffer)) + Self::calculate_unaggregated_attestation_deadline(attestation_slot, slot_clock) } Work::GossipAggregate { aggregate, .. } => { let attestation_slot = aggregate.aggregate.message().aggregate().data().slot; - let Some(start_of_next_slot) = slot_clock.start_of(attestation_slot + Slot::new(1)) - else { - return None; - }; - let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); - - Some(max(start_of_next_slot, arrival_time_with_buffer)) + Self::calculate_aggregate_attestation_deadline(attestation_slot, slot_clock) } - Work::UnknownBlockAggregate { .. } => Some(current_time), Work::GossipAggregateBatch { aggregates, .. } => { let Some(aggregate) = aggregates.first() else { return None; }; let attestation_slot = aggregate.aggregate.message().aggregate().data().slot; - let Some(start_of_next_slot) = slot_clock.start_of(attestation_slot + Slot::new(1)) - else { - return None; - }; - let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); - - Some(max(start_of_next_slot, arrival_time_with_buffer)) - } - Work::GossipBlock(_) => Some(current_time), - Work::DelayedImportBlock { .. } => Some(current_time), - Work::GossipVoluntaryExit(_) => { - Some(current_time.saturating_add(Duration::from_secs(4))) - } - Work::UnknownLightClientOptimisticUpdate { .. } => { - Some(current_time.saturating_add(Duration::from_secs(4))) + Self::calculate_aggregate_attestation_deadline(attestation_slot, slot_clock) } Work::UnknownBlockAttestation { .. } | Work::UnknownBlockSamplingRequest { .. } @@ -115,39 +77,110 @@ impl QueueItem { | Work::GossipSyncContribution(_) | Work::RpcBlobs { .. } | Work::RpcCustodyColumn { .. } - | Work::RpcVerifyDataColumn { .. } => { + | Work::RpcVerifyDataColumn { .. } + | Work::SamplingResult(_) + | Work::BlocksByRangeRequest(_) + | Work::BlocksByRootsRequest(_) + | Work::BlobsByRangeRequest(_) + | Work::BlobsByRootsRequest(_) + | Work::DataColumnsByRootsRequest(_) + | Work::DataColumnsByRangeRequest(_) + | Work::GossipBlsToExecutionChange(_) => { Some(current_time.saturating_add(Duration::from_secs(1))) } - Work::GossipLightClientFinalityUpdate(..) => { - Some(current_time.saturating_add(Duration::from_secs(4))) + Work::UnknownLightClientOptimisticUpdate { .. } + | Work::GossipVoluntaryExit(_) + | Work::GossipLightClientFinalityUpdate(_) + | Work::GossipLightClientOptimisticUpdate(_) + | Work::Status(_) + | Work::LightClientBootstrapRequest(_) + | Work::LightClientOptimisticUpdateRequest(_) + | Work::LightClientFinalityUpdateRequest(_) + | Work::LightClientUpdatesByRangeRequest(_) + | Work::ApiRequestP0(_) + | Work::ApiRequestP1(_) => Some(current_time.saturating_add(Duration::from_secs(4))), + Work::RpcBlock { .. } + | Work::IgnoredRpcBlock { .. } + | Work::ChainSegment(_) + | Work::ChainSegmentBackfill(_) + | Work::UnknownBlockAggregate { .. } + | Work::GossipBlock(_) + | Work::DelayedImportBlock { .. } => Some(current_time), + Work::Reprocess(reprocess_queue_message) => { + Self::calculate_reprocess_deadline(reprocess_queue_message, slot_clock) } - Work::GossipLightClientOptimisticUpdate(fn_once) => todo!(), - Work::RpcBlock { process_fn } => todo!(), - (pin) => todo!(), - (pin) => todo!(), - Work::SamplingResult(pin) => todo!(), - Work::IgnoredRpcBlock { process_fn } => todo!(), - Work::ChainSegment(pin) => todo!(), - Work::ChainSegmentBackfill(pin) => todo!(), - Work::Status(fn_once) => todo!(), - Work::BlocksByRangeRequest(pin) => todo!(), - Work::BlocksByRootsRequest(pin) => todo!(), - Work::BlobsByRangeRequest(fn_once) => todo!(), - Work::BlobsByRootsRequest(fn_once) => todo!(), - Work::DataColumnsByRootsRequest(fn_once) => todo!(), - Work::DataColumnsByRangeRequest(fn_once) => todo!(), - Work::GossipBlsToExecutionChange(fn_once) => todo!(), - Work::LightClientBootstrapRequest(fn_once) => todo!(), - Work::LightClientOptimisticUpdateRequest(fn_once) => todo!(), - Work::LightClientFinalityUpdateRequest(fn_once) => todo!(), - Work::LightClientUpdatesByRangeRequest(fn_once) => todo!(), - Work::ApiRequestP0(blocking_or_async) => todo!(), - Work::ApiRequestP1(blocking_or_async) => todo!(), - Work::Reprocess(reprocess_queue_message) => todo!(), }; + println!("deadline {:?}", deadline); + deadline } + + /// An unaggregated attestation should be scheduled to be processed no later than within four seconds of the start of the current slot + /// or within a second of its arrival time if received later than the four second deadline. + fn calculate_unaggregated_attestation_deadline( + attestation_slot: Slot, + slot_clock: &S, + ) -> Option { + let Some(current_time) = slot_clock.now_duration() else { + return None; + }; + + let Some(start_of_attestation_slot) = slot_clock.start_of(attestation_slot) else { + return None; + }; + + let four_seconds_into_slot = + start_of_attestation_slot.saturating_add(Duration::from_secs(4)); + + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + Some(max(four_seconds_into_slot, arrival_time_with_buffer)) + } + + /// An aggregation attestation should be scheduled to be processed no later than the start of the next slot + /// or within a second of its arrival time if received later than the start of the next slot. + fn calculate_aggregate_attestation_deadline( + attestation_slot: Slot, + slot_clock: &S, + ) -> Option { + let Some(current_time) = slot_clock.now_duration() else { + return None; + }; + + let Some(start_of_next_slot) = slot_clock.start_of(attestation_slot + Slot::new(1)) else { + return None; + }; + let arrival_time_with_buffer = current_time.saturating_add(Duration::from_secs(1)); + + Some(max(start_of_next_slot, arrival_time_with_buffer)) + } + + fn calculate_reprocess_deadline( + reprocess_queue_message: &ReprocessQueueMessage, + slot_clock: &S, + ) -> Option { + let Some(current_time) = slot_clock.now_duration() else { + return None; + }; + + println!("reprocessing"); + + match reprocess_queue_message { + ReprocessQueueMessage::EarlyBlock(_) + | ReprocessQueueMessage::RpcBlock(_) + | ReprocessQueueMessage::BlockImported { .. } + | ReprocessQueueMessage::UnknownBlockUnaggregate(_) + | ReprocessQueueMessage::UnknownBlockAggregate(_) => Some(current_time), + ReprocessQueueMessage::NewLightClientOptimisticUpdate { .. } + | ReprocessQueueMessage::UnknownLightClientOptimisticUpdate(_) + | ReprocessQueueMessage::UnknownBlockSamplingRequest(_) => { + Some(current_time.saturating_add(Duration::from_secs(1))) + } + ReprocessQueueMessage::BackfillSync(_) => { + Some(current_time.saturating_add(Duration::from_secs(4))) + } + } + } } impl std::cmp::Eq for QueueItem {} @@ -160,7 +193,7 @@ impl PartialEq for QueueItem { impl PartialOrd for QueueItem { fn partial_cmp(&self, other: &Self) -> Option { - self.deadline.partial_cmp(&other.deadline) + Some(self.cmp(other)) } } @@ -189,7 +222,7 @@ impl WorkQueue { } } - fn peek(&self) -> Option<&Reverse>> { + fn _peek(&self) -> Option<&Reverse>> { self.min_heap.peek() } @@ -198,7 +231,7 @@ impl WorkQueue { } // TODO do we want an is_full method? should there be a concept of full? - pub fn is_full(&self) -> bool { + pub fn _is_full(&self) -> bool { todo!() } } diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index 1882484254d..260bc10d825 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -1,9 +1,11 @@ -use std::task::Poll; +use std::{sync::Arc, time::Duration}; +use crate::{scheduler::InboundEvents, QueuedBackfillBatch, ReprocessQueueMessage}; use earliest_deadline_queue::{QueueItem, WorkQueue}; -use futures::{Stream, StreamExt}; -use slog::{debug, trace, warn}; +use futures::stream::StreamExt; +use slog::{crit, debug, error, trace, warn}; use slot_clock::SlotClock; +use tokio::sync::mpsc::error::TrySendError; use tokio::sync::mpsc::{self, Sender}; use types::EthSpec; @@ -11,7 +13,11 @@ use crate::{ metrics, BeaconProcessor, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, }; -use super::spawn_worker; +use super::{ + spawn_worker, + work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, + worker_journal, InboundEvent, +}; mod earliest_deadline_queue; @@ -23,54 +29,6 @@ pub struct Scheduler { work_queue: WorkQueue, } -struct InboundEvents { - /// Used by workers when they finish a task. - idle_rx: mpsc::Receiver<()>, - /// Used by upstream processes to send new work to the `BeaconProcessor`. - event_rx: mpsc::Receiver>, -} - -/// Unifies all the messages processed by the `BeaconProcessor`. -enum InboundEvent { - /// A worker has completed a task and is free. - WorkerIdle, - /// There is new work to be done. - WorkEvent(WorkEvent), -} - -impl Stream for InboundEvents { - type Item = InboundEvent; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - // Always check for idle workers before anything else. This allows us to ensure that a big - // stream of new events doesn't suppress the processing of existing events. - match self.idle_rx.poll_recv(cx) { - Poll::Ready(Some(())) => { - return Poll::Ready(Some(InboundEvent::WorkerIdle)); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - match self.event_rx.poll_recv(cx) { - Poll::Ready(Some(event)) => { - return Poll::Ready(Some(InboundEvent::WorkEvent(event))); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - Poll::Pending - } -} - impl Scheduler { pub fn new(beacon_processor: BeaconProcessor) -> Self { let work_queue = WorkQueue::new(); @@ -85,12 +43,33 @@ impl Scheduler { event_rx: mpsc::Receiver>, work_journal_tx: Option>, slot_clock: S, + maximum_gossip_clock_disparity: Duration, ) -> Result<(), String> { let (idle_tx, idle_rx) = mpsc::channel::<()>(MAX_IDLE_QUEUE_LEN); + let (ready_work_tx, ready_work_rx) = + mpsc::channel::(self.beacon_processor.config.max_scheduled_work_queue_len); + + let (reprocess_work_tx, reprocess_work_rx) = mpsc::channel::( + self.beacon_processor.config.max_scheduled_work_queue_len, + ); + let executor = self.beacon_processor.executor.clone(); - let mut inbound_events = InboundEvents { event_rx, idle_rx }; + let mut inbound_events = InboundEvents { + idle_rx, + event_rx, + ready_work_rx, + }; + + spawn_reprocess_scheduler( + ready_work_tx, + reprocess_work_rx, + &self.beacon_processor.executor, + Arc::new(slot_clock.clone()), + self.beacon_processor.log.clone(), + maximum_gossip_clock_disparity, + )?; let manager_future = async move { loop { @@ -100,7 +79,51 @@ impl Scheduler { self.beacon_processor.current_workers.saturating_sub(1); None } - Some(InboundEvent::WorkEvent(event)) => Some(event), + Some(InboundEvent::WorkEvent(event)) + if self.beacon_processor.config.enable_backfill_rate_limiting => + { + match QueuedBackfillBatch::try_from(event) { + Ok(backfill_batch) => { + match reprocess_work_tx + .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) + { + Err(e) => { + warn!( + self.beacon_processor.log, + "Unable to queue backfill work event. Will try to process now."; + "error" => %e + ); + match e { + TrySendError::Full(reprocess_queue_message) + | TrySendError::Closed(reprocess_queue_message) => { + match reprocess_queue_message { + ReprocessQueueMessage::BackfillSync( + backfill_batch, + ) => Some(backfill_batch.into()), + other => { + crit!( + self.beacon_processor.log, + "Unexpected queue message type"; + "message_type" => other.as_ref() + ); + // This is an unhandled exception, drop the message. + continue; + } + } + } + } + } + Ok(..) => { + // backfill work sent to "reprocessing" queue. Process the next event. + continue; + } + } + } + Err(event) => Some(event), + } + } + Some(InboundEvent::WorkEvent(event)) + | Some(InboundEvent::ReprocessingWork(event)) => Some(event), None => { debug!( self.beacon_processor.log, @@ -113,6 +136,8 @@ impl Scheduler { let can_spawn = self.beacon_processor.current_workers < self.beacon_processor.config.max_workers; + + worker_journal(&work_event, &work_journal_tx); let drop_during_sync = work_event .as_ref() .map_or(false, |event| event.drop_during_sync); @@ -122,12 +147,20 @@ impl Scheduler { // We don't check the `work.drop_during_sync` here. We assume that if it made // it into the queue at any point then we should process it. None if can_spawn => { - let work_event = self.earliest_deadline_first_scheduler(&work_journal_tx); + let work_event = self.earliest_deadline_first_scheduler(); if let Some(work_event) = work_event { let work_type = work_event.to_type(); + println!("weve spawned a worker here"); spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work_event); Some(work_type) } else { + // Let the journal know that a worker is freed and there's nothing else + // for it to do. + if let Some(work_journal_tx) = &work_journal_tx { + // We don't care if this message was successfully sent, we only use the journal + // during testing. + let _ = work_journal_tx.try_send(NOTHING_TO_DO); + } None } } @@ -167,12 +200,16 @@ impl Scheduler { } // There is a new work event and the chain is not syncing. Process it or queue // it. - Some(WorkEvent { work, .. }) => self.process_or_queue_work_event( - idle_tx.clone(), - work, - &slot_clock, - can_spawn, - ), + Some(WorkEvent { work, .. }) => { + println!("work: {:?}", work); + self.process_or_queue_work_event( + &reprocess_work_tx, + idle_tx.clone(), + work, + &slot_clock, + can_spawn, + ) + } }; self.update_metrics(modified_queue_id); @@ -201,47 +238,49 @@ impl Scheduler { // TODO check if is_full? } - fn earliest_deadline_first_scheduler( - &mut self, - work_journal_tx: &Option>, - ) -> Option> { + fn earliest_deadline_first_scheduler(&mut self) -> Option> { let queue_item = self.work_queue.pop(); if let Some(queue_item) = queue_item { Some(queue_item.work_event.work) } else { - // Let the journal know that a worker is freed and there's nothing else - // for it to do. - if let Some(work_journal_tx) = &work_journal_tx { - // We don't care if this message was successfully sent, we only use the journal - // during testing. - let _ = work_journal_tx.try_send(NOTHING_TO_DO); - } None } } pub fn process_or_queue_work_event( &mut self, + reprocess_work_tx: &Sender, idle_tx: Sender<()>, work: Work, slot_clock: &S, can_spawn: bool, ) -> Option { - let work_id = work.str_id(); - let work_type = work.to_type(); match work { - _ if can_spawn => spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work), + Work::Reprocess(work_event) => { + if let Err(e) = reprocess_work_tx.try_send(work_event) { + error!( + self.beacon_processor.log, + "Failed to reprocess work event"; + "error" => %e + ) + } + } + _ if can_spawn => { + println!("spawning"); + spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work) + } _ => { let Some(queue_item) = QueueItem::new(work, slot_clock) else { return None; }; + println!("queue"); self.work_queue.insert(queue_item); } } - todo!() + Some(work_type) } } diff --git a/beacon_node/beacon_processor/src/scheduler/interface.rs b/beacon_node/beacon_processor/src/scheduler/interface.rs index f9ee82d1deb..8c6911de3d2 100644 --- a/beacon_node/beacon_processor/src/scheduler/interface.rs +++ b/beacon_node/beacon_processor/src/scheduler/interface.rs @@ -36,9 +36,16 @@ impl Scheduler for SchedulerType beacon_state: &BeaconState, spec: &ChainSpec, ) -> Result, String> { - Ok(Box::new(SchedulerType::PriorityScheduler( - priority_scheduler::Scheduler::new(beacon_processor, beacon_state, spec)?, - ))) + match beacon_processor.config.beacon_processor_type { + crate::BeaconProcessorType::Priority => Ok(Box::new(SchedulerType::PriorityScheduler( + priority_scheduler::Scheduler::new(beacon_processor, beacon_state, spec)?, + ))), + crate::BeaconProcessorType::EarliestDeadline => { + Ok(Box::new(SchedulerType::EarliestDeadlineScheduler( + earliest_deadline_scheduler::Scheduler::new(beacon_processor), + ))) + } + } } fn run( @@ -55,9 +62,12 @@ impl Scheduler for SchedulerType slot_clock, maximum_gossip_clock_disparity, ), - SchedulerType::EarliestDeadlineScheduler(scheduler) => { - scheduler.run(event_rx, work_journal_tx, slot_clock) - } + SchedulerType::EarliestDeadlineScheduler(scheduler) => scheduler.run( + event_rx, + work_journal_tx, + slot_clock, + maximum_gossip_clock_disparity, + ), } } } diff --git a/beacon_node/beacon_processor/src/scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/mod.rs index cb2a17fb787..25827d407cc 100644 --- a/beacon_node/beacon_processor/src/scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/mod.rs @@ -1,13 +1,85 @@ +use std::task::Poll; + +use futures::stream::Stream; use slog::trace; +use std::pin::Pin; +use std::task::Context; +use tokio::sync::mpsc; use tokio::sync::mpsc::Sender; use types::EthSpec; +use work_reprocessing_queue::ReadyWork; -use crate::metrics; +use crate::{metrics, WorkEvent, WORKER_FREED}; use crate::{BeaconProcessor, BlockingOrAsync, SendOnDrop, TaskSpawner, Work}; mod earliest_deadline_scheduler; pub mod interface; mod priority_scheduler; +pub mod work_reprocessing_queue; + +/// Unifies all the messages processed by the `BeaconProcessor`. +enum InboundEvent { + /// A worker has completed a task and is free. + WorkerIdle, + /// There is new work to be done. + WorkEvent(WorkEvent), + /// A work event that was queued for re-processing has become ready. + ReprocessingWork(WorkEvent), +} + +/// Combines the various incoming event streams for the `BeaconProcessor` into a single stream. +/// +/// This struct has a similar purpose to `tokio::select!`, however it allows for more fine-grained +/// control (specifically in the ordering of event processing). +struct InboundEvents { + /// Used by workers when they finish a task. + idle_rx: mpsc::Receiver<()>, + /// Used by upstream processes to send new work to the `BeaconProcessor`. + event_rx: mpsc::Receiver>, + ready_work_rx: mpsc::Receiver, +} + +impl Stream for InboundEvents { + type Item = InboundEvent; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Always check for idle workers before anything else. This allows us to ensure that a big + // stream of new events doesn't suppress the processing of existing events. + match self.idle_rx.poll_recv(cx) { + Poll::Ready(Some(())) => { + return Poll::Ready(Some(InboundEvent::WorkerIdle)); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + // Poll for delayed blocks before polling for new work. It might be the case that a delayed + // block is required to successfully process some new work. + match self.ready_work_rx.poll_recv(cx) { + Poll::Ready(Some(ready_work)) => { + return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + match self.event_rx.poll_recv(cx) { + Poll::Ready(Some(event)) => { + return Poll::Ready(Some(InboundEvent::WorkEvent(event))); + } + Poll::Ready(None) => { + return Poll::Ready(None); + } + Poll::Pending => {} + } + + Poll::Pending + } +} /// Spawns a blocking worker thread to process some `Work`. /// @@ -52,6 +124,8 @@ pub fn spawn_worker( send_idle_on_drop, }; + println!("spawing work {:?}", work); + match work { Work::GossipAttestation { attestation, @@ -131,6 +205,24 @@ pub fn spawn_worker( | Work::LightClientUpdatesByRangeRequest(process_fn) => { task_spawner.spawn_blocking(process_fn) } - Work::Reprocess(_) => (), + Work::Reprocess(_) => {} }; } + +pub fn worker_journal( + work_event: &Option>, + work_journal_tx: &Option>, +) { + if let Some(work_journal_tx) = work_journal_tx { + let id = work_event + .as_ref() + .map(|event| event.work.str_id()) + .unwrap_or(WORKER_FREED); + + // We don't care if this message was successfully sent, we only use the journal + // during testing. We also ignore reprocess messages to ensure our test cases can pass. + if id != "reprocess" { + let _ = work_journal_tx.try_send(id); + } + } +} diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 849bdb1302d..1c7330dd6e3 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -4,91 +4,24 @@ // 3. A retry queue mod work_queue; -mod work_reprocessing_queue; -use futures::stream::{Stream, StreamExt}; -use futures::task::Poll; +use crate::scheduler::work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; +use crate::scheduler::InboundEvents; +use futures::stream::StreamExt; use slog::error; use slog::{crit, debug, trace, warn}; use slot_clock::SlotClock; -use std::pin::Pin; -use std::task::Context; use std::{cmp, marker::PhantomData, sync::Arc, time::Duration}; use tokio::sync::mpsc::{self, error::TrySendError, Sender}; use types::{BeaconState, ChainSpec, EthSpec}; use work_queue::{BeaconProcessorQueueLengths, WorkQueues}; -use work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; use crate::{ metrics, BeaconProcessor, QueuedBackfillBatch, ReprocessQueueMessage, Work, WorkEvent, - WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, WORKER_FREED, + WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, }; -use super::spawn_worker; - -/// Unifies all the messages processed by the `BeaconProcessor`. -enum InboundEvent { - /// A worker has completed a task and is free. - WorkerIdle, - /// There is new work to be done. - WorkEvent(WorkEvent), - /// A work event that was queued for re-processing has become ready. - ReprocessingWork(WorkEvent), -} - -/// Combines the various incoming event streams for the `BeaconProcessor` into a single stream. -/// -/// This struct has a similar purpose to `tokio::select!`, however it allows for more fine-grained -/// control (specifically in the ordering of event processing). -struct InboundEvents { - /// Used by workers when they finish a task. - idle_rx: mpsc::Receiver<()>, - /// Used by upstream processes to send new work to the `BeaconProcessor`. - event_rx: mpsc::Receiver>, - ready_work_rx: mpsc::Receiver, -} - -impl Stream for InboundEvents { - type Item = InboundEvent; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // Always check for idle workers before anything else. This allows us to ensure that a big - // stream of new events doesn't suppress the processing of existing events. - match self.idle_rx.poll_recv(cx) { - Poll::Ready(Some(())) => { - return Poll::Ready(Some(InboundEvent::WorkerIdle)); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - // Poll for delayed blocks before polling for new work. It might be the case that a delayed - // block is required to successfully process some new work. - match self.ready_work_rx.poll_recv(cx) { - Poll::Ready(Some(ready_work)) => { - return Poll::Ready(Some(InboundEvent::ReprocessingWork(ready_work.into()))); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - match self.event_rx.poll_recv(cx) { - Poll::Ready(Some(event)) => { - return Poll::Ready(Some(InboundEvent::WorkEvent(event))); - } - Poll::Ready(None) => { - return Poll::Ready(None); - } - Poll::Pending => {} - } - - Poll::Pending - } -} +use super::{spawn_worker, worker_journal, InboundEvent}; /// The name of the manager tokio task. const MANAGER_TASK_NAME: &str = "priority_scheduler"; @@ -217,7 +150,7 @@ impl Scheduler { }; let _event_timer = self.increment_metrics(&work_event); - self.worker_journal(&work_event, &work_journal_tx); + worker_journal(&work_event, &work_journal_tx); let can_spawn = self.beacon_processor.current_workers < self.beacon_processor.config.max_workers; @@ -840,26 +773,6 @@ impl Scheduler { } } - // TODO(beacon-processor) this can live outside of this struct in a more general location - fn worker_journal( - &self, - work_event: &Option>, - work_journal_tx: &Option>, - ) { - if let Some(work_journal_tx) = work_journal_tx { - let id = work_event - .as_ref() - .map(|event| event.work.str_id()) - .unwrap_or(WORKER_FREED); - - // We don't care if this message was successfully sent, we only use the journal - // during testing. We also ignore reprocess messages to ensure our test cases can pass. - if id != "reprocess" { - let _ = work_journal_tx.try_send(id); - } - } - } - // TODO(beacon-processor) this can live outside of this struct in a more general location fn increment_metrics( &self, diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs index f7b93199d4e..9053e1c48de 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_queue.rs @@ -10,6 +10,11 @@ use types::{BeaconState, ChainSpec, EthSpec, RelativeEpoch}; /// slightly, we don't need to adjust the queues during the lifetime of a process. const ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT: usize = 110; +/// Minimum size of dynamically sized queues. Due to integer division we don't want 0 length queues +/// as the processor won't process that message type. 128 is an arbitrary value value >= 1 that +/// seems reasonable. +const MIN_QUEUE_LEN: usize = 128; + /// A simple first-in-first-out queue with a maximum length. pub struct FifoQueue { queue: VecDeque, @@ -153,13 +158,22 @@ impl BeaconProcessorQueueLengths { (ACTIVE_VALIDATOR_COUNT_OVERPROVISION_PERCENT * active_validator_count) / 100; let slots_per_epoch = E::slots_per_epoch() as usize; + println!("activate val count {}", active_validator_count); + println!("slots per epoch {}", slots_per_epoch); + Ok(Self { aggregate_queue: 4096, unknown_block_aggregate_queue: 1024, // Capacity for a full slot's worth of attestations if subscribed to all subnets - attestation_queue: active_validator_count / slots_per_epoch, + attestation_queue: std::cmp::max( + active_validator_count / slots_per_epoch, + MIN_QUEUE_LEN, + ), // Capacity for a full slot's worth of attestations if subscribed to all subnets - unknown_block_attestation_queue: active_validator_count / slots_per_epoch, + unknown_block_attestation_queue: std::cmp::max( + active_validator_count / slots_per_epoch, + MIN_QUEUE_LEN, + ), sync_message_queue: 2048, sync_contribution_queue: 1024, gossip_voluntary_exit_queue: 4096, @@ -357,11 +371,6 @@ impl WorkQueues { #[cfg(test)] mod tests { - /// Minimum size of dynamically sized queues. Due to integer division we don't want 0 length queues - /// as the processor won't process that message type. 128 is an arbitrary value value >= 1 that - /// seems reasonable. - const MIN_QUEUE_LEN: usize = 128; - use super::*; use types::{BeaconState, ChainSpec, Eth1Data, ForkName, MainnetEthSpec}; diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs similarity index 100% rename from beacon_node/beacon_processor/src/scheduler/priority_scheduler/work_reprocessing_queue.rs rename to beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index ac7ddcdbd98..ce10b9e171f 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -7,6 +7,7 @@ use beacon_node::beacon_chain::chain_config::{ }; use beacon_node::beacon_chain::graffiti_calculator::GraffitiOrigin; use beacon_processor::BeaconProcessorConfig; +use beacon_processor::BeaconProcessorType; use eth1::Eth1Endpoint; use lighthouse_network::PeerId; use lighthouse_version; @@ -2579,7 +2580,8 @@ fn beacon_processor() { max_scheduled_work_queue_len: 3, max_gossip_attestation_batch_size: 4, max_gossip_aggregate_batch_size: 5, - enable_backfill_rate_limiting: false + enable_backfill_rate_limiting: false, + beacon_processor_type: BeaconProcessorType::EarliestDeadline, } ) }); From e2bf025cd7040cffeb27bff89796f724f34bfb2a Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 31 Oct 2024 19:08:12 -0700 Subject: [PATCH 12/16] get next work event in a separate fn --- .../earliest_deadline_scheduler/mod.rs | 74 +++------------- .../beacon_processor/src/scheduler/mod.rs | 84 ++++++++++++++++++- .../src/scheduler/priority_scheduler/mod.rs | 77 +++-------------- 3 files changed, 104 insertions(+), 131 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index 260bc10d825..fd567771650 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -1,11 +1,9 @@ use std::{sync::Arc, time::Duration}; -use crate::{scheduler::InboundEvents, QueuedBackfillBatch, ReprocessQueueMessage}; +use crate::{scheduler::InboundEvents, ReprocessQueueMessage}; use earliest_deadline_queue::{QueueItem, WorkQueue}; -use futures::stream::StreamExt; -use slog::{crit, debug, error, trace, warn}; +use slog::{error, trace, warn}; use slot_clock::SlotClock; -use tokio::sync::mpsc::error::TrySendError; use tokio::sync::mpsc::{self, Sender}; use types::EthSpec; @@ -16,7 +14,7 @@ use crate::{ use super::{ spawn_worker, work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}, - worker_journal, InboundEvent, + worker_journal, NextWorkEvent, }; mod earliest_deadline_queue; @@ -73,65 +71,13 @@ impl Scheduler { let manager_future = async move { loop { - let work_event = match inbound_events.next().await { - Some(InboundEvent::WorkerIdle) => { - self.beacon_processor.current_workers = - self.beacon_processor.current_workers.saturating_sub(1); - None - } - Some(InboundEvent::WorkEvent(event)) - if self.beacon_processor.config.enable_backfill_rate_limiting => - { - match QueuedBackfillBatch::try_from(event) { - Ok(backfill_batch) => { - match reprocess_work_tx - .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) - { - Err(e) => { - warn!( - self.beacon_processor.log, - "Unable to queue backfill work event. Will try to process now."; - "error" => %e - ); - match e { - TrySendError::Full(reprocess_queue_message) - | TrySendError::Closed(reprocess_queue_message) => { - match reprocess_queue_message { - ReprocessQueueMessage::BackfillSync( - backfill_batch, - ) => Some(backfill_batch.into()), - other => { - crit!( - self.beacon_processor.log, - "Unexpected queue message type"; - "message_type" => other.as_ref() - ); - // This is an unhandled exception, drop the message. - continue; - } - } - } - } - } - Ok(..) => { - // backfill work sent to "reprocessing" queue. Process the next event. - continue; - } - } - } - Err(event) => Some(event), - } - } - Some(InboundEvent::WorkEvent(event)) - | Some(InboundEvent::ReprocessingWork(event)) => Some(event), - None => { - debug!( - self.beacon_processor.log, - "Gossip processor stopped"; - "msg" => "stream ended" - ); - break; - } + let work_event = match inbound_events + .next_work_event(&reprocess_work_tx, &mut self.beacon_processor) + .await + { + NextWorkEvent::WorkEvent(work_event) => work_event, + NextWorkEvent::Continue => continue, + NextWorkEvent::Break => break, }; let can_spawn = self.beacon_processor.current_workers diff --git a/beacon_node/beacon_processor/src/scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/mod.rs index 25827d407cc..391ac0b25f8 100644 --- a/beacon_node/beacon_processor/src/scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/mod.rs @@ -1,15 +1,17 @@ use std::task::Poll; use futures::stream::Stream; -use slog::trace; +use futures::StreamExt; +use slog::{crit, debug, trace, warn}; use std::pin::Pin; use std::task::Context; use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TrySendError; use tokio::sync::mpsc::Sender; use types::EthSpec; use work_reprocessing_queue::ReadyWork; -use crate::{metrics, WorkEvent, WORKER_FREED}; +use crate::{metrics, QueuedBackfillBatch, ReprocessQueueMessage, WorkEvent, WORKER_FREED}; use crate::{BeaconProcessor, BlockingOrAsync, SendOnDrop, TaskSpawner, Work}; mod earliest_deadline_scheduler; @@ -81,6 +83,84 @@ impl Stream for InboundEvents { } } +pub enum NextWorkEvent { + WorkEvent(Option>), + Continue, + Break, +} + +impl InboundEvents { + pub async fn next_work_event( + &mut self, + reprocess_work_tx: &Sender, + beacon_processor: &mut BeaconProcessor, + ) -> NextWorkEvent { + match self.next().await { + Some(InboundEvent::WorkerIdle) => { + beacon_processor.current_workers = + beacon_processor.current_workers.saturating_sub(1); + NextWorkEvent::WorkEvent(None) + } + Some(InboundEvent::WorkEvent(event)) + if beacon_processor.config.enable_backfill_rate_limiting => + { + match QueuedBackfillBatch::try_from(event) { + Ok(backfill_batch) => { + match reprocess_work_tx + .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) + { + Err(e) => { + warn!( + beacon_processor.log, + "Unable to queue backfill work event. Will try to process now."; + "error" => %e + ); + match e { + TrySendError::Full(reprocess_queue_message) + | TrySendError::Closed(reprocess_queue_message) => { + match reprocess_queue_message { + ReprocessQueueMessage::BackfillSync(backfill_batch) => { + NextWorkEvent::WorkEvent(Some( + backfill_batch.into(), + )) + } + other => { + crit!( + beacon_processor.log, + "Unexpected queue message type"; + "message_type" => other.as_ref() + ); + // This is an unhandled exception, drop the message. + NextWorkEvent::Continue + } + } + } + } + } + Ok(..) => { + // backfill work sent to "reprocessing" queue. Process the next event. + NextWorkEvent::Continue + } + } + } + Err(event) => NextWorkEvent::WorkEvent(Some(event)), + } + } + Some(InboundEvent::WorkEvent(event)) | Some(InboundEvent::ReprocessingWork(event)) => { + NextWorkEvent::WorkEvent(Some(event)) + } + None => { + debug!( + beacon_processor.log, + "Gossip processor stopped"; + "msg" => "stream ended" + ); + NextWorkEvent::Break + } + } + } +} + /// Spawns a blocking worker thread to process some `Work`. /// /// Sends an message on `idle_tx` when the work is complete and the task is stopping. diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 1c7330dd6e3..92764658985 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -7,21 +7,20 @@ mod work_queue; use crate::scheduler::work_reprocessing_queue::{spawn_reprocess_scheduler, ReadyWork}; use crate::scheduler::InboundEvents; -use futures::stream::StreamExt; use slog::error; -use slog::{crit, debug, trace, warn}; +use slog::{crit, trace, warn}; use slot_clock::SlotClock; use std::{cmp, marker::PhantomData, sync::Arc, time::Duration}; -use tokio::sync::mpsc::{self, error::TrySendError, Sender}; +use tokio::sync::mpsc::{self, Sender}; use types::{BeaconState, ChainSpec, EthSpec}; use work_queue::{BeaconProcessorQueueLengths, WorkQueues}; use crate::{ - metrics, BeaconProcessor, QueuedBackfillBatch, ReprocessQueueMessage, Work, WorkEvent, - WorkType, MAX_IDLE_QUEUE_LEN, NOTHING_TO_DO, + metrics, BeaconProcessor, ReprocessQueueMessage, Work, WorkEvent, WorkType, MAX_IDLE_QUEUE_LEN, + NOTHING_TO_DO, }; -use super::{spawn_worker, worker_journal, InboundEvent}; +use super::{spawn_worker, worker_journal, NextWorkEvent}; /// The name of the manager tokio task. const MANAGER_TASK_NAME: &str = "priority_scheduler"; @@ -88,65 +87,13 @@ impl Scheduler { let executor = self.beacon_processor.executor.clone(); let manager_future = async move { loop { - let work_event = match inbound_events.next().await { - Some(InboundEvent::WorkerIdle) => { - self.beacon_processor.current_workers = - self.beacon_processor.current_workers.saturating_sub(1); - None - } - Some(InboundEvent::WorkEvent(event)) - if self.beacon_processor.config.enable_backfill_rate_limiting => - { - match QueuedBackfillBatch::try_from(event) { - Ok(backfill_batch) => { - match reprocess_work_tx - .try_send(ReprocessQueueMessage::BackfillSync(backfill_batch)) - { - Err(e) => { - warn!( - self.beacon_processor.log, - "Unable to queue backfill work event. Will try to process now."; - "error" => %e - ); - match e { - TrySendError::Full(reprocess_queue_message) - | TrySendError::Closed(reprocess_queue_message) => { - match reprocess_queue_message { - ReprocessQueueMessage::BackfillSync( - backfill_batch, - ) => Some(backfill_batch.into()), - other => { - crit!( - self.beacon_processor.log, - "Unexpected queue message type"; - "message_type" => other.as_ref() - ); - // This is an unhandled exception, drop the message. - continue; - } - } - } - } - } - Ok(..) => { - // backfill work sent to "reprocessing" queue. Process the next event. - continue; - } - } - } - Err(event) => Some(event), - } - } - Some(InboundEvent::WorkEvent(event)) - | Some(InboundEvent::ReprocessingWork(event)) => Some(event), - None => { - debug!( - self.beacon_processor.log, - "Gossip processor stopped"; - "msg" => "stream ended" - ); - break; - } + let work_event = match inbound_events + .next_work_event(&reprocess_work_tx, &mut self.beacon_processor) + .await + { + NextWorkEvent::WorkEvent(work_event) => work_event, + NextWorkEvent::Continue => continue, + NextWorkEvent::Break => break, }; let _event_timer = self.increment_metrics(&work_event); From 949b6e2629858200fe83087877e53980bff91966 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 21 Nov 2024 12:32:44 +0700 Subject: [PATCH 13/16] add priority work logic --- beacon_node/beacon_chain/src/beacon_chain.rs | 10 +++ beacon_node/beacon_processor/src/lib.rs | 17 +++++ .../earliest_deadline_queue.rs | 12 ++-- .../earliest_deadline_scheduler/mod.rs | 72 +++++++++---------- .../beacon_processor/src/scheduler/mod.rs | 2 + .../src/scheduler/priority_scheduler/mod.rs | 28 +++++--- beacon_node/http_api/src/test_utils.rs | 8 ++- 7 files changed, 92 insertions(+), 57 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index f8dfbc55155..d5dc3a862a3 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -7048,6 +7048,16 @@ impl BeaconChain { reqresp_pre_import_cache_len: self.reqresp_pre_import_cache.read().len(), } } + + /// Returns true if `parent_root` is equal to either the cached head snapshot's block root + /// or, in the case of a one slot re-org, is equal to the cached head snapshot's parent + /// block root. + pub fn is_canonical(&self, parent_root: Hash256) -> bool { + let head_snapshot = self.canonical_head.cached_head().snapshot; + + head_snapshot.beacon_block_root == parent_root + || head_snapshot.beacon_block.parent_root() == parent_root + } } impl Drop for BeaconChain { diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index e444ec425f9..f004eff7285 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -245,6 +245,8 @@ pub enum WorkType { ApiRequestP0, ApiRequestP1, Reprocess, + GossipCanonicalBlock, + RpcCanonicalBlock, } impl Work { @@ -305,6 +307,8 @@ impl Work { Work::ApiRequestP0 { .. } => WorkType::ApiRequestP0, Work::ApiRequestP1 { .. } => WorkType::ApiRequestP1, Work::Reprocess { .. } => WorkType::Reprocess, + Work::RpcCanonicalBlock { .. } => WorkType::RpcCanonicalBlock, + Work::GossipCanonicalBlock(_) => WorkType::GossipCanonicalBlock, } } } @@ -560,6 +564,10 @@ pub enum Work { ApiRequestP0(BlockingOrAsync), ApiRequestP1(BlockingOrAsync), Reprocess(ReprocessQueueMessage), + GossipCanonicalBlock(AsyncFn), + RpcCanonicalBlock { + process_fn: AsyncFn, + }, } impl fmt::Debug for Work { @@ -568,6 +576,15 @@ impl fmt::Debug for Work { } } +impl Work { + pub fn is_priority_work(&self) -> bool { + matches!( + self, + Work::GossipCanonicalBlock(..) | Work::RpcCanonicalBlock { .. } + ) + } +} + /// A mutli-threaded processor for messages received on the network /// that need to be processed by the `BeaconChain` /// diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs index 045bb976125..8a6f66027c8 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs @@ -21,16 +21,11 @@ pub struct QueueItem { } impl QueueItem { - pub fn new(work: Work, slot_clock: &S) -> Option { - let Some(deadline) = QueueItem::calculate_deadline(&work, slot_clock) else { + pub fn new(work_event: WorkEvent, slot_clock: &S) -> Option { + let Some(deadline) = QueueItem::calculate_deadline(&work_event.work, slot_clock) else { return None; }; - let work_event = WorkEvent { - drop_during_sync: false, - work, - }; - Some(Self { work_event, deadline, @@ -109,6 +104,9 @@ impl QueueItem { Work::Reprocess(reprocess_queue_message) => { Self::calculate_reprocess_deadline(reprocess_queue_message, slot_clock) } + Work::GossipCanonicalBlock(_) | Work::RpcCanonicalBlock { .. } => { + Some(Duration::from_secs(0)) + } }; println!("deadline {:?}", deadline); diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index fd567771650..4e5035027b6 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -93,12 +93,13 @@ impl Scheduler { // We don't check the `work.drop_during_sync` here. We assume that if it made // it into the queue at any point then we should process it. None if can_spawn => { - let work_event = self.earliest_deadline_first_scheduler(); - if let Some(work_event) = work_event { - let work_type = work_event.to_type(); - println!("weve spawned a worker here"); - spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work_event); - Some(work_type) + if let Some(queue_item) = self.work_queue.pop() { + self.process_or_queue_item( + &reprocess_work_tx, + &idle_tx, + queue_item, + can_spawn, + ) } else { // Let the journal know that a worker is freed and there's nothing else // for it to do. @@ -146,15 +147,17 @@ impl Scheduler { } // There is a new work event and the chain is not syncing. Process it or queue // it. - Some(WorkEvent { work, .. }) => { - println!("work: {:?}", work); - self.process_or_queue_work_event( - &reprocess_work_tx, - idle_tx.clone(), - work, - &slot_clock, - can_spawn, - ) + Some(work_event) => { + if let Some(queue_item) = QueueItem::new(work_event, &slot_clock) { + self.process_or_queue_item( + &reprocess_work_tx, + &idle_tx, + queue_item, + can_spawn, + ) + } else { + None + } } }; @@ -184,27 +187,19 @@ impl Scheduler { // TODO check if is_full? } - fn earliest_deadline_first_scheduler(&mut self) -> Option> { - let queue_item = self.work_queue.pop(); - - if let Some(queue_item) = queue_item { - Some(queue_item.work_event.work) - } else { - None - } - } - - pub fn process_or_queue_work_event( + pub fn process_or_queue_item( &mut self, reprocess_work_tx: &Sender, - idle_tx: Sender<()>, - work: Work, - slot_clock: &S, + idle_tx: &Sender<()>, + queue_item: QueueItem, can_spawn: bool, ) -> Option { - let work_type = work.to_type(); + let work_type = queue_item.work_event.work_type(); + + let workers_available = + self.beacon_processor.config.max_workers - self.beacon_processor.current_workers; - match work { + match queue_item.work_event.work { Work::Reprocess(work_event) => { if let Err(e) = reprocess_work_tx.try_send(work_event) { error!( @@ -215,14 +210,17 @@ impl Scheduler { } } _ if can_spawn => { - println!("spawning"); - spawn_worker(&mut self.beacon_processor, idle_tx.clone(), work) + if queue_item.work_event.work.is_priority_work() || workers_available > 1 { + spawn_worker( + &mut self.beacon_processor, + idle_tx.clone(), + queue_item.work_event.work, + ) + } else { + self.work_queue.insert(queue_item); + } } _ => { - let Some(queue_item) = QueueItem::new(work, slot_clock) else { - return None; - }; - println!("queue"); self.work_queue.insert(queue_item); } } diff --git a/beacon_node/beacon_processor/src/scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/mod.rs index 391ac0b25f8..85f023e40ed 100644 --- a/beacon_node/beacon_processor/src/scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/mod.rs @@ -248,12 +248,14 @@ pub fn spawn_worker( process_fn, } => task_spawner.spawn_async(process_fn), Work::RpcBlock { process_fn } + | Work::RpcCanonicalBlock { process_fn } | Work::RpcBlobs { process_fn } | Work::RpcCustodyColumn(process_fn) | Work::RpcVerifyDataColumn(process_fn) | Work::SamplingResult(process_fn) => task_spawner.spawn_async(process_fn), Work::IgnoredRpcBlock { process_fn } => task_spawner.spawn_blocking(process_fn), Work::GossipBlock(work) + | Work::GossipCanonicalBlock(work) | Work::GossipBlobSidecar(work) | Work::GossipDataColumnSidecar(work) => task_spawner.spawn_async(async move { work.await; diff --git a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs index 92764658985..7b1d3b709b1 100644 --- a/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/priority_scheduler/mod.rs @@ -447,11 +447,10 @@ impl Scheduler { "Unsupported inbound event"; "type" => "GossipAggregateBatch" ), - Work::GossipBlock { .. } => { - self.work_queues - .gossip_block_queue - .push(work, work_id, &self.beacon_processor.log) - } + Work::GossipBlock { .. } | Work::GossipCanonicalBlock { .. } => self + .work_queues + .gossip_block_queue + .push(work, work_id, &self.beacon_processor.log), Work::GossipBlobSidecar { .. } => { self.work_queues .gossip_blob_queue @@ -492,10 +491,13 @@ impl Scheduler { .work_queues .optimistic_update_queue .push(work, work_id, &self.beacon_processor.log), - Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => self - .work_queues - .rpc_block_queue - .push(work, work_id, &self.beacon_processor.log), + Work::RpcBlock { .. } + | Work::IgnoredRpcBlock { .. } + | Work::RpcCanonicalBlock { .. } => { + self.work_queues + .rpc_block_queue + .push(work, work_id, &self.beacon_processor.log) + } Work::RpcBlobs { .. } => { self.work_queues .rpc_blob_queue @@ -634,7 +636,9 @@ impl Scheduler { self.work_queues.unknown_block_sampling_request_queue.len() } WorkType::GossipAggregateBatch => 0, // No queue - WorkType::GossipBlock => self.work_queues.gossip_block_queue.len(), + WorkType::GossipBlock | WorkType::GossipCanonicalBlock => { + self.work_queues.gossip_block_queue.len() + } WorkType::GossipBlobSidecar => self.work_queues.gossip_blob_queue.len(), WorkType::GossipDataColumnSidecar => { self.work_queues.gossip_data_column_queue.len() @@ -655,7 +659,9 @@ impl Scheduler { WorkType::GossipLightClientOptimisticUpdate => { self.work_queues.optimistic_update_queue.len() } - WorkType::RpcBlock => self.work_queues.rpc_block_queue.len(), + WorkType::RpcBlock | WorkType::RpcCanonicalBlock => { + self.work_queues.rpc_block_queue.len() + } WorkType::RpcBlobs | WorkType::IgnoredRpcBlock => { self.work_queues.rpc_blob_queue.len() } diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index f0f2108bdbb..d26ffed5ee4 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -194,8 +194,12 @@ pub async fn create_api_server_with_config( // The number of workers must be greater than one. Tests which use the // builder workflow sometimes require an internal HTTP request in order // to fulfill an already in-flight HTTP request, therefore having only - // one worker will result in a deadlock. - max_workers: 2, + // one worker will result in a deadlock. Since the introduction of the + // earliest deadline priority queue, the beacon processor now requires more + // than one worker to be available for "non-priority" work events. Keeping the + // number of workers to a value greater than two prevents test failures due to + // timeouts + max_workers: 3, ..BeaconProcessorConfig::default() }; let BeaconProcessorChannels { From 7741e08e99a10ae04d9d60d3277e59c4be92f388 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Wed, 27 Nov 2024 23:31:46 +0700 Subject: [PATCH 14/16] make things a bit more generic --- .../earliest_deadline_queue.rs | 12 ++++++------ .../src/scheduler/earliest_deadline_scheduler/mod.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs index 8a6f66027c8..4609f281957 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/earliest_deadline_queue.rs @@ -10,8 +10,8 @@ use types::{EthSpec, Slot}; use crate::{ReprocessQueueMessage, Work, WorkEvent}; -pub struct WorkQueue { - min_heap: BinaryHeap>>, +pub struct WorkQueue { + min_heap: BinaryHeap>, } pub struct QueueItem { @@ -201,18 +201,18 @@ impl Ord for QueueItem { } } -impl WorkQueue { +impl WorkQueue { pub fn new() -> Self { WorkQueue { min_heap: BinaryHeap::new(), } } - pub fn insert(&mut self, queue_item: QueueItem) { + pub fn insert(&mut self, queue_item: Q) { self.min_heap.push(Reverse(queue_item)) } - pub fn pop(&mut self) -> Option> { + pub fn pop(&mut self) -> Option { if let Some(queue_item) = self.min_heap.pop() { Some(queue_item.0) } else { @@ -220,7 +220,7 @@ impl WorkQueue { } } - fn _peek(&self) -> Option<&Reverse>> { + fn _peek(&self) -> Option<&Reverse> { self.min_heap.peek() } diff --git a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs index 4e5035027b6..bb28e577602 100644 --- a/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs +++ b/beacon_node/beacon_processor/src/scheduler/earliest_deadline_scheduler/mod.rs @@ -24,7 +24,7 @@ const MANAGER_TASK_NAME: &str = "earliest_deadline_first_scheduler"; pub struct Scheduler { beacon_processor: BeaconProcessor, - work_queue: WorkQueue, + work_queue: WorkQueue>, } impl Scheduler { From 7259efbe56810e53014b16963bd3036be8f7da48 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Sat, 5 Apr 2025 12:44:16 -0700 Subject: [PATCH 15/16] clean up merge --- beacon_node/network/src/service/tests.rs | 73 +----------------------- 1 file changed, 1 insertion(+), 72 deletions(-) diff --git a/beacon_node/network/src/service/tests.rs b/beacon_node/network/src/service/tests.rs index 00aa609ee0a..c882c0ebcd3 100644 --- a/beacon_node/network/src/service/tests.rs +++ b/beacon_node/network/src/service/tests.rs @@ -120,78 +120,7 @@ fn test_removing_topic_weight_on_old_topics() { Arc::downgrade(&runtime), exit, shutdown_tx, - "test-removing-topic-weight-on-old-topics".to_string(), - ); - - let mut config = NetworkConfig::default(); - config.set_ipv4_listening_address(std::net::Ipv4Addr::UNSPECIFIED, 21214, 21214, 21215); - config.discv5_config.table_filter = |_| true; // Do not ignore local IPs - config.upnp_enabled = false; - let config = Arc::new(config); - - let BeaconProcessorChannels { - beacon_processor_tx, - beacon_processor_rx: _beacon_processor_rx, - } = <_>::default(); - - let _network_service = NetworkService::start( - beacon_chain.clone(), - config, - executor, - None, - beacon_processor_tx, - ) - .await - .unwrap(); - drop(signal); - }); - - let raw_runtime = Arc::try_unwrap(runtime).unwrap(); - raw_runtime.shutdown_timeout(tokio::time::Duration::from_secs(300)); - - // Load the persisted dht from the store - let persisted_enrs = load_dht(store); - assert!( - persisted_enrs.contains(&enrs[0]), - "should have persisted the first ENR to store" - ); - assert!( - persisted_enrs.contains(&enrs[1]), - "should have persisted the second ENR to store" - ); -} - -// Test removing topic weight on old topics when a fork happens. -#[test] -fn test_removing_topic_weight_on_old_topics() { - let runtime = Arc::new(Runtime::new().unwrap()); - - // Capella spec - let mut spec = MinimalEthSpec::default_spec(); - spec.altair_fork_epoch = Some(Epoch::new(0)); - spec.bellatrix_fork_epoch = Some(Epoch::new(0)); - spec.capella_fork_epoch = Some(Epoch::new(1)); - - // Build beacon chain. - let beacon_chain = BeaconChainHarness::builder(MinimalEthSpec) - .spec(spec.clone().into()) - .deterministic_keypairs(8) - .fresh_ephemeral_store() - .mock_execution_layer() - .build() - .chain; - let (next_fork_name, _) = beacon_chain.duration_to_next_fork().expect("next fork"); - assert_eq!(next_fork_name, ForkName::Capella); - - // Build network service. - let (mut network_service, network_globals, _network_senders) = runtime.block_on(async { - let (_, exit) = async_channel::bounded(1); - let (shutdown_tx, _) = futures::channel::mpsc::channel(1); - let executor = task_executor::TaskExecutor::new( - Arc::downgrade(&runtime), - exit, - get_logger(false), - shutdown_tx, + "test-removing-topic-weight-on-old-topics" ); let mut config = NetworkConfig::default(); From 09ebd65d541f890c6f246b48be887984cce2e486 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Sat, 5 Apr 2025 13:16:53 -0700 Subject: [PATCH 16/16] linting --- beacon_node/network/src/network_beacon_processor/tests.rs | 1 - beacon_node/network/src/service/tests.rs | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index e1f1fce3214..ee0696427fc 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -14,7 +14,6 @@ use beacon_chain::test_utils::{ }; use beacon_chain::{BeaconChain, WhenSlotSkipped}; use beacon_processor::*; -use lighthouse_network::discovery::ConnectionId; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, MetaDataV3}; use lighthouse_network::rpc::InboundRequestId; use lighthouse_network::{ diff --git a/beacon_node/network/src/service/tests.rs b/beacon_node/network/src/service/tests.rs index c882c0ebcd3..b841a2698e0 100644 --- a/beacon_node/network/src/service/tests.rs +++ b/beacon_node/network/src/service/tests.rs @@ -58,8 +58,6 @@ fn test_dht_persistence() { let BeaconProcessorChannels { beacon_processor_tx, beacon_processor_rx: _beacon_processor_rx, - work_reprocessing_tx, - work_reprocessing_rx: _work_reprocessing_rx, } = <_>::default(); let _network_service = NetworkService::start( @@ -68,7 +66,6 @@ fn test_dht_persistence() { executor, None, beacon_processor_tx, - work_reprocessing_tx, ) .await .unwrap(); @@ -120,7 +117,7 @@ fn test_removing_topic_weight_on_old_topics() { Arc::downgrade(&runtime), exit, shutdown_tx, - "test-removing-topic-weight-on-old-topics" + "test-removing-topic-weight-on-old-topics".to_string() ); let mut config = NetworkConfig::default();