diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index b75755b579a..88f5ec4bd33 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,7 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 2282d12ee86..3bb9663c7e4 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -99,6 +99,13 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// If it is possible to generate a payload, the Digest should be returned over the provided /// channel. If it is not possible to generate a payload, the channel can be dropped. If construction /// takes too long, the consensus engine may drop the provided proposal. + /// + /// Returning a payload from `propose` commits the local proposer to verifying + /// the same `(context, payload)`. + /// + /// For [`CertifiableAutomaton`] implementations, returning a payload from + /// `propose` also commits the local proposer to certifying that same + /// `(round, payload)` if it later becomes notarized. fn propose( &mut self, context: Self::Context, @@ -134,18 +141,12 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// Determine whether a verified payload is safe to commit. /// /// The round parameter identifies which consensus round is being certified, allowing - /// applications to associate certification with the correct verification context. - /// - /// Note: In applications where payloads incorporate the round number (recommended), - /// each round will have a unique payload digest. However, the same payload may appear - /// in multiple rounds when re-proposing notarized blocks at epoch boundaries or in - /// integrations where payloads are round-agnostic. - /// - /// This is particularly useful for applications that employ erasure coding, which - /// can override this method to delay or prevent finalization until they have - /// reconstructed and validated the full block (e.g., after receiving enough shards). + /// applications to associate certification with the correct verification context. The + /// same payload may appear in multiple rounds, so implementations must key any state + /// on `(round, payload)` rather than `payload` alone. /// - /// Like [`Automaton::verify`], certification is single-shot for the given + /// Like [`Automaton::verify`], payloads produced by [`Automaton::propose`] are certifiable-by-construction. + /// Also like [`Automaton::verify`], certification is single-shot for the given /// `(round, payload)`. Once the returned channel resolves or closes, consensus treats /// certification as concluded and will not retry the same request. /// @@ -193,7 +194,7 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// treat every broadcast identically can set this to `()`. type Plan: Send; - /// Broadcast a payload to the given recipients. + /// Broadcast a payload according to the given plan. fn broadcast( &mut self, payload: Self::Digest, diff --git a/consensus/src/marshal/application/validation.rs b/consensus/src/marshal/application/validation.rs index 46df606ffa5..82b2697da6c 100644 --- a/consensus/src/marshal/application/validation.rs +++ b/consensus/src/marshal/application/validation.rs @@ -3,13 +3,37 @@ //! This module centralizes pure invariant checks shared across marshal verification //! and certification flows. -use crate::types::{Epoch, Epocher, Height, Round}; -use commonware_utils::sync::Mutex; -use std::sync::Arc; +use crate::{ + marshal::core::{Mailbox, Variant}, + types::{Epoch, Epocher, Height, Round}, +}; +use commonware_cryptography::certificate::Scheme; -/// Cache for the last block built during proposal, shared between the -/// proposer task and the broadcast path. -pub(crate) type LastBuilt = Arc>>; +/// Which stage of verification a block has reached. +/// +/// This is used to determine which marshal cache a block should be stored in. +#[derive(Clone, Copy, Debug)] +pub(crate) enum Stage { + /// The block has been verified (store in `verified_blocks`). + Verified, + /// The block has been certified (store in `notarized_blocks`). + Certified, +} + +impl Stage { + /// Store `block` in the marshal cache for the provided stage. + pub(crate) async fn store( + self, + marshal: &mut Mailbox, + round: Round, + block: V::Block, + ) -> bool { + match self { + Self::Verified => marshal.verified(round, block).await, + Self::Certified => marshal.certified(round, block).await, + } + } +} /// Returns true if the block is at an epoch boundary (last block in its epoch). #[inline] diff --git a/consensus/src/marshal/application/verification_tasks.rs b/consensus/src/marshal/application/verification_tasks.rs index 98df16a77ab..7c85b0502e5 100644 --- a/consensus/src/marshal/application/verification_tasks.rs +++ b/consensus/src/marshal/application/verification_tasks.rs @@ -59,3 +59,114 @@ where .retain(|(task_round, _), _| task_round > finalized_round); } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Epoch, View}; + use commonware_cryptography::{sha256::Digest as Sha256Digest, Hasher, Sha256}; + + type D = Sha256Digest; + + fn round(view: u64) -> Round { + Round::new(Epoch::zero(), View::new(view)) + } + + fn pending_task() -> oneshot::Receiver { + let (_tx, rx) = oneshot::channel(); + rx + } + + #[test] + fn test_insert_and_take_returns_task() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + tasks.insert(round(1), digest, pending_task()); + + assert!(tasks.take(round(1), digest).is_some()); + assert!( + tasks.take(round(1), digest).is_none(), + "taking twice should yield None" + ); + } + + #[test] + fn test_take_absent_key_is_none() { + let tasks = VerificationTasks::::new(); + assert!(tasks.take(round(1), Sha256::hash(b"missing")).is_none()); + } + + #[test] + fn test_take_distinguishes_rounds_and_digests() { + let tasks = VerificationTasks::::new(); + let digest_a = Sha256::hash(b"a"); + let digest_b = Sha256::hash(b"b"); + tasks.insert(round(1), digest_a, pending_task()); + tasks.insert(round(2), digest_a, pending_task()); + tasks.insert(round(1), digest_b, pending_task()); + + assert!(tasks.take(round(1), digest_a).is_some()); + assert!(tasks.take(round(2), digest_a).is_some()); + assert!(tasks.take(round(1), digest_b).is_some()); + } + + #[test] + fn test_retain_after_drops_at_and_below_boundary() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + tasks.insert(round(1), digest, pending_task()); + tasks.insert(round(2), digest, pending_task()); + tasks.insert(round(3), digest, pending_task()); + + tasks.retain_after(&round(2)); + + assert!( + tasks.take(round(1), digest).is_none(), + "tasks strictly below boundary should be dropped" + ); + assert!( + tasks.take(round(2), digest).is_none(), + "tasks at boundary should be dropped" + ); + assert!( + tasks.take(round(3), digest).is_some(), + "tasks strictly above boundary should be retained" + ); + } + + #[test] + fn test_retain_after_spans_epochs() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + let early = Round::new(Epoch::zero(), View::new(100)); + let late = Round::new(Epoch::new(1), View::zero()); + tasks.insert(early, digest, pending_task()); + tasks.insert(late, digest, pending_task()); + + tasks.retain_after(&early); + + assert!( + tasks.take(early, digest).is_none(), + "task at boundary must be dropped" + ); + assert!( + tasks.take(late, digest).is_some(), + "task in later epoch must outlive an earlier boundary" + ); + } + + #[test] + fn test_retain_after_empty_map_is_noop() { + let tasks = VerificationTasks::::new(); + tasks.retain_after(&round(5)); + assert!(tasks.take(round(5), Sha256::hash(b"x")).is_none()); + } + + #[test] + fn test_default_matches_new() { + let default = as Default>::default(); + let digest = Sha256::hash(b"block"); + default.insert(round(1), digest, pending_task()); + assert!(default.take(round(1), digest).is_some()); + } +} diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index ef04a4146bb..9b54ea55d9a 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -82,9 +82,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{ - is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, - }, + validation::{is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, Stage}, verification_tasks::VerificationTasks, }, coding::{ @@ -106,6 +104,7 @@ use commonware_cryptography::{ Committable, Digestible, Hasher, }; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_parallel::Strategy; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, @@ -116,7 +115,6 @@ use commonware_utils::{ fallible::OneshotExt, oneshot::{self, error::RecvError}, }, - sync::Mutex, NZU16, }; use futures::future::{ready, try_join, Either, Ready}; @@ -183,7 +181,6 @@ where scheme_provider: Z, epocher: ES, strategy: S, - last_built: LastBuilt>, verification_tasks: VerificationTasks, cached_genesis: Arc)>>, @@ -266,7 +263,6 @@ where scheme_provider, strategy, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), cached_genesis: Arc::new(OnceLock::new()), @@ -299,6 +295,7 @@ where consensus_context: Context::PublicKey>, commitment: Commitment, prefetched_block: Option>, + stage: Stage, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); @@ -424,9 +421,9 @@ where is_valid = validity_request => is_valid, }; timer.observe(); - if application_valid { - // The block is only persisted at this point. - marshal.verified(round, block).await; + if application_valid && !stage.store(&mut marshal, round, block).await { + debug!(?round, "marshal unable to accept block"); + return; } tx.send_lossy(application_valid); }); @@ -490,15 +487,15 @@ where /// boundary block to avoid creating blocks that would be invalidated by the epoch transition. /// /// The proposal operation is spawned in a background task and returns a receiver that will - /// contain the proposed block's digest when ready. The built block is cached for later - /// broadcasting. + /// contain the proposed block's commitment when ready. The built block is persisted via + /// [`core::Mailbox::verified`] before the commitment is delivered, so consensus can rely + /// on the block surviving restart. async fn propose( &mut self, consensus_context: Context::PublicKey>, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let strategy = self.strategy.clone(); let cached_genesis = self.cached_genesis.clone(); @@ -528,6 +525,42 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified block + // for this round (persisted before voting in consensus). + // + // Building a fresh block would land on the same prunable + // archive index and be silently dropped, so the stored block + // is the only proposal we can broadcast for this round. + // + // The recovered block is safe to reuse only if its embedded + // context matches the context simplex just recovered. + // Otherwise the cached block was built against a different + // parent and cannot be broadcast under the current header, so + // drop the receiver and let the voter nullify the view via + // timeout. + if let Some(block) = marshal.get_verified(consensus_context.round).await { + let block_context = block.context(); + if block_context != consensus_context { + debug!( + round = ?consensus_context.round, + ?consensus_context, + ?block_context, + "skipping proposal: cached verified block context no longer matches" + ); + return; + } + let commitment = block.commitment(); + let round = consensus_context.round; + let success = tx.send_lossy(commitment); + debug!( + ?round, + ?commitment, + success, + "reused verified block from marshal on leader recovery" + ); + return; + } + let (parent_view, parent_commitment) = consensus_context.parent; let parent_request = fetch_parent( parent_commitment, @@ -569,14 +602,18 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let commitment = parent.commitment(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + let round = consensus_context.round; + if !marshal.verified(round, parent).await { + debug!( + ?round, + ?commitment, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(commitment); debug!( - round = ?consensus_context.round, + ?round, ?commitment, success, "re-proposed parent block at epoch boundary" @@ -618,18 +655,13 @@ where erasure_timer.observe(); let commitment = coded_block.commitment(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, coded_block)); + let round = consensus_context.round; + if !marshal.verified(round, coded_block).await { + debug!(?round, ?commitment, "marshal rejected proposed block"); + return; } - let success = tx.send_lossy(commitment); - debug!( - round = ?consensus_context.round, - ?commitment, - success, - "proposed new block" - ); + debug!(?round, ?commitment, success, "proposed new block"); }); rx } @@ -758,9 +790,12 @@ where return; } - // Valid re-proposal. Notify the marshal and complete the + // Valid re-proposal: notify the marshal and complete the // verification task for `certify`. - marshal.verified(round, block).await; + if !marshal.verified(round, block).await { + debug!(?round, "marshal unable to accept block"); + return; + } task_tx.send_lossy(true); tx.send_lossy(true); }); @@ -779,7 +814,7 @@ where // Kick off deferred verification early to hide verification latency behind // shard validity checks and network latency for collecting votes. let round = consensus_context.round; - let task = self.deferred_verify(consensus_context, payload, None); + let task = self.deferred_verify(consensus_context, payload, None, Stage::Verified); self.verification_tasks.insert(round, payload, task); match scheme.me() { @@ -895,10 +930,13 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call - // `marshal.verified` twice for the same block. That function is - // idempotent, so this is safe. - marshaled.marshal.verified(round, block).await; + // Certifier holds a notarization for this block, so route + // the write to the notarized cache. `certified` is + // idempotent, so crash-recovery double-invocation is safe. + if !marshaled.marshal.certified(round, block).await { + debug!(?round, "marshal unable to accept block"); + return; + } tx.send_lossy(true); return; } @@ -914,7 +952,12 @@ where // Use the block's embedded context for verification, passing the // prefetched block to avoid fetching it again inside deferred_verify. - let verify_rx = marshaled.deferred_verify(embedded_context, payload, Some(block)); + let verify_rx = marshaled.deferred_verify( + embedded_context, + payload, + Some(block), + Stage::Certified, + ); if let Ok(result) = verify_rx.await { tx.send_lossy(result); } @@ -943,36 +986,16 @@ where type Plan = Plan; async fn broadcast(&mut self, commitment: Self::Digest, plan: Self::Plan) { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); - return; - }; - if block.commitment() != commitment { - warn!( - round = %round, - commitment = %block.commitment(), - height = %block.height(), - "skipping requested broadcast of block with mismatched commitment" - ); - return; - } - debug!( - round = %round, - commitment = %block.commitment(), - height = %block.height(), - "requested broadcast of built block" - ); - self.shards.proposed(round, block).await; - } - Plan::Forward { .. } => { - // Coding variant does not support targeted forwarding; - // peers reconstruct blocks from erasure-coded shards. - // - // TODO(#3389): Support checked data forwarding for PhasedScheme. - } - } + // Coding variant does not support targeted forwarding; + // peers reconstruct blocks from erasure-coded shards. + // + // TODO(#3389): Support checked data forwarding for PhasedScheme. + let Plan::Propose { round } = plan else { + return; + }; + self.marshal + .forward(round, commitment, Recipients::All) + .await; } } @@ -1052,7 +1075,7 @@ where } /// Constructs the [`Commitment`] for the genesis block. -fn genesis_coding_commitment(block: &B) -> Commitment { +pub(super) fn genesis_coding_commitment(block: &B) -> Commitment { Commitment::from(( block.digest(), block.digest(), diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 5ec8cd89032..35c6d728074 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -66,6 +66,7 @@ mod tests { use crate::{ marshal::{ coding::{ + marshaled::genesis_coding_commitment, types::{coding_config_for_participants, CodedBlock}, Marshaled, MarshaledConfig, }, @@ -136,6 +137,26 @@ mod tests { } } + #[test_group("slow")] + #[test_traced("WARN")] + fn test_coding_hailstorm_restarts() { + for seed in 0..2 { + let r1 = harness::hailstorm::(seed, 4, 4, 1, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 1, LINK); + assert_eq!(r1, r2); + } + } + + #[test_group("slow")] + #[test_traced("WARN")] + fn test_coding_hailstorm_multi_restarts() { + for seed in 0..2 { + let r1 = harness::hailstorm::(seed, 4, 4, 2, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 2, LINK); + assert_eq!(r1, r2); + } + } + #[test_traced("WARN")] fn test_coding_ack_pipeline_backlog() { harness::ack_pipeline_backlog::(); @@ -146,6 +167,26 @@ mod tests { harness::ack_pipeline_backlog_persists_on_restart::(); } + #[test_traced("WARN")] + fn test_coding_proposed_success_implies_recoverable_after_restart() { + harness::proposed_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_coding_verified_success_implies_recoverable_after_restart() { + harness::verified_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_coding_certified_success_implies_recoverable_after_restart() { + harness::certified_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_coding_delivery_visibility_implies_recoverable_after_restart() { + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); + } + #[test_traced("WARN")] fn test_coding_sync_height_floor() { harness::sync_height_floor::(); @@ -221,6 +262,109 @@ mod tests { harness::finalize_same_height_different_views::(); } + #[test_traced("WARN")] + fn test_coding_certify_persists_equivocated_block() { + harness::certify_persists_equivocated_block::(); + } + + #[test_traced("WARN")] + fn test_coding_certify_at_later_view_survives_earlier_view_pruning() { + harness::certify_at_later_view_survives_earlier_view_pruning::(); + } + + /// Finalizing a descendant must not height-prune the shard-engine buffer before + /// `try_repair_gaps` has consumed buffer-only ancestors. + /// + /// Places parent (height 1) and descendant (height 2) in the shard engine's + /// reconstructed-block cache via `proposed()`, then reports a finalization + /// for the descendant only. + #[test_traced("WARN")] + fn test_coding_store_finalization_does_not_prune_buffer_before_repair() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let mut handle = harness::ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + // Build a 2-block chain: parent at height 1, descendant at height 2. + let parent_block = CodingHarness::make_test_block( + Sha256::hash(b""), + CodingHarness::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 1, + NUM_VALIDATORS as u16, + ); + let parent_digest = CodingHarness::digest(&parent_block); + let parent_commitment = CodingHarness::commitment(&parent_block); + + let descendant_block = CodingHarness::make_test_block( + parent_digest, + parent_commitment, + Height::new(2), + 2, + NUM_VALIDATORS as u16, + ); + let descendant_commitment = CodingHarness::commitment(&descendant_block); + + // Seed the shard engine's reconstructed-block cache with both blocks. + CodingHarness::propose( + &mut handle, + Round::new(Epoch::new(0), View::new(1)), + &parent_block, + ) + .await; + CodingHarness::propose( + &mut handle, + Round::new(Epoch::new(0), View::new(2)), + &descendant_block, + ) + .await; + + // Report finalization for the descendant only. The parent has no + // finalization certificate: it must be archived by walking the + // parent link from the descendant and sourcing the block from the + // shard-engine buffer. + let descendant_proposal = Proposal { + round: Round::new(Epoch::new(0), View::new(2)), + parent: View::new(1), + payload: descendant_commitment, + }; + let descendant_finalization = + CodingHarness::make_finalization(descendant_proposal, &schemes, QUORUM); + CodingHarness::report_finalization(&mut handle.mailbox, descendant_finalization).await; + + // Wait until the descendant is archived: that proves finalization processing + // has completed, at which point the parent must already have been repaired + // from the shard buffer. + while handle.mailbox.get_block(Height::new(2)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + let parent = handle.mailbox.get_block(Height::new(1)).await; + assert!( + parent.is_some(), + "parent must be archived from shard buffer before height-prune evicts it" + ); + }); + } + #[test_traced("WARN")] fn test_coding_init_processed_height() { harness::init_processed_height::(); @@ -291,7 +435,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(0), View::new(1)), coded_parent) .await; @@ -305,7 +448,7 @@ mod tests { let block_a = make_coding_block(context_a.clone(), parent_digest, Height::new(2), 200); let coded_block_a = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_block_a.commitment(); - shards.clone().proposed(round_a, coded_block_a).await; + shards.proposed(round_a, coded_block_a).await; // Block B at view 10 (height 2, different block same height - could happen with // different proposers or re-proposals) @@ -318,7 +461,7 @@ mod tests { let block_b = make_coding_block(context_b.clone(), parent_digest, Height::new(2), 300); let coded_block_b = CodedBlock::new(block_b.clone(), coding_config, &Sequential); let commitment_b = coded_block_b.commitment(); - shards.clone().proposed(round_b, coded_block_b).await; + shards.proposed(round_b, coded_block_b).await; context.sleep(Duration::from_millis(10)).await; @@ -417,7 +560,7 @@ mod tests { let block = make_coding_block(ctx.clone(), parent, Height::new(i), i * 100); let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); last_commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + shards.proposed(round, coded_block).await; parent = block.digest(); last_view = View::new(i); } @@ -439,10 +582,7 @@ mod tests { let coded_boundary = CodedBlock::new(boundary_block.clone(), coding_config, &Sequential); let boundary_commitment = coded_boundary.commitment(); - shards - .clone() - .proposed(boundary_round, coded_boundary) - .await; + shards.proposed(boundary_round, coded_boundary).await; context.sleep(Duration::from_millis(10)).await; @@ -504,7 +644,6 @@ mod tests { // Make the non-boundary block available shards - .clone() .proposed(non_boundary_round, coded_non_boundary) .await; @@ -632,7 +771,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::zero(), View::new(1)), coded_parent) .await; @@ -993,7 +1131,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::zero(), View::new(19)), coded_parent) .await; @@ -1007,7 +1144,6 @@ mod tests { let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); let block_commitment = coded_block.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(1), View::new(20)), coded_block) .await; @@ -1107,7 +1243,6 @@ mod tests { let coded_parent = CodedBlock::new(honest_parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(1), View::new(21)), coded_parent) .await; @@ -1129,10 +1264,7 @@ mod tests { let coded_malicious = CodedBlock::new(malicious_block.clone(), coding_config, &Sequential); let malicious_commitment = coded_malicious.commitment(); - shards - .clone() - .proposed(byzantine_round, coded_malicious) - .await; + shards.proposed(byzantine_round, coded_malicious).await; // Small delay to ensure broadcast is processed context.sleep(Duration::from_millis(10)).await; @@ -1177,10 +1309,7 @@ mod tests { let coded_malicious2 = CodedBlock::new(malicious_block2.clone(), coding_config, &Sequential); let malicious_commitment2 = coded_malicious2.commitment(); - shards - .clone() - .proposed(byzantine_round2, coded_malicious2) - .await; + shards.proposed(byzantine_round2, coded_malicious2).await; // Small delay to ensure broadcast is processed context.sleep(Duration::from_millis(10)).await; @@ -1269,7 +1398,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + shards.proposed(parent_round, coded_parent).await; // Create child at height 2. let child_round = Round::new(Epoch::zero(), View::new(2)); @@ -1281,7 +1410,7 @@ mod tests { let child = make_coding_block(child_ctx, parent.digest(), Height::new(2), 200); let coded_child = CodedBlock::new(child, coding_config, &Sequential); let child_commitment = coded_child.commitment(); - shards.clone().proposed(child_round, coded_child).await; + shards.proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; @@ -1390,7 +1519,7 @@ mod tests { let parent = make_coding_block(parent_context, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + shards.proposed(parent_round, coded_parent).await; // 3) Publish a valid child so optimistic verify can succeed. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1403,7 +1532,7 @@ mod tests { make_coding_block(verify_context.clone(), parent.digest(), Height::new(2), 200); let coded_block = CodedBlock::new(block, coding_config, &Sequential); let commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + shards.proposed(round, coded_block).await; context.sleep(Duration::from_millis(10)).await; @@ -1498,7 +1627,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - v1_mailbox.proposed(round1, coded_block_b.clone()).await; + assert!(v1_mailbox.verified(round1, coded_block_b.clone()).await); context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). @@ -1595,4 +1724,423 @@ mod tests { assert!(rx.await.is_err()); }); } + + /// Regression: a validator must not vote finalize on a block that is not + /// durably persisted. `certify` resolves true ⟹ block is on disk for + /// this validator. We assert this by aborting the marshal actor the + /// instant `certify` returns true; without the persist-before-certify + /// fix, the actor may have only had the `Verified` message enqueued (not + /// processed), and the block is lost on restart even though the validator + /// would have proceeded to broadcast a finalize vote. + #[test_traced("WARN")] + fn test_marshaled_certify_persists_block_before_resolving() { + for seed in 0u64..16 { + certify_persists_block_before_resolving_at(seed); + } + } + + fn certify_persists_block_before_resolving_at(seed: u64) { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + let marshal_actor_handle = setup.actor_handle; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + + // Push parent (height 1) and child (height 2) into the shards + // engine. These are reconstructable but NOT durably persisted. + let parent_round = Round::new(Epoch::zero(), View::new(1)); + let parent_ctx = CodingCtx { + round: parent_round, + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); + let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); + let parent_commitment = coded_parent.commitment(); + shards.proposed(parent_round, coded_parent).await; + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = CodingCtx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_commitment), + }; + let child = make_coding_block(child_ctx.clone(), parent.digest(), Height::new(2), 200); + let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); + let child_commitment = coded_child.commitment(); + let child_digest = coded_child.digest(); + shards.proposed(child_round, coded_child).await; + + context.sleep(Duration::from_millis(10)).await; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + // Optimistic verify - returns shard validity (true). + let shard_validity = marshaled + .verify(child_ctx, child_commitment) + .await + .await + .expect("verify result missing"); + assert!(shard_validity, "shard validity should pass"); + + // Certify - this is the safety gate before finalize voting. + let certify_result = marshaled + .certify(child_round, child_commitment) + .await + .await + .expect("certify result missing"); + assert!(certify_result, "certify should succeed"); + + // Abort marshal immediately after certify returns to prove the + // block is already persisted at that point. + marshal_actor_handle.abort(); + drop(marshaled); + drop(marshal); + drop(shards); + + // Restart from the same partition. The block must be durably + // persisted - otherwise the validator would have voted finalize + // for a block it cannot serve from local storage. + let setup2 = CodingHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "certify resolved true ⟹ block must be durably persisted" + ); + }); + } + + /// Regression: a proposer must be able to recover its own block after a + /// crash that occurs immediately after `Marshaled::propose()` returns a + /// commitment. `propose` is responsible for persisting the block via + /// `marshal.verified`, so the block must survive restart even if + /// `Relay::broadcast` never runs or marshal aborts in between. + #[test_traced("WARN")] + fn test_marshaled_proposed_block_persists_across_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + let marshal_actor_handle = setup.actor_handle; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); + + // Build the block we want propose() to return. Its embedded context + // uses the proper genesis commitment so fetch_parent matches the + // cached genesis without going through the marshal subscription. + let propose_round = Round::new(Epoch::zero(), View::new(1)); + let propose_context = CodingCtx { + round: propose_round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + let block_to_propose = make_coding_block( + propose_context.clone(), + genesis.digest(), + Height::new(1), + 100, + ); + let block_digest = block_to_propose.digest(); + let expected_commitment = CodedBlock::<_, ReedSolomon, Sha256>::new( + block_to_propose.clone(), + coding_config, + &Sequential, + ) + .commitment(); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis).with_propose_result(block_to_propose); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + // Drive the leader-side propose path. `propose` must persist the + // block before returning the commitment. + let commitment = marshaled + .propose(propose_context) + .await + .await + .expect("propose should produce a commitment"); + assert_eq!(commitment, expected_commitment); + + // Abort marshal immediately after propose returns; the propose + // path must already have persisted the block. + marshal_actor_handle.abort(); + drop(marshaled); + drop(marshal); + drop(shards); + + let setup2 = CodingHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + // The proposer must recover its own block after restart. Without + // the broadcast-path persistence fix, the block lived only in the + // shards engine's in-memory cache and is now gone. + let post_restart = marshal2.get_block(&block_digest).await; + assert!( + post_restart.is_some(), + "proposer should recover its own block after restart" + ); + }); + } + + /// Regression: if marshal already holds a verified block for a round + /// (say, persisted by a pre-crash propose whose notarize vote never + /// reached the journal), a restarted leader's `propose` must return + /// that block's commitment instead of rebuilding. Otherwise the + /// new block lands on the same view index in the prunable archive, + /// gets silently dropped (`skip_if_index_exists=true`), and the + /// leader's notarize targets a commitment no peer can serve. + #[test_traced("WARN")] + fn test_propose_reuses_verified_block_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); + + let round = Round::new(Epoch::zero(), View::new(1)); + let ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + + // Seed block A in marshal's verified cache for `round`. + let block_a = make_coding_block(ctx.clone(), genesis.digest(), Height::new(1), 100); + let coded_a: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(block_a.clone(), coding_config, &Sequential); + let commitment_a = coded_a.commitment(); + assert!(marshal.verified(round, coded_a).await); + + // After restart, a fresh application would build a different + // block for the same round. + let block_b = make_coding_block(ctx.clone(), genesis.digest(), Height::new(1), 200); + let coded_b: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(block_b.clone(), coding_config, &Sequential); + let commitment_b = coded_b.commitment(); + assert_ne!( + commitment_a, commitment_b, + "test requires distinct commitments" + ); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis).with_propose_result(block_b); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + let commitment = marshaled + .propose(ctx) + .await + .await + .expect("propose must return a commitment"); + assert_eq!( + commitment, commitment_a, + "propose must reuse the block marshal already persisted for this round" + ); + }); + } + + /// Regression: if a pre-crash leader persisted a verified block for a + /// round but the simplex `Notarize` never reached the journal, replay + /// can recover a `consensus_context` whose parent differs from the one + /// the cached block was built against. The restarted leader must then + /// drop the receiver so the voter nullifies the view via + /// `MissingProposal`, rather than broadcasting the stale cached block + /// under a header that peers will reject. + #[test_traced("WARN")] + fn test_propose_skips_when_verified_block_context_changed() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); + + // Stash a stale block built against genesis as its parent at round V=2. + let round = Round::new(Epoch::zero(), View::new(2)); + let stale_ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + let stale_block = make_coding_block(stale_ctx, genesis.digest(), Height::new(1), 100); + let stale_coded: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(stale_block, coding_config, &Sequential); + assert!(marshal.verified(round, stale_coded).await); + + // Simulate a replay where parent selection now points to a + // different parent commitment than the cached block was built for. + let new_parent_commitment = Commitment::from(( + Sha256::hash(b"different-parent-block"), + Sha256::hash(b"different-parent-inner"), + Sha256::hash(b"different-parent-ctx"), + coding_config, + )); + let new_ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::new(1), new_parent_commitment), + }; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + let commitment_rx = marshaled.propose(new_ctx).await; + assert!( + commitment_rx.await.is_err(), + "propose must drop the receiver when the cached block's context no longer matches" + ); + }); + } } diff --git a/consensus/src/marshal/coding/variant.rs b/consensus/src/marshal/coding/variant.rs index cd939dffe74..49ca3df89e4 100644 --- a/consensus/src/marshal/coding/variant.rs +++ b/consensus/src/marshal/coding/variant.rs @@ -100,6 +100,7 @@ where } async fn send(&self, round: Round, block: CodedBlock, _recipients: Recipients

) { + // Targeted forwarding is not supported by the coding variant. self.proposed(round, block).await; } } diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 54597e252e4..90cb2cd8fe4 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -52,7 +52,7 @@ use std::{ pin::Pin, sync::Arc, }; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, warn}; /// The key used to store the last processed height in the metadata store. const LATEST_KEY: U64 = U64::new(0xFF); @@ -453,7 +453,7 @@ where result = self.pending_acks.current() => { // Start with the ack that woke this `select_loop!` arm. let mut pending = Some(self.pending_acks.complete_current(result)); - loop { + let last_acked_commitment = loop { let (height, commitment, result) = pending.take().expect("pending ack must exist"); match result { @@ -471,11 +471,11 @@ where // Opportunistically drain any additional already-ready acks so we // can persist one metadata sync for the whole batch below. - let Some(next) = self.pending_acks.pop_ready() else { - break; - }; - pending = Some(next); - } + match self.pending_acks.pop_ready() { + Some(next) => pending = Some(next), + None => break commitment, + } + }; // Persist buffered processed-height updates once after draining all ready acks. if let Err(e) = self.application_metadata.sync().await { @@ -483,12 +483,15 @@ where return; } + // Inform the buffer of the last acknowledged commitment (anything below this is safe to prune). + buffer.finalized(last_acked_commitment).await; + // Fill the pipeline self.try_dispatch_blocks(&mut application).await; }, // Handle consensus inputs before backfill or resolver traffic Some(message) = self.mailbox.recv() else { - info!("mailbox closed, shutting down"); + debug!("mailbox closed, shutting down"); break; } => { match message { @@ -518,17 +521,20 @@ where }; response.send_lossy(info); } - Message::Proposed { round, block } => { - self.cache_verified(round, block.digest(), block.clone()) - .await; - buffer.send(round, block, Recipients::All).await; + Message::GetVerified { round, response } => { + let block = self + .cache + .get_verified(round) + .await + .map(Into::into); + response.send_lossy(block); } Message::Forward { round, commitment, - peers, + recipients, } => { - if peers.is_empty() { + if matches!(&recipients, Recipients::Some(peers) if peers.is_empty()) { continue; } let Some(block) = self.find_block_by_commitment(&buffer, commitment).await @@ -536,10 +542,23 @@ where debug!(?commitment, "block not found for forwarding"); continue; }; - buffer.send(round, block, Recipients::Some(peers)).await; + buffer.send(round, block, recipients).await; } - Message::Verified { round, block } => { + Message::Verified { round, block, ack } => { + // If the round has already been pruned by tip advancement, + // `cache_verified` is a no-op because the round is below + // the retention floor (and no longer is required by consensus + // to make progress). self.cache_verified(round, block.digest(), block).await; + ack.send_lossy(()); + } + Message::Certified { round, block, ack } => { + // If the round has already been pruned by tip advancement, + // `cache_block` is a no-op because the round is below + // the retention floor (and no longer is required by consensus + // to make progress). + self.cache_block(round, block.digest(), block).await; + ack.send_lossy(()); } Message::Notarization { notarization } => { let round = notarization.round(); @@ -586,13 +605,13 @@ where block, Some(finalization), &mut application, - &mut buffer, ) .await { self.try_repair_gaps(&mut buffer, &mut resolver, &mut application) .await; self.sync_finalized().await; + self.try_dispatch_blocks(&mut application).await; debug!(?round, %height, "finalized block stored"); } } else { @@ -727,7 +746,7 @@ where }, // Handle resolver messages last (batched up to max_repair, sync once) Some(message) = resolver_rx.recv() else { - info!("handler closed, shutting down"); + debug!("handler closed, shutting down"); return; } => { // Drain up to max_repair messages: blocks handled immediately, @@ -755,7 +774,6 @@ where response, &mut delivers, &mut application, - &mut buffer, ) .await; } @@ -764,7 +782,7 @@ where // Batch verify and process all delivers. needs_sync |= self - .verify_delivered(delivers, &mut application, &mut buffer) + .verify_delivered(delivers, &mut application) .await; // Attempt to fill gaps before handling produce requests (so we @@ -777,6 +795,7 @@ where // durability). if needs_sync { self.sync_finalized().await; + self.try_dispatch_blocks(&mut application).await; } // Handle produce requests in parallel. @@ -919,14 +938,13 @@ where /// immediately. Finalized/Notarized delivers are parsed and structurally /// validated, then collected into `delivers` for batch certificate verification. /// Returns true if finalization archives were written and need syncing. - async fn handle_deliver>( + async fn handle_deliver( &mut self, key: Request, value: Bytes, response: oneshot::Sender, delivers: &mut Vec>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { match key { Request::Block(commitment) => { @@ -945,7 +963,7 @@ where let digest = block.digest(); let finalization = self.cache.get_finalization_for(digest).await; let wrote = self - .store_finalization(height, digest, block, finalization, application, buffer) + .store_finalization(height, digest, block, finalization, application) .await; debug!(?digest, %height, "received block"); response.send_lossy(true); // if a valid block is received, we should still send true (even if it was stale) @@ -1041,11 +1059,10 @@ where /// Batch verify pending certificates and process valid items. Returns true /// if finalization archives were written and need syncing. - async fn verify_delivered>( + async fn verify_delivered( &mut self, mut delivers: Vec>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { if delivers.is_empty() { return false; @@ -1128,14 +1145,7 @@ where debug!(?round, %height, "received finalization"); wrote |= self - .store_finalization( - height, - digest, - block, - Some(finalization), - application, - buffer, - ) + .store_finalization(height, digest, block, Some(finalization), application) .await; } PendingVerification::Notarized { @@ -1165,7 +1175,6 @@ where block.clone(), Some(finalization), application, - buffer, ) .await; } @@ -1225,6 +1234,10 @@ where /// arrive and [`Self::handle_block_processed`] calls /// [`Self::update_processed_height`]. /// + /// Callers must only invoke this after [`Self::sync_finalized`] has made any + /// preceding finalized-archive writes durable. In other words, anything fed + /// to the application from this method is already durably persisted in marshal. + /// /// Acks are processed in FIFO order so `last_processed_height` always /// advances sequentially. /// @@ -1238,8 +1251,8 @@ where /// ```text /// Iteration N (caller): /// store_finalization -> Archive::put (buffered) - /// try_dispatch_blocks -> sends blocks to app, enqueues pending acks /// sync_finalized -> archive durable + /// try_dispatch_blocks -> sends blocks to app, enqueues pending acks /// /// Iteration M (ack handler, M > N): /// handle_block_processed -> update_processed_height -> metadata buffered @@ -1343,8 +1356,10 @@ where /// /// Must be called within the same `select_loop!` arm as any preceding /// [`Self::store_finalization`] / [`Self::try_repair_gaps`] writes, before yielding back - /// to the loop. This ensures archives are durable before the ack handler - /// advances `last_processed_height`. See [`Self::try_dispatch_blocks`] for details. + /// to the loop. This is the durability barrier for application delivery: + /// [`Self::try_dispatch_blocks`] must run only after this sync completes. + /// It also ensures archives are durable before the ack handler advances + /// `last_processed_height`. See [`Self::try_dispatch_blocks`] for details. async fn sync_finalized(&mut self) { if let Err(e) = try_join!( async { @@ -1394,21 +1409,23 @@ where /// Add a finalized block, and optionally a finalization, to the archive. /// - /// After persisting the block, attempt to dispatch the next contiguous block to the application. + /// After persisting the block, the caller must sync finalized archives + /// before dispatching the next contiguous block to the application. The + /// buffered archive writes from this method are not a sufficient durability + /// guarantee for downstream application state transitions on their own. /// /// Writes are buffered and not synced. The caller must call /// [sync_finalized](Self::sync_finalized) before yielding to the /// `select_loop!` so that archive data is durable before the ack handler /// advances `last_processed_height`. See [`Self::try_dispatch_blocks`] for the /// crash safety invariant. - async fn store_finalization>( + async fn store_finalization( &mut self, height: Height, digest: ::Digest, block: V::Block, finalization: Option>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { // Blocks below the last processed height are not useful to us, so we ignore them (this // has the nice byproduct of ensuring we don't call a backing store with a block below the @@ -1425,7 +1442,6 @@ where self.notify_subscribers(&block); // Convert block to storage format - let commitment = V::commitment(&block); let stored: V::StoredBlock = block.into(); let round = finalization.as_ref().map(|f| f.round()); @@ -1450,14 +1466,12 @@ where panic!("failed to finalize: {e}"); } - // Update metrics, buffer, and application + // Update metrics and application if let Some(round) = round.filter(|_| height > self.tip) { application.report(Update::Tip(round, height, digest)).await; self.tip = height; let _ = self.finalized_height.try_set(height.get()); } - buffer.finalized(commitment).await; - self.try_dispatch_blocks(application).await; true } @@ -1580,7 +1594,6 @@ where block, Some(finalization), application, - buffer, ) .await; } else { @@ -1626,7 +1639,6 @@ where block.clone(), finalization, application, - buffer, ) .await; debug!(height = %block.height(), "repaired block"); diff --git a/consensus/src/marshal/core/cache.rs b/consensus/src/marshal/core/cache.rs index 9124d5b04f5..3d668dfcedc 100644 --- a/consensus/src/marshal/core/cache.rs +++ b/consensus/src/marshal/core/cache.rs @@ -371,6 +371,16 @@ where .expect("failed to get notarization") } + /// Get the block previously persisted in the verified archive for `round`. + pub(crate) async fn get_verified(&self, round: Round) -> Option { + let cache = self.caches.get(&round.epoch())?; + cache + .verified_blocks + .get(Identifier::Index(round.view().get())) + .await + .expect("failed to get verified block") + } + /// Get a finalization from the prunable archive by block digest. /// /// SAFETY: For blocks/certificates admitted by marshal verification, a block digest diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 9c2ff0b8728..01ff946c2f3 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -9,6 +9,7 @@ use crate::{ Reporter, }; use commonware_cryptography::{certificate::Scheme, Digestible}; +use commonware_p2p::Recipients; use commonware_utils::{ channel::{fallible::AsyncFallibleExt, mpsc, oneshot}, vec::NonEmptyVec, @@ -85,21 +86,21 @@ pub(crate) enum Message { /// A channel to send the retrieved block. response: oneshot::Sender, }, - /// A request to broadcast a proposed block to peers. - Proposed { - /// The round in which the block was proposed. + /// A request to retrieve the verified block previously persisted for `round`. + GetVerified { + /// The round to query. round: Round, - /// The block to broadcast. - block: V::Block, + /// A channel to send the retrieved block, if any. + response: oneshot::Sender>, }, - /// A request to forward a block to a set of peers. + /// A request to forward a block to a set of recipients. Forward { /// The round in which the block was proposed. round: Round, /// The commitment of the block to forward. commitment: V::Commitment, - /// The peers to forward the block to. - peers: Vec, + /// The recipients to forward the block to. + recipients: Recipients, }, /// A notification that a block has been verified by the application. Verified { @@ -107,6 +108,17 @@ pub(crate) enum Message { round: Round, /// The verified block. block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, + }, + /// A notification that a block has been certified by the application. + Certified { + /// The round in which the block was certified. + round: Round, + /// The certified block. + block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, }, /// Sets the sync starting point (advances if higher than current). /// @@ -283,18 +295,32 @@ impl Mailbox { .map(|block| AncestorStream::new(self.clone(), [V::into_inner(block)])) } - /// Requests that a proposed block is sent to peers. - pub async fn proposed(&self, round: Round, block: V::Block) { + /// Returns the verified block previously persisted for `round`, if any. + pub async fn get_verified(&self, round: Round) -> Option { self.sender - .send_lossy(Message::Proposed { round, block }) - .await; + .request(|response| Message::GetVerified { round, response }) + .await + .flatten() } - /// Notifies the actor that a block has been verified. - pub async fn verified(&self, round: Round, block: V::Block) { + /// Notifies the actor that a block has been verified, awaiting the actor's + /// confirmation that the block has been durably persisted before returning. + #[must_use = "callers must consider block durability before proceeding"] + pub async fn verified(&self, round: Round, block: V::Block) -> bool { self.sender - .send_lossy(Message::Verified { round, block }) - .await; + .request(|ack| Message::Verified { round, block, ack }) + .await + .is_some() + } + + /// Notifies the actor that a block has been certified, awaiting the actor's + /// confirmation that the block has been durably persisted before returning. + #[must_use = "callers must consider block durability before proceeding"] + pub async fn certified(&self, round: Round, block: V::Block) -> bool { + self.sender + .request(|ack| Message::Certified { round, block, ack }) + .await + .is_some() } /// Sets the sync starting point (advances if higher than current). @@ -321,13 +347,18 @@ impl Mailbox { self.sender.send_lossy(Message::Prune { height }).await; } - /// Forward a block to a set of peers. - pub async fn forward(&self, round: Round, commitment: V::Commitment, peers: Vec) { + /// Forward a block to a set of recipients. + pub async fn forward( + &self, + round: Round, + commitment: V::Commitment, + recipients: Recipients, + ) { self.sender .send_lossy(Message::Forward { round, commitment, - peers, + recipients, }) .await; } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 19aef8c2b16..0749d7e060c 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -40,7 +40,7 @@ use commonware_storage::{ archive::{immutable, prunable}, translator::EightCap, }; -use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16, NZU64}; +use commonware_utils::{test_rng_seeded, vec::NonEmptyVec, NZUsize, NZU16, NZU64}; use futures::StreamExt; use rand::{ seq::{IteratorRandom, SliceRandom}, @@ -165,6 +165,7 @@ pub struct ValidatorSetup { pub mailbox: Mailbox, pub extra: H::ValidatorExtra, pub height: Height, + pub actor_handle: commonware_runtime::Handle<()>, } /// Per-validator handle for test operations. @@ -258,6 +259,13 @@ pub trait TestHarness: 'static + Sized { all_handles: &mut [ValidatorHandle], ) -> impl Future + Send; + /// Mark a block as certified. + fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> impl Future + Send; + /// Create a finalization certificate. fn make_finalization( proposal: Proposal, @@ -312,6 +320,1200 @@ pub trait TestHarness: 'static + Sized { ) -> impl Future + Send; } +fn contract_runner(seed: u64) -> deterministic::Runner { + deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(30))), + ) +} + +fn restart_cycles_for_seed(seed: u64) -> usize { + let mut rng = test_rng_seeded(seed); + rng.gen_range(2..=4) +} + +struct HailstormValidator { + application: Application, + handle: ValidatorHandle, + actor_handle: commonware_runtime::Handle<()>, +} + +type CanonicalEntry = (Height, D, Finalization::Commitment>); +type CanonicalChain = Vec>; + +struct HailstormState<'a, H: TestHarness> { + validators: &'a mut [Option>], + canonical: &'a mut CanonicalChain, + parent: &'a mut D, + parent_commitment: &'a mut H::Commitment, + participants: &'a [K], + schemes: &'a [S], +} + +fn active_validator_indices( + validators: &[Option>], +) -> Vec { + validators + .iter() + .enumerate() + .filter_map(|(idx, validator)| validator.as_ref().map(|_| idx)) + .collect() +} + +async fn wait_for_validator_height( + context: &mut deterministic::Context, + validator: &HailstormValidator, + height: Height, + expected_digest: D, + expected_finalization: &Finalization, + label: &str, +) { + loop { + let block = validator.handle.mailbox.get_block(height).await; + let finalization = validator.handle.mailbox.get_finalization(height).await; + if let (Some(block), Some(finalization)) = (block, finalization) { + assert_eq!( + block.digest(), + expected_digest, + "{label}: wrong block digest at height {}", + height.get() + ); + assert_eq!( + finalization.round(), + expected_finalization.round(), + "{label}: wrong finalization round at height {}", + height.get() + ); + assert_eq!( + finalization.proposal.payload, + expected_finalization.proposal.payload, + "{label}: wrong finalization payload at height {}", + height.get() + ); + break; + } + context.sleep(Duration::from_millis(10)).await; + } +} + +async fn assert_validator_matches_canonical( + validator: &HailstormValidator, + canonical: &[CanonicalEntry], + label: &str, +) { + let delivered = validator.application.blocks(); + for (height, block) in delivered { + let (_, expected_digest, _) = canonical + .iter() + .find(|(expected_height, _, _)| *expected_height == height) + .unwrap_or_else(|| { + panic!( + "{label}: unexpected delivered block at height {}", + height.get() + ) + }); + assert_eq!( + block.digest(), + *expected_digest, + "{label}: application delivered wrong digest at height {}", + height.get() + ); + } + + if let Some((height, digest)) = validator.application.tip() { + let (_, expected_digest, _) = canonical + .iter() + .find(|(expected_height, _, _)| *expected_height == height) + .unwrap_or_else(|| { + panic!( + "{label}: unexpected delivered tip at height {}", + height.get() + ) + }); + assert_eq!( + digest, + *expected_digest, + "{label}: application reported wrong tip digest at height {}", + height.get() + ); + } + + for (height, expected_digest, expected_finalization) in canonical { + let stored_block = validator + .handle + .mailbox + .get_block(*height) + .await + .unwrap_or_else(|| { + panic!( + "{label}: missing finalized block at height {}", + height.get() + ) + }); + assert_eq!( + stored_block.digest(), + *expected_digest, + "{label}: stored wrong block digest at height {}", + height.get() + ); + + let stored_finalization = validator + .handle + .mailbox + .get_finalization(*height) + .await + .unwrap_or_else(|| panic!("{label}: missing finalization at height {}", height.get())); + assert_eq!( + stored_finalization.round(), + expected_finalization.round(), + "{label}: stored wrong finalization round at height {}", + height.get() + ); + assert_eq!( + stored_finalization.proposal.payload, + expected_finalization.proposal.payload, + "{label}: stored wrong finalization payload at height {}", + height.get() + ); + } + + if let Some((height, digest, _)) = canonical.last() { + assert_eq!( + validator.handle.mailbox.get_info(Identifier::Latest).await, + Some((*height, *digest)), + "{label}: latest info should match the canonical tip", + ); + } +} + +async fn assert_active_validators_match_canonical( + validators: &[Option>], + canonical: &[CanonicalEntry], +) { + for idx in active_validator_indices(validators) { + let validator = validators[idx] + .as_ref() + .expect("active validator should be present"); + assert_validator_matches_canonical(validator, canonical, &format!("validator_{idx}")).await; + } +} + +/// A height that has been driven through propose + verify but has not yet had +/// its finalization reported to the validators. +struct PendingHailstormHeight { + height: Height, + expected_digest: D, + finalization: Finalization, + next_parent: D, + next_parent_commitment: H::Commitment, +} + +/// Drives one height through the propose and verify phases without reporting +/// finalization. The returned pending height must be committed via +/// [`finalize_hailstorm_height`] to advance the canonical chain. +async fn drive_hailstorm_height_up_to_verify( + height_value: u64, + context: &mut deterministic::Context, + state: &mut HailstormState<'_, H>, +) -> PendingHailstormHeight { + let height = Height::new(height_value); + let active = active_validator_indices(state.validators); + let proposer_idx = active[context.gen_range(0..active.len())]; + let verifier_count = usize::min(QUORUM as usize, active.len()); + let verifier_indices = active + .iter() + .copied() + .filter(|idx| *idx != proposer_idx) + .choose_multiple(context, verifier_count.saturating_sub(1)); + let block = H::make_test_block( + *state.parent, + *state.parent_commitment, + height, + height_value, + state.participants.len() as u16, + ); + let round = Round::new(Epoch::zero(), View::new(height_value)); + let proposal = Proposal { + round, + parent: height + .previous() + .map(|previous| View::new(previous.get())) + .unwrap_or(View::zero()), + payload: H::commitment(&block), + }; + let expected_digest = H::digest(&block); + let finalization = H::make_finalization(proposal.clone(), state.schemes, QUORUM); + + { + let proposer = state.validators[proposer_idx] + .as_mut() + .expect("proposer should be active"); + H::propose(&mut proposer.handle, round, &block).await; + H::report_notarization( + &mut proposer.handle.mailbox, + H::make_notarization(proposal, state.schemes, QUORUM), + ) + .await; + } + + for verifier_idx in verifier_indices.iter().copied() { + let verifier = state.validators[verifier_idx] + .as_mut() + .expect("verifier should be active"); + H::verify(&mut verifier.handle, round, &block, &mut []).await; + } + + PendingHailstormHeight { + height, + expected_digest, + finalization, + next_parent: expected_digest, + next_parent_commitment: H::commitment(&block), + } +} + +/// Reports the finalization for a previously-driven pending height to every +/// currently-active validator, waits for them to reach the height, and updates +/// the canonical chain. +async fn finalize_hailstorm_height( + pending: PendingHailstormHeight, + context: &mut deterministic::Context, + state: &mut HailstormState<'_, H>, +) { + let PendingHailstormHeight { + height, + expected_digest, + finalization, + next_parent, + next_parent_commitment, + } = pending; + + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] + .as_mut() + .expect("validator should remain active"); + H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; + } + + state + .canonical + .push((height, expected_digest, finalization)); + *state.parent = next_parent; + *state.parent_commitment = next_parent_commitment; + + let (_, _, expected_finalization) = state + .canonical + .last() + .expect("canonical chain should contain the new height"); + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] + .as_ref() + .expect("validator should be active"); + wait_for_validator_height( + context, + validator, + height, + expected_digest, + expected_finalization, + &format!("validator_{idx}"), + ) + .await; + } +} + +async fn advance_hailstorm_to( + target: u64, + context: &mut deterministic::Context, + state: &mut HailstormState<'_, H>, +) { + for height_value in (state.canonical.len() as u64 + 1)..=target { + let pending = drive_hailstorm_height_up_to_verify(height_value, context, state).await; + finalize_hailstorm_height(pending, context, state).await; + } + + assert_active_validators_match_canonical(state.validators, state.canonical).await; +} + +/// Stress marshal with repeated validator crashes and recoveries while a +/// canonical finalized chain continues to advance. +pub fn hailstorm( + seed: u64, + shutdowns: usize, + interval: u64, + max_down: usize, + link: Link, +) -> String { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(H::finalize_timeout())), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(3), participants.clone()) + .await; + setup_network_links(&mut oracle, &participants, link.clone()).await; + + let mut validators = Vec::new(); + for (idx, validator) in participants.iter().enumerate() { + let setup = H::setup_validator( + context.with_label(&format!("validator_{idx}")), + &mut oracle, + validator.clone(), + ConstantProvider::new(schemes[idx].clone()), + ) + .await; + validators.push(Some(HailstormValidator:: { + application: setup.application, + handle: ValidatorHandle { + mailbox: setup.mailbox, + extra: setup.extra, + }, + actor_handle: setup.actor_handle, + })); + } + + let mut canonical = CanonicalChain::::new(); + let mut parent = Sha256::hash(b""); + let mut parent_commitment = H::genesis_parent_commitment(participants.len() as u16); + let mut target_height = 0u64; + let max_interval = interval.max(1); + let max_down = max_down.max(1); + + for shutdown_idx in 0..shutdowns { + let leadup = context.gen_range(1..=max_interval); + target_height += leadup; + + // Pick validators to crash and compute how far the advance should + // run before aborting them. `crash_after == leadup` fires the + // crash after every new height has fully finalized; any smaller + // value lands mid-cycle, after `verified` / `certified` have + // returned for the post-crash height but before finalization is + // reported for it. + let active_pre = active_validator_indices(&validators); + let down_limit = usize::min(max_down, active_pre.len().saturating_sub(1)); + let down_count = context.gen_range(1..=down_limit.max(1)); + let mut selected = active_pre + .iter() + .copied() + .choose_multiple(&mut context, down_count); + selected.sort_unstable(); + let crash_after = context.gen_range(0..=leadup); + let persisted_height = target_height - leadup + crash_after; + + { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + advance_hailstorm_to(persisted_height, &mut context, &mut state).await; + } + + // Crash mid-advance: drive propose + verify for the next height + // and abort the selected validators before reporting + // finalization. If `crash_after == leadup - 1` the crash still + // happens after the last height's finalization because the loop + // below is a no-op. + let pending = if persisted_height < target_height { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + Some( + drive_hailstorm_height_up_to_verify( + persisted_height + 1, + &mut context, + &mut state, + ) + .await, + ) + } else { + None + }; + + for idx in selected.iter().copied() { + let crashed = validators[idx] + .take() + .expect("selected validator should be active"); + crashed.actor_handle.abort(); + let _ = crashed.actor_handle.await; + } + + if let Some(pending) = pending { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + finalize_hailstorm_height(pending, &mut context, &mut state).await; + } + + info!( + seed, + shutdown_idx, + ?selected, + down_count, + persisted_height, + leadup, + crash_after, + "marshal hailstorm shutdown" + ); + + let downtime = context.gen_range(1..=max_interval); + target_height += downtime; + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + advance_hailstorm_to(target_height, &mut context, &mut state).await; + + for idx in selected.iter().copied() { + let restarted = H::setup_validator( + context.with_label(&format!("validator_{idx}_restart_{shutdown_idx}")), + &mut oracle, + participants[idx].clone(), + ConstantProvider::new(schemes[idx].clone()), + ) + .await; + assert_eq!( + restarted.height, + Height::new(persisted_height), + "validator {idx} should recover its persisted finalized height before replay" + ); + + let mut restarted = HailstormValidator:: { + application: restarted.application, + handle: ValidatorHandle { + mailbox: restarted.mailbox, + extra: restarted.extra, + }, + actor_handle: restarted.actor_handle, + }; + for (_, _, finalization) in canonical.iter().skip(persisted_height as usize) { + H::report_finalization(&mut restarted.handle.mailbox, finalization.clone()) + .await; + } + validators[idx] = Some(restarted); + } + + for idx in selected.iter().copied() { + let validator = validators[idx] + .as_ref() + .expect("restarted validator should be active"); + for (height, digest, finalization) in canonical.iter() { + wait_for_validator_height( + &mut context, + validator, + *height, + *digest, + finalization, + &format!("validator_{idx}_restarted"), + ) + .await; + } + } + assert_active_validators_match_canonical(&validators, &canonical).await; + info!( + seed, + shutdown_idx, + ?selected, + target_height, + downtime, + "marshal hailstorm recovered" + ); + } + + context.auditor().state() + }) +} + +/// Contract: `marshal.verified(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +pub fn proposed_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + H::propose(&mut handle, round, &block).await; + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = + restarted + .mailbox + .get_verified(round) + .await + .unwrap_or_else(|| { + panic!( + "marshal.verified() returning true must imply \ + get_verified(round) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_verified(round) must return the proposed block \ + (seed={seed}, cycle={cycle})" + ); + assert!( + restarted.mailbox.get_block(&digest).await.is_some(), + "get_block(&digest) must also recover the proposed block \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Contract: `marshal.verified(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +pub fn verified_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block, &mut peers).await; + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = + restarted + .mailbox + .get_verified(round) + .await + .unwrap_or_else(|| { + panic!( + "marshal.verified() returning true must imply \ + get_verified(round) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_verified(round) must return the verified block \ + (seed={seed}, cycle={cycle})" + ); + assert!( + restarted.mailbox.get_block(&digest).await.is_some(), + "get_block(&digest) must also recover the verified block \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Contract: `marshal.certified(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +/// +/// Complements [`verified_success_implies_recoverable_after_restart`] by +/// exercising the `Message::Certified -> cache_block -> put_sync` handshake. +/// A regression that acked before syncing the notarized cache would surface +/// here as a missing block after restart. +pub fn certified_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + assert!( + H::certify(&mut handle, round, &block).await, + "certify must ack" + ); + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = + restarted + .mailbox + .get_block(&digest) + .await + .unwrap_or_else(|| { + panic!( + "marshal.certified() returning true must imply \ + get_block(&digest) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_block(&digest) must return the certified block \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Regression: when the same block is verified at an earlier view and later +/// certified at a much later view (epoch-boundary reproposal), both writes +/// must land so retention can prune the earlier view without losing the +/// block. A naive "skip the sibling write if the block's digest is already +/// present in the other archive" optimization is unsafe because the two +/// archives prune per-view on the same boundary: if the block lives only in +/// `verified_blocks[V_early]` and never gets written to +/// `notarized_blocks[V_late]`, advancing retention past V_early drops the +/// block even though V_late is still within the window. +pub fn certify_at_later_view_survives_earlier_view_pruning() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let application = setup.application; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + // A repeated block that we will verify at an early view and certify + // at a later view. Its height is intentionally well beyond the chain + // we'll drive below, so it never enters the finalized archive via + // gap repair and lives solely in the prunable caches. + let repeated = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(5_000), + 9_999, + NUM_VALIDATORS as u16, + ); + let repeated_digest = H::digest(&repeated); + + // Negative control: a verify-only block at a distinct early view. + // Placing `orphan` at V=2 (instead of V=1, where `repeated` already + // occupies the verified index) guarantees the write actually lands in + // `verified_blocks[V=2]` rather than being silently dropped as a + // duplicate index. Because it is never certified, it lives solely in + // that verified entry and must disappear once retention pruning + // advances past V=2. Asserting it is gone (after asserting it was + // present before pruning) confirms the prune actually fires at the + // expected floor. + let orphan = H::make_test_block( + Sha256::hash(b"orphan"), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(6_000), + 9_998, + NUM_VALIDATORS as u16, + ); + let orphan_digest = H::digest(&orphan); + + // Verify `repeated` at V=1, then certify at V=25 (reproposal-style gap). + // The chain below starts at V=3 to avoid overwriting V=1 (`repeated`) + // or V=2 (`orphan`) in the verified archive (which drops subsequent + // writes at an existing view). + let v_early = Round::new(Epoch::zero(), View::new(1)); + let v_orphan = Round::new(Epoch::zero(), View::new(2)); + let v_late = Round::new(Epoch::zero(), View::new(25)); + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, v_early, &repeated, &mut peers).await; + assert!( + H::certify(&mut handle, v_late, &repeated).await, + "certify must ack" + ); + + // Verify `orphan` at its own distinct view V=2 (no certify). + H::verify(&mut handle, v_orphan, &orphan, &mut peers).await; + assert!( + handle.mailbox.get_block(&orphan_digest).await.is_some(), + "negative control assumes `orphan` is present before pruning; \ + if it is not, the V=2 write was dropped and the post-prune \ + assertion would pass vacuously" + ); + + // Drive the finalized chain forward to advance `last_processed_round` + // past V=2's retention boundary but not past V=25's. With + // view_retention_timeout=10 and prunable_items_per_section=10, the + // prune floor snaps down to the section boundary and evicts V=1 and + // V=2 while leaving V=25 intact. + const CHAIN_LEN: u64 = 21; + let mut parent = Sha256::hash(b""); + let mut parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); + for i in 1..=CHAIN_LEN { + let block = H::make_test_block( + parent, + parent_commitment, + Height::new(i), + i, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let commitment = H::commitment(&block); + let round = Round::new(Epoch::zero(), View::new(i + 2)); + H::propose(&mut handle, round, &block).await; + let proposal = Proposal { + round, + parent: View::new(i), + payload: commitment, + }; + let finalization = H::make_finalization(proposal, &schemes, QUORUM); + H::report_finalization(&mut handle.mailbox, finalization).await; + parent = digest; + parent_commitment = commitment; + } + while (application.blocks().len() as u64) < CHAIN_LEN { + context.sleep(Duration::from_millis(10)).await; + } + context.sleep(Duration::from_millis(100)).await; + + // Negative control: the verify-only orphan at V=2 must be gone, which + // proves retention pruning actually evicted the early-view entries at + // the expected floor. + assert!( + handle.mailbox.get_block(&orphan_digest).await.is_none(), + "verify-only block at V=2 must be evicted by retention pruning" + ); + + // The repeated block must still be retrievable: verified_blocks[V=1] + // has been pruned, but notarized_blocks[V=25] still holds it. + let recovered = handle.mailbox.get_block(&repeated_digest).await; + assert!( + recovered.is_some(), + "block certified at V=25 must survive retention pruning of V=1" + ); + assert_eq!(recovered.unwrap().digest(), repeated_digest); + }); +} + +/// Regression: when a leader equivocates, a validator may verify one block +/// (A) and then certify a different block (B) at the same round. `verified()` +/// and `certified()` must write to distinct archives so both blocks are +/// retained and retrievable; otherwise the second write collides on the same +/// prunable-archive index (`skip_if_index_exists=true`) and is silently +/// dropped despite the mailbox returning success. +pub fn certify_persists_equivocated_block() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + let round = Round::new(Epoch::zero(), View::new(1)); + let parent = Sha256::hash(b""); + let parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); + + // Two distinct blocks at the same height/round (leader equivocation): + // distinct timestamps yield distinct digests. + let block_a = H::make_test_block( + parent, + parent_commitment, + Height::new(1), + 1, + NUM_VALIDATORS as u16, + ); + let digest_a = H::digest(&block_a); + let block_b = H::make_test_block( + parent, + parent_commitment, + Height::new(1), + 2, + NUM_VALIDATORS as u16, + ); + let digest_b = H::digest(&block_b); + assert_ne!(digest_a, digest_b, "test requires distinct digests"); + + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block_a, &mut peers).await; + assert!( + H::certify(&mut handle, round, &block_b).await, + "certified must ack" + ); + + let got_a = handle.mailbox.get_block(&digest_a).await; + assert!( + got_a.is_some(), + "verified block A must be persisted in verified_blocks" + ); + assert_eq!(got_a.unwrap().digest(), digest_a); + let got_b = handle.mailbox.get_block(&digest_b).await; + assert!( + got_b.is_some(), + "certified block B must be persisted despite a verify at the same round" + ); + assert_eq!(got_b.unwrap().digest(), digest_b); + }); +} + +/// Contract: once marshal has delivered a finalized block to the application, +/// that finalized block and its certificate must already be durable. +pub fn delivery_visibility_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let application = Application::::manual_ack(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let finalization = H::make_finalization( + Proposal::new(round, View::zero(), H::commitment(&block)), + &schemes, + QUORUM, + ); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let application = application.clone(); + let block = block.clone(); + let finalization = finalization.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator_with( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + NZUsize!(1), + application.clone(), + ) + .await; + let mut mailbox = setup.mailbox; + let mut handle = ValidatorHandle:: { + mailbox: mailbox.clone(), + extra: setup.extra, + }; + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block, &mut peers).await; + H::report_finalization(&mut mailbox, finalization.clone()).await; + + let height = application.acknowledged().await; + assert_eq!( + height, + Height::new(1), + "expected the first delivered finalized block to become visible at height 1 \ + before restart (seed={seed})" + ); + } + }); + + for cycle in 0..recovery_cycles { + let expected_round = finalization.round(); + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = restarted.mailbox.get_block(Height::new(1)).await.expect( + "delivered finalized block must be recoverable after restart \ + (seed={seed}, cycle={cycle})", + ); + assert_eq!( + recovered.height(), + Height::new(1), + "restart should recover the delivered finalized block by height \ + (seed={seed}, cycle={cycle})" + ); + assert_eq!( + restarted + .mailbox + .get_finalization(Height::new(1)) + .await + .expect( + "delivered finalization must be recoverable after restart \ + (seed={seed}, cycle={cycle})", + ) + .round(), + expected_round, + "restart should recover the delivered finalization by height \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + // ============================================================================= // Standard Harness Implementation // ============================================================================= @@ -323,7 +1525,7 @@ impl TestHarness for StandardHarness { type ApplicationBlock = B; type Variant = Standard; type TestBlock = B; - type ValidatorExtra = (); + type ValidatorExtra = buffered::Mailbox; type Commitment = D; async fn setup_validator( @@ -482,13 +1684,14 @@ impl TestHarness for StandardHarness { config, ) .await; - actor.start(application.clone(), buffer, resolver); + let actor_handle = actor.start(application.clone(), buffer.clone(), resolver); ValidatorSetup { application, mailbox, - extra: (), + extra: buffer, height, + actor_handle, } } @@ -519,7 +1722,7 @@ impl TestHarness for StandardHarness { } async fn propose(handle: &mut ValidatorHandle, round: Round, block: &B) { - handle.mailbox.proposed(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } async fn verify( @@ -528,7 +1731,11 @@ impl TestHarness for StandardHarness { block: &B, _all_handles: &mut [ValidatorHandle], ) { - handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); + } + + async fn certify(handle: &mut ValidatorHandle, round: Round, block: &B) -> bool { + handle.mailbox.certified(round, block.clone()).await } fn make_finalization(proposal: Proposal, schemes: &[S], quorum: u32) -> Finalization { @@ -668,13 +1875,13 @@ impl TestHarness for StandardHarness { ) .await; let application = Application::::default(); - actor.start(application.clone(), buffer, resolver); + actor.start(application.clone(), buffer.clone(), resolver); - (mailbox, (), application) + (mailbox, buffer, application) } async fn verify_for_prune(handle: &mut ValidatorHandle, round: Round, block: &B) { - handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } } @@ -700,6 +1907,7 @@ impl TestHarness for InlineHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -725,6 +1933,7 @@ impl TestHarness for InlineHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -764,7 +1973,7 @@ impl TestHarness for InlineHarness { StandardHarness::propose( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -781,7 +1990,7 @@ impl TestHarness for InlineHarness { StandardHarness::verify( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -790,6 +1999,22 @@ impl TestHarness for InlineHarness { .await; } + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> bool { + StandardHarness::certify( + &mut ValidatorHandle:: { + mailbox: handle.mailbox.clone(), + extra: handle.extra.clone(), + }, + round, + block, + ) + .await + } + fn make_finalization( proposal: Proposal, schemes: &[S], @@ -855,7 +2080,7 @@ impl TestHarness for InlineHarness { StandardHarness::verify_for_prune( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -886,6 +2111,7 @@ impl TestHarness for DeferredHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -911,6 +2137,7 @@ impl TestHarness for DeferredHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -950,7 +2177,7 @@ impl TestHarness for DeferredHarness { InlineHarness::propose( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -967,7 +2194,7 @@ impl TestHarness for DeferredHarness { InlineHarness::verify( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -976,6 +2203,22 @@ impl TestHarness for DeferredHarness { .await; } + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> bool { + InlineHarness::certify( + &mut ValidatorHandle:: { + mailbox: handle.mailbox.clone(), + extra: handle.extra.clone(), + }, + round, + block, + ) + .await + } + fn make_finalization( proposal: Proposal, schemes: &[S], @@ -1041,7 +2284,7 @@ impl TestHarness for DeferredHarness { InlineHarness::verify_for_prune( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -1250,13 +2493,14 @@ impl TestHarness for CodingHarness { config, ) .await; - actor.start(application.clone(), shard_mailbox.clone(), resolver); + let actor_handle = actor.start(application.clone(), shard_mailbox.clone(), resolver); ValidatorSetup { application, mailbox, extra: shard_mailbox, height, + actor_handle, } } @@ -1302,7 +2546,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - handle.mailbox.proposed(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } async fn verify( @@ -1311,7 +2555,15 @@ impl TestHarness for CodingHarness { block: &CodedBlock, Sha256>, _all_handles: &mut [ValidatorHandle], ) { - handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); + } + + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &CodedBlock, Sha256>, + ) -> bool { + handle.mailbox.certified(round, block.clone()).await } fn make_finalization( @@ -1474,7 +2726,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } } diff --git a/consensus/src/marshal/mocks/verifying.rs b/consensus/src/marshal/mocks/verifying.rs index 0b392eb0c2e..2a8b44902ae 100644 --- a/consensus/src/marshal/mocks/verifying.rs +++ b/consensus/src/marshal/mocks/verifying.rs @@ -9,17 +9,24 @@ use crate::{ CertifiableBlock, Epochable, }; use commonware_runtime::deterministic; +use commonware_utils::{ + channel::{fallible::OneshotExt, oneshot}, + sync::Mutex, +}; +use std::{marker::PhantomData, sync::Arc}; /// A mock application that implements `VerifyingApplication` for testing. /// /// This mock: /// - Returns the provided genesis block from `genesis()` -/// - Returns `None` from `propose()` (never proposes) +/// - Returns the configured block (if any) from `propose()` /// - Returns a configurable result from `verify()` #[derive(Clone)] pub struct MockVerifyingApp { /// The genesis block to return. pub genesis: B, + /// The block returned by `propose`. If `None`, `propose` returns `None`. + pub propose_result: Option, /// The result returned by `verify`. pub verify_result: bool, _phantom: std::marker::PhantomData, @@ -30,6 +37,7 @@ impl MockVerifyingApp { pub fn new(genesis: B) -> Self { Self { genesis, + propose_result: None, verify_result: true, _phantom: std::marker::PhantomData, } @@ -39,10 +47,17 @@ impl MockVerifyingApp { pub fn with_verify_result(genesis: B, verify_result: bool) -> Self { Self { genesis, + propose_result: None, verify_result, _phantom: std::marker::PhantomData, } } + + /// Configure the block returned by `propose`. + pub fn with_propose_result(mut self, block: B) -> Self { + self.propose_result = Some(block); + self + } } impl crate::Application for MockVerifyingApp @@ -64,7 +79,7 @@ where _context: (deterministic::Context, Self::Context), _ancestry: AncestorStream, ) -> Option { - None + self.propose_result.clone() } } @@ -82,3 +97,80 @@ where self.verify_result } } + +/// A verifying mock application whose `verify()` signals `started` on entry and +/// blocks until `release` is received. Used to deterministically control when +/// the application verdict races with marshal shutdown. +#[derive(Clone)] +pub struct GatedVerifyingApp { + genesis: B, + started: Arc>>>, + release: Arc>>>, + _phantom: PhantomData, +} + +impl GatedVerifyingApp { + /// Returns the gated app, a `started` receiver fired when `verify()` is entered, + /// and a `release` sender that unblocks `verify()` once signaled. + pub fn new(genesis: B) -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (started_tx, started_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + genesis, + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), + _phantom: PhantomData, + }, + started_rx, + release_tx, + ) + } +} + +impl crate::Application for GatedVerifyingApp +where + B: CertifiableBlock + Clone + Send + Sync + 'static, + B::Context: Epochable + Clone + Send + Sync + 'static, + S: commonware_cryptography::certificate::Scheme + Clone + Send + Sync + 'static, +{ + type Block = B; + type Context = B::Context; + type SigningScheme = S; + + async fn genesis(&mut self) -> Self::Block { + self.genesis.clone() + } + + async fn propose>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: AncestorStream, + ) -> Option { + None + } +} + +impl crate::VerifyingApplication for GatedVerifyingApp +where + B: CertifiableBlock + Clone + Send + Sync + 'static, + B::Context: Epochable + Clone + Send + Sync + 'static, + S: commonware_cryptography::certificate::Scheme + Clone + Send + Sync + 'static, +{ + async fn verify>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: AncestorStream, + ) -> bool { + if let Some(started) = self.started.lock().take() { + started.send_lossy(()); + } + let release = self + .release + .lock() + .take() + .expect("release receiver missing"); + let _ = release.await; + true + } +} diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs index 3a2401fb377..71546bcac82 100644 --- a/consensus/src/marshal/mod.rs +++ b/consensus/src/marshal/mod.rs @@ -139,5 +139,9 @@ pub enum Update { /// /// Because the [Acknowledgement] is clonable, the application can pass [Update] to multiple consumers /// (and marshal will only consider the block delivered once all consumers have acknowledged it). + /// + /// Marshal only emits a block after it has durably persisted the said block. This ensures applications + /// that make stateful changes based on a block in other locations can access the same block on restart (often + /// some logic on startup attempts on infallible read on the last processed block). Block(B, A), } diff --git a/consensus/src/marshal/resolver/handler.rs b/consensus/src/marshal/resolver/handler.rs index df54ec15ab9..f0ffdcc0910 100644 --- a/consensus/src/marshal/resolver/handler.rs +++ b/consensus/src/marshal/resolver/handler.rs @@ -342,6 +342,16 @@ mod tests { assert_eq!(decoded, Request::Notarized { round }); } + #[test] + fn test_subject_decode_rejects_invalid_enum_tag() { + let bad = [3u8]; + let mut buf = bad.as_ref(); + assert!(matches!( + Request::::read(&mut buf), + Err(CodecError::InvalidEnum(3)) + )); + } + #[test] fn test_subject_hash() { use std::collections::HashSet; diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index f16d18f23e7..d9b29d067aa 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -74,7 +74,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{is_inferred_reproposal_at_certify, LastBuilt}, + validation::{is_inferred_reproposal_at_certify, Stage}, verification_tasks::VerificationTasks, }, core::Mailbox, @@ -93,17 +93,15 @@ use crate::{ }; use commonware_cryptography::{certificate::Scheme, Digestible}; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, Clock, Metrics, Spawner, }; -use commonware_utils::{ - channel::{fallible::OneshotExt, oneshot}, - sync::Mutex, -}; +use commonware_utils::channel::{fallible::OneshotExt, oneshot}; use rand::Rng; use std::sync::Arc; -use tracing::{debug, warn}; +use tracing::debug; /// An [`Application`] adapter that handles epoch transitions and validates block ancestry. /// @@ -146,7 +144,6 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, verification_tasks: VerificationTasks<::Digest>, build_duration: Timed, @@ -182,7 +179,6 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), build_duration, @@ -203,6 +199,7 @@ where &mut self, context: ::Context, block: B, + stage: Stage, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); @@ -226,6 +223,7 @@ where &mut application, &mut marshal, &mut tx, + stage, ) .await { @@ -291,15 +289,15 @@ where /// boundary block to avoid creating blocks that would be invalidated by the epoch transition. /// /// The proposal operation is spawned in a background task and returns a receiver that will - /// contain the proposed block's digest when ready. The built block is cached for later - /// broadcasting. + /// contain the proposed block's digest when ready. The built block is persisted via + /// [`Mailbox::verified`] before the digest is delivered, so consensus can rely on the + /// block surviving restart. async fn propose( &mut self, consensus_context: Context, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); // Metrics @@ -310,6 +308,41 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified block + // for this round (persisted by a pre-crash propose whose + // notarize vote never reached the journal). + // + // Building a fresh block would land on the same prunable archive + // index and be silently dropped, so the stored block is the only proposal + // we can broadcast for this round. + // + // The recovered block is safe to reuse only if its embedded + // context matches the context simplex just recovered. Otherwise the + // cached block was built against a different parent and cannot be + // broadcast under the current header, so drop the receiver + // and let the voter nullify the view via timeout. + if let Some(block) = marshal.get_verified(consensus_context.round).await { + let block_context = block.context(); + if block_context != consensus_context { + debug!( + round = ?consensus_context.round, + ?consensus_context, + ?block_context, + "skipping proposal: cached verified block context no longer matches" + ); + return; + } + let digest = block.digest(); + let success = tx.send_lossy(digest); + debug!( + round = ?consensus_context.round, + ?digest, + success, + "reused verified block from marshal on leader recovery" + ); + return; + } + let (parent_view, parent_digest) = consensus_context.parent; let parent_request = fetch_parent( parent_digest, @@ -348,11 +381,14 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let digest = parent.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + if !marshal.verified(consensus_context.round, parent).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -393,11 +429,14 @@ where build_timer.observe(); let digest = built_block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, built_block)); + if !marshal.verified(consensus_context.round, built_block).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected proposed block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -451,7 +490,7 @@ where // Re-proposals return early and skip normal parent/height checks // because they were already verified when originally proposed and // parent-child checks would fail by construction when parent == block. - let block = match precheck_epoch_and_reproposal( + let Some(decision) = precheck_epoch_and_reproposal( &marshaled.epocher, &mut marshal, &context, @@ -459,10 +498,14 @@ where block, ) .await - { + else { + return; + }; + let block = match decision { Decision::Complete(valid) => { if valid { - // Valid re-proposal. Create a completed verification task for `certify`. + // A valid re-proposal needs no further ancestry validation, but + // `certify` still expects a completed verification task. let round = context.round; let (task_tx, task_rx) = oneshot::channel(); task_tx.send_lossy(true); @@ -496,7 +539,7 @@ where // Begin the rest of the verification process asynchronously. let round = context.round; - let task = marshaled.deferred_verify(context, block); + let task = marshaled.deferred_verify(context, block, Stage::Verified); marshaled.verification_tasks.insert(round, digest, task); tx.send_lossy(true); @@ -580,14 +623,19 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call `marshal.verified` - // twice for the same block. That function is idempotent, so this is safe. - marshaled.marshal.verified(round, block).await; + // Certifier holds a notarization for this block, so route + // the write to the notarized cache. `certified` is + // idempotent, so crash-recovery double-invocation is safe. + if !marshaled.marshal.certified(round, block).await { + debug!(?round, "marshal unable to accept block"); + return; + } tx.send_lossy(true); return; } - let verify_rx = marshaled.deferred_verify(embedded_context, block); + let verify_rx = + marshaled.deferred_verify(embedded_context, block, Stage::Certified); if let Ok(result) = verify_rx.await { tx.send_lossy(result); } @@ -609,33 +657,11 @@ where type Plan = Plan; async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); - return; - }; - if block.digest() != digest { - warn!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "skipping requested broadcast of block with mismatched digest" - ); - return; - } - debug!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "requested broadcast of built block" - ); - self.marshal.proposed(round, block).await; - } - Plan::Forward { round, peers } => { - self.marshal.forward(round, digest, peers).await; - } - } + let (round, recipients) = match plan { + Plan::Propose { round } => (round, Recipients::All), + Plan::Forward { round, recipients } => (round, recipients), + }; + self.marshal.forward(round, digest, recipients).await; } } @@ -669,12 +695,13 @@ mod tests { default_leader, make_raw_block, setup_network_with_participants, Ctx, StandardHarness, TestHarness, B, BLOCKS_PER_EPOCH, NAMESPACE, NUM_VALIDATORS, S, V, }, - verifying::MockVerifyingApp, + verifying::{GatedVerifyingApp, MockVerifyingApp}, }, simplex::scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::{Epoch, Epocher, FixedEpocher, Height, Round, View}, Automaton, CertifiableAutomaton, }; + use commonware_broadcast::Broadcaster; use commonware_cryptography::{ certificate::{mocks::Fixture, ConstantProvider}, sha256::Sha256, @@ -682,7 +709,7 @@ mod tests { }; use commonware_macros::{select, test_traced}; use commonware_runtime::{deterministic, Clock, Metrics, Runner}; - use commonware_utils::NZUsize; + use commonware_utils::{channel::fallible::OneshotExt, NZUsize}; use std::time::Duration; #[test_traced("INFO")] @@ -722,10 +749,11 @@ mod tests { // Create parent block at height 1 let parent = make_raw_block(genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal - .clone() - .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) - .await; + assert!( + marshal + .verified(Round::new(Epoch::new(0), View::new(1)), parent.clone()) + .await + ); // Block A at view 5 (height 2) let round_a = Round::new(Epoch::new(0), View::new(5)); @@ -736,7 +764,7 @@ mod tests { }; let block_a = B::new::(context_a.clone(), parent_digest, Height::new(2), 200); let commitment_a = block_a.digest(); - marshal.clone().proposed(round_a, block_a.clone()).await; + assert!(marshal.verified(round_a, block_a.clone()).await); // Block B at view 10 (height 2, different block same height) let round_b = Round::new(Epoch::new(0), View::new(10)); @@ -747,7 +775,7 @@ mod tests { }; let block_b = B::new::(context_b.clone(), parent_digest, Height::new(2), 300); let commitment_b = block_b.digest(); - marshal.clone().proposed(round_b, block_b.clone()).await; + assert!(marshal.verified(round_b, block_b.clone()).await); context.sleep(Duration::from_millis(10)).await; @@ -854,10 +882,12 @@ mod tests { let parent = B::new::(parent_ctx.clone(), genesis.digest(), Height::new(19), 1000); let parent_digest = parent.digest(); - marshal - .clone() - .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) - .await; + assert!( + marshal + .clone() + .verified(Round::new(Epoch::zero(), View::new(19)), parent.clone()) + .await + ); // Create a block at height 20 (first block in epoch 1, which is NOT supported) let unsupported_round = Round::new(Epoch::new(1), View::new(20)); @@ -873,10 +903,12 @@ mod tests { 2000, ); let block_commitment = block.digest(); - marshal - .clone() - .proposed(unsupported_round, block.clone()) - .await; + assert!( + marshal + .clone() + .verified(unsupported_round, block.clone()) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -943,10 +975,12 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_commitment = parent.digest(); - marshal - .clone() - .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) - .await; + assert!( + marshal + .clone() + .verified(Round::new(Epoch::zero(), View::new(1)), parent.clone()) + .await + ); // Build a block with context A (embedded in the block). let round_a = Round::new(Epoch::zero(), View::new(2)); @@ -957,7 +991,7 @@ mod tests { }; let block_a = B::new::(context_a, parent.digest(), Height::new(2), 200); let commitment_a = block_a.digest(); - marshal.clone().proposed(round_a, block_a).await; + assert!(marshal.verified(round_a, block_a).await); context.sleep(Duration::from_millis(10)).await; @@ -983,4 +1017,229 @@ mod tests { } }) } + + /// Regression: `certify` resolving true drives the finalize vote, so it must imply + /// the block is durably persisted. In deferred mode `verify()` spawns the + /// `deferred_verify` background task and `certify()` returns that same receiver; the + /// persistence ack happens inside `verify_with_parent` after `app.verify` returns. + /// + /// The gated app holds `app.verify()` open until the test releases it, so we can + /// abort the marshal actor deterministically after the optimistic path has run but + /// before the persistence-ack path runs. With the ack in place `verified()` returns + /// false once the actor is gone, `verify_with_parent` returns `None`, and the tx is + /// dropped unresolved; we assert the certify receiver errors. + #[test_traced("WARN")] + fn test_deferred_certify_does_not_bypass_failed_verify_persistence() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let marshal_actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let (mock_app, verify_started, release_verify): (GatedVerifyingApp, _, _) = + GatedVerifyingApp::new(genesis.clone()); + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + // Seed parent and child via the buffer (in-memory only) so + // `deferred_verify` can fetch them without going through the + // persisted marshal path. + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me, + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child) + .await + .await + .expect("buffer broadcast for child should ack"); + + // Kick off the optimistic verify, which spawns `deferred_verify`. + // Its gated `app.verify` blocks until we release it, giving us a + // deterministic window to abort the marshal actor. + let _optimistic_rx = marshaled.verify(child_ctx, child_digest).await; + let certify_rx = marshaled.certify(child_round, child_digest).await; + verify_started + .await + .expect("verify should reach application before marshal abort"); + marshal_actor_handle.abort(); + release_verify.send_lossy(()); + + select! { + result = certify_rx => { + assert!( + result.is_err(), + "certify must not resolve after marshal.verified loses its persistence ack" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should terminate after marshal abort"); + }, + } + }); + } + + /// Regression: when marshal holds a verified block for a round from a + /// pre-crash propose, a restarted leader's `propose` must return that + /// block's digest instead of asking the application to build afresh. + /// See `standard::inline::tests::test_propose_reuses_verified_block_on_restart`. + #[test_traced("WARN")] + fn test_propose_reuses_verified_block_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let round = Round::new(Epoch::zero(), View::new(1)); + let ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); + let digest_a = block_a.digest(); + assert!(marshal.verified(round, block_a.clone()).await); + + let block_b = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); + let digest_b = block_b.digest(); + assert_ne!(digest_a, digest_b, "test requires distinct digests"); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis.clone()).with_propose_result(block_b); + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = marshaled.propose(ctx).await; + let digest = digest_rx.await.expect("propose must return a digest"); + assert_eq!( + digest, digest_a, + "propose must reuse the block marshal already persisted for this round" + ); + }); + } + + /// Regression: if a pre-crash leader persisted a verified block for a + /// round but the simplex `Notarize` never reached the journal, replay + /// can recover a `consensus_context` whose parent differs from the one + /// the cached block was built against (e.g. a late certification of an + /// older view changes the parent selected by `State::find_parent`). + /// In that case the restarted leader must not broadcast the stale + /// cached block; it must drop the receiver so the voter nullifies the + /// view via `MissingProposal`. + #[test_traced("WARN")] + fn test_propose_skips_when_verified_block_context_changed() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + + // Stash a stale block built against genesis as its parent at round V=2. + let round = Round::new(Epoch::zero(), View::new(2)); + let stale_ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + let stale_block = B::new::(stale_ctx, genesis.digest(), Height::new(1), 100); + assert!(marshal.verified(round, stale_block).await); + + // Simulate a replay where parent selection now points to a + // different parent view than the cached block was built for. + let new_parent_digest = Sha256::hash(b"late-certified-parent"); + let new_ctx = Ctx { + round, + leader: me.clone(), + parent: (View::new(1), new_parent_digest), + }; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = marshaled.propose(new_ctx).await; + assert!( + digest_rx.await.is_err(), + "propose must drop the receiver when the cached block's context no longer matches" + ); + }); + } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 6c7cc8da2ae..bc1e6f63a64 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -45,7 +45,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, - application::validation::LastBuilt, + application::validation::Stage, core::Mailbox, standard::{ validation::{ @@ -62,6 +62,7 @@ use crate::{ }; use commonware_cryptography::certificate::Scheme; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, Clock, Metrics, Spawner, @@ -73,7 +74,7 @@ use commonware_utils::{ use prometheus_client::metrics::histogram::Histogram; use rand::Rng; use std::{collections::BTreeSet, sync::Arc}; -use tracing::{debug, warn}; +use tracing::debug; /// Tracks `(round, digest)` pairs for which `verify` has already fetched the /// block, so `certify` can return immediately without re-subscribing to marshal. @@ -141,7 +142,6 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, available_blocks: AvailableBlocks, build_duration: Timed, @@ -162,8 +162,7 @@ where { /// Creates a new inline-verification wrapper. /// - /// Registers a `build_duration` histogram for proposal latency and initializes - /// the shared "last built block" cache used by [`Relay::broadcast`]. + /// Registers a `build_duration` histogram for proposal latency. pub fn new(context: E, application: A, marshal: Mailbox>, epocher: ES) -> Self { let build_histogram = Histogram::new(Buckets::LOCAL); context.register( @@ -178,7 +177,6 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), available_blocks: Arc::new(Mutex::new(BTreeSet::new())), build_duration, } @@ -224,15 +222,15 @@ where /// Proposes a new block or re-proposes an epoch boundary block. /// /// Proposal runs in a spawned task and returns a receiver for the resulting digest. - /// Built/re-proposed blocks are cached in `last_built` so relay can broadcast - /// exactly what was proposed. + /// The built block is persisted via [`Mailbox::verified`] before the digest is + /// delivered, so a digest received from `propose()` implies the block is + /// recoverable after restart. async fn propose( &mut self, consensus_context: Context, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let build_duration = self.build_duration.clone(); @@ -241,6 +239,28 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified block + // for this round (persisted by a pre-crash propose whose + // notarize vote never reached the journal). + // + // The parent context recovered by simplex may differ from the one + // the cached block was built against, so the stored block is not safe to reuse + // and building a fresh block would land on the same prunable + // archive index and be silently dropped. + // + // Skip this view and let the voter nullify it via timeout. + if marshal + .get_verified(consensus_context.round) + .await + .is_some() + { + debug!( + round = ?consensus_context.round, + "skipping proposal: verified block already exists for round on restart" + ); + return; + } + let (parent_view, parent_digest) = consensus_context.parent; let parent_request = fetch_parent( parent_digest, @@ -277,11 +297,14 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let digest = parent.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + if !marshal.verified(consensus_context.round, parent).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -322,9 +345,13 @@ where build_timer.observe(); let digest = built_block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, built_block)); + if !marshal.verified(consensus_context.round, built_block).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected proposed block" + ); + return; } let success = tx.send_lossy(digest); debug!( @@ -362,7 +389,6 @@ where .with_label("inline_verify") .with_attribute("round", context.round) .spawn(move |runtime_context| async move { - // If block can be fetched, mark it as available. let block_request = marshal .subscribe_by_digest(Some(context.round), digest) .await; @@ -371,7 +397,6 @@ where else { return; }; - available_blocks.lock().insert((context.round, digest)); // Shared pre-checks: // - Blocks are invalid if they are not in the expected epoch and are @@ -380,18 +405,17 @@ where // - Re-proposals skip normal parent/height checks because: // 1) the block was already verified when originally proposed // 2) parent-child checks would fail by construction when parent == block - let block = match precheck_epoch_and_reproposal( - &epocher, - &mut marshal, - &context, - digest, - block, - ) - .await - { + let Some(decision) = + precheck_epoch_and_reproposal(&epocher, &mut marshal, &context, digest, block) + .await + else { + return; + }; + let block = match decision { Decision::Complete(valid) => { - // `Complete` means either an immediate reject or a valid - // re-proposal accepted without further ancestry checks. + if valid { + available_blocks.lock().insert((context.round, digest)); + } tx.send_lossy(valid); return; } @@ -403,6 +427,7 @@ where // // The helper returns `None` when work should stop early (for example, // receiver closed or parent unavailable). + let round = context.round; let application_valid = match verify_with_parent( runtime_context, context, @@ -410,12 +435,16 @@ where &mut application, &mut marshal, &mut tx, + Stage::Verified, ) .await { Some(valid) => valid, None => return, }; + if application_valid { + available_blocks.lock().insert((round, digest)); + } tx.send_lossy(application_valid); }); rx @@ -437,7 +466,17 @@ where ES: Epocher, { async fn certify(&mut self, round: Round, digest: Self::Digest) -> oneshot::Receiver { - // If block was already seen, return immediately. + // Verify has already run for this (round, digest) and its + // success was recorded in `available_blocks`. `verify` does not mark a + // round available until `marshal.verified(round, block)` has returned, + // and that call blocks on `put_sync` of the block into the round's + // verified cache. Because the verified and notarized caches share the + // same pruning schedule (both advance together to `min_view`), the + // block is already durable for this round and re-persisting it into + // the notarized cache would be a redundant `put_sync`. The slow path + // below persists through the notarized cache because in that case + // verify has not run locally and the block may be held only in the + // broadcast buffer, which is not durable. if self.available_blocks.lock().contains(&(round, digest)) { let (tx, rx) = oneshot::channel(); tx.send_lossy(true); @@ -445,18 +484,25 @@ where } // Otherwise, subscribe to marshal for block availability. - // - // TODO(#3393): Avoid fetching the block just to check if it's available. let block_rx = self.marshal.subscribe_by_digest(Some(round), digest).await; + let marshal = self.marshal.clone(); let (mut tx, rx) = oneshot::channel(); self.context .with_label("inline_certify") .with_attribute("round", round) .spawn(move |_| async move { - if await_block_subscription(&mut tx, block_rx, &digest, "certification") - .await - .is_some() - { + let Some(block) = + await_block_subscription(&mut tx, block_rx, &digest, "certification").await + else { + return; + }; + + // `certify` resolving true drives the finalize vote, so mere + // buffered availability is not sufficient here. Persist the + // block through marshal before signaling success. The caller + // holds a notarization for this block, so route it into the + // notarized cache directly rather than the verified cache. + if marshal.certified(round, block).await { tx.send_lossy(true); } }); @@ -481,27 +527,11 @@ where type Plan = Plan; async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); - return; - }; - if block.digest() != digest { - warn!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "skipping requested broadcast of block with mismatched digest" - ); - return; - } - self.marshal.proposed(round, block).await; - } - Plan::Forward { round, peers } => { - self.marshal.forward(round, digest, peers).await; - } - } + let (round, recipients) = match plan { + Plan::Propose { round } => (round, Recipients::All), + Plan::Forward { round, recipients } => (round, recipients), + }; + self.marshal.forward(round, digest, recipients).await; } } @@ -536,12 +566,13 @@ mod tests { default_leader, make_raw_block, setup_network_with_participants, Ctx, StandardHarness, TestHarness, B, BLOCKS_PER_EPOCH, NAMESPACE, NUM_VALIDATORS, S, V, }, - verifying::MockVerifyingApp, + verifying::{GatedVerifyingApp, MockVerifyingApp}, }, simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context}, types::{Epoch, FixedEpocher, Height, Round, View}, Automaton, Block, CertifiableAutomaton, Relay, VerifyingApplication, }; + use commonware_broadcast::Broadcaster; use commonware_cryptography::{ certificate::{mocks::Fixture, ConstantProvider, Scheme}, sha256::Sha256, @@ -549,7 +580,7 @@ mod tests { }; use commonware_macros::{select, test_traced}; use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; - use commonware_utils::NZUsize; + use commonware_utils::{channel::fallible::OneshotExt, NZUsize}; use rand::Rng; use std::time::Duration; @@ -618,7 +649,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.verified(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -629,7 +660,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + assert!(marshal.verified(round, block).await); // Complete verify first so the block is already available locally. let verify_rx = inline.verify(verify_context, digest).await; @@ -696,7 +727,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.verified(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -707,7 +738,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + assert!(marshal.verified(round, block).await); // Certify should still resolve by waiting on marshal block availability directly. let certify_rx = inline.certify(round, digest).await; @@ -725,4 +756,503 @@ mod tests { } }); } + + #[test_traced("INFO")] + fn test_certify_reproposal_uses_available_blocks_after_verify() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let marshal_actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let boundary_height = Height::new(BLOCKS_PER_EPOCH.get() - 1); + let boundary_round = Round::new(Epoch::zero(), View::new(boundary_height.get())); + let boundary_block = B::new::( + Ctx { + round: boundary_round, + leader: default_leader(), + parent: (View::zero(), genesis.digest()), + }, + genesis.digest(), + boundary_height, + 1900, + ); + let boundary_digest = boundary_block.digest(); + assert!(marshal.verified(boundary_round, boundary_block).await); + + let reproposal_round = Round::new(Epoch::zero(), View::new(boundary_height.get() + 1)); + let reproposal_context = Ctx { + round: reproposal_round, + leader: me, + parent: (View::new(boundary_height.get()), boundary_digest), + }; + + let verify_rx = inline.verify(reproposal_context, boundary_digest).await; + assert!( + verify_rx.await.unwrap(), + "verify should accept a valid boundary re-proposal" + ); + + marshal_actor_handle.abort(); + drop(marshal); + context.sleep(Duration::from_millis(1)).await; + + let certify_rx = inline.certify(reproposal_round, boundary_digest).await; + select! { + result = certify_rx => { + assert!( + result.unwrap(), + "certify should use the available_blocks fast path for verified re-proposals" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should not depend on marshal after verify cached a re-proposal"); + }, + } + }); + } + + /// Regression: in inline mode, `verify` itself returns true after running + /// app verification. That return value drives the notarize vote, so it + /// must imply "block is durably persisted" -- otherwise a crash between + /// vote and persistence leaves the validator having voted for a block it + /// cannot serve. + /// + /// As with the deferred-mode test, the parent and child are seeded via + /// the buffered broadcast layer (in-memory only), bypassing + /// `marshal.proposed` which would already persist them. + #[test_traced("WARN")] + fn test_inline_verify_persists_block_before_resolving() { + for seed in 0u64..16 { + inline_verify_persists_block_before_resolving_at(seed); + } + } + + fn inline_verify_persists_block_before_resolving_at(seed: u64) { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + // Build parent (height 1) and child (height 2). Seed both into + // the buffered broadcast cache (in-memory only). + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .await + .await + .expect("buffer broadcast for child should ack"); + + // Inline verify runs full validation inline and returns true only + // after `marshal.verified` is enqueued. With the persistence-ack + // fix, that enqueue blocks until put_sync completes. + let verify_result = inline + .verify(child_ctx, child_digest) + .await + .await + .expect("verify result missing"); + assert!(verify_result, "inline verify should pass"); + + // Abort the marshal actor synchronously, with no + // intervening await. If verify returned true but the actor had + // only enqueued (not processed) the `Verified` message, this + // abort kills the actor before persistence completes. + actor_handle.abort(); + drop(inline); + drop(marshal); + drop(buffer); + + // Restart from the same partition. The block must be durably + // persisted - otherwise the validator would have voted notarize + // for a block it cannot serve from local storage. + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "verify resolved true so block must be durably persisted (seed={seed})" + ); + }); + } + + /// Regression: `certify` resolving true drives the finalize vote in inline + /// mode, so it must imply the block is durably persisted even when the + /// certify path subscribed before `verify()` finished. + #[test_traced("WARN")] + fn test_inline_certify_persists_block_before_resolving() { + for seed in 0u64..16 { + inline_certify_persists_block_before_resolving_at(seed); + } + } + + fn inline_certify_persists_block_before_resolving_at(seed: u64) { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .await + .await + .expect("buffer broadcast for child should ack"); + + let verify_rx = inline.verify(child_ctx, child_digest).await; + let certify_result = inline + .certify(child_round, child_digest) + .await + .await + .expect("certify result missing"); + assert!(certify_result, "certify should succeed"); + + actor_handle.abort(); + drop(verify_rx); + drop(inline); + drop(marshal); + drop(buffer); + + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "certify resolved true so block must be durably persisted (seed={seed})" + ); + }); + } + + #[test_traced("WARN")] + fn test_inline_certify_does_not_bypass_failed_verify_persistence() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let marshal_actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let (mock_app, verify_started, release_verify): (GatedVerifyingApp, _, _) = + GatedVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child) + .await + .await + .expect("buffer broadcast for child should ack"); + + let verify_rx = inline.verify(child_ctx, child_digest).await; + verify_started + .await + .expect("verify should reach application before marshal abort"); + marshal_actor_handle.abort(); + release_verify.send_lossy(()); + + select! { + result = verify_rx => { + assert!( + result.is_err(), + "verify must not resolve after marshal.verified loses its persistence ack" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("verify should terminate after marshal abort"); + }, + } + + let certify_rx = inline.certify(child_round, child_digest).await; + select! { + result = certify_rx => { + assert!( + result.is_err(), + "certify must not bypass failed verify persistence via stale availability" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should terminate after marshal abort"); + }, + } + + drop(inline); + drop(marshal); + drop(buffer); + + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me, + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_none(), + "failed marshal.verified ack must not leave a durably recoverable block" + ); + }); + } + + /// Regression: if marshal persisted a verified block for a round before + /// a crash (via a prior `propose` call) but the simplex notarize artifact + /// never reached the journal, the restarted leader must skip proposing + /// for that round. The cached block was built against a parent context + /// that replay may have changed, so reusing it can broadcast a proposal + /// whose payload no longer matches the recovered header. Building a + /// fresh block would also be unsafe because the prunable archive silently + /// drops the second write at the same view index. Dropping the receiver + /// lets the voter nullify the view via `MissingProposal`. + #[test_traced("WARN")] + fn test_propose_skips_when_verified_block_exists_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + + // Pre-crash: seed `verified_blocks[V=1]` through the live mailbox, + // mirroring an aborted pre-crash `Inline::propose` that persisted + // its verified block before the voter could journal a notarize. + let pre_setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let pre_marshal = pre_setup.mailbox; + let pre_actor = pre_setup.actor_handle; + let pre_extra = pre_setup.extra; + let pre_application = pre_setup.application; + + let stale_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); + assert!(pre_marshal.verified(round, stale_block).await); + + // Simulate a crash: abort the actor and drop every handle so the + // storage partition is fully released before reopening. + pre_actor.abort(); + drop(pre_marshal); + drop(pre_extra); + drop(pre_application); + + // Post-crash: reopen the same partition. The verified block must + // be recovered from storage during archive restore so that + // `Message::GetVerified` on the new mailbox observes it. + let post_setup = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let post_marshal = post_setup.mailbox; + + let fresh_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis.clone()).with_propose_result(fresh_block); + let mut inline = Inline::new( + context.clone(), + mock_app, + post_marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = inline.propose(ctx).await; + assert!( + digest_rx.await.is_err(), + "propose must drop the receiver so the voter nullifies the round via timeout" + ); + }); + } } diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 9162c1dcba9..01935371232 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -57,14 +57,14 @@ mod tests { verifying::MockVerifyingApp, }, resolver::handler, - Identifier, + Identifier, Update, }, simplex::{ scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::{Finalization, Proposal}, }, types::{Epoch, Epocher, FixedEpocher, Height, Round, View, ViewDelta}, - Automaton, CertifiableAutomaton, Heightable, + Automaton, CertifiableAutomaton, Heightable, Reporter, }; use bytes::Bytes; use commonware_broadcast::buffered; @@ -74,8 +74,11 @@ mod tests { sha256::Sha256, Digestible, Hasher as _, }; - use commonware_macros::{test_group, test_traced}; - use commonware_p2p::simulated::{self, Network}; + use commonware_macros::{select, test_group, test_traced}; + use commonware_p2p::{ + simulated::{self, Network}, + Recipients, + }; use commonware_parallel::Sequential; use commonware_resolver::Resolver; use commonware_runtime::{ @@ -87,12 +90,14 @@ mod tests { translator::{EightCap, TwoCap}, }; use commonware_utils::{ - channel::{mpsc, oneshot}, + channel::{fallible::OneshotExt, mpsc, oneshot}, + sync::Mutex, vec::NonEmptyVec, NZUsize, NZU16, NZU64, }; use std::{ num::{NonZeroU32, NonZeroU64, NonZeroUsize}, + sync::Arc, time::Duration, }; @@ -106,6 +111,18 @@ mod tests { assert_eq!(r1, r2); } + fn assert_hailstorm_deterministic(seed: u64) { + let r1 = harness::hailstorm::(seed, 4, 4, 1, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 1, LINK); + assert_eq!(r1, r2); + } + + fn assert_hailstorm_multi_deterministic(seed: u64) { + let r1 = harness::hailstorm::(seed, 4, 4, 2, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 2, LINK); + assert_eq!(r1, r2); + } + #[test_group("slow")] #[test_traced("WARN")] fn test_standard_finalize_good_links() { @@ -142,6 +159,24 @@ mod tests { } } + #[test_group("slow")] + #[test_traced("WARN")] + fn test_standard_hailstorm_restarts() { + for seed in 0..2 { + assert_hailstorm_deterministic::(seed); + assert_hailstorm_deterministic::(seed); + } + } + + #[test_group("slow")] + #[test_traced("WARN")] + fn test_standard_hailstorm_multi_restarts() { + for seed in 0..2 { + assert_hailstorm_multi_deterministic::(seed); + assert_hailstorm_multi_deterministic::(seed); + } + } + #[test_traced("WARN")] fn test_standard_ack_pipeline_backlog() { harness::ack_pipeline_backlog::(); @@ -154,6 +189,42 @@ mod tests { harness::ack_pipeline_backlog_persists_on_restart::(); } + #[test_traced("WARN")] + fn test_standard_proposed_success_implies_recoverable_after_restart() { + harness::proposed_success_implies_recoverable_after_restart::(0..16); + harness::proposed_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_standard_verified_success_implies_recoverable_after_restart() { + harness::verified_success_implies_recoverable_after_restart::(0..16); + harness::verified_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_standard_certify_persists_equivocated_block() { + harness::certify_persists_equivocated_block::(); + harness::certify_persists_equivocated_block::(); + } + + #[test_traced("WARN")] + fn test_standard_certified_success_implies_recoverable_after_restart() { + harness::certified_success_implies_recoverable_after_restart::(0..16); + harness::certified_success_implies_recoverable_after_restart::(0..16); + } + + #[test_traced("WARN")] + fn test_standard_certify_at_later_view_survives_earlier_view_pruning() { + harness::certify_at_later_view_survives_earlier_view_pruning::(); + harness::certify_at_later_view_survives_earlier_view_pruning::(); + } + + #[test_traced("WARN")] + fn test_standard_delivery_visibility_implies_recoverable_after_restart() { + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); + } + #[test_traced("WARN")] fn test_standard_sync_height_floor() { harness::sync_height_floor::(); @@ -412,12 +483,16 @@ mod tests { ) .await .mailbox; - peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await; - peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await; + assert!( + peer_mailbox + .verified(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .await + ); + assert!( + peer_mailbox + .verified(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; context.sleep(Duration::from_millis(200)).await; @@ -505,15 +580,21 @@ mod tests { ) .await .mailbox; - peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await; - peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await; - peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) - .await; + assert!( + peer_mailbox + .verified(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .await + ); + assert!( + peer_mailbox + .verified(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .await + ); + assert!( + peer_mailbox + .verified(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; StandardHarness::report_finalization(&mut peer_mailbox, finalization_three.clone()) .await; @@ -691,12 +772,14 @@ mod tests { .await .mailbox; for (i, block) in blocks.iter().enumerate() { - peer_mailbox - .proposed( - Round::new(Epoch::zero(), View::new(block.height().get())), - (*block).clone(), - ) - .await; + assert!( + peer_mailbox + .verified( + Round::new(Epoch::zero(), View::new(block.height().get())), + (*block).clone(), + ) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalizations[i].clone()) .await; } @@ -1120,10 +1203,12 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - marshal - .clone() - .proposed(boundary_round, boundary_block.clone()) - .await; + assert!( + marshal + .clone() + .verified(boundary_round, boundary_block.clone()) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1190,10 +1275,12 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - marshal - .clone() - .proposed(boundary_round, boundary_block) - .await; + assert!( + marshal + .clone() + .verified(boundary_round, boundary_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1227,10 +1314,12 @@ mod tests { 1000, ); let non_boundary_digest = non_boundary_block.digest(); - marshal - .clone() - .proposed(non_boundary_round, non_boundary_block) - .await; + assert!( + marshal + .clone() + .verified(non_boundary_round, non_boundary_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1330,10 +1419,12 @@ mod tests { 200, ); let malformed_digest = malformed_block.digest(); - marshal - .clone() - .proposed(malformed_round, malformed_block) - .await; + assert!( + marshal + .clone() + .verified(malformed_round, malformed_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1371,7 +1462,7 @@ mod tests { let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 300); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.verified(parent_round, parent).await); let mismatch_round = Round::new(Epoch::zero(), View::new(3)); let mismatched_context = Ctx { @@ -1386,10 +1477,12 @@ mod tests { 400, ); let mismatched_digest = mismatched_block.digest(); - marshal - .clone() - .proposed(mismatch_round, mismatched_block) - .await; + assert!( + marshal + .clone() + .verified(mismatch_round, mismatched_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1462,7 +1555,7 @@ mod tests { }; let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.verified(parent_round, parent).await); // 2) Publish a valid child; only application-level verification should fail. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1473,7 +1566,7 @@ mod tests { }; let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + assert!(marshal.verified(round, block).await); context.sleep(Duration::from_millis(10)).await; @@ -1505,33 +1598,267 @@ mod tests { } } - /// A no-op resolver used by tests that drive the marshal actor's - /// resolver_rx channel directly. Outbound fetches/cancellations are dropped. + /// Recorded `send` call on the [`RecordingBuffer`]. + type BufferSend = (Round, B, Recipients); + + /// A buffer that records each `send` invocation; other methods are no-ops. #[derive(Clone, Default)] - struct NoopResolver; + struct RecordingBuffer { + sends: Arc>>, + } + + impl RecordingBuffer { + fn sends(&self) -> Vec { + self.sends.lock().clone() + } + } + + impl crate::marshal::core::Buffer> for RecordingBuffer { + type PublicKey = PublicKey; + type CachedBlock = B; - impl Resolver for NoopResolver { + async fn find_by_digest(&self, _digest: D) -> Option { + None + } + + async fn find_by_commitment(&self, _commitment: D) -> Option { + None + } + + async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn subscribe_by_commitment( + &self, + _commitment: D, + ) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn finalized(&self, _commitment: D) {} + + async fn send(&self, round: Round, block: B, recipients: Recipients) { + self.sends.lock().push((round, block, recipients)); + } + } + + /// Recorded `fetch_targeted` call on the [`RecordingResolver`]. + type TargetedFetch = (handler::Request, NonEmptyVec); + + /// A resolver that records each `fetch_targeted` invocation; other + /// methods are no-ops. + /// + /// `_keepalive` optionally retains a resolver-message sender so the + /// actor's corresponding receiver stays alive when nothing else owns it. + #[derive(Clone, Default)] + struct RecordingResolver { + targeted: Arc>>, + _keepalive: Option>>, + } + + impl RecordingResolver { + fn holding(sender: mpsc::Sender>) -> Self { + Self { + targeted: Arc::new(Mutex::new(Vec::new())), + _keepalive: Some(sender), + } + } + + fn targeted(&self) -> Vec { + self.targeted.lock().clone() + } + + fn targeted_is_empty(&self) -> bool { + self.targeted.lock().is_empty() + } + } + + impl Resolver for RecordingResolver { type Key = handler::Request; type PublicKey = PublicKey; async fn fetch(&mut self, _key: Self::Key) {} async fn fetch_all(&mut self, _keys: Vec) {} - async fn fetch_targeted( - &mut self, - _key: Self::Key, - _targets: NonEmptyVec, - ) { + async fn fetch_targeted(&mut self, key: Self::Key, targets: NonEmptyVec) { + self.targeted.lock().push((key, targets)); } async fn fetch_all_targeted( &mut self, - _requests: Vec<(Self::Key, NonEmptyVec)>, + requests: Vec<(Self::Key, NonEmptyVec)>, ) { + self.targeted.lock().extend(requests); } async fn cancel(&mut self, _key: Self::Key) {} async fn clear(&mut self) {} async fn retain(&mut self, _predicate: impl Fn(&Self::Key) -> bool + Send + 'static) {} } + /// Poll `cond` on a 10ms tick until it returns true, panicking on timeout. + async fn wait_until bool>( + context: &deterministic::Context, + deadline: Duration, + label: &str, + mut cond: F, + ) { + let start = context.current(); + while !cond() { + if context.current().duration_since(start).unwrap_or_default() > deadline { + panic!("{label} did not hold within {deadline:?}"); + } + context.sleep(Duration::from_millis(10)).await; + } + } + + /// A reporter that blocks inside `Update::Block` so tests can abort marshal + /// exactly when application delivery starts. + #[derive(Clone)] + struct GatedBlockReporter { + started: Arc>>>, + release: Arc>>>, + } + + impl GatedBlockReporter { + fn new() -> (Self, oneshot::Receiver, oneshot::Sender<()>) { + let (started_tx, started_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), + }, + started_rx, + release_tx, + ) + } + } + + impl Reporter for GatedBlockReporter { + type Activity = Update; + + async fn report(&mut self, activity: Self::Activity) { + match activity { + Update::Block(block, _ack) => { + if let Some(started) = self.started.lock().take() { + started.send_lossy(block.height()); + } + let release = self.release.lock().take(); + if let Some(release) = release { + let _ = release.await; + } + } + Update::Tip(_, _, _) => {} + } + } + } + + async fn start_standard_actor( + context: deterministic::Context, + partition_prefix: &str, + provider: ConstantProvider, + application: R, + buffer: Buf, + ) -> ( + Mailbox>, + Buf, + RecordingResolver, + commonware_runtime::Handle<()>, + ) + where + R: Reporter>, + Buf: crate::marshal::core::Buffer, PublicKey = PublicKey, CachedBlock = B> + + Clone, + { + let config = Config { + provider, + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + mailbox_size: 100, + view_retention_timeout: ViewDelta::new(10), + max_repair: NZUsize!(10), + max_pending_acks: NZUsize!(1), + block_codec_config: (), + partition_prefix: partition_prefix.to_string(), + prunable_items_per_section: NZU64!(10), + replay_buffer: NZUsize!(1024), + key_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + strategy: Sequential, + }; + let finalizations_by_height = immutable::Archive::init( + context.with_label("finalizations_by_height"), + immutable::Config { + metadata_partition: format!("{partition_prefix}-finalizations-by-height-metadata"), + freezer_table_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-table" + ), + freezer_table_initial_size: 64, + freezer_table_resize_frequency: 10, + freezer_table_resize_chunk_size: 10, + freezer_key_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-key" + ), + freezer_key_page_cache: config.page_cache.clone(), + freezer_value_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-value" + ), + freezer_value_target_size: 1024, + freezer_value_compression: None, + ordinal_partition: format!("{partition_prefix}-finalizations-by-height-ordinal"), + items_per_section: NZU64!(10), + codec_config: S::certificate_codec_config_unbounded(), + replay_buffer: config.replay_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, + }, + ) + .await + .expect("failed to initialize finalizations by height archive"); + let finalized_blocks = immutable::Archive::init( + context.with_label("finalized_blocks"), + immutable::Config { + metadata_partition: format!("{partition_prefix}-finalized_blocks-metadata"), + freezer_table_partition: format!( + "{partition_prefix}-finalized_blocks-freezer-table" + ), + freezer_table_initial_size: 64, + freezer_table_resize_frequency: 10, + freezer_table_resize_chunk_size: 10, + freezer_key_partition: format!("{partition_prefix}-finalized_blocks-freezer-key"), + freezer_key_page_cache: config.page_cache.clone(), + freezer_value_partition: format!( + "{partition_prefix}-finalized_blocks-freezer-value" + ), + freezer_value_target_size: 1024, + freezer_value_compression: None, + ordinal_partition: format!("{partition_prefix}-finalized_blocks-ordinal"), + items_per_section: NZU64!(10), + codec_config: config.block_codec_config, + replay_buffer: config.replay_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, + }, + ) + .await + .expect("failed to initialize finalized blocks archive"); + let (actor, mailbox, _) = Actor::init( + context.clone(), + finalizations_by_height, + finalized_blocks, + config, + ) + .await; + let (resolver_tx, resolver_rx) = mpsc::channel(100); + let resolver = RecordingResolver::holding(resolver_tx); + let actor_handle = + actor.start(application, buffer.clone(), (resolver_rx, resolver.clone())); + (mailbox, buffer, resolver, actor_handle) + } + /// When the provider has no verifier for an epoch, in-flight deliveries /// for that epoch must be acknowledged (`true`) so the serving peer is /// not blamed, rather than rejected (`false`). @@ -1634,7 +1961,7 @@ mod tests { actor.start( Application::::default(), buffer, - (resolver_rx, NoopResolver), + (resolver_rx, RecordingResolver::default()), ); // Inject a Finalized delivery with garbage payload. The @@ -1669,6 +1996,95 @@ mod tests { }); } + /// Regression: application delivery of a finalized block must only happen + /// after the finalized archives are durably synced. Otherwise a crash in + /// the delivery callback can expose a block to another subsystem that then + /// persists derived state ahead of marshal's height-indexed finalization. + #[test_traced("WARN")] + fn test_standard_dispatches_finalized_blocks_after_sync() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let partition_prefix = format!("validator-{me}"); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (application, started, release) = GatedBlockReporter::new(); + let (mut mailbox, _buffer, _resolver, actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + application, + RecordingBuffer::default(), + ) + .await; + + assert!( + mailbox.verified(round, block.clone()).await, + "verified block should persist to the cache" + ); + StandardHarness::report_finalization(&mut mailbox, finalization.clone()).await; + + select! { + height = started => { + assert_eq!( + height.expect("delivery signal missing"), + Height::new(1), + "application should observe the first finalized block" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("application should observe block delivery promptly"); + }, + } + + actor_handle.abort(); + let _ = release.send_lossy(()); + drop(mailbox); + + // Yield once so the aborted actor drops its storage handles before restart. + context.sleep(Duration::from_millis(1)).await; + + let (mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0_restart"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + let recovered = mailbox + .get_block(Height::new(1)) + .await + .expect("finalized block must be durable before delivery"); + assert_eq!( + recovered.digest(), + block.digest(), + "restart should recover the delivered finalized block by height" + ); + assert_eq!( + mailbox + .get_finalization(Height::new(1)) + .await + .expect("finalization must be durable before delivery") + .round(), + round, + "restart should recover the delivered finalization by height" + ); + }); + } + /// Parse the `processed_height` gauge value from a prometheus-encoded /// metrics dump produced by `Metrics::encode`. Looks for any line of the /// form `processed_height `. @@ -1801,4 +2217,272 @@ mod tests { } }); } + + /// `Forward` for an unknown commitment must early-return without + /// dispatching, even when peers are provided. + #[test_traced("WARN")] + fn test_standard_forward_unknown_block_is_noop() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let unknown = Sha256::hash(b"unknown-block"); + + let (mailbox, buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("forward-unknown-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + mailbox + .forward( + round, + unknown, + Recipients::Some(vec![participants[1].clone()]), + ) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + buffer.sends().is_empty(), + "forward for an unknown block must not dispatch" + ); + }); + } + + /// `Forward` for a block that marshal has cached must dispatch that block + /// to exactly the provided peer set via the buffer. + #[test_traced("WARN")] + fn test_standard_forward_cached_block_sends_to_peers() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let digest = block.digest(); + + let (mailbox, buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("forward-cached-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + + let targets = vec![participants[1].clone(), participants[2].clone()]; + mailbox + .forward(round, digest, Recipients::Some(targets.clone())) + .await; + + wait_until(&context, Duration::from_secs(5), "buffer.send", || { + !buffer.sends.lock().is_empty() + }) + .await; + + let sends = buffer.sends(); + assert_eq!(sends.len(), 1); + let (sent_round, sent_block, sent_recipients) = &sends[0]; + assert_eq!(*sent_round, round); + assert_eq!(sent_block.digest(), digest); + match sent_recipients { + Recipients::Some(peers) => assert_eq!(peers, &targets), + other => panic!("expected Recipients::Some, got {other:?}"), + } + }); + } + + /// `HintFinalized` at or below the floor must be a no-op: marshal must + /// not fire a targeted resolver fetch since the hint is stale. + #[test_traced("WARN")] + fn test_standard_hint_finalized_below_floor_is_noop() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + + let (mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-below-floor-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + // Raise the floor above the hint we are about to send. + mailbox.set_floor(Height::new(10)).await; + context.sleep(Duration::from_millis(50)).await; + + mailbox + .hint_finalized(Height::new(5), NonEmptyVec::new(participants[1].clone())) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + resolver.targeted_is_empty(), + "hint at or below floor must not fetch" + ); + }); + } + + /// `HintFinalized` for a height whose finalization is already durable must + /// be a no-op: marshal already has everything needed and must not + /// initiate a redundant fetch. + #[test_traced("WARN")] + fn test_standard_hint_finalized_skips_when_already_finalized() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (mut mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-already-final-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + StandardHarness::report_finalization(&mut mailbox, finalization).await; + + // Wait until marshal has durably stored the finalization. + while mailbox.get_finalization(Height::new(1)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + mailbox + .hint_finalized(Height::new(1), NonEmptyVec::new(participants[1].clone())) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + resolver.targeted_is_empty(), + "hint for a locally-finalized height must not fetch" + ); + }); + } + + /// `HintFinalized` above the floor for a not-yet-finalized height must + /// trigger exactly one targeted fetch via the resolver. + #[test_traced("WARN")] + fn test_standard_hint_finalized_emits_targeted_fetch() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + + let (mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-targets-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + let target = participants[1].clone(); + mailbox + .hint_finalized(Height::new(7), NonEmptyVec::new(target.clone())) + .await; + + wait_until(&context, Duration::from_secs(5), "fetch_targeted", || { + !resolver.targeted.lock().is_empty() + }) + .await; + + let targeted = resolver.targeted(); + assert_eq!(targeted.len(), 1); + let (request, targets) = &targeted[0]; + assert_eq!( + request, + &handler::Request::Finalized { + height: Height::new(7) + } + ); + assert_eq!(&targets[..], &[target]); + }); + } + + /// `Prune` for a height above the floor must be rejected (warn + continue) + /// and must not advance the floor or alter the finalized archive contents. + #[test_traced("WARN")] + fn test_standard_prune_above_floor_is_rejected() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (mut mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("prune-above-floor-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + StandardHarness::report_finalization(&mut mailbox, finalization).await; + + while mailbox.get_finalization(Height::new(1)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + // Prune above the floor must be a no-op, not an error. + mailbox.prune(Height::new(100)).await; + context.sleep(Duration::from_millis(50)).await; + + // The finalized block and its finalization must still be retrievable. + assert!(mailbox.get_block(Height::new(1)).await.is_some()); + assert!(mailbox.get_finalization(Height::new(1)).await.is_some()); + }); + } } diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index d53a3211514..673f8e418a5 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -2,7 +2,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::validation::{ - has_contiguous_height, is_block_in_expected_epoch, is_valid_reproposal_at_verify, + has_contiguous_height, is_block_in_expected_epoch, is_valid_reproposal_at_verify, Stage, }, core::Mailbox, standard::Standard, @@ -72,7 +72,7 @@ pub(super) async fn precheck_epoch_and_reproposal( context: &Context, digest: B::Digest, block: B, -) -> Decision +) -> Option> where ES: Epocher, S: Scheme, @@ -84,7 +84,7 @@ where height = %block.height(), "block height not in expected epoch" ); - return Decision::Complete(false); + return Some(Decision::Complete(false)); } // Re-proposals are signaled by `digest == context.parent.1`. @@ -97,14 +97,16 @@ where height = %block.height(), "re-proposal is not at epoch boundary" ); - return Decision::Complete(false); + return Some(Decision::Complete(false)); } - marshal.verified(context.round, block).await; - return Decision::Complete(true); + if !marshal.verified(context.round, block).await { + return None; + } + return Some(Decision::Complete(true)); } - Decision::Continue(block) + Some(Decision::Continue(block)) } /// Runs the shared non-reproposal verification flow. @@ -123,6 +125,7 @@ pub(super) async fn verify_with_parent( application: &mut A, marshal: &mut Mailbox>, tx: &mut oneshot::Sender, + stage: Stage, ) -> Option where E: Rng + Spawner + Metrics + Clock, @@ -199,8 +202,9 @@ where valid = validity_request => valid, }; - if application_valid { - marshal.verified(context.round, block).await; + if application_valid && !stage.store(marshal, context.round, block).await { + debug!(round = ?context.round, "marshal unable to accept block"); + return None; } Some(application_valid) } diff --git a/consensus/src/simplex/actors/batcher/actor.rs b/consensus/src/simplex/actors/batcher/actor.rs index d517e712fe3..46c61ee4cec 100644 --- a/consensus/src/simplex/actors/batcher/actor.rs +++ b/consensus/src/simplex/actors/batcher/actor.rs @@ -14,7 +14,7 @@ use crate::{ }; use commonware_cryptography::Digest; use commonware_macros::select_loop; -use commonware_p2p::{utils::codec::WrappedReceiver, Blocker, Receiver}; +use commonware_p2p::{utils::codec::WrappedReceiver, Blocker, Receiver, Recipients}; use commonware_parallel::Strategy; use commonware_runtime::{ spawn_cell, @@ -249,7 +249,7 @@ where proposal.payload, Plan::Forward { round: proposal.round, - peers, + recipients: Recipients::Some(peers), }, ) .await; diff --git a/consensus/src/simplex/actors/batcher/mod.rs b/consensus/src/simplex/actors/batcher/mod.rs index 733c8963847..6c52e1eab36 100644 --- a/consensus/src/simplex/actors/batcher/mod.rs +++ b/consensus/src/simplex/actors/batcher/mod.rs @@ -98,7 +98,11 @@ mod tests { type Plan = Plan; async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) { - if let Plan::Forward { round, peers } = plan { + if let Plan::Forward { + round, + recipients: Recipients::Some(peers), + } = plan + { self.broadcasts.lock().push((payload, round, peers)); } } diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 88fe24d841c..73ed8e9387a 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -32,7 +32,10 @@ use commonware_utils::{ futures::AbortablePool, }; use core::{future::Future, panic}; -use futures::{pin_mut, StreamExt}; +use futures::{ + future::{ready, Either}, + pin_mut, StreamExt, +}; use prometheus_client::metrics::{counter::Counter, family::Family, histogram::Histogram}; use rand_core::CryptoRngCore; use std::{ @@ -792,6 +795,14 @@ impl< .await; } } + + // We deliberately avoid re-seeding the batcher with our + // own votes (or the votes of other peers) on replay. We assume that + // whatever view we were in during shutdown is no longer the latest + // and we'll quickly jump ahead to a new view. + // + // If this is not the case (cluster-wide shutdown), we will recover + // when timing out. } } self.journal = Some(journal); @@ -850,12 +861,17 @@ impl< } // Attempt to certify any views that we have notarizations for. - for proposal in self.state.certify_candidates() { + for (proposal, is_local) in self.state.certify_candidates() { let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let receiver = self.automaton.certify(round, proposal.payload).await; - let handle = certify_pool.push(async move { (round, receiver.await) }); + let result = if is_local { + Either::Left(ready(Ok(true))) + } else { + let receiver = self.automaton.certify(round, proposal.payload).await; + Either::Right(receiver) + }; + let handle = certify_pool.push(async move { (round, result.await) }); self.state.set_certify_handle(view, handle); } @@ -872,14 +888,6 @@ impl< }, on_stopped => { debug!("context shutdown, stopping voter"); - - // Sync and drop journal - self.journal - .take() - .unwrap() - .sync_all() - .await - .expect("unable to sync journal"); }, _ = self.context.sleep_until(timeout) => { // Process the timeout @@ -918,8 +926,15 @@ impl< } view = self.state.current_view(); - // Notify application of proposal - self.relay.broadcast(proposed, Plan::Propose).await; + // Notify application of proposal. + self.relay + .broadcast( + proposed, + Plan::Propose { + round: context.round, + }, + ) + .await; }, (context, verified) = verify_wait => { // Clear verify waiter @@ -1091,5 +1106,13 @@ impl< } }, } + + // Sync and drop the journal + self.journal + .take() + .expect("journal missing on voter exit") + .sync_all() + .await + .expect("unable to sync journal"); } } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index b83cad7cf4c..fa5339b26de 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -181,6 +181,41 @@ mod tests { leader_timeout: Duration, certification_timeout: Duration, timeout_retry: Duration, + ) -> ( + Mailbox, + mpsc::Receiver>, + mpsc::Receiver>, + Arc>, + mocks::reporter::Reporter, + ) + where + S: Scheme, + L: ElectorConfig, + { + setup_voter_with_certifier( + context, + oracle, + participants, + schemes, + elector, + leader_timeout, + certification_timeout, + timeout_retry, + mocks::application::Certifier::Always, + ) + .await + } + + #[allow(clippy::too_many_arguments)] + async fn setup_voter_with_certifier( + context: &mut deterministic::Context, + oracle: &commonware_p2p::simulated::Oracle, + participants: &[S::PublicKey], + schemes: &[S], + elector: L, + leader_timeout: Duration, + certification_timeout: Duration, + timeout_retry: Duration, should_certify: mocks::application::Certifier, ) -> ( Mailbox, @@ -352,7 +387,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -586,7 +621,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), app_config); @@ -852,7 +887,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, ) .await; @@ -973,7 +1007,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, ) .await; @@ -1109,7 +1142,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, ) .await; @@ -1238,7 +1270,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (100_000.0, 0.0), // Very slow verification certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1625,7 +1657,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2036,7 +2068,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, ) .await; @@ -2137,7 +2168,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, ) .await; @@ -3056,7 +3086,6 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_mins(60), - mocks::application::Certifier::Sometimes, ) .await; @@ -3508,7 +3537,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Custom(Box::new(move |d| { + should_certify: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true })), @@ -3644,7 +3673,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Custom(Box::new(move |d| { + should_certify: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true })), @@ -4153,6 +4182,294 @@ mod tests { no_self_propose_or_verify_after_restart(secp256r1::fixture); } + /// Regression: a leader that crashes after calling `automaton.propose` but + /// before journaling its local `Notarize` must, on restart, issue at most a + /// single `automaton.propose` call for the leader-owned view and exit that + /// view via `Vote::Nullify` instead of retrying proposals through the live + /// run loop. + fn nullify_after_crash_in_propose_window(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"nullify_after_crash_in_propose_window".to_vec(); + let partition = "nullify_after_crash_in_propose_window".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + // Set up the simulated network. + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + // Pre-crash: drop every propose response. The leader calls + // `automaton.propose`, the mock swallows the request, and nothing + // is journaled. An observer records that the pre-crash leader + // actually got as far as requesting a proposal so the test knows + // the abort happens inside the propose window rather than before + // the voter even became leader. + let pre_propose_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let pre_propose_tracker = pre_propose_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + // Stall (not drop) so the voter's receiver stays open indefinitely. + // Dropping the sender would fire `MissingProposal` and journal a + // `Nullify` before we can abort, which would in turn cause replay + // to skip the propose path entirely post-restart. + app_actor.set_stall_proposals(true); + app_actor.set_propose_observer(Box::new(move |ctx| { + pre_propose_tracker.lock().push(ctx.view()); + })); + app_actor.start(); + + // Build and start the pre-crash voter. `leader_timeout` is long + // enough that the voter won't auto-nullify before we abort, + // guaranteeing the journal contains no `Nullify` either. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(600), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for startup, then advance into the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + break; + } + batcher::Message::Constructed(_) => {} + } + } + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Wait for the pre-crash voter to call `automaton.propose` for + // the leader-owned view. The observer fires before the mock parks + // the response sender, so seeing this entry confirms the voter + // entered the propose window and is now blocked on a response + // that will never arrive. Driving the runtime forward with a + // short `context.sleep` lets the voter and application tasks + // progress to their next await points without consuming batcher + // messages we still need for later assertions. + for _ in 0..100 { + if pre_propose_calls.lock().contains(&target_view) { + break; + } + context.sleep(Duration::from_millis(10)).await; + } + assert!( + pre_propose_calls.lock().contains(&target_view), + "pre-crash voter must reach the propose window for the leader-owned view" + ); + + // Crash: abort the voter. Because `propose` never returned, no + // `Notarize` (or any other artifact for the target view) reached + // the journal. + handle.abort(); + + // Post-restart: install a fresh application that also drops + // `propose` responses. This mirrors the marshal's post-restart + // behavior when `get_verified` sees a cached block for the round + // and deliberately drops the tx, forcing the voter to nullify + // the view rather than reuse the stale block. A propose observer + // on this application is the assertion anchor: it must record + // exactly one call for the target view. + let post_propose_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let post_propose_tracker = post_propose_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.set_drop_proposals(true); + app_actor.set_propose_observer(Box::new(move |ctx| { + post_propose_tracker.lock().push(ctx.view()); + })); + app_actor.start(); + + // Build and start the post-restart voter on the same partition + // with a short `leader_timeout` so the nullify path fires promptly + // once the restarted voter has had a chance to issue its single + // (dropped) propose request. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for replay to complete and confirm we re-entered the + // leader-owned target view. Journal replay saw no notarize for + // this view, so the slot starts empty and the voter will call + // `automaton.propose` from scratch. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { + current, + leader, + response, + .. + } => { + response.send(None).unwrap(); + assert_eq!(current, target_view); + assert_eq!(leader, Participant::new(0)); + break; + } + batcher::Message::Constructed(_) => {} + } + } + + // Wait for the leader-timeout nullify. This also proves the + // run loop stayed responsive after the dropped propose request: + // the voter did not livelock trying to re-propose, it reached the + // timeout path and emitted the nullify vote. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + break; + } + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == target_view => + { + panic!( + "restarted voter must not emit a new Notarize for the \ + leader-owned view; its stale verified block could \ + still be cached in marshal" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + } + + // Assert the core restart invariant: the restarted voter issued + // `automaton.propose` at most once for the target view and then + // nullified instead of retrying. + let proposed = post_propose_calls.lock(); + let target_call_count = proposed.iter().filter(|v| **v == target_view).count(); + assert_eq!( + target_call_count, 1, + "restarted voter must call automaton.propose exactly once for the \ + leader-owned view before nullifying (observed: {proposed:?})" + ); + }); + } + + #[test_traced] + fn test_nullify_after_crash_in_propose_window() { + nullify_after_crash_in_propose_window(bls12381_threshold_vrf::fixture::); + nullify_after_crash_in_propose_window(bls12381_threshold_vrf::fixture::); + nullify_after_crash_in_propose_window(bls12381_multisig::fixture::); + nullify_after_crash_in_propose_window(bls12381_multisig::fixture::); + nullify_after_crash_in_propose_window(ed25519::fixture); + nullify_after_crash_in_propose_window(secp256r1::fixture); + } + /// After restart, a proposal we already voted on must not be re-verified /// when it is re-delivered to the voter (e.g. via the automaton after /// peer vote aggregation reconstructs it). @@ -4430,67 +4747,79 @@ mod tests { no_self_verify_after_restart(secp256r1::fixture); } - /// Test that in-flight certification requests are cancelled when finalization occurs. + /// When the voter is the leader of a view and later reconstructs a + /// notarization for the proposal it built locally, it must not ask the + /// automaton to certify that same proposal again. /// - /// 1. Use a very long certify latency to ensure certification is in-flight. - /// 2. Send a notarization to trigger certification. - /// 3. Send a finalization for the same view before certification completes. - /// 4. Verify that no Certified message is sent to the resolver. - fn certification_cancelled_on_finalization(mut fixture: F) + /// This is enforced in `actor::run` by short-circuiting certification only + /// when the round carries explicit local proposal evidence, not merely + /// because `leader == me`. The test asserts the end-to-end invariant on the + /// live path: a `Finalize` is emitted for the leader-owned view without the + /// certify observer firing for that view. + fn no_self_certify_when_proposing(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, - L: ElectorConfig, { let n = 5; let quorum = quorum(n); - let namespace = b"consensus".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(30)); + let namespace = b"no_self_certify_when_proposing".to_vec(); + let partition = "no_self_certify_when_proposing".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(10)); executor.start(|mut context| async move { + // Set up the simulated network. let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); - - // Create simulated network let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); let me = participants[0].clone(); - let elector = L::default(); - let reporter_config = mocks::reporter::Config { + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { participants: participants.clone().try_into().unwrap(), scheme: schemes[0].clone(), elector: elector.clone(), }; let reporter = - mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); let relay = Arc::new(mocks::relay::Relay::new()); - let application_cfg = mocks::application::Config { + // Install a certify observer to detect any spurious certify call for + // the leader-owned view. + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { hasher: Sha256::default(), relay: relay.clone(), me: me.clone(), propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), - certify_latency: (2_000.0, 0.0), // 2 seconds - should_certify: mocks::application::Certifier::Always, + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), }; - let (actor, application) = mocks::application::Application::new( - context.with_label("application"), - application_cfg, - ); - actor.start(); + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); - let cfg = Config { + // Build and start the voter wired to the observing application. + let voter_cfg = Config { scheme: schemes[0].clone(), elector, blocker: oracle.control(me.clone()), automaton: application.clone(), relay: application.clone(), - reporter: reporter.clone(), - partition: "cert_cancel_test".to_string(), + reporter, + partition, epoch: Epoch::new(333), mailbox_size: 128, leader_timeout: Duration::from_secs(5), @@ -4501,190 +4830,169 @@ mod tests { write_buffer: NZUsize!(1024 * 1024), page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), }; - let (actor, mut mailbox) = Actor::new(context.clone(), cfg); - - let (resolver_sender, mut resolver_receiver) = mpsc::channel(10); - let resolver = resolver::Mailbox::new(resolver_sender); - - let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); - let batcher = batcher::Mailbox::new(batcher_sender); - - let (vote_sender, _vote_receiver) = oracle + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle .control(me.clone()) .register(0, TEST_QUOTA) .await .unwrap(); - let (certificate_sender, _certificate_receiver) = oracle + let (cert_sender, _) = oracle .control(me.clone()) .register(1, TEST_QUOTA) .await .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); - actor.start(batcher, resolver, vote_sender, certificate_sender); - - // Wait for initial batcher notification - if let batcher::Message::Update { response, .. } = - batcher_receiver.recv().await.unwrap() - { - response.send(None).unwrap(); - } - - // Send a notarization for view 5 to trigger certification - let view5 = View::new(5); - let digest5 = Sha256::hash(b"payload_to_certify"); - let proposal5 = - Proposal::new(Round::new(Epoch::new(333), view5), View::new(0), digest5); - - // Broadcast payload - let contents = (proposal5.round, Sha256::hash(b"genesis"), 42u64).encode(); - relay.broadcast(&me, (digest5, contents)); - - // Send proposal to verify - mailbox.proposal(proposal5.clone()).await; - - // Send notarization - let (_, notarization) = build_notarization(&schemes, &proposal5, quorum); - mailbox - .recovered(Certificate::Notarization(notarization)) - .await; - - // Wait for certification to start (it will be slow due to latency) - context.sleep(Duration::from_millis(100)).await; - - // Send finalization for view 5 before certification completes - let (_, finalization) = build_finalization(&schemes, &proposal5, quorum); - mailbox - .recovered(Certificate::Finalization(finalization)) - .await; - - // Wait for finalization to be processed + // Wait for startup, then advance to the leader-owned view. loop { - if let batcher::Message::Update { - finalized, - response, - .. - } = batcher_receiver.recv().await.unwrap() - { - response.send(None).unwrap(); - if finalized >= view5 { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); break; } + batcher::Message::Constructed(_) => {} } } + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; - // Wait for resolver finalization message (skip other certificates) + // Capture the leader's local notarize so we can resolve the matching + // notarization back into the voter to drive certification. + let proposal = loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == target_view => + { + break notarize.proposal; + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + }; + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // A finalize for the leader-owned view proves the voter certified its + // own proposal without consulting the automaton. loop { - let msg = resolver_receiver - .recv() - .await - .expect("expected resolver msg"); - match msg { - MailboxMessage::Certificate(Certificate::Finalization(f)) => { - assert_eq!(f.view(), view5); + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, proposal); break; } - MailboxMessage::Certificate(_) => continue, - MailboxMessage::Certified { .. } => { - panic!("unexpected Certified message before finalization processed") + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "leader-owned proposal should certify locally instead of nullifying view {target_view}" + ); } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} } } - // Wait longer than certify_latency (2s) to verify certification was cancelled. - // If certification wasn't cancelled, it would complete and send a Certified message. - let certified_received = select! { - msg = resolver_receiver.recv() => { - matches!(msg, Some(MailboxMessage::Certified { .. })) - }, - _ = context.sleep(Duration::from_secs(4)) => false, - }; - + // Assert the live invariant: the certify observer never fired for + // the leader-owned proposal we built ourselves. + let certified = certify_calls.lock(); assert!( - !certified_received, - "Certified message should NOT have been sent - certification should be cancelled" + !certified.contains(&target_view), + "voter must not certify its own leader-built proposal (observed: {certified:?})" ); }); } #[test_traced] - fn test_certification_cancelled_on_finalization() { - certification_cancelled_on_finalization::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - certification_cancelled_on_finalization::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - certification_cancelled_on_finalization::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - certification_cancelled_on_finalization::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - certification_cancelled_on_finalization::<_, _, RoundRobin>(ed25519::fixture); - certification_cancelled_on_finalization::<_, _, RoundRobin>(secp256r1::fixture); + fn test_no_self_certify_when_proposing() { + no_self_certify_when_proposing(bls12381_threshold_vrf::fixture::); + no_self_certify_when_proposing(bls12381_threshold_vrf::fixture::); + no_self_certify_when_proposing(bls12381_multisig::fixture::); + no_self_certify_when_proposing(bls12381_multisig::fixture::); + no_self_certify_when_proposing(ed25519::fixture); + no_self_certify_when_proposing(secp256r1::fixture); } - /// Test that in-flight certification is still reported to resolver after nullification. + /// Restart analogue of `no_self_certify_when_proposing`: after the voter has + /// proposed and journaled a local notarize as leader, restarting must + /// recover that local proposal evidence and continue to bypass automaton + /// certification once the corresponding notarization is resolved. /// - /// 1. Use a long certify latency so certification remains in-flight. - /// 2. Send notarization to trigger certification. - /// 3. Send nullification for the same view before certification completes. - /// 4. Verify that a Certified message is still sent to resolver when certification completes. - fn certification_still_reports_to_resolver_after_nullification(mut fixture: F) + /// The replayed local notarize is what distinguishes this case from merely + /// observing a leader-owned proposal certificate during catch-up. + fn no_self_certify_after_restart(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, - L: ElectorConfig, { let n = 5; let quorum = quorum(n); - let namespace = b"consensus".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(30)); + let namespace = b"no_self_certify_after_restart".to_vec(); + let partition = "no_self_certify_after_restart".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); executor.start(|mut context| async move { + // Set up the simulated network. let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); - - // Create simulated network let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); let me = participants[0].clone(); - let elector = L::default(); - let reporter_config = mocks::reporter::Config { + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { participants: participants.clone().try_into().unwrap(), scheme: schemes[0].clone(), elector: elector.clone(), }; let reporter = - mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); let relay = Arc::new(mocks::relay::Relay::new()); - let application_cfg = mocks::application::Config { + // Pre-restart: plain application (no observers) so the voter can + // cleanly propose and journal its own notarize vote for view 2. + let app_cfg = mocks::application::Config { hasher: Sha256::default(), relay: relay.clone(), me: me.clone(), propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), - certify_latency: (2_000.0, 0.0), // 2 seconds + certify_latency: (1.0, 0.0), should_certify: mocks::application::Certifier::Always, }; - let (actor, application) = mocks::application::Application::new( - context.with_label("application"), - application_cfg, - ); - actor.start(); + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); - let cfg = Config { + // Build and start the pre-restart voter. + let voter_cfg = Config { scheme: schemes[0].clone(), - elector, + elector: elector.clone(), blocker: oracle.control(me.clone()), automaton: application.clone(), relay: application.clone(), reporter: reporter.clone(), - partition: "cert_after_nullification_test".to_string(), + partition: partition.clone(), epoch: Epoch::new(333), mailbox_size: 128, leader_timeout: Duration::from_secs(5), @@ -4695,148 +5003,314 @@ mod tests { write_buffer: NZUsize!(1024 * 1024), page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), }; - let (actor, mut mailbox) = Actor::new(context.clone(), cfg); - - let (resolver_sender, mut resolver_receiver) = mpsc::channel(10); - let resolver = resolver::Mailbox::new(resolver_sender); - - let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); - let batcher = batcher::Mailbox::new(batcher_sender); - - let (vote_sender, _vote_receiver) = oracle + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle .control(me.clone()) .register(0, TEST_QUOTA) .await .unwrap(); - let (certificate_sender, _certificate_receiver) = oracle + let (cert_sender, _) = oracle .control(me.clone()) .register(1, TEST_QUOTA) .await .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); - actor.start(batcher, resolver, vote_sender, certificate_sender); - - // Wait for initial batcher notification - if let batcher::Message::Update { response, .. } = - batcher_receiver.recv().await.unwrap() - { - response.send(None).unwrap(); + // Wait for startup, then advance to the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + break; + } + batcher::Message::Constructed(_) => {} + } } + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; - // Send a notarization for view 5 to trigger certification - let view5 = View::new(5); - let digest5 = Sha256::hash(b"payload_to_certify"); - let proposal5 = - Proposal::new(Round::new(Epoch::new(333), view5), View::new(0), digest5); - - // Broadcast payload - let contents = (proposal5.round, Sha256::hash(b"genesis"), 42u64).encode(); - relay.broadcast(&me, (digest5, contents)); + // Wait for the voter to emit and journal its own notarize for the + // leader-owned view. The captured proposal is reused post-restart + // to drive certification. + let proposal = loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == target_view => + { + break notarize.proposal; + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + }; - // Send proposal and notarization - mailbox.proposal(proposal5.clone()).await; - let (_, notarization) = build_notarization(&schemes, &proposal5, quorum); - mailbox - .recovered(Certificate::Notarization(notarization)) - .await; + // Restart: abort the voter and construct a fresh application with a + // certify observer to catch any spurious certify call for the + // leader-owned view post-replay. + handle.abort(); + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), + }; + let (app_actor, application) = mocks::application::Application::new( + context.with_label("app_restarted"), + app_cfg, + ); + app_actor.start(); - // Wait for certification to start (it will be slow due to latency) - context.sleep(Duration::from_millis(100)).await; + // Build and start the post-restart voter against the same journal partition. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = + Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); - // Send nullification for the same view before certification completes - let (_, nullification) = - build_nullification(&schemes, Round::new(Epoch::new(333), view5), quorum); + // Wait for replay to complete; confirm we re-entered the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { + current, + leader, + response, + .. + } => { + response.send(None).unwrap(); + assert_eq!(current, target_view); + assert_eq!(leader, Participant::new(0)); + break; + } + batcher::Message::Constructed(_) => {} + } + } + + // Resolve the matching notarization to drive certification on the + // restarted voter. + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox - .recovered(Certificate::Nullification(nullification)) + .resolved(Certificate::Notarization(notarization)) .await; - // Even after nullification, late certification should still be forwarded to resolver. - let reported = loop { - select! { - msg = resolver_receiver.recv() => match msg.unwrap() { - MailboxMessage::Certified { view, success } if view == view5 => - break Some(success), - MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} - }, - msg = batcher_receiver.recv() => { - if let batcher::Message::Update { response, .. } = msg.unwrap() { - response.send(None).unwrap(); - } - }, - _ = context.sleep(Duration::from_secs(6)) => { - break None; - }, + // A finalize for the leader-owned view proves the voter recovered + // the local certification shortcut after replay. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, proposal); + break; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "leader-owned recovered proposal should certify locally instead of nullifying view {target_view}" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} } - }; + } - assert_eq!( - reported, - Some(true), - "expected resolver to receive successful certification after nullification" + // Assert the restart invariant: certify did not fire for the + // leader-owned view whose journaled local notarize replay restored + // the local proposal evidence. + let certified = certify_calls.lock(); + assert!( + !certified.contains(&target_view), + "voter must not certify its own leader-built proposal after restart (observed: {certified:?})" ); }); } #[test_traced] - fn test_certification_still_reports_to_resolver_after_nullification() { - certification_still_reports_to_resolver_after_nullification::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - certification_still_reports_to_resolver_after_nullification::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( - ed25519::fixture, - ); - certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( - secp256r1::fixture, - ); + fn test_no_self_certify_after_restart() { + no_self_certify_after_restart(bls12381_threshold_vrf::fixture::); + no_self_certify_after_restart(bls12381_threshold_vrf::fixture::); + no_self_certify_after_restart(bls12381_multisig::fixture::); + no_self_certify_after_restart(bls12381_multisig::fixture::); + no_self_certify_after_restart(ed25519::fixture); + no_self_certify_after_restart(secp256r1::fixture); } - /// Regression: a notarization arriving after nullification for the same view - /// should still trigger certification. - fn late_notarization_after_nullification_still_certifies(mut fixture: F) + /// Regression: when an elected leader receives an external notarization + /// for a proposal it did *not* build locally, it must invoke + /// `automaton.certify` before finalizing the view. The + /// `is_local=true` shortcut in `actor::run` must only short-circuit when + /// the slot carries explicit local proposal evidence; an + /// externally-recovered proposal on a leader-owned view produces + /// `is_local=false`, which requires consulting the automaton. + fn certify_observer_fires_for_external_leader_proposal(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"late_notarization_after_nullification".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(30)); + let namespace = b"certify_observer_fires_for_external_leader_proposal".to_vec(); + let partition = "certify_observer_fires_for_external_leader_proposal".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); executor.start(|mut context| async move { - // Create simulated network. - // Build participants and voter. + // Set up the simulated network. let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); - - // Create simulated network let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; - let (mut mailbox, mut batcher_receiver, mut resolver_receiver, _, _) = setup_voter( - &mut context, - &oracle, - &participants, - &schemes, - RoundRobin::::default(), - Duration::from_secs(5), - Duration::from_secs(5), - Duration::from_secs(5), - mocks::application::Certifier::Always, - ) - .await; - // Move into a concrete current view. - let target_view = View::new(3); + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); + let target_epoch = Epoch::new(333); + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + // Stall the propose response so the slot is never populated with + // a locally-built proposal. The slot stays empty (proposal=None, + // status=None) while the voter's internal flag `requested_build` + // is true, exactly the state in which an externally-recovered + // proposal lands with `is_local=false` at the leader. + // + // The certify observer records every `automaton.certify` call so + // the final assertion can confirm the `is_local=false` code path + // ran instead of being short-circuited. + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new(move |round, _| { + certify_tracker.lock().push(round.view()); + true + })), + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.set_stall_proposals(true); + app_actor.start(); + + // Build and start the voter. Use long `leader_timeout` so the + // stalled proposal does not trigger a nullify before the + // conflicting notarization reaches the voter. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch: target_epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(600), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for startup, then advance into the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + break; + } + batcher::Message::Constructed(_) => {} + } + } advance_to_view( &mut mailbox, &mut batcher_receiver, @@ -4846,80 +5320,1130 @@ mod tests { ) .await; - // Nullify current view first. - let (_, nullification) = - build_nullification(&schemes, Round::new(Epoch::new(333), target_view), quorum); + // Craft a proposal the voter could not have built locally + // (distinct payload) and build its notarization from all validator + // schemes. The notarization is well-formed; quorum-worth of signers + // cover the proposal so it will pass `add_notarization`. + let foreign_payload = Sha256::hash(b"foreign_leader_owned_proposal"); + let foreign_proposal = Proposal::new( + Round::new(target_epoch, target_view), + target_view.previous().unwrap_or(View::zero()), + foreign_payload, + ); + let (_, foreign_notarization) = build_notarization(&schemes, &foreign_proposal, quorum); + + // Deliver the foreign notarization. This seeds the voter's slot + // with a proposal it never built, producing `is_local=false` on + // the certification candidate. mailbox - .resolved(Certificate::Nullification(nullification)) + .resolved(Certificate::Notarization(foreign_notarization)) .await; - // Then provide notarization for that same view. - let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), + // Wait for a `Finalize` on the leader-owned view. Observing + // finalize proves the certify callback both fired and resolved + // successfully. Any `Nullify` here would mean the voter never + // reached the certification branch (for example because + // `is_local=true` incorrectly short-circuited it). + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, foreign_proposal); + break; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "leader-owned view with an externally-recovered proposal \ + must certify via the automaton instead of nullifying \ + view {target_view}" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + } + + // Assert the `is_local=false` invariant: the certify callback + // fired for the leader-owned view. Without the fix under test, + // a `leader == me`-only shortcut would skip the call and this + // assertion would fail. + let certified = certify_calls.lock(); + assert!( + certified.contains(&target_view), + "voter must invoke automaton.certify for an externally-recovered \ + leader-owned proposal (observed: {certified:?})" + ); + }); + } + + #[test_traced] + fn test_certify_observer_fires_for_external_leader_proposal() { + certify_observer_fires_for_external_leader_proposal( + bls12381_threshold_vrf::fixture::, + ); + certify_observer_fires_for_external_leader_proposal( + bls12381_threshold_vrf::fixture::, + ); + certify_observer_fires_for_external_leader_proposal(bls12381_multisig::fixture::); + certify_observer_fires_for_external_leader_proposal( + bls12381_multisig::fixture::, + ); + certify_observer_fires_for_external_leader_proposal(ed25519::fixture); + certify_observer_fires_for_external_leader_proposal(secp256r1::fixture); + } + + /// Test that in-flight certification requests are cancelled when finalization occurs. + /// + /// 1. Use a very long certify latency to ensure certification is in-flight. + /// 2. Send a notarization to trigger certification. + /// 3. Send a finalization for the same view before certification completes. + /// 4. Verify that no Certified message is sent to the resolver. + fn certification_cancelled_on_finalization(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + L: ElectorConfig, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"consensus".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = L::default(); + let reporter_config = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + let relay = Arc::new(mocks::relay::Relay::new()); + + let application_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (2_000.0, 0.0), // 2 seconds + should_certify: mocks::application::Certifier::Always, + }; + let (actor, application) = mocks::application::Application::new( + context.with_label("application"), + application_cfg, + ); + actor.start(); + + let cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: "cert_cancel_test".to_string(), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (actor, mut mailbox) = Actor::new(context.clone(), cfg); + + let (resolver_sender, mut resolver_receiver) = mpsc::channel(10); + let resolver = resolver::Mailbox::new(resolver_sender); + + let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); + let batcher = batcher::Mailbox::new(batcher_sender); + + let (vote_sender, _vote_receiver) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (certificate_sender, _certificate_receiver) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + actor.start(batcher, resolver, vote_sender, certificate_sender); + + // Wait for initial batcher notification + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Send a notarization for view 5 to trigger certification + let view5 = View::new(5); + let digest5 = Sha256::hash(b"payload_to_certify"); + let proposal5 = + Proposal::new(Round::new(Epoch::new(333), view5), View::new(0), digest5); + + // Broadcast payload + let contents = (proposal5.round, Sha256::hash(b"genesis"), 42u64).encode(); + relay.broadcast(&me, (digest5, contents)); + + // Send proposal to verify + mailbox.proposal(proposal5.clone()).await; + + // Send notarization + let (_, notarization) = build_notarization(&schemes, &proposal5, quorum); + mailbox + .recovered(Certificate::Notarization(notarization)) + .await; + + // Wait for certification to start (it will be slow due to latency) + context.sleep(Duration::from_millis(100)).await; + + // Send finalization for view 5 before certification completes + let (_, finalization) = build_finalization(&schemes, &proposal5, quorum); + mailbox + .recovered(Certificate::Finalization(finalization)) + .await; + + // Wait for finalization to be processed + loop { + if let batcher::Message::Update { + finalized, + response, + .. + } = batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + if finalized >= view5 { + break; + } + } + } + + // Wait for resolver finalization message (skip other certificates) + loop { + let msg = resolver_receiver + .recv() + .await + .expect("expected resolver msg"); + match msg { + MailboxMessage::Certificate(Certificate::Finalization(f)) => { + assert_eq!(f.view(), view5); + break; + } + MailboxMessage::Certificate(_) => continue, + MailboxMessage::Certified { .. } => { + panic!("unexpected Certified message before finalization processed") + } + } + } + + // Wait longer than certify_latency (2s) to verify certification was cancelled. + // If certification wasn't cancelled, it would complete and send a Certified message. + let certified_received = select! { + msg = resolver_receiver.recv() => { + matches!(msg, Some(MailboxMessage::Certified { .. })) + }, + _ = context.sleep(Duration::from_secs(4)) => false, + }; + + assert!( + !certified_received, + "Certified message should NOT have been sent - certification should be cancelled" + ); + }); + } + + #[test_traced] + fn test_certification_cancelled_on_finalization() { + certification_cancelled_on_finalization::<_, _, Random>( + bls12381_threshold_vrf::fixture::, + ); + certification_cancelled_on_finalization::<_, _, Random>( + bls12381_threshold_vrf::fixture::, + ); + certification_cancelled_on_finalization::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + certification_cancelled_on_finalization::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + certification_cancelled_on_finalization::<_, _, RoundRobin>(ed25519::fixture); + certification_cancelled_on_finalization::<_, _, RoundRobin>(secp256r1::fixture); + } + + /// Test that in-flight certification is still reported to resolver after nullification. + /// + /// 1. Use a long certify latency so certification remains in-flight. + /// 2. Send notarization to trigger certification. + /// 3. Send nullification for the same view before certification completes. + /// 4. Verify that a Certified message is still sent to resolver when certification completes. + fn certification_still_reports_to_resolver_after_nullification(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + L: ElectorConfig, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"consensus".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = L::default(); + let reporter_config = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + let relay = Arc::new(mocks::relay::Relay::new()); + + let application_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (2_000.0, 0.0), // 2 seconds + should_certify: mocks::application::Certifier::Always, + }; + let (actor, application) = mocks::application::Application::new( + context.with_label("application"), + application_cfg, + ); + actor.start(); + + let cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: "cert_after_nullification_test".to_string(), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (actor, mut mailbox) = Actor::new(context.clone(), cfg); + + let (resolver_sender, mut resolver_receiver) = mpsc::channel(10); + let resolver = resolver::Mailbox::new(resolver_sender); + + let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); + let batcher = batcher::Mailbox::new(batcher_sender); + + let (vote_sender, _vote_receiver) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (certificate_sender, _certificate_receiver) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + actor.start(batcher, resolver, vote_sender, certificate_sender); + + // Wait for initial batcher notification + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Send a notarization for view 5 to trigger certification + let view5 = View::new(5); + let digest5 = Sha256::hash(b"payload_to_certify"); + let proposal5 = + Proposal::new(Round::new(Epoch::new(333), view5), View::new(0), digest5); + + // Broadcast payload + let contents = (proposal5.round, Sha256::hash(b"genesis"), 42u64).encode(); + relay.broadcast(&me, (digest5, contents)); + + // Send proposal and notarization + mailbox.proposal(proposal5.clone()).await; + let (_, notarization) = build_notarization(&schemes, &proposal5, quorum); + mailbox + .recovered(Certificate::Notarization(notarization)) + .await; + + // Wait for certification to start (it will be slow due to latency) + context.sleep(Duration::from_millis(100)).await; + + // Send nullification for the same view before certification completes + let (_, nullification) = + build_nullification(&schemes, Round::new(Epoch::new(333), view5), quorum); + mailbox + .recovered(Certificate::Nullification(nullification)) + .await; + + // Even after nullification, late certification should still be forwarded to resolver. + let reported = loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == view5 => + break Some(success), + MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(6)) => { + break None; + }, + } + }; + + assert_eq!( + reported, + Some(true), + "expected resolver to receive successful certification after nullification" + ); + }); + } + + #[test_traced] + fn test_certification_still_reports_to_resolver_after_nullification() { + certification_still_reports_to_resolver_after_nullification::<_, _, Random>( + bls12381_threshold_vrf::fixture::, + ); + certification_still_reports_to_resolver_after_nullification::<_, _, Random>( + bls12381_threshold_vrf::fixture::, + ); + certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( + ed25519::fixture, + ); + certification_still_reports_to_resolver_after_nullification::<_, _, RoundRobin>( + secp256r1::fixture, + ); + } + + /// Regression: a notarization arriving after nullification for the same view + /// should still trigger certification. + fn late_notarization_after_nullification_still_certifies(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"late_notarization_after_nullification".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + // Create simulated network. + // Build participants and voter. + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + let (mut mailbox, mut batcher_receiver, mut resolver_receiver, _, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + RoundRobin::::default(), + Duration::from_secs(5), + Duration::from_secs(5), + Duration::from_secs(5), + ) + .await; + + // Move into a concrete current view. + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Nullify current view first. + let (_, nullification) = + build_nullification(&schemes, Round::new(Epoch::new(333), target_view), quorum); + mailbox + .resolved(Certificate::Nullification(nullification)) + .await; + + // Then provide notarization for that same view. + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"late_notarization_after_nullification"), + ); + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + let certified = loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + break Some(success); + } + MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(6)) => break None, + } + }; + + assert_eq!( + certified, + Some(true), + "expected notarization after nullification to still trigger certification" + ); + }); + } + + #[test_traced] + fn test_late_notarization_after_nullification_still_certifies() { + late_notarization_after_nullification_still_certifies::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + late_notarization_after_nullification_still_certifies::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + late_notarization_after_nullification_still_certifies::<_, _>( + bls12381_multisig::fixture::, + ); + late_notarization_after_nullification_still_certifies::<_, _>( + bls12381_multisig::fixture::, + ); + late_notarization_after_nullification_still_certifies::<_, _>(ed25519::fixture); + late_notarization_after_nullification_still_certifies::<_, _>(secp256r1::fixture); + } + + /// Tests certification after: timeout -> receive notarization -> certify. + /// This test does NOT send a notarize vote first (we timeout before receiving a proposal). + fn certification_after_timeout(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"certification_after_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(60)); + executor.start(|mut context| async move { + // Get participants + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // Setup application mock and voter + let elector = RoundRobin::::default(); + let built_elector: RoundRobinElector = elector + .clone() + .build(&participants.clone().try_into().unwrap()); + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), + ) + .await; + + // Advance to view 3 where we're a follower. + // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 + // View 3: leader = 1 (not us) + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + assert_ne!( + built_elector.elect(Round::new(Epoch::new(333), target_view), None), + Participant::new(0), + "we should not be leader at view 3" + ); + + // Wait for timeout (nullify vote) WITHOUT sending notarize first + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(n)) + if n.view() == target_view => + break, + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(15)) => { + panic!("expected nullify vote"); + }, + } + } + + // Send notarization certificate (simulating delayed network delivery) + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), target_view.previous().unwrap(), - Sha256::hash(b"late_notarization_after_nullification"), + Sha256::hash(b"timeout_test"), + ); + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .recovered(Certificate::Notarization(notarization)) + .await; + + // Verify view advances + let advanced = loop { + select! { + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break true; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + break false; + }, + } + }; + assert!( + advanced, + "view should advance after certification (timeout case)" + ); + }); + } + + #[test_traced] + fn test_certification_after_timeout() { + certification_after_timeout::<_, _>(bls12381_threshold_vrf::fixture::); + certification_after_timeout::<_, _>(bls12381_threshold_vrf::fixture::); + certification_after_timeout::<_, _>(bls12381_multisig::fixture::); + certification_after_timeout::<_, _>(bls12381_multisig::fixture::); + certification_after_timeout::<_, _>(ed25519::fixture); + certification_after_timeout::<_, _>(secp256r1::fixture); + } + + /// Tests certification after: notarize -> timeout -> receive notarization -> certify. + /// This test runs when we are NOT the leader (receiving proposal from another participant). + fn certification_after_notarize_timeout_as_follower(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"certification_after_notarize_timeout_as_follower".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(60)); + executor.start(|mut context| async move { + // Get participants + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // Setup application mock and voter + let elector = RoundRobin::::default(); + let built_elector: RoundRobinElector = elector + .clone() + .build(&participants.clone().try_into().unwrap()); + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), + ) + .await; + + // Advance to view 3 where we're a follower. + // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 + // View 3: leader = 1 (not us) + let target_view = View::new(3); + let parent_payload = advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + assert_ne!( + built_elector.elect(Round::new(Epoch::new(333), target_view), None), + Participant::new(0), + "we should not be leader at view 3" + ); + + // Create and send proposal as if from the leader (participant 1) + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"follower_test"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + // Wait for notarize vote + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Notarize(n)) + if n.view() == target_view => + break, + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected notarize vote"); + }, + } + } + + // Trigger timeout + context.sleep(Duration::from_secs(11)).await; + + // Wait for nullify vote + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(n)) + if n.view() == target_view => + break, + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(1)) => { + panic!("expected nullify vote"); + }, + } + } + + // Send notarization certificate + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .recovered(Certificate::Notarization(notarization)) + .await; + + // Verify view advances + let advanced = loop { + select! { + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break true; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + break false; + }, + } + }; + assert!( + advanced, + "view should advance after certification (follower case)" + ); + }); + } + + #[test_traced] + fn test_certification_after_notarize_timeout_as_follower() { + certification_after_notarize_timeout_as_follower::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + certification_after_notarize_timeout_as_follower::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + certification_after_notarize_timeout_as_follower::<_, _>( + bls12381_multisig::fixture::, + ); + certification_after_notarize_timeout_as_follower::<_, _>( + bls12381_multisig::fixture::, + ); + certification_after_notarize_timeout_as_follower::<_, _>(ed25519::fixture); + certification_after_notarize_timeout_as_follower::<_, _>(secp256r1::fixture); + } + + /// Tests certification after: notarize -> timeout -> receive notarization -> certify. + /// This test runs when we ARE the leader (proposing ourselves). + fn certification_after_notarize_timeout_as_leader(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"certification_after_notarize_timeout_as_leader".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(60)); + executor.start(|mut context| async move { + // Get participants + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // Setup application mock and voter + let elector = RoundRobin::::default(); + let built_elector: RoundRobinElector = elector + .clone() + .build(&participants.clone().try_into().unwrap()); + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), + ) + .await; + + // Advance to view 2 where we ARE the leader. + // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 + // View 2: leader = 0 (us) + let target_view = View::new(2); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + assert_eq!( + built_elector.elect(Round::new(Epoch::new(333), target_view), None), + Participant::new(0), + "we should be leader at view 2" ); + + // As leader, wait for our own notarize vote (automaton will propose) + let proposal = loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Notarize(n)) + if n.view() == target_view => + { + break n.proposal.clone(); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected notarize vote as leader"); + }, + } + }; + + // Trigger timeout + context.sleep(Duration::from_secs(11)).await; + + // Wait for nullify vote + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(n)) + if n.view() == target_view => + break, + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(1)) => { + panic!("expected nullify vote"); + }, + } + } + + // Send notarization certificate (as if other participants formed it) let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox - .resolved(Certificate::Notarization(notarization)) + .recovered(Certificate::Notarization(notarization)) .await; - let certified = loop { + // Verify view advances + let advanced = loop { select! { - msg = resolver_receiver.recv() => match msg.unwrap() { - MailboxMessage::Certified { view, success } if view == target_view => { - break Some(success); + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break true; + } } - MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} }, + _ = context.sleep(Duration::from_secs(5)) => { + break false; + }, + } + }; + assert!( + advanced, + "view should advance after certification (leader case)" + ); + }); + } + + #[test_traced] + fn test_certification_after_notarize_timeout_as_leader() { + certification_after_notarize_timeout_as_leader::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + certification_after_notarize_timeout_as_leader::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + certification_after_notarize_timeout_as_leader::<_, _>( + bls12381_multisig::fixture::, + ); + certification_after_notarize_timeout_as_leader::<_, _>( + bls12381_multisig::fixture::, + ); + certification_after_notarize_timeout_as_leader::<_, _>(ed25519::fixture); + certification_after_notarize_timeout_as_leader::<_, _>(secp256r1::fixture); + } + + /// Tests that when certification returns a cancelled receiver, the voter doesn't hang + /// and continues to make progress (via voting to nullify the view that could not be certified). + fn cancelled_certification_does_not_hang(mut fixture: F, traces: TraceStorage) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"consensus".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(10)); + executor.start(|mut context| async move { + // Get participants + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + // Create simulated network + let oracle = start_test_network_with_peers( + context.clone(), + participants.clone(), + true, + ) + .await; + + let elector = RoundRobin::::default(); + + // Set up voter with Certifier::Cancel + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_millis(500), + Duration::from_millis(500), + Duration::from_mins(60), + mocks::application::Certifier::Cancel, + ) + .await; + + // Advance to view 3 where we're a follower. + // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 + // View 3: leader = 1 (not us) + let target_view = View::new(3); + let parent_payload = advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Broadcast the payload contents so verification can complete. + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"test_proposal"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay + .broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + // Build and send notarization so the voter tries to certify + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Certification will be cancelled, so the voter should eventually timeout + // and emit a nullify vote. + loop { + select! { msg = batcher_receiver.recv() => { - if let batcher::Message::Update { response, .. } = msg.unwrap() { - response.send(None).unwrap(); + match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) if nullify.view() == target_view => { + break; + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} } }, - _ = context.sleep(Duration::from_secs(6)) => break None, + _ = context.sleep(Duration::from_secs(5)) => { + panic!( + "voter should emit nullify for view {target_view} despite cancelled certification", + ); + }, } - }; + } - assert_eq!( - certified, - Some(true), - "expected notarization after nullification to still trigger certification" - ); + // Verify the "failed to certify proposal" log was emitted with the correct round + let expected_round = format!("Round {{ epoch: Epoch(333), view: View({target_view}) }}"); + traces + .get_by_level(Level::DEBUG) + .expect_event(|event| { + event.metadata.content == "failed to certify proposal" + && event + .metadata + .fields + .iter() + .any(|(name, value)| name == "err" && value == "RecvError(())") + && event + .metadata + .fields + .iter() + .any(|(name, value)| name == "round" && value == &expected_round) + }) + .unwrap(); }); } - #[test_traced] - fn test_late_notarization_after_nullification_still_certifies() { - late_notarization_after_nullification_still_certifies::<_, _>( + #[test_collect_traces] + fn test_cancelled_certification_does_not_hang(traces: TraceStorage) { + cancelled_certification_does_not_hang( bls12381_threshold_vrf::fixture::, + traces.clone(), ); - late_notarization_after_nullification_still_certifies::<_, _>( + cancelled_certification_does_not_hang( bls12381_threshold_vrf::fixture::, + traces.clone(), ); - late_notarization_after_nullification_still_certifies::<_, _>( + cancelled_certification_does_not_hang( bls12381_multisig::fixture::, + traces.clone(), ); - late_notarization_after_nullification_still_certifies::<_, _>( + cancelled_certification_does_not_hang( bls12381_multisig::fixture::, + traces.clone(), ); - late_notarization_after_nullification_still_certifies::<_, _>(ed25519::fixture); - late_notarization_after_nullification_still_certifies::<_, _>(secp256r1::fixture); + cancelled_certification_does_not_hang(ed25519::fixture, traces.clone()); + cancelled_certification_does_not_hang(secp256r1::fixture, traces); } - /// Tests certification after: timeout -> receive notarization -> certify. - /// This test does NOT send a notarize vote first (we timeout before receiving a proposal). - fn certification_after_timeout(mut fixture: F) + /// Regression: a canceled certification attempt must not be persisted as failure. + /// + /// We first trigger a canceled certify receiver, restart the voter, and then require: + /// 1. successful certification for the same view from replayed notarization state, and + /// 2. no immediate timeout/nullify for that view after restart. + fn cancelled_certification_recertifies_after_restart(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"certification_after_timeout".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(60)); + let namespace = b"cancelled_cert_restart_recertify".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); executor.start(|mut context| async move { - // Get participants let Fixture { participants, schemes, @@ -4927,32 +6451,89 @@ mod tests { } = fixture(&mut context, &namespace, n); // Create simulated network - let oracle = - start_test_network_with_peers(context.clone(), participants.clone(), true).await; - - // Setup application mock and voter - let elector = RoundRobin::::default(); - let built_elector: RoundRobinElector = elector - .clone() - .build(&participants.clone().try_into().unwrap()); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( - &mut context, - &oracle, - &participants, - &schemes, - elector, - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_secs(100), - mocks::application::Certifier::Always, + let oracle = start_test_network_with_peers( + context.clone(), + participants.clone(), + true, ) .await; - // Advance to view 3 where we're a follower. - // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 - // View 3: leader = 1 (not us) + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + let partition = "cancelled_certification_recertifies_after_restart".to_string(); + let epoch = Epoch::new(333); + + // First run: certification receiver gets cancelled. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Cancel, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_cancel"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter_cancel"), voter_cfg); + + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + let target_view = View::new(3); - advance_to_view( + let parent_payload = advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, @@ -4960,85 +6541,190 @@ mod tests { target_view, ) .await; - assert_ne!( - built_elector.elect(Round::new(Epoch::new(333), target_view), None), - Participant::new(0), - "we should not be leader at view 3" + + let proposal = Proposal::new( + Round::new(epoch, target_view), + target_view.previous().unwrap(), + Sha256::hash(b"restart_recertify_payload"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Give the canceled certification attempt time to run before restart. + context.sleep(Duration::from_millis(200)).await; + + // Sanity check: canceled certification should not have advanced this view yet. + let advanced_before_restart = select! { + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + current > target_view + } else { + false + } + }, + _ = context.sleep(Duration::from_millis(200)) => false, + }; + assert!( + !advanced_before_restart, + "view should not advance before restart when certification receiver is canceled" + ); + + // Restart voter. + handle.abort(); + + // Second run: certification should succeed from replayed state. + // Use a longer certify latency so there is a real window where an + // incorrect immediate nullify could fire after restart. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (2_000.0, 0.0), // 2 seconds + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, ); - // Wait for timeout (nullify vote) WITHOUT sending notarize first - loop { - select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(n)) - if n.view() == target_view => - break, - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} - }, - _ = context.sleep(Duration::from_secs(15)) => { - panic!("expected nullify vote"); - }, - } + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); } - // Send notarization certificate (simulating delayed network delivery) - let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), - target_view.previous().unwrap(), - Sha256::hash(b"timeout_test"), - ); - let (_, notarization) = build_notarization(&schemes, &proposal, quorum); - mailbox - .recovered(Certificate::Notarization(notarization)) - .await; - - // Verify view advances - let advanced = loop { + loop { select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(success, "expected successful certification after restart for canceled certification view"); + break; + } + MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} + }, msg = batcher_receiver.recv() => { - if let batcher::Message::Update { - current, response, .. - } = msg.unwrap() - { - response.send(None).unwrap(); - if current > target_view { - break true; + match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!("unexpected immediate nullify for view {target_view} after restart"); + } + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); } + _ => {} } }, _ = context.sleep(Duration::from_secs(5)) => { - break false; + panic!( + "timed out waiting for successful certification for restarted view {target_view}" + ); }, } }; + + // Give reporter a moment to ingest any late events and ensure no nullify artifacts + // were emitted for the restarted target view. + context.sleep(Duration::from_millis(100)).await; assert!( - advanced, - "view should advance after certification (timeout case)" + !reporter.nullifies.lock().contains_key(&target_view), + "did not expect nullify votes for restarted view {target_view}" + ); + assert!( + !reporter.nullifications.lock().contains_key(&target_view), + "did not expect nullification certificate for restarted view {target_view}" ); }); } #[test_traced] - fn test_certification_after_timeout() { - certification_after_timeout::<_, _>(bls12381_threshold_vrf::fixture::); - certification_after_timeout::<_, _>(bls12381_threshold_vrf::fixture::); - certification_after_timeout::<_, _>(bls12381_multisig::fixture::); - certification_after_timeout::<_, _>(bls12381_multisig::fixture::); - certification_after_timeout::<_, _>(ed25519::fixture); - certification_after_timeout::<_, _>(secp256r1::fixture); + fn test_cancelled_certification_recertifies_after_restart() { + cancelled_certification_recertifies_after_restart::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + cancelled_certification_recertifies_after_restart::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + cancelled_certification_recertifies_after_restart::<_, _>( + bls12381_multisig::fixture::, + ); + cancelled_certification_recertifies_after_restart::<_, _>( + bls12381_multisig::fixture::, + ); + cancelled_certification_recertifies_after_restart::<_, _>(ed25519::fixture); + cancelled_certification_recertifies_after_restart::<_, _>(secp256r1::fixture); } - /// Tests certification after: notarize -> timeout -> receive notarization -> certify. - /// This test runs when we are NOT the leader (receiving proposal from another participant). - fn certification_after_notarize_timeout_as_follower(mut fixture: F) + /// Demonstrates that validators in future views cannot retroactively help + /// stuck validators escape via nullification. + /// + /// This test extends the previous scenario to show that: + /// 1. A stuck validator (view 3) cannot be rescued by notarizations from future views + /// 2. The only escape route is a finalization certificate (which requires Byzantine cooperation) + /// + /// Once the f+1 honest validators certify view 3 and advance to view 4, + /// they can only vote to nullify view 4 (their current view) without equivocating. + /// The `timeout` function only votes to nullify `self.view` (current view). + fn only_finalization_rescues_validator(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { - let n = 5; + let n = 4; let quorum = quorum(n); - let namespace = b"certification_after_notarize_timeout_as_follower".to_vec(); + let namespace = b"future_notarization_no_rescue".to_vec(); let executor = deterministic::Runner::timed(Duration::from_secs(60)); executor.start(|mut context| async move { // Get participants @@ -5052,96 +6738,136 @@ mod tests { let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; - // Setup application mock and voter + // Setup voter with Certifier::Cancel to simulate missing verification context. let elector = RoundRobin::::default(); - let built_elector: RoundRobinElector = elector - .clone() - .build(&participants.clone().try_into().unwrap()); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, - elector, - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_secs(100), - mocks::application::Certifier::Always, + elector.clone(), + Duration::from_secs(2), + Duration::from_secs(3), + Duration::from_secs(1), + mocks::application::Certifier::Cancel, ) .await; - // Advance to view 3 where we're a follower. - // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 - // View 3: leader = 1 (not us) - let target_view = View::new(3); + // Advance to view 4 so the stuck round is not leader-owned by this validator. + let view_4 = View::new(4); let parent_payload = advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, quorum, - target_view, + view_4, ) .await; - assert_ne!( - built_elector.elect(Round::new(Epoch::new(333), target_view), None), - Participant::new(0), - "we should not be leader at view 3" - ); - // Create and send proposal as if from the leader (participant 1) - let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), - target_view.previous().unwrap(), - Sha256::hash(b"follower_test"), + let proposal_4 = Proposal::new( + Round::new(Epoch::new(333), view_4), + view_4.previous().unwrap(), + Sha256::hash(b"view_4_proposal"), ); let leader = participants[1].clone(); - let contents = (proposal.round, parent_payload, 0u64).encode(); - relay.broadcast(&leader, (proposal.payload, contents)); - mailbox.proposal(proposal.clone()).await; + let contents = (proposal_4.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal_4.payload, contents)); + mailbox.proposal(proposal_4.clone()).await; - // Wait for notarize vote + let (_, notarization_4) = build_notarization(&schemes, &proposal_4, quorum); + mailbox + .resolved(Certificate::Notarization(notarization_4)) + .await; + + // Wait for the first nullify vote (confirms stuck state) loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(n)) - if n.view() == target_view => + batcher::Message::Constructed(Vote::Nullify(n)) if n.view() == view_4 => break, batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(5)) => { - panic!("expected notarize vote"); + _ = context.sleep(Duration::from_secs(10)) => { + panic!("expected nullify vote for view 4"); }, } } - // Trigger timeout - context.sleep(Duration::from_secs(11)).await; + // Now simulate what the "advanced" validators (f+1 honest with context) are doing: + // They certified view 4 and advanced to view 5, where they're making progress. + // Send a notarization for view 5 to the stuck validator. + let view_5 = View::new(5); + let proposal_5 = Proposal::new( + Round::new(Epoch::new(333), view_5), + view_4, // Parent is view 4 (certified by the advanced validators) + Sha256::hash(b"view_5_proposal"), + ); + let (_, notarization_5) = build_notarization(&schemes, &proposal_5, quorum); - // Wait for nullify vote - loop { + // Send the view 5 notarization to the stuck validator + mailbox + .resolved(Certificate::Notarization(notarization_5)) + .await; + + // The stuck validator should still not advance. + // + // Receiving a notarization for view 5 doesn't help because: + // 1. add_notarization() does not call enter_view() - it only adds to certification_candidates + // 2. To advance past view 4, the validator needs EITHER: + // a. Certification of view 4 to succeed (impossible - no context) + // b. A nullification certificate for view 4 (impossible - only f votes) + // c. A finalization certificate (requires Byzantine to vote finalize) + let advanced = loop { select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(n)) - if n.view() == target_view => - break, - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} + msg = batcher_receiver.recv() => { + match msg.unwrap() { + batcher::Message::Update { + current, response, .. + } => { + response.send(None).unwrap(); + if current > view_4 { + break true; + } + } + batcher::Message::Constructed(Vote::Nullify(n)) => { + // Still voting nullify for view 4 - expected + assert_eq!( + n.view(), + view_4, + "should only vote nullify for stuck view" + ); + } + _ => {} + } }, - _ = context.sleep(Duration::from_secs(1)) => { - panic!("expected nullify vote"); + _ = context.sleep(Duration::from_secs(5)) => { + break false; }, } - } + }; - // Send notarization certificate - let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + assert!( + !advanced, + "receiving a notarization for view 5 should NOT rescue the stuck validator - \ + they still can't certify view 4 (no context) and can't form a nullification \ + (not enough votes). The f+1 honest validators who advanced to view 5 cannot \ + retroactively help because they can only vote nullify for their current view (5), \ + not for view 4." + ); + + // HOWEVER: A finalization certificate WOULD rescue the stuck validator. + // If the Byzantine validators eventually cooperate and vote finalize, + // the finalization would abort the stuck certification and advance the view. + // + // Let's demonstrate this escape route works (if Byzantine cooperate): + let (_, finalization_5) = build_finalization(&schemes, &proposal_5, quorum); mailbox - .recovered(Certificate::Notarization(notarization)) + .resolved(Certificate::Finalization(finalization_5)) .await; - // Verify view advances - let advanced = loop { + // Now the validator SHOULD advance (finalization aborts stuck certification) + let rescued = loop { select! { msg = batcher_receiver.recv() => { if let batcher::Message::Update { @@ -5149,7 +6875,7 @@ mod tests { } = msg.unwrap() { response.send(None).unwrap(); - if current > target_view { + if current > view_5 { break true; } } @@ -5159,42 +6885,50 @@ mod tests { }, } }; + assert!( - advanced, - "view should advance after certification (follower case)" + rescued, + "a finalization certificate SHOULD rescue the stuck validator - \ + this is the ONLY escape route, but it requires Byzantine cooperation \ + (they must vote finalize). If Byzantine permanently withhold finalize votes, \ + the stuck validators are permanently excluded from consensus." ); }); } #[test_traced] - fn test_certification_after_notarize_timeout_as_follower() { - certification_after_notarize_timeout_as_follower::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - certification_after_notarize_timeout_as_follower::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - certification_after_notarize_timeout_as_follower::<_, _>( - bls12381_multisig::fixture::, - ); - certification_after_notarize_timeout_as_follower::<_, _>( - bls12381_multisig::fixture::, - ); - certification_after_notarize_timeout_as_follower::<_, _>(ed25519::fixture); - certification_after_notarize_timeout_as_follower::<_, _>(secp256r1::fixture); + fn test_only_finalization_rescues_validator() { + only_finalization_rescues_validator::<_, _>(bls12381_threshold_vrf::fixture::); + only_finalization_rescues_validator::<_, _>(bls12381_threshold_vrf::fixture::); + only_finalization_rescues_validator::<_, _>(bls12381_multisig::fixture::); + only_finalization_rescues_validator::<_, _>(bls12381_multisig::fixture::); + only_finalization_rescues_validator::<_, _>(ed25519::fixture); + only_finalization_rescues_validator::<_, _>(secp256r1::fixture); } - /// Tests certification after: notarize -> timeout -> receive notarization -> certify. - /// This test runs when we ARE the leader (proposing ourselves). - fn certification_after_notarize_timeout_as_leader(mut fixture: F) + /// Tests that when certification explicitly fails (returns false), the voter: + /// 1. Can vote nullify even after having voted notarize + /// 2. Will emit a nullify vote immediately after certification failure + /// + /// This simulates the coding marshal scenario where: + /// - verify() returns true (shard validity passes) + /// - Voter votes notarize + /// - Notarization forms + /// - certify() returns false (block context mismatch discovered during deferred_verify) + /// - Voter should vote nullify to attempt to advance + /// + /// The liveness concern is: if only f honest validators can vote nullify (the ones who + /// never saw the shard/never verified), then nullification quorum (2f+1) cannot form + /// since the f+1 honest who voted notarize need to also vote nullify. + fn certification_failure_allows_nullify_after_notarize(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"certification_after_notarize_timeout_as_leader".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(60)); + let namespace = b"cert_fail_nullify".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(10)); executor.start(|mut context| async move { // Get participants let Fixture { @@ -5204,32 +6938,33 @@ mod tests { } = fixture(&mut context, &namespace, n); // Create simulated network - let oracle = - start_test_network_with_peers(context.clone(), participants.clone(), true).await; + let oracle = start_test_network_with_peers( + context.clone(), + participants.clone(), + true, + ) + .await; - // Setup application mock and voter let elector = RoundRobin::::default(); - let built_elector: RoundRobinElector = elector - .clone() - .build(&participants.clone().try_into().unwrap()); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + + // Set up voter with Certifier::Custom that always returns false + // This simulates coding marshal's deferred_verify finding context mismatch + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_secs(10), - Duration::from_secs(10), + Duration::from_secs(100), // Long timeout to prove nullify comes from cert failure + Duration::from_secs(100), Duration::from_secs(100), - mocks::application::Certifier::Always, + mocks::application::Certifier::Custom(Box::new(|_, _| false)), ) .await; - // Advance to view 2 where we ARE the leader. - // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 - // View 2: leader = 0 (us) - let target_view = View::new(2); - advance_to_view( + // Advance to view 3 where we're a follower. + let target_view = View::new(3); + let parent_payload = advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, @@ -5237,109 +6972,100 @@ mod tests { target_view, ) .await; - assert_eq!( - built_elector.elect(Round::new(Epoch::new(333), target_view), None), - Participant::new(0), - "we should be leader at view 2" - ); - - // As leader, wait for our own notarize vote (automaton will propose) - let proposal = loop { - select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(n)) - if n.view() == target_view => - { - break n.proposal.clone(); - } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} - }, - _ = context.sleep(Duration::from_secs(5)) => { - panic!("expected notarize vote as leader"); - }, - } - }; - // Trigger timeout - context.sleep(Duration::from_secs(11)).await; + // Broadcast the payload contents so verification can complete. + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"test_proposal"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; - // Wait for nullify vote + // Wait for notarize vote first (verification passes) loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(n)) - if n.view() == target_view => - break, + batcher::Message::Constructed(Vote::Notarize(n)) if n.view() == target_view => { + break; + } batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(1)) => { - panic!("expected nullify vote"); + _ = context.sleep(Duration::from_secs(2)) => { + panic!("expected notarize vote for view {target_view}"); }, } } - // Send notarization certificate (as if other participants formed it) + // Build and send notarization so the voter tries to certify let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox - .recovered(Certificate::Notarization(notarization)) + .resolved(Certificate::Notarization(notarization)) .await; - // Verify view advances - let advanced = loop { + // Certification will fail (returns false), so the voter should emit a nullify vote. + // This must happen quickly (not after 100s timeout) to prove it's from cert failure. + loop { select! { - msg = batcher_receiver.recv() => { - if let batcher::Message::Update { - current, response, .. - } = msg.unwrap() - { - response.send(None).unwrap(); - if current > target_view { - break true; - } + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) if nullify.view() == target_view => { + // Successfully voted nullify after having voted notarize + break; } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} }, _ = context.sleep(Duration::from_secs(5)) => { - break false; + panic!( + "voter should emit nullify for view {target_view} after certification failure, \ + even though it already voted notarize" + ); }, } - }; - assert!( - advanced, - "view should advance after certification (leader case)" - ); + } }); } #[test_traced] - fn test_certification_after_notarize_timeout_as_leader() { - certification_after_notarize_timeout_as_leader::<_, _>( + fn test_certification_failure_allows_nullify_after_notarize() { + certification_failure_allows_nullify_after_notarize::<_, _>( bls12381_threshold_vrf::fixture::, ); - certification_after_notarize_timeout_as_leader::<_, _>( + certification_failure_allows_nullify_after_notarize::<_, _>( bls12381_threshold_vrf::fixture::, ); - certification_after_notarize_timeout_as_leader::<_, _>( + certification_failure_allows_nullify_after_notarize::<_, _>( bls12381_multisig::fixture::, ); - certification_after_notarize_timeout_as_leader::<_, _>( + certification_failure_allows_nullify_after_notarize::<_, _>( bls12381_multisig::fixture::, ); - certification_after_notarize_timeout_as_leader::<_, _>(ed25519::fixture); - certification_after_notarize_timeout_as_leader::<_, _>(secp256r1::fixture); + certification_failure_allows_nullify_after_notarize::<_, _>(ed25519::fixture); + certification_failure_allows_nullify_after_notarize::<_, _>(secp256r1::fixture); } - /// Tests that when certification returns a cancelled receiver, the voter doesn't hang - /// and continues to make progress (via voting to nullify the view that could not be certified). - fn cancelled_certification_does_not_hang(mut fixture: F, traces: TraceStorage) + /// Verify that a voter recovers via timeout when certification hangs indefinitely. + /// + /// This simulates the scenario where a notarization forms but the block is + /// unrecoverable (e.g., proposer is dead and shard gossip didn't deliver enough + /// shards for reconstruction). In this case, `certify()` subscribes to the block + /// but the subscription never resolves. The voter must rely on the view timeout + /// to emit a nullify vote and advance the chain. + /// + /// Unlike `Cancel` mode (where the certify receiver errors immediately), `Pending` + /// mode holds the certify sender alive so the future never completes, forcing the + /// voter to recover purely through its timeout mechanism. + fn pending_certification_nullifies_on_timeout(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"consensus".to_vec(); + let namespace = b"pending_cert_nullify".to_vec(); let executor = deterministic::Runner::timed(Duration::from_secs(10)); executor.start(|mut context| async move { // Get participants @@ -5350,32 +7076,26 @@ mod tests { } = fixture(&mut context, &namespace, n); // Create simulated network - let oracle = start_test_network_with_peers( - context.clone(), - participants.clone(), - true, - ) - .await; + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; let elector = RoundRobin::::default(); - // Set up voter with Certifier::Cancel - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + // Set up voter with Certifier::Pending (certify hangs indefinitely). + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_millis(500), + Duration::from_secs(3), + Duration::from_secs(4), Duration::from_mins(60), - mocks::application::Certifier::Cancel, + mocks::application::Certifier::Pending, ) .await; // Advance to view 3 where we're a follower. - // With RoundRobin, epoch=333, n=5: leader = (333 + view) % 5 - // View 3: leader = 1 (not us) let target_view = View::new(3); let parent_payload = advance_to_view( &mut mailbox, @@ -5394,94 +7114,88 @@ mod tests { ); let leader = participants[1].clone(); let contents = (proposal.round, parent_payload, 0u64).encode(); - relay - .broadcast(&leader, (proposal.payload, contents)); + relay.broadcast(&leader, (proposal.payload, contents)); mailbox.proposal(proposal.clone()).await; - // Build and send notarization so the voter tries to certify + // Wait for notarize vote (verification passes). + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Notarize(n)) + if n.view() == target_view => + { + break; + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(2)) => { + panic!("expected notarize vote for view {target_view}"); + }, + } + } + + // Build and send notarization so the voter tries to certify. let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox .resolved(Certificate::Notarization(notarization)) .await; - // Certification will be cancelled, so the voter should eventually timeout - // and emit a nullify vote. + // Certification hangs (sender held alive, receiver pending). The voter + // must recover via the view timeout and emit a nullify vote. loop { select! { - msg = batcher_receiver.recv() => { - match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) if nullify.view() == target_view => { - break; - } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + // Timeout fired and voter emitted nullify despite + // certification being indefinitely pending. + break; } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} }, - _ = context.sleep(Duration::from_secs(5)) => { + _ = context.sleep(Duration::from_secs(8)) => { panic!( - "voter should emit nullify for view {target_view} despite cancelled certification", + "voter should emit nullify for view {target_view} via timeout \ + when certification hangs indefinitely", ); }, } } - - // Verify the "failed to certify proposal" log was emitted with the correct round - let expected_round = format!("Round {{ epoch: Epoch(333), view: View({target_view}) }}"); - traces - .get_by_level(Level::DEBUG) - .expect_event(|event| { - event.metadata.content == "failed to certify proposal" - && event - .metadata - .fields - .iter() - .any(|(name, value)| name == "err" && value == "RecvError(())") - && event - .metadata - .fields - .iter() - .any(|(name, value)| name == "round" && value == &expected_round) - }) - .unwrap(); }); } - #[test_collect_traces] - fn test_cancelled_certification_does_not_hang(traces: TraceStorage) { - cancelled_certification_does_not_hang( + #[test_traced] + fn test_pending_certification_nullifies_on_timeout() { + pending_certification_nullifies_on_timeout::<_, _>( bls12381_threshold_vrf::fixture::, - traces.clone(), ); - cancelled_certification_does_not_hang( + pending_certification_nullifies_on_timeout::<_, _>( bls12381_threshold_vrf::fixture::, - traces.clone(), - ); - cancelled_certification_does_not_hang( - bls12381_multisig::fixture::, - traces.clone(), - ); - cancelled_certification_does_not_hang( - bls12381_multisig::fixture::, - traces.clone(), ); - cancelled_certification_does_not_hang(ed25519::fixture, traces.clone()); - cancelled_certification_does_not_hang(secp256r1::fixture, traces); + pending_certification_nullifies_on_timeout::<_, _>(bls12381_multisig::fixture::); + pending_certification_nullifies_on_timeout::<_, _>(bls12381_multisig::fixture::); + pending_certification_nullifies_on_timeout::<_, _>(ed25519::fixture); + pending_certification_nullifies_on_timeout::<_, _>(secp256r1::fixture); } - /// Regression: a canceled certification attempt must not be persisted as failure. + /// Regression: once a proposal is received, leader timeout must no longer fire for that view. /// - /// We first trigger a canceled certify receiver, restart the voter, and then require: - /// 1. successful certification for the same view from replayed notarization state, and - /// 2. no immediate timeout/nullify for that view after restart. - fn cancelled_certification_recertifies_after_restart(mut fixture: F) + /// We require: + /// 1. No nullify before `certification_timeout` even though `leader_timeout` has elapsed. + /// 2. Nullify eventually arrives only after `certification_timeout` when no + /// certificate progress occurs. + fn proposal_clears_leader_timeout_before_certification_timeout(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"cancelled_cert_restart_recertify".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(20)); + let namespace = b"proposal_clears_leader_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(15)); executor.start(|mut context| async move { let Fixture { participants, @@ -5497,80 +7211,20 @@ mod tests { ) .await; - let me = participants[0].clone(); let elector = RoundRobin::::default(); - let reporter_cfg = mocks::reporter::Config { - participants: participants.clone().try_into().unwrap(), - scheme: schemes[0].clone(), - elector: elector.clone(), - }; - let reporter = - mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); - let relay = Arc::new(mocks::relay::Relay::new()); - - let partition = "cancelled_certification_recertifies_after_restart".to_string(); - let epoch = Epoch::new(333); - - // First run: certification receiver gets cancelled. - let app_cfg = mocks::application::Config { - hasher: Sha256::default(), - relay: relay.clone(), - me: me.clone(), - propose_latency: (1.0, 0.0), - verify_latency: (1.0, 0.0), - certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Cancel, - }; - let (app_actor, application) = - mocks::application::Application::new(context.with_label("app_cancel"), app_cfg); - app_actor.start(); - - let voter_cfg = Config { - scheme: schemes[0].clone(), - elector: elector.clone(), - blocker: oracle.control(me.clone()), - automaton: application.clone(), - relay: application.clone(), - reporter: reporter.clone(), - partition: partition.clone(), - epoch, - mailbox_size: 128, - leader_timeout: Duration::from_secs(5), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - activity_timeout: ViewDelta::new(10), - replay_buffer: NZUsize!(1024 * 1024), - write_buffer: NZUsize!(1024 * 1024), - page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), - }; - let (voter, mut mailbox) = Actor::new(context.with_label("voter_cancel"), voter_cfg); - - let (resolver_sender, _resolver_receiver) = mpsc::channel(8); - let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); - let (vote_sender, _) = oracle - .control(me.clone()) - .register(0, TEST_QUOTA) - .await - .unwrap(); - let (cert_sender, _) = oracle - .control(me.clone()) - .register(1, TEST_QUOTA) - .await - .unwrap(); - - let handle = voter.start( - batcher::Mailbox::new(batcher_sender), - resolver::Mailbox::new(resolver_sender), - vote_sender, - cert_sender, - ); - - if let batcher::Message::Update { response, .. } = - batcher_receiver.recv().await.unwrap() - { - response.send(None).unwrap(); - } + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), + ) + .await; + // Advance to a follower view. let target_view = View::new(3); let parent_payload = advance_to_view( &mut mailbox, @@ -5581,192 +7235,111 @@ mod tests { ) .await; + // Submit proposal quickly so leader timeout is cleared. let proposal = Proposal::new( - Round::new(epoch, target_view), + Round::new(Epoch::new(333), target_view), target_view.previous().unwrap(), - Sha256::hash(b"restart_recertify_payload"), + Sha256::hash(b"proposal_clears_leader_timeout"), ); let leader = participants[1].clone(); let contents = (proposal.round, parent_payload, 0u64).encode(); relay.broadcast(&leader, (proposal.payload, contents)); mailbox.proposal(proposal.clone()).await; - let (_, notarization) = build_notarization(&schemes, &proposal, quorum); - mailbox - .resolved(Certificate::Notarization(notarization)) - .await; - - // Give the canceled certification attempt time to run before restart. - context.sleep(Duration::from_millis(200)).await; - - // Sanity check: canceled certification should not have advanced this view yet. - let advanced_before_restart = select! { - msg = batcher_receiver.recv() => { - if let batcher::Message::Update { - current, response, .. - } = msg.unwrap() - { - response.send(None).unwrap(); - current > target_view - } else { - false - } - }, - _ = context.sleep(Duration::from_millis(200)) => false, - }; - assert!( - !advanced_before_restart, - "view should not advance before restart when certification receiver is canceled" - ); - - // Restart voter. - handle.abort(); - - // Second run: certification should succeed from replayed state. - // Use a longer certify latency so there is a real window where an - // incorrect immediate nullify could fire after restart. - let app_cfg = mocks::application::Config { - hasher: Sha256::default(), - relay: relay.clone(), - me: me.clone(), - propose_latency: (1.0, 0.0), - verify_latency: (1.0, 0.0), - certify_latency: (2_000.0, 0.0), // 2 seconds - should_certify: mocks::application::Certifier::Always, - }; - let (app_actor, application) = - mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); - app_actor.start(); - - let voter_cfg = Config { - scheme: schemes[0].clone(), - elector, - blocker: oracle.control(me.clone()), - automaton: application.clone(), - relay: application.clone(), - reporter: reporter.clone(), - partition, - epoch, - mailbox_size: 128, - leader_timeout: Duration::from_secs(5), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - activity_timeout: ViewDelta::new(10), - replay_buffer: NZUsize!(1024 * 1024), - write_buffer: NZUsize!(1024 * 1024), - page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), - }; - let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); - - let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); - let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); - let (vote_sender, _) = oracle - .control(me.clone()) - .register(2, TEST_QUOTA) - .await - .unwrap(); - let (cert_sender, _) = oracle - .control(me.clone()) - .register(3, TEST_QUOTA) - .await - .unwrap(); - - voter.start( - batcher::Mailbox::new(batcher_sender), - resolver::Mailbox::new(resolver_sender), - vote_sender, - cert_sender, - ); - - if let batcher::Message::Update { response, .. } = - batcher_receiver.recv().await.unwrap() - { - response.send(None).unwrap(); - } - + // Ensure proposal verification path ran. loop { select! { - msg = resolver_receiver.recv() => match msg.unwrap() { - MailboxMessage::Certified { view, success } if view == target_view => { - assert!(success, "expected successful certification after restart for canceled certification view"); + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Notarize(v)) if v.view() == target_view => { break; } - MailboxMessage::Certified { .. } | MailboxMessage::Certificate(_) => {} + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} }, - msg = batcher_receiver.recv() => { - match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => - { - panic!("unexpected immediate nullify for view {target_view} after restart"); - } - batcher::Message::Update { response, .. } => { - response.send(None).unwrap(); - } - _ => {} + _ = context.sleep(Duration::from_secs(2)) => { + panic!("expected notarize vote for view {target_view}"); + }, + } + } + + // `leader_timeout` is 1s and `certification_timeout` is 5s. We should not + // see nullify in this 2s window after proposal handling, even though + // leader timeout has elapsed. + let no_nullify_deadline = context.current() + Duration::from_secs(2); + loop { + select! { + _ = context.sleep_until(no_nullify_deadline) => break, + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "received nullify for view {target_view} before certification timeout" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + } + } + } + + // After certification timeout elapses, timeout recovery must emit nullify. + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + break; } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} }, - _ = context.sleep(Duration::from_secs(5)) => { + _ = context.sleep(Duration::from_secs(6)) => { panic!( - "timed out waiting for successful certification for restarted view {target_view}" + "expected nullify for view {target_view} after certification timeout" ); }, } - }; - - // Give reporter a moment to ingest any late events and ensure no nullify artifacts - // were emitted for the restarted target view. - context.sleep(Duration::from_millis(100)).await; - assert!( - !reporter.nullifies.lock().contains_key(&target_view), - "did not expect nullify votes for restarted view {target_view}" - ); - assert!( - !reporter.nullifications.lock().contains_key(&target_view), - "did not expect nullification certificate for restarted view {target_view}" - ); + } }); } #[test_traced] - fn test_cancelled_certification_recertifies_after_restart() { - cancelled_certification_recertifies_after_restart::<_, _>( + fn test_proposal_clears_leader_timeout_before_certification_timeout() { + proposal_clears_leader_timeout_before_certification_timeout::<_, _>( bls12381_threshold_vrf::fixture::, ); - cancelled_certification_recertifies_after_restart::<_, _>( + proposal_clears_leader_timeout_before_certification_timeout::<_, _>( bls12381_threshold_vrf::fixture::, ); - cancelled_certification_recertifies_after_restart::<_, _>( + proposal_clears_leader_timeout_before_certification_timeout::<_, _>( bls12381_multisig::fixture::, ); - cancelled_certification_recertifies_after_restart::<_, _>( + proposal_clears_leader_timeout_before_certification_timeout::<_, _>( bls12381_multisig::fixture::, ); - cancelled_certification_recertifies_after_restart::<_, _>(ed25519::fixture); - cancelled_certification_recertifies_after_restart::<_, _>(secp256r1::fixture); + proposal_clears_leader_timeout_before_certification_timeout::<_, _>(ed25519::fixture); + proposal_clears_leader_timeout_before_certification_timeout::<_, _>(secp256r1::fixture); } - /// Demonstrates that validators in future views cannot retroactively help - /// stuck validators escape via nullification. - /// - /// This test extends the previous scenario to show that: - /// 1. A stuck validator (view 3) cannot be rescued by notarizations from future views - /// 2. The only escape route is a finalization certificate (which requires Byzantine cooperation) + /// Regression: proposals recovered from notarization certificates must clear the + /// current view's leader timeout without emitting a local notarize vote. /// - /// Once the f+1 honest validators certify view 3 and advance to view 4, - /// they can only vote to nullify view 4 (their current view) without equivocating. - /// The `timeout` function only votes to nullify `self.view` (current view). - fn only_finalization_rescues_validator(mut fixture: F) + /// We require: + /// 1. No nullify before `certification_timeout` even though `leader_timeout` has elapsed. + /// 2. Nullify eventually arrives only after `certification_timeout` when certification + /// remains pending. + fn recovered_proposal_clears_leader_timeout_before_certification_timeout(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { - let n = 4; + let n = 5; let quorum = quorum(n); - let namespace = b"future_notarization_no_rescue".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(60)); + let namespace = b"recovered_proposal_clears_leader_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(15)); executor.start(|mut context| async move { - // Get participants let Fixture { participants, schemes, @@ -5774,202 +7347,131 @@ mod tests { } = fixture(&mut context, &namespace, n); // Create simulated network - let oracle = - start_test_network_with_peers(context.clone(), participants.clone(), true).await; + let oracle = start_test_network_with_peers( + context.clone(), + participants.clone(), + true, + ) + .await; - // Setup voter with Certifier::Cancel to simulate missing verification context. let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, - elector.clone(), - Duration::from_secs(2), - Duration::from_secs(3), + elector, Duration::from_secs(1), - mocks::application::Certifier::Cancel, + Duration::from_secs(5), + Duration::from_mins(60), + mocks::application::Certifier::Pending, ) .await; - // Advance to view 3 - let view_3 = View::new(3); - let parent_payload = advance_to_view( + // Advance to a follower view. + let target_view = View::new(3); + advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, quorum, - view_3, + target_view, ) .await; - let proposal_3 = Proposal::new( - Round::new(Epoch::new(333), view_3), - view_3.previous().unwrap(), - Sha256::hash(b"view_3_proposal"), + // Recover a notarization that carries the proposal for this view. + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"recovered_proposal_clears_leader_timeout"), ); - let leader = participants[1].clone(); - let contents = (proposal_3.round, parent_payload, 0u64).encode(); - relay.broadcast(&leader, (proposal_3.payload, contents)); - mailbox.proposal(proposal_3.clone()).await; - - let (_, notarization_3) = build_notarization(&schemes, &proposal_3, quorum); + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox - .resolved(Certificate::Notarization(notarization_3)) + .recovered(Certificate::Notarization(notarization)) .await; - // Wait for the first nullify vote (confirms stuck state) + // `leader_timeout` is 1s and `certification_timeout` is 5s. We should not + // emit a notarize vote or nullify in this 2s window after certificate handling, + // even though leader timeout has elapsed. + let quiet_deadline = context.current() + Duration::from_secs(2); loop { select! { + _ = context.sleep_until(quiet_deadline) => break, msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(n)) if n.view() == view_3 => - break, + batcher::Message::Constructed(Vote::Notarize(v)) if v.view() == target_view => { + panic!( + "unexpected notarize for view {target_view} from recovered certificate" + ); + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "received nullify for view {target_view} before certification timeout after recovered certificate" + ); + } batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} - }, - _ = context.sleep(Duration::from_secs(10)) => { - panic!("expected nullify vote for view 3"); - }, + } } } - // Now simulate what the "advanced" validators (f+1 honest with context) are doing: - // They certified view 3 and advanced to view 4, where they're making progress. - // Send a notarization for view 4 to the stuck validator. - let view_4 = View::new(4); - let proposal_4 = Proposal::new( - Round::new(Epoch::new(333), view_4), - view_3, // Parent is view 3 (certified by the advanced validators) - Sha256::hash(b"view_4_proposal"), - ); - let (_, notarization_4) = build_notarization(&schemes, &proposal_4, quorum); - - // Send the view 4 notarization to the stuck validator - mailbox - .resolved(Certificate::Notarization(notarization_4)) - .await; - - // The stuck validator should still not advance. - // - // Receiving a notarization for view 4 doesn't help because: - // 1. add_notarization() does not call enter_view() - it only adds to certification_candidates - // 2. To advance past view 3, the validator needs EITHER: - // a. Certification of view 3 to succeed (impossible - no context) - // b. A nullification certificate for view 3 (impossible - only f votes) - // c. A finalization certificate (requires Byzantine to vote finalize) - let advanced = loop { - select! { - msg = batcher_receiver.recv() => { - match msg.unwrap() { - batcher::Message::Update { - current, response, .. - } => { - response.send(None).unwrap(); - if current > view_3 { - break true; - } - } - batcher::Message::Constructed(Vote::Nullify(n)) => { - // Still voting nullify for view 3 - expected - assert_eq!( - n.view(), - view_3, - "should only vote nullify for stuck view" - ); - } - _ => {} - } - }, - _ = context.sleep(Duration::from_secs(5)) => { - break false; - }, - } - }; - - assert!( - !advanced, - "receiving a notarization for view 4 should NOT rescue the stuck validator - \ - they still can't certify view 3 (no context) and can't form a nullification \ - (not enough votes). The f+1 honest validators who advanced to view 4 cannot \ - retroactively help because they can only vote nullify for their current view (4), \ - not for view 3." - ); - - // HOWEVER: A finalization certificate WOULD rescue the stuck validator. - // If the Byzantine validators eventually cooperate and vote finalize, - // the finalization would abort the stuck certification and advance the view. - // - // Let's demonstrate this escape route works (if Byzantine cooperate): - let (_, finalization_4) = build_finalization(&schemes, &proposal_4, quorum); - mailbox - .resolved(Certificate::Finalization(finalization_4)) - .await; - - // Now the validator SHOULD advance (finalization aborts stuck certification) - let rescued = loop { + // After certification timeout elapses, timeout recovery must emit nullify. + loop { select! { - msg = batcher_receiver.recv() => { - if let batcher::Message::Update { - current, response, .. - } = msg.unwrap() + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => { - response.send(None).unwrap(); - if current > view_4 { - break true; - } + break; } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} }, - _ = context.sleep(Duration::from_secs(5)) => { - break false; + _ = context.sleep(Duration::from_secs(6)) => { + panic!( + "expected nullify for view {target_view} after certification timeout with recovered certificate" + ); }, } - }; - - assert!( - rescued, - "a finalization certificate SHOULD rescue the stuck validator - \ - this is the ONLY escape route, but it requires Byzantine cooperation \ - (they must vote finalize). If Byzantine permanently withhold finalize votes, \ - the stuck validators are permanently excluded from consensus." - ); + } }); } #[test_traced] - fn test_only_finalization_rescues_validator() { - only_finalization_rescues_validator::<_, _>(bls12381_threshold_vrf::fixture::); - only_finalization_rescues_validator::<_, _>(bls12381_threshold_vrf::fixture::); - only_finalization_rescues_validator::<_, _>(bls12381_multisig::fixture::); - only_finalization_rescues_validator::<_, _>(bls12381_multisig::fixture::); - only_finalization_rescues_validator::<_, _>(ed25519::fixture); - only_finalization_rescues_validator::<_, _>(secp256r1::fixture); + fn test_recovered_proposal_clears_leader_timeout_before_certification_timeout() { + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + bls12381_multisig::fixture::, + ); + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + bls12381_multisig::fixture::, + ); + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + ed25519::fixture, + ); + recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + secp256r1::fixture, + ); } - /// Tests that when certification explicitly fails (returns false), the voter: - /// 1. Can vote nullify even after having voted notarize - /// 2. Will emit a nullify vote immediately after certification failure - /// - /// This simulates the coding marshal scenario where: - /// - verify() returns true (shard validity passes) - /// - Voter votes notarize - /// - Notarization forms - /// - certify() returns false (block context mismatch discovered during deferred_verify) - /// - Voter should vote nullify to attempt to advance - /// - /// The liveness concern is: if only f honest validators can vote nullify (the ones who - /// never saw the shard/never verified), then nullification quorum (2f+1) cannot form - /// since the f+1 honest who voted notarize need to also vote nullify. - fn certification_failure_allows_nullify_after_notarize(mut fixture: F) + /// Regression: after a timed-out view is nullified and the voter advances, + /// the next view must start with a fresh leader timeout. + fn next_view_gets_fresh_timeout_after_prior_view_nullifies(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"cert_fail_nullify".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(10)); + let namespace = b"next_view_gets_fresh_timeout_after_prior_view_nullifies".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(15)); executor.start(|mut context| async move { - // Get participants let Fixture { participants, schemes, @@ -5984,130 +7486,142 @@ mod tests { ) .await; - let elector = RoundRobin::::default(); - - // Set up voter with Certifier::Custom that always returns false - // This simulates coding marshal's deferred_verify finding context mismatch - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( &mut context, &oracle, &participants, &schemes, - elector, - Duration::from_secs(100), // Long timeout to prove nullify comes from cert failure - Duration::from_secs(100), - Duration::from_secs(100), - mocks::application::Certifier::Custom(Box::new(|_| false)), - ) - .await; - - // Advance to view 3 where we're a follower. - let target_view = View::new(3); - let parent_payload = advance_to_view( - &mut mailbox, - &mut batcher_receiver, - &schemes, - quorum, - target_view, + RoundRobin::::default(), + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), ) .await; - // Broadcast the payload contents so verification can complete. - let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), - target_view.previous().unwrap(), - Sha256::hash(b"test_proposal"), - ); - let leader = participants[1].clone(); - let contents = (proposal.round, parent_payload, 0u64).encode(); - relay.broadcast(&leader, (proposal.payload, contents)); - mailbox.proposal(proposal.clone()).await; + // Wait for the initial view 1 batcher update. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { + current, response, .. + } => { + response.send(None).unwrap(); + if current == View::new(1) { + break; + } + } + batcher::Message::Constructed(_) => {} + } + } - // Wait for notarize vote first (verification passes) + // Allow view 1 to time out and emit a nullify vote. loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(n)) if n.view() == target_view => { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == View::new(1) => + { break; } batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected notarize vote for view {target_view}"); + panic!("expected nullify for view 1"); }, } } - // Build and send notarization so the voter tries to certify - let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + // Deliver a nullification certificate for view 1 so the voter enters view 2. + let (_, nullification) = + build_nullification(&schemes, Round::new(Epoch::new(333), View::new(1)), quorum); mailbox - .resolved(Certificate::Notarization(notarization)) + .resolved(Certificate::Nullification(nullification)) .await; - // Certification will fail (returns false), so the voter should emit a nullify vote. - // This must happen quickly (not after 100s timeout) to prove it's from cert failure. loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) if nullify.view() == target_view => { - // Successfully voted nullify after having voted notarize - break; + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == View::new(2) => + { + panic!( + "received nullify for view 2 before its fresh leader timeout elapsed" + ); + } + batcher::Message::Update { + current, response, .. + } => { + response.send(None).unwrap(); + if current == View::new(2) { + break; + } } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(5)) => { - panic!( - "voter should emit nullify for view {target_view} after certification failure, \ - even though it already voted notarize" - ); + _ = context.sleep(Duration::from_secs(2)) => { + panic!("expected voter to advance to view 2"); }, } } + + // The old view timed out, but the new view should still get its own leader timeout + // rather than immediately nullifying on entry. + let quiet_deadline = context.current() + Duration::from_millis(500); + loop { + select! { + _ = context.sleep_until(quiet_deadline) => break, + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == View::new(2) => + { + panic!( + "received nullify for view 2 before its fresh leader timeout elapsed" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + } + } + } }); } #[test_traced] - fn test_certification_failure_allows_nullify_after_notarize() { - certification_failure_allows_nullify_after_notarize::<_, _>( + fn test_next_view_gets_fresh_timeout_after_prior_view_nullifies() { + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( bls12381_threshold_vrf::fixture::, ); - certification_failure_allows_nullify_after_notarize::<_, _>( + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( bls12381_threshold_vrf::fixture::, ); - certification_failure_allows_nullify_after_notarize::<_, _>( + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( bls12381_multisig::fixture::, ); - certification_failure_allows_nullify_after_notarize::<_, _>( + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( bls12381_multisig::fixture::, ); - certification_failure_allows_nullify_after_notarize::<_, _>(ed25519::fixture); - certification_failure_allows_nullify_after_notarize::<_, _>(secp256r1::fixture); + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>(ed25519::fixture); + next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>(secp256r1::fixture); } - /// Verify that a voter recovers via timeout when certification hangs indefinitely. - /// - /// This simulates the scenario where a notarization forms but the block is - /// unrecoverable (e.g., proposer is dead and shard gossip didn't deliver enough - /// shards for reconstruction). In this case, `certify()` subscribes to the block - /// but the subscription never resolves. The voter must rely on the view timeout - /// to emit a nullify vote and advance the chain. + /// Regression: the first view should make progress without timing out when peers are online. /// - /// Unlike `Cancel` mode (where the certify receiver errors immediately), `Pending` - /// mode holds the certify sender alive so the future never completes, forcing the - /// voter to recover purely through its timeout mechanism. - fn pending_certification_nullifies_on_timeout(mut fixture: F) + /// We require: + /// 1. No `nullify(1)` is emitted while quorum certificates arrive promptly. + /// 2. The voter emits `notarize(1)`. + /// 3. After successful certification, the voter emits `finalize(1)` before + /// advancing to view 2. + fn first_view_progress_without_timeout(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + L: ElectorConfig, { let n = 5; let quorum = quorum(n); - let namespace = b"pending_cert_nullify".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(10)); + let namespace = b"first_view_progress_without_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(15)); executor.start(|mut context| async move { - // Get participants let Fixture { participants, schemes, @@ -6118,153 +7632,264 @@ mod tests { let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; - let elector = RoundRobin::::default(); + let elector = L::default(); + let first_round = Round::new(Epoch::new(333), View::new(1)); + let leader_idx = elector + .clone() + .build(schemes[0].participants()) + .elect(first_round, None); + let leader = participants[usize::from(leader_idx)].clone(); - // Set up voter with Certifier::Pending (certify hangs indefinitely). - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, reporter) = setup_voter( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_secs(3), - Duration::from_secs(4), + Duration::from_secs(1), + Duration::from_secs(5), Duration::from_mins(60), - mocks::application::Certifier::Pending, ) .await; - // Advance to view 3 where we're a follower. - let target_view = View::new(3); - let parent_payload = advance_to_view( - &mut mailbox, - &mut batcher_receiver, - &schemes, - quorum, - target_view, - ) - .await; + // Wait for initial batcher notification. + let message = batcher_receiver.recv().await.unwrap(); + match message { + batcher::Message::Update { + current, + finalized, + response, + .. + } => { + assert_eq!(current, View::new(1)); + assert_eq!(finalized, View::new(0)); + response.send(None).unwrap(); + } + _ => panic!("unexpected batcher message"), + } - // Broadcast the payload contents so verification can complete. + // Build a valid first-view proposal (parent is genesis at view 0). + let mut hasher = Sha256::default(); + hasher.update(&(bytes::Bytes::from_static(b"genesis"), Epoch::new(333)).encode()); + let genesis = hasher.finalize(); let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), - target_view.previous().unwrap(), - Sha256::hash(b"test_proposal"), + first_round, + View::zero(), + Sha256::hash(b"first_view_progress_without_timeout"), ); - let leader = participants[1].clone(); - let contents = (proposal.round, parent_payload, 0u64).encode(); + let contents = (proposal.round, genesis, 0u64).encode(); relay.broadcast(&leader, (proposal.payload, contents)); mailbox.proposal(proposal.clone()).await; - // Wait for notarize vote (verification passes). + // The voter should notarize view 1 and must not nullify it. loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(n)) - if n.view() == target_view => + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == View::new(1) => { break; } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == View::new(1) => + { + panic!("unexpected nullify for view 1 while peers are online"); + } batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected notarize vote for view {target_view}"); + panic!("expected notarize for view 1"); + }, + } + } + + // Deliver quorum notarization and ensure we finalize + advance to view 2 without nullify. + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + let deadline = context.current() + Duration::from_secs(3); + let reached_view2 = loop { + select! { + _ = context.sleep_until(deadline) => break false, + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == View::new(1) => + { + break false; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == View::new(1) => + { + panic!("unexpected nullify for view 1 while peers are online"); + } + batcher::Message::Update { + current, response, .. + } => { + response.send(None).unwrap(); + if current >= View::new(2) { + break true; + } + } + _ => {} }, } - } - - // Build and send notarization so the voter tries to certify. - let (_, notarization) = build_notarization(&schemes, &proposal, quorum); - mailbox - .resolved(Certificate::Notarization(notarization)) - .await; + }; + assert!(!reached_view2, "view advanced before finalize for view 1"); - // Certification hangs (sender held alive, receiver pending). The voter - // must recover via the view timeout and emit a nullify vote. - loop { + let reached_view2 = loop { select! { + _ = context.sleep_until(deadline) => break false, msg = batcher_receiver.recv() => match msg.unwrap() { batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => + if nullify.view() == View::new(1) => { - // Timeout fired and voter emitted nullify despite - // certification being indefinitely pending. - break; + panic!("unexpected nullify for view 1 while peers are online"); + } + batcher::Message::Update { + current, response, .. + } => { + response.send(None).unwrap(); + if current >= View::new(2) { + break true; + } } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(8)) => { - panic!( - "voter should emit nullify for view {target_view} via timeout \ - when certification hangs indefinitely", - ); - }, } - } + }; + assert!(reached_view2, "expected progress to view 2 from view 1"); + + // Give the reporter a moment to receive any late events and verify no first-view nullify artifacts. + context.sleep(Duration::from_millis(50)).await; + assert!( + !reporter.nullifies.lock().contains_key(&View::new(1)), + "did not expect nullify votes for view 1" + ); + assert!( + !reporter.nullifications.lock().contains_key(&View::new(1)), + "did not expect a nullification certificate for view 1" + ); }); } #[test_traced] - fn test_pending_certification_nullifies_on_timeout() { - pending_certification_nullifies_on_timeout::<_, _>( + fn test_first_view_progress_without_timeout() { + first_view_progress_without_timeout::<_, _, Random>( bls12381_threshold_vrf::fixture::, ); - pending_certification_nullifies_on_timeout::<_, _>( + first_view_progress_without_timeout::<_, _, Random>( bls12381_threshold_vrf::fixture::, ); - pending_certification_nullifies_on_timeout::<_, _>(bls12381_multisig::fixture::); - pending_certification_nullifies_on_timeout::<_, _>(bls12381_multisig::fixture::); - pending_certification_nullifies_on_timeout::<_, _>(ed25519::fixture); - pending_certification_nullifies_on_timeout::<_, _>(secp256r1::fixture); + first_view_progress_without_timeout::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + first_view_progress_without_timeout::<_, _, RoundRobin>( + bls12381_multisig::fixture::, + ); + first_view_progress_without_timeout::<_, _, RoundRobin>(ed25519::fixture); + first_view_progress_without_timeout::<_, _, RoundRobin>(secp256r1::fixture); } - /// Regression: once a proposal is received, leader timeout must no longer fire for that view. + /// Tests that a successful certification is correctly replayed from the journal + /// after a restart. /// - /// We require: - /// 1. No nullify before `certification_timeout` even though `leader_timeout` has elapsed. - /// 2. Nullify eventually arrives only after `certification_timeout` when no - /// certificate progress occurs. - fn proposal_clears_leader_timeout_before_certification_timeout(mut fixture: F) + /// 1. First run: follower certifies a view successfully, which is persisted to journal. + /// 2. Abort the voter. + /// 3. Second run: voter replays journal and processes the Artifact::Certification entry, + /// advancing past the certified view without re-certifying. + fn successful_certification_replayed_after_restart(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"proposal_clears_leader_timeout".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(15)); + let namespace = b"successful_cert_replay".to_vec(); + let partition = "successful_cert_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); executor.start(|mut context| async move { let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; - // Create simulated network - let oracle = start_test_network_with_peers( - context.clone(), - participants.clone(), - true, - ) - .await; - + let me = participants[0].clone(); let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( - &mut context, - &oracle, - &participants, - &schemes, - elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), - mocks::application::Certifier::Always, - ) - .await; + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); - // Advance to a follower view. + // First run: certify a follower view successfully. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3 (leader = participant 1). let target_view = View::new(3); let parent_payload = advance_to_view( &mut mailbox, @@ -6275,142 +7900,268 @@ mod tests { ) .await; - // Submit proposal quickly so leader timeout is cleared. + // Send proposal + payload so verification passes. let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), + Round::new(epoch, target_view), target_view.previous().unwrap(), - Sha256::hash(b"proposal_clears_leader_timeout"), + Sha256::hash(b"cert_replay_payload"), ); let leader = participants[1].clone(); let contents = (proposal.round, parent_payload, 0u64).encode(); relay.broadcast(&leader, (proposal.payload, contents)); mailbox.proposal(proposal.clone()).await; - // Ensure proposal verification path ran. + // Send notarization to trigger certification. + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Wait for certification to complete (view advances past target_view). loop { select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(v)) if v.view() == target_view => { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(success, "expected successful certification"); break; } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected notarize vote for view {target_view}"); + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for certification in first run"); }, } } - // `leader_timeout` is 1s and `certification_timeout` is 5s. We should not - // see nullify in this 2s window after proposal handling, even though - // leader timeout has elapsed. - let no_nullify_deadline = context.current() + Duration::from_secs(2); - loop { - select! { - _ = context.sleep_until(no_nullify_deadline) => break, - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => - { - panic!( - "received nullify for view {target_view} before certification timeout" - ); - } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} - } + // Drain any pending batcher messages so the view has advanced. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); } } - // After certification timeout elapses, timeout recovery must emit nullify. + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Certification from journal. + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = + Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for replay to complete and verify the voter advanced past + // target_view (certification was replayed from journal). + let mut replayed_certified = false; loop { select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => - { - break; + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(success, "replayed certification should be successful"); + replayed_certified = true; } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(6)) => { - panic!( - "expected nullify for view {target_view} after certification timeout" - ); + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for restarted voter to advance past view {target_view}"); }, } } + + assert!( + replayed_certified, + "resolver should receive Certified during replay for view {target_view}" + ); + + // The voter should NOT have called certify on the automaton for + // target_view (it was replayed from journal). + let certified = certify_calls.lock(); + assert!( + !certified.contains(&target_view), + "voter should not re-certify view {target_view} during replay (observed: {certified:?})" + ); }); } #[test_traced] - fn test_proposal_clears_leader_timeout_before_certification_timeout() { - proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + fn test_successful_certification_replayed_after_restart() { + successful_certification_replayed_after_restart( bls12381_threshold_vrf::fixture::, ); - proposal_clears_leader_timeout_before_certification_timeout::<_, _>( + successful_certification_replayed_after_restart( bls12381_threshold_vrf::fixture::, ); - proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_multisig::fixture::, - ); - proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_multisig::fixture::, - ); - proposal_clears_leader_timeout_before_certification_timeout::<_, _>(ed25519::fixture); - proposal_clears_leader_timeout_before_certification_timeout::<_, _>(secp256r1::fixture); + successful_certification_replayed_after_restart(bls12381_multisig::fixture::); + successful_certification_replayed_after_restart(bls12381_multisig::fixture::); + successful_certification_replayed_after_restart(ed25519::fixture); + successful_certification_replayed_after_restart(secp256r1::fixture); } - /// Regression: proposals recovered from notarization certificates must clear the - /// current view's leader timeout without emitting a local notarize vote. - /// - /// We require: - /// 1. No nullify before `certification_timeout` even though `leader_timeout` has elapsed. - /// 2. Nullify eventually arrives only after `certification_timeout` when certification - /// remains pending. - fn recovered_proposal_clears_leader_timeout_before_certification_timeout(mut fixture: F) + /// Tests that a failed certification (certify returns false) is correctly replayed + /// from the journal after a restart. The replayed failure should trigger a timeout + /// for the view (not re-certify or advance). + fn failed_certification_replayed_after_restart(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"recovered_proposal_clears_leader_timeout".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(15)); + let namespace = b"failed_cert_replay".to_vec(); + let partition = "failed_cert_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); executor.start(|mut context| async move { let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; - // Create simulated network - let oracle = start_test_network_with_peers( - context.clone(), - participants.clone(), - true, - ) - .await; - + let me = participants[0].clone(); let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( - &mut context, - &oracle, - &participants, - &schemes, - elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), - mocks::application::Certifier::Pending, - ) - .await; + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); - // Advance to a follower view. + // First run: certify fails (returns false). + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new(|_, _| false)), + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3. let target_view = View::new(3); - advance_to_view( + let parent_payload = advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, @@ -6419,419 +8170,606 @@ mod tests { ) .await; - // Recover a notarization that carries the proposal for this view. + // Send proposal + payload. let proposal = Proposal::new( - Round::new(Epoch::new(333), target_view), + Round::new(epoch, target_view), target_view.previous().unwrap(), - Sha256::hash(b"recovered_proposal_clears_leader_timeout"), + Sha256::hash(b"failed_cert_replay_payload"), ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + // Send notarization to trigger certification. let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox - .recovered(Certificate::Notarization(notarization)) + .resolved(Certificate::Notarization(notarization)) .await; - // `leader_timeout` is 1s and `certification_timeout` is 5s. We should not - // emit a notarize vote or nullify in this 2s window after certificate handling, - // even though leader timeout has elapsed. - let quiet_deadline = context.current() + Duration::from_secs(2); + // Wait for failed certification result to be reported to resolver. loop { select! { - _ = context.sleep_until(quiet_deadline) => break, - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(v)) if v.view() == target_view => { - panic!( - "unexpected notarize for view {target_view} from recovered certificate" - ); - } - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => - { - panic!( - "received nullify for view {target_view} before certification timeout after recovered certificate" - ); + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(!success, "expected failed certification"); + break; } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} - } + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for failed certification in first run"); + }, } } - // After certification timeout elapses, timeout recovery must emit nullify. - loop { - select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == target_view => - { - break; + // Let the journal sync. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); + } + } + + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Certification(false) from journal. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // The replayed failed certification should be reported to resolver + // and the voter should NOT advance past target_view. + let mut replayed_certified = false; + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(!success, "replayed certification should be a failure"); + replayed_certified = true; } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(6)) => { - panic!( - "expected nullify for view {target_view} after certification timeout with recovered certificate" - ); + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + // After replay, should be at target_view (not past it). + if current == target_view && replayed_certified { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + if replayed_certified { + break; + } + panic!("timed out waiting for replayed failed certification"); }, } } + + assert!( + replayed_certified, + "resolver should receive Certified(false) during replay for view {target_view}" + ); }); } #[test_traced] - fn test_recovered_proposal_clears_leader_timeout_before_certification_timeout() { - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_multisig::fixture::, - ); - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - bls12381_multisig::fixture::, - ); - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - ed25519::fixture, - ); - recovered_proposal_clears_leader_timeout_before_certification_timeout::<_, _>( - secp256r1::fixture, - ); + fn test_failed_certification_replayed_after_restart() { + failed_certification_replayed_after_restart(bls12381_threshold_vrf::fixture::); + failed_certification_replayed_after_restart(bls12381_threshold_vrf::fixture::); + failed_certification_replayed_after_restart(bls12381_multisig::fixture::); + failed_certification_replayed_after_restart(bls12381_multisig::fixture::); + failed_certification_replayed_after_restart(ed25519::fixture); + failed_certification_replayed_after_restart(secp256r1::fixture); } - /// Regression: after a timed-out view is nullified and the voter advances, - /// the next view must start with a fresh leader timeout. - fn next_view_gets_fresh_timeout_after_prior_view_nullifies(mut fixture: F) + /// Tests that nullify votes and nullification certificates are correctly + /// replayed from the journal after a restart. + /// + /// 1. First run: follower times out, votes nullify, receives nullification + /// certificate. All persisted to journal. + /// 2. Abort the voter. + /// 3. Second run: voter replays journal and processes Artifact::Nullify and + /// Artifact::Nullification entries. The resolver receives the nullification + /// and the voter re-enters the same view (since it was never finalized). + fn nullify_and_nullification_replayed_after_restart(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, { let n = 5; let quorum = quorum(n); - let namespace = b"next_view_gets_fresh_timeout_after_prior_view_nullifies".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(15)); + let namespace = b"nullify_nullification_replay".to_vec(); + let partition = "nullify_nullification_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); executor.start(|mut context| async move { let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; - // Create simulated network - let oracle = start_test_network_with_peers( - context.clone(), - participants.clone(), - true, - ) - .await; + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( - &mut context, - &oracle, - &participants, + // First run: trigger timeout and nullification. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(1), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3. + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, &schemes, - RoundRobin::::default(), - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), - mocks::application::Certifier::Always, + quorum, + target_view, ) .await; - // Wait for the initial view 1 batcher update. - loop { - match batcher_receiver.recv().await.unwrap() { - batcher::Message::Update { - current, response, .. - } => { - response.send(None).unwrap(); - if current == View::new(1) { - break; - } - } - batcher::Message::Constructed(_) => {} - } - } - - // Allow view 1 to time out and emit a nullify vote. + // Wait for the timeout-driven nullify vote. loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(1) => + batcher::Message::Constructed(Vote::Nullify(n)) + if n.view() == target_view => { break; } batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, - _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected nullify for view 1"); + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected nullify vote for view {target_view}"); }, } } - // Deliver a nullification certificate for view 1 so the voter enters view 2. + // Send a nullification certificate for this view. let (_, nullification) = - build_nullification(&schemes, Round::new(Epoch::new(333), View::new(1)), quorum); + build_nullification(&schemes, Round::new(epoch, target_view), quorum); mailbox .resolved(Certificate::Nullification(nullification)) .await; + // Wait for the voter to process the nullification (advances to next view). loop { select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(2) => - { - panic!( - "received nullify for view 2 before its fresh leader timeout elapsed" - ); - } - batcher::Message::Update { + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { current, response, .. - } => { + } = msg.unwrap() + { response.send(None).unwrap(); - if current == View::new(2) { + if current > target_view { break; } } - _ => {} }, - _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected voter to advance to view 2"); + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for view advance after nullification"); }, } } - // The old view timed out, but the new view should still get its own leader timeout - // rather than immediately nullifying on entry. - let quiet_deadline = context.current() + Duration::from_millis(500); + // Let journal sync. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); + } + } + + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Nullify and + // Artifact::Nullification from journal. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(1), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Verify: resolver receives the replayed nullification. + let mut replayed_nullification = false; loop { select! { - _ = context.sleep_until(quiet_deadline) => break, - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(2) => - { - panic!( - "received nullify for view 2 before its fresh leader timeout elapsed" - ); + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certificate(Certificate::Nullification(n)) + if n.view() == target_view => + { + replayed_nullification = true; } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} - } + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view && replayed_nullification { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + if replayed_nullification { + break; + } + panic!("timed out waiting for nullification replay"); + }, } } + + assert!( + replayed_nullification, + "resolver should receive nullification during replay for view {target_view}" + ); }); } #[test_traced] - fn test_next_view_gets_fresh_timeout_after_prior_view_nullifies() { - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( + fn test_nullify_and_nullification_replayed_after_restart() { + nullify_and_nullification_replayed_after_restart( bls12381_threshold_vrf::fixture::, ); - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( + nullify_and_nullification_replayed_after_restart( bls12381_threshold_vrf::fixture::, ); - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( - bls12381_multisig::fixture::, - ); - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>( - bls12381_multisig::fixture::, - ); - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>(ed25519::fixture); - next_view_gets_fresh_timeout_after_prior_view_nullifies::<_, _>(secp256r1::fixture); + nullify_and_nullification_replayed_after_restart(bls12381_multisig::fixture::); + nullify_and_nullification_replayed_after_restart(bls12381_multisig::fixture::); + nullify_and_nullification_replayed_after_restart(ed25519::fixture); + nullify_and_nullification_replayed_after_restart(secp256r1::fixture); } - /// Regression: the first view should make progress without timing out when peers are online. + /// Tests that when the batcher signals a timeout reason on view update, + /// the voter immediately triggers a timeout for the current view. /// - /// We require: - /// 1. No `nullify(1)` is emitted while quorum certificates arrive promptly. - /// 2. The voter emits `notarize(1)`. - /// 3. After successful certification, the voter emits `finalize(1)` before - /// advancing to view 2. - fn first_view_progress_without_timeout(mut fixture: F) + /// This covers the path where `batcher.update()` returns `Some(TimeoutReason)` + /// (e.g., because the leader is inactive or has already nullified the view). + fn batcher_update_triggers_timeout(mut fixture: F) where S: Scheme, F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, - L: ElectorConfig, { let n = 5; let quorum = quorum(n); - let namespace = b"first_view_progress_without_timeout".to_vec(); - let executor = deterministic::Runner::timed(Duration::from_secs(15)); + let namespace = b"batcher_update_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); executor.start(|mut context| async move { let Fixture { participants, schemes, .. } = fixture(&mut context, &namespace, n); - - // Create simulated network let oracle = start_test_network_with_peers(context.clone(), participants.clone(), true).await; - let elector = L::default(); - let first_round = Round::new(Epoch::new(333), View::new(1)); - let leader_idx = elector - .clone() - .build(schemes[0].participants()) - .elect(first_round, None); - let leader = participants[usize::from(leader_idx)].clone(); + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); - let (mut mailbox, mut batcher_receiver, _, relay, reporter) = setup_voter( - &mut context, - &oracle, - &participants, - &schemes, + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), - mocks::application::Certifier::Always, - ) - .await; + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition: format!("batcher_timeout_test_{me}"), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(100), + certification_timeout: Duration::from_secs(100), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.clone(), voter_cfg); - // Wait for initial batcher notification. - let message = batcher_receiver.recv().await.unwrap(); - match message { - batcher::Message::Update { - current, - finalized, - response, - .. - } => { - assert_eq!(current, View::new(1)); - assert_eq!(finalized, View::new(0)); - response.send(None).unwrap(); - } - _ => panic!("unexpected batcher message"), + let (resolver_sender, _resolver_receiver) = mpsc::channel(10); + let resolver = resolver::Mailbox::new(resolver_sender); + + let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); + let batcher = batcher::Mailbox::new(batcher_sender); + + let (vote_sender, _vote_receiver) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (certificate_sender, _certificate_receiver) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + voter.start(batcher, resolver, vote_sender, certificate_sender); + + // Consume initial Update. + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); } - // Build a valid first-view proposal (parent is genesis at view 0). - let mut hasher = Sha256::default(); - hasher.update(&(bytes::Bytes::from_static(b"genesis"), Epoch::new(333)).encode()); - let genesis = hasher.finalize(); + // Advance to follower view 3 using finalization. + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Certify view 3 to advance to view 4. let proposal = Proposal::new( - first_round, - View::zero(), - Sha256::hash(b"first_view_progress_without_timeout"), + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"batcher_timeout_view3"), ); - let contents = (proposal.round, genesis, 0u64).encode(); + let leader = participants[1].clone(); + let contents = (proposal.round, Sha256::hash(b"genesis"), 0u64).encode(); relay.broadcast(&leader, (proposal.payload, contents)); mailbox.proposal(proposal.clone()).await; - - // The voter should notarize view 1 and must not nullify it. - loop { - select! { - msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Notarize(notarize)) - if notarize.view() == View::new(1) => - { - break; - } - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(1) => - { - panic!("unexpected nullify for view 1 while peers are online"); - } - batcher::Message::Update { response, .. } => response.send(None).unwrap(), - _ => {} - }, - _ = context.sleep(Duration::from_secs(2)) => { - panic!("expected notarize for view 1"); - }, - } - } - - // Deliver quorum notarization and ensure we finalize + advance to view 2 without nullify. let (_, notarization) = build_notarization(&schemes, &proposal, quorum); mailbox .resolved(Certificate::Notarization(notarization)) .await; - let deadline = context.current() + Duration::from_secs(3); - let reached_view2 = loop { + // Wait for the Update for view 4 and respond with a timeout reason + // to simulate batcher signaling that the leader should be skipped. + loop { select! { - _ = context.sleep_until(deadline) => break false, msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Finalize(finalize)) - if finalize.view() == View::new(1) => - { - break false; - } - batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(1) => - { - panic!("unexpected nullify for view 1 while peers are online"); - } batcher::Message::Update { - current, response, .. - } => { + current, + response, + .. + } if current > target_view => { + // Signal leader inactivity to trigger the timeout path. + response.send(Some(TimeoutReason::Inactivity)).unwrap(); + break; + } + batcher::Message::Update { response, .. } => { response.send(None).unwrap(); - if current >= View::new(2) { - break true; - } } _ => {} }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected Update for view > {target_view}"); + }, } - }; - assert!(!reached_view2, "view advanced before finalize for view 1"); + } - let reached_view2 = loop { + // The voter should emit a nullify vote for view 4 quickly (not + // after the 100s leader timeout) because the batcher signaled + // immediate timeout. + let next_view = target_view.next(); + loop { select! { - _ = context.sleep_until(deadline) => break false, msg = batcher_receiver.recv() => match msg.unwrap() { batcher::Message::Constructed(Vote::Nullify(nullify)) - if nullify.view() == View::new(1) => + if nullify.view() == next_view => { - panic!("unexpected nullify for view 1 while peers are online"); + break; } - batcher::Message::Update { - current, response, .. - } => { + batcher::Message::Update { response, .. } => { response.send(None).unwrap(); - if current >= View::new(2) { - break true; - } } _ => {} }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!( + "expected nullify for view {next_view} triggered by batcher timeout" + ); + }, } - }; - assert!(reached_view2, "expected progress to view 2 from view 1"); - - // Give the reporter a moment to receive any late events and verify no first-view nullify artifacts. - context.sleep(Duration::from_millis(50)).await; - assert!( - !reporter.nullifies.lock().contains_key(&View::new(1)), - "did not expect nullify votes for view 1" - ); - assert!( - !reporter.nullifications.lock().contains_key(&View::new(1)), - "did not expect a nullification certificate for view 1" - ); + } }); } #[test_traced] - fn test_first_view_progress_without_timeout() { - first_view_progress_without_timeout::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - first_view_progress_without_timeout::<_, _, Random>( - bls12381_threshold_vrf::fixture::, - ); - first_view_progress_without_timeout::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - first_view_progress_without_timeout::<_, _, RoundRobin>( - bls12381_multisig::fixture::, - ); - first_view_progress_without_timeout::<_, _, RoundRobin>(ed25519::fixture); - first_view_progress_without_timeout::<_, _, RoundRobin>(secp256r1::fixture); + fn test_batcher_update_triggers_timeout() { + batcher_update_triggers_timeout(bls12381_threshold_vrf::fixture::); + batcher_update_triggers_timeout(bls12381_threshold_vrf::fixture::); + batcher_update_triggers_timeout(bls12381_multisig::fixture::); + batcher_update_triggers_timeout(bls12381_multisig::fixture::); + batcher_update_triggers_timeout(ed25519::fixture); + batcher_update_triggers_timeout(secp256r1::fixture); } } diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index db32df0a8d6..6f9dab5c1a6 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -134,7 +134,11 @@ impl Round { } /// Attempt to certify this round's proposal. - pub fn try_certify(&mut self) -> Option> { + /// + /// Returns the proposal along with whether the local participant previously + /// proposed it, in which case certification can be inferred once a + /// notarization exists. + pub fn try_certify(&mut self) -> Option<(Proposal, bool)> { let notarization = self.notarization.as_ref()?; match self.certify { CertifyState::Ready => {} @@ -155,7 +159,7 @@ impl Round { &proposal, ¬arization.proposal, "slot proposal must match notarization proposal" ); - Some(proposal) + Some((proposal, self.proposal.is_local())) } /// Sets the handle for the certification request. @@ -473,7 +477,7 @@ impl Round { /// Returns a proposal candidate for notarization if we're ready to vote. /// /// Marks that we've broadcast our notarize vote to prevent duplicates. - pub fn construct_notarize(&mut self) -> Option<&Proposal> { + pub const fn construct_notarize(&mut self) -> Option<&Proposal> { // Ensure we haven't already broadcast a notarize vote or nullify vote. if self.broadcast_notarize || self.broadcast_nullify { return None; @@ -486,7 +490,7 @@ impl Round { // This check prevents us from voting for a proposal if we have observed equivocation (where // the proposal would be set to ProposalStatus::Equivocated) or if verification hasn't // completed yet. - if self.proposal.status() != ProposalStatus::Verified { + if !matches!(self.proposal.status(), ProposalStatus::Verified(_)) { return None; } @@ -533,10 +537,25 @@ impl Round { "replaying notarize from another signer" ); - // While we may not be the leader here, we still call - // built because the effect is the same (there is a proposal - // and it is verified). - self.proposal.built(notarize.proposal.clone()); + // Replaying our local notarize restores a verified proposal and + // the fact that we already voted. Only leader-owned rounds gain + // the local certification shortcut from this replay; follower + // rounds also journal local notarize votes over other leaders' + // proposals. + // + // This relies on journal replay remaining append-ordered. By the + // time we replay a local vote for round `v`, the earlier + // certificate for `v - 1` has already replayed and seeded this + // round's leader. + if self + .leader + .as_ref() + .is_some_and(|leader| self.is_signer(leader.idx)) + { + self.proposal.built(notarize.proposal.clone()); + } else { + self.proposal.notarized(notarize.proposal.clone()); + } self.broadcast_notarize = true; } Artifact::Nullify(nullify) => { @@ -886,7 +905,9 @@ mod tests { fn replayed_local_notarize_restores_verified_proposal_state() { let mut rng = test_rng(); let namespace = b"ns"; - let Fixture { schemes, .. } = ed25519::fixture(&mut rng, namespace, 4); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); let local_scheme = schemes[0].clone(); // Create a proposal where we (participant 0) are the leader. @@ -902,7 +923,7 @@ mod tests { // Proposal should be restored as verified (we are the leader). assert_eq!(round.proposal.proposal(), Some(&proposal)); - assert_eq!(round.proposal.status(), ProposalStatus::Verified); + assert_eq!(round.proposal.status(), ProposalStatus::Verified(true)); assert!(round.broadcast_notarize); // No verification request should be emitted. @@ -910,6 +931,24 @@ mod tests { !round.try_verify(), "leader-owned replay should not request verification again" ); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, equivocator) = round.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!( + is_local, + "local notarize replay should restore local certification" + ); } #[test] @@ -986,8 +1025,10 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // First try_certify should succeed - assert!(round.try_certify().is_some()); + // First try_certify should succeed, but not via the local shortcut. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); // Set a certify handle then mark as certified let mut pool = AbortablePool::<()>::default(); @@ -999,6 +1040,42 @@ mod tests { assert!(round.try_certify().is_none()); } + #[test] + fn try_certify_marks_locally_proposed_candidate() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([7u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + round.set_leader(Participant::new(0)); + assert!(round.proposed(proposal.clone())); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, equivocator) = round.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!( + is_local, + "locally proposed payload should carry local certify permission" + ); + } + #[test] fn try_certify_blocked_when_handle_exists() { let mut rng = test_rng(); @@ -1028,8 +1105,10 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // First try_certify should succeed - assert!(round.try_certify().is_some()); + // First try_certify should succeed, but not via the local shortcut. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); // Set a certify handle (simulating in-flight certification) let mut pool = AbortablePool::<()>::default(); @@ -1098,7 +1177,7 @@ mod tests { let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([1u8; 32])); let mut round = Round::new(local_scheme, round_info, now); - round.set_leader(Participant::new(0)); + round.set_leader(Participant::new(1)); // Don't set proposal yet // Add notarization (which includes the proposal in the certificate) @@ -1112,9 +1191,12 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // Has notarization and proposal came from certificate - // try_certify returns the proposal from the certificate - assert!(round.try_certify().is_some()); + // Has notarization and proposal came from certificate. + // Certification should go through the automaton because the proposal was + // not built locally. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); } #[test] diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index f8a7d1db8e3..99b5574e4a0 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -1,6 +1,6 @@ use crate::simplex::types::Proposal; use commonware_cryptography::Digest; -use tracing::debug; +use tracing::warn; /// Proposal verification status within a round. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] @@ -8,7 +8,7 @@ pub enum Status { #[default] None, Unverified, - Verified, + Verified(bool), Equivocated, } @@ -65,6 +65,11 @@ where self.proposal.is_some() && self.status != Status::Equivocated } + /// Returns whether the current proposal was built locally. + pub const fn is_local(&self) -> bool { + matches!(self.status, Status::Verified(true)) + } + pub const fn should_build(&self) -> bool { !self.requested_build && self.proposal.is_none() } @@ -73,28 +78,40 @@ where self.requested_build = true; } - /// Records the proposal in this slot and flips the build/verify flags. + /// Records a proposal that has already been verified. /// - /// If the slot is already populated, we ignore the proposal. - pub fn built(&mut self, proposal: Proposal) { + /// Additional observations of the same proposal are ignored here. + /// Conflicting proposals are handled separately as equivocation. + fn verified(&mut self, proposal: Proposal, local: bool) { if let Some(existing) = &self.proposal { // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. - debug!( + warn!( ?existing, ?proposal, - "ignoring local proposal because slot already populated" + ?local, + "ignoring verified proposal because slot already populated" ); return; } // Otherwise, we record the proposal and flip the build/verify flags. self.proposal = Some(proposal); - self.status = Status::Verified; + self.status = Status::Verified(local); self.requested_build = true; self.requested_verify = true; } + /// Records a proposal built locally by this participant. + pub fn built(&mut self, proposal: Proposal) { + self.verified(proposal, true); + } + + /// Records a proposal we verified and voted for, but did not build locally. + pub fn notarized(&mut self, proposal: Proposal) { + self.verified(proposal, false); + } + pub const fn request_verify(&mut self) -> bool { if self.requested_verify { return false; @@ -107,7 +124,7 @@ where if self.status != Status::Unverified { return false; } - self.status = Status::Verified; + self.status = Status::Verified(false); true } @@ -185,9 +202,10 @@ mod tests { Some(stored) => assert_eq!(stored, &proposal), None => panic!("proposal missing after recording"), } - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert!(!slot.should_build()); assert!(!slot.request_verify()); + assert!(slot.is_local()); } #[test] @@ -199,9 +217,10 @@ mod tests { slot.built(proposal.clone()); assert_eq!(slot.proposal(), Some(&proposal)); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert!(!slot.should_build()); assert!(!slot.request_verify()); + assert!(slot.is_local()); } #[test] @@ -214,8 +233,9 @@ mod tests { slot.built(proposal.clone()); assert!(!slot.should_build()); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert_eq!(slot.proposal(), Some(&proposal)); + assert!(slot.is_local()); } #[test] @@ -227,10 +247,12 @@ mod tests { assert!(matches!(slot.update(&proposal, false), Change::New)); assert!(matches!(slot.update(&proposal, true), Change::Unchanged)); assert_eq!(slot.status(), Status::Unverified); + assert!(!slot.is_local()); assert!(slot.mark_verified()); assert!(matches!(slot.update(&proposal, true), Change::Unchanged)); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(false)); + assert!(!slot.is_local()); } #[test] @@ -251,6 +273,7 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_a)); + assert!(!slot.is_local()); } #[test] @@ -268,6 +291,7 @@ mod tests { assert!(matches!(slot.update(&compromised, true), Change::New)); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.is_local()); // Once we finally finish proposing our honest payload, the slot should just // ignore it (the equivocation was already detected when the certificate @@ -275,6 +299,7 @@ mod tests { slot.built(honest); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.is_local()); } #[test] @@ -298,6 +323,7 @@ mod tests { other => panic!("expected equivocation, got {other:?}"), } assert_eq!(slot.status(), Status::Equivocated); + assert!(!slot.is_local()); // Verifier completion arriving afterwards must be ignored. assert!(!slot.mark_verified()); assert!(matches!(slot.update(&conflicting, true), Change::Skipped)); @@ -321,6 +347,7 @@ mod tests { assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_b)); assert!(!slot.should_build()); + assert!(!slot.is_local()); } #[test] @@ -337,6 +364,7 @@ mod tests { )); assert!(matches!(slot.update(&proposal_b, true), Change::Skipped)); assert_eq!(slot.status(), Status::Equivocated); + assert!(!slot.is_local()); } #[test] @@ -353,6 +381,7 @@ mod tests { // gating even before the follower-side verify path runs. assert!(matches!(slot.update(&proposal_a, true), Change::New)); assert!(slot.has_unequivocated_proposal()); + assert!(!slot.is_local()); // A conflicting proposal immediately revokes that property. assert!(matches!( diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index d13c1d416f7..2bb2b9dd201 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -584,8 +584,15 @@ impl, L: ElectorConfig, D: D self.outstanding_certifications.insert(view); } - /// Takes all certification candidates and returns proposals ready for certification. - pub fn certify_candidates(&mut self) -> Vec> { + /// Takes all certification candidates and returns proposals ready for + /// certification, along with whether the proposal was built locally. + /// + /// Certification may be inferred only when we have explicit evidence that we + /// proposed this exact payload for the round, either in the current process + /// or via replay of our durable local vote. In certain cases for Byzantine nodes, + /// it is possible that a certificate is received for a proposal that we did not propose (although + /// we are the leader). + pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); candidates .into_iter() @@ -593,7 +600,8 @@ impl, L: ElectorConfig, D: D if view <= self.last_finalized { return None; } - self.views.get_mut(&view)?.try_certify() + let candidate = self.views.get_mut(&view)?.try_certify()?; + Some(candidate) }) .collect() } @@ -1765,7 +1773,9 @@ mod tests { let runtime = deterministic::Runner::default(); runtime.start(|mut context| async move { let namespace = b"ns".to_vec(); - let Fixture { schemes, .. } = ed25519::fixture(&mut context, &namespace, 4); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); let epoch = Epoch::new(2); let view = View::new(2); @@ -1810,6 +1820,77 @@ mod tests { // No verification request should be emitted (leader-owned). assert!(state.try_verify().is_none()); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .expect("notarization"); + let (added, _) = state.add_notarization(notarization); + assert!(added); + + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].0.round.view(), view); + assert!(candidates[0].1); + }); + } + + #[test] + fn certify_external_candidates_for_leader_controlled_views() { + let runtime = deterministic::Runner::default(); + runtime.start(|mut context| async move { + let namespace = b"ns".to_vec(); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); + + let epoch = Epoch::new(2); + let view = View::new(2); + let proposal = Proposal::new( + Rnd::new(epoch, view), + View::new(1), + Sha256Digest::from([43u8; 32]), + ); + + let mut state = State::new( + context, + Config { + scheme: schemes[0].clone(), + elector: ::default(), + epoch, + activity_timeout: ViewDelta::new(5), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(3), + }, + ); + state.set_genesis(test_genesis()); + assert!(state.enter_view(view)); + state.set_leader(view, None); + assert_eq!(state.leader_index(view), Some(Participant::new(0))); + + let notarize_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarize_votes.iter(), &Sequential) + .expect("notarization"); + let (added, equivocator) = state.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + let (candidate, is_local) = &candidates[0]; + assert_eq!(*candidate, proposal); + assert!( + !*is_local, + "leader-owned recovered proposal must not inherit local certification" + ); }); } @@ -1942,6 +2023,7 @@ mod tests { // All 6 views should be candidates let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 6); + assert!(candidates.iter().all(|(_, is_local)| !is_local)); // Set certify handles for views 3, 4, 5, 7 (NOT 6 or 8) for i in [3u64, 4, 5, 7] { @@ -1981,7 +2063,8 @@ mod tests { state.add_notarization(make_notarization(View::new(9))); let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(9)); + assert_eq!(candidates[0].0.round.view(), View::new(9)); + assert!(!candidates[0].1); // Set handle for view 9, add view 10 let handle9 = pool.push(futures::future::pending()); @@ -1991,7 +2074,8 @@ mod tests { // View 10 returned (view 9 has handle) let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(10)); + assert_eq!(candidates[0].0.round.view(), View::new(10)); + assert!(!candidates[0].1); // Finalize view 9 - aborts view 9's handle state.add_finalization(make_finalization(View::new(9))); @@ -2001,7 +2085,81 @@ mod tests { state.add_notarization(make_notarization(View::new(11))); let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(11)); + assert_eq!(candidates[0].0.round.view(), View::new(11)); + assert!(!candidates[0].1); + }); + } + + #[test] + fn certify_candidates_skips_views_at_or_below_last_finalized() { + let runtime = deterministic::Runner::default(); + runtime.start(|mut context| async move { + let namespace = b"ns".to_vec(); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); + + let cfg = Config { + scheme: schemes[0].clone(), + elector: ::default(), + epoch: Epoch::new(1), + activity_timeout: ViewDelta::new(10), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(3), + }; + let mut state = State::new(context, cfg); + state.set_genesis(test_genesis()); + + let make_notarization = |view: View| { + let proposal = Proposal::new( + Rnd::new(Epoch::new(1), view), + GENESIS_VIEW, + Sha256Digest::from([view.get() as u8; 32]), + ); + let votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + Notarization::from_notarizes(&verifier, votes.iter(), &Sequential).unwrap() + }; + + let make_finalization = |view: View| { + let proposal = Proposal::new( + Rnd::new(Epoch::new(1), view), + GENESIS_VIEW, + Sha256Digest::from([view.get() as u8; 32]), + ); + let votes: Vec<_> = schemes + .iter() + .map(|scheme| Finalize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + Finalization::from_finalizes(&verifier, votes.iter(), &Sequential).unwrap() + }; + + let stale_view = View::new(2); + let live_view = View::new(3); + + state.add_notarization(make_notarization(stale_view)); + state.add_notarization(make_notarization(live_view)); + state.add_finalization(make_finalization(stale_view)); + + // Reinsert a stale candidate to exercise the defensive finalized-view guard. + state.certification_candidates.insert(stale_view); + assert_eq!(state.last_finalized(), stale_view); + + // The stale round still looks certifiable without the finalized-view filter. + assert!(state + .views + .get_mut(&stale_view) + .expect("stale round must exist") + .try_certify() + .is_some()); + + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].0.round.view(), live_view); + assert!(!candidates[0].1); }); } @@ -2056,7 +2214,8 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), view); + assert_eq!(candidates[0].0.round.view(), view); + assert!(!candidates[0].1); }); } @@ -2100,7 +2259,8 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), view); + assert_eq!(candidates[0].0.round.view(), view); + assert!(!candidates[0].1); let mut pool = AbortablePool::<()>::default(); let handle = pool.push(futures::future::pending()); diff --git a/consensus/src/simplex/engine.rs b/consensus/src/simplex/engine.rs index 1b609032908..22edfd73b58 100644 --- a/consensus/src/simplex/engine.rs +++ b/consensus/src/simplex/engine.rs @@ -225,20 +225,20 @@ impl< certificate_sender, ); - // Wait for the resolver or voter to finish + // If any task completes, the engine should stop let mut shutdown = self.context.stopped(); select! { _ = &mut shutdown => { debug!("context shutdown, stopping engine"); }, - _ = &mut voter_task => { - panic!("voter should not finish"); + voter = &mut voter_task => { + debug!(?voter, "voter stopped, shutting down engine"); }, - _ = &mut batcher_task => { - panic!("batcher should not finish"); + batcher = &mut batcher_task => { + debug!(?batcher, "batcher stopped, shutting down engine"); }, - _ = &mut resolver_task => { - panic!("resolver should not finish"); + resolver = &mut resolver_task => { + debug!(?resolver, "resolver stopped, shutting down engine"); }, } } diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 7b45aa9484f..76e292d33c8 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -139,11 +139,8 @@ type VerifyObserver = pub enum Certifier { /// Always certify. Always, - /// Certify sometimes, but not always. The behavior is to certify pseudorandomly - /// (but deterministically) 82% of the time, depending on the last byte of the payload. - Sometimes, - /// A custom predicate function. - Custom(Box bool + Send + 'static>), + /// A custom predicate function that receives the round and payload digest. + Custom(Box bool + Send + 'static>), /// Drop the sender without responding, causing the receiver to be cancelled. /// This simulates scenarios where the automaton cannot determine certification /// (e.g., missing verification context in Marshaled). @@ -170,7 +167,6 @@ pub struct Config { pub certify_latency: Latency, /// Predicate to determine whether a payload should be certified. - /// Returning true means certify, false means reject. pub should_certify: Certifier, } @@ -190,6 +186,7 @@ pub struct Application { fail_verification: bool, drop_proposals: bool, + stall_proposals: bool, drop_verifications: bool, should_certify: Certifier, @@ -206,6 +203,10 @@ pub struct Application { /// of a leader-owned proposal). verify_observer: Option>, + /// Senders held alive to simulate proposals that hang indefinitely + /// (used when `stall_proposals` is set). + pending_proposes: Vec>, + /// Senders held alive to simulate certifications that hang indefinitely /// (used by [`Certifier::Pending`]). pending_certifications: Vec>, @@ -240,6 +241,7 @@ impl Application fail_verification: false, drop_proposals: false, + stall_proposals: false, drop_verifications: false, should_certify: cfg.should_certify, @@ -247,6 +249,7 @@ impl Application verified: HashSet::new(), propose_observer: None, verify_observer: None, + pending_proposes: Vec::new(), pending_certifications: Vec::new(), }, Mailbox::new(sender), @@ -261,6 +264,14 @@ impl Application self.drop_proposals = drop; } + /// When set, `Message::Propose` requests are held open indefinitely: the + /// response sender is parked in `pending_proposes`, keeping the oneshot + /// alive so the caller's `receiver` never resolves. This simulates a + /// propose that is still in flight at the moment the voter crashes. + pub const fn set_stall_proposals(&mut self, stall: bool) { + self.stall_proposals = stall; + } + pub const fn set_drop_verifications(&mut self, drop: bool) { self.drop_verifications = drop; } @@ -366,7 +377,12 @@ impl Application true } - async fn certify(&mut self, payload: H::Digest, _contents: Bytes) -> Option { + async fn certify( + &mut self, + round: Round, + payload: H::Digest, + _contents: Bytes, + ) -> Option { // Simulate the certify latency let duration = self.certify_latency.sample(&mut self.context); self.context @@ -376,8 +392,7 @@ impl Application // Use configured predicate to determine certification match &self.should_certify { Certifier::Always => Some(true), - Certifier::Sometimes => Some((payload.as_ref().last().copied().unwrap_or(0) % 11) < 9), - Certifier::Custom(func) => Some(func(payload)), + Certifier::Custom(func) => Some(func(round, payload)), Certifier::Cancel | Certifier::Pending => None, } } @@ -416,6 +431,10 @@ impl Application if let Some(observer) = &self.propose_observer { observer(context.clone()); } + if self.stall_proposals { + self.pending_proposes.push(response); + continue; + } if self.drop_proposals { continue; } @@ -444,12 +463,12 @@ impl Application } } Message::Certify { - round: _, + round, payload, response, } => { let contents = seen.get(&payload).cloned().unwrap_or_default(); - if let Some(certified) = self.certify(payload, contents).await { + if let Some(certified) = self.certify(round, payload, contents).await { response.send_lossy(certified); } else if matches!(self.should_certify, Certifier::Pending) { // Hold the sender alive so the receiver never resolves. diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 8c3e08768a4..84c016727ef 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -314,72 +314,71 @@ //! Before sending a message, the `Journal` sync is invoked to prevent inadvertent Byzantine behavior //! on restart (especially in the case of unclean shutdown). -use crate::types::Round; -use commonware_cryptography::PublicKey; - pub mod elector; pub mod scheme; pub mod types; cfg_if::cfg_if! { if #[cfg(not(target_arch = "wasm32"))] { + use crate::types::{Round, View, ViewDelta}; + use commonware_cryptography::PublicKey; + use commonware_p2p::Recipients; + mod actors; pub mod config; pub use config::{Config, ForwardingPolicy}; mod engine; pub use engine::Engine; mod metrics; - } -} -#[cfg(any(test, feature = "mocks"))] -pub mod mocks; - -#[cfg(not(target_arch = "wasm32"))] -use crate::types::{View, ViewDelta}; + /// The minimum view we are tracking both in-memory and on-disk. + pub(crate) const fn min_active(activity_timeout: ViewDelta, last_finalized: View) -> View { + last_finalized.saturating_sub(activity_timeout) + } -/// The minimum view we are tracking both in-memory and on-disk. -#[cfg(not(target_arch = "wasm32"))] -pub(crate) const fn min_active(activity_timeout: ViewDelta, last_finalized: View) -> View { - last_finalized.saturating_sub(activity_timeout) -} + /// Whether or not a view is interesting to us. This is a function + /// of both `min_active` and whether or not the view is too far + /// in the future (based on the view we are currently in). + pub(crate) fn interesting( + activity_timeout: ViewDelta, + last_finalized: View, + current: View, + pending: View, + allow_future: bool, + ) -> bool { + // If the view is genesis, skip it, genesis doesn't have votes + if pending.is_zero() { + return false; + } + if pending < min_active(activity_timeout, last_finalized) { + return false; + } + if !allow_future && pending > current.next() { + return false; + } + true + } -/// Whether or not a view is interesting to us. This is a function -/// of both `min_active` and whether or not the view is too far -/// in the future (based on the view we are currently in). -#[cfg(not(target_arch = "wasm32"))] -pub(crate) fn interesting( - activity_timeout: ViewDelta, - last_finalized: View, - current: View, - pending: View, - allow_future: bool, -) -> bool { - // If the view is genesis, skip it, genesis doesn't have votes - if pending.is_zero() { - return false; - } - if pending < min_active(activity_timeout, last_finalized) { - return false; - } - if !allow_future && pending > current.next() { - return false; + /// Describes how a payload should be broadcast to the network. + pub enum Plan { + /// Initial broadcast of a newly proposed block to all participants. + Propose { + /// The round in which the block was proposed. + round: Round, + }, + /// Forward a block to a specific set of peers. + Forward { + /// The round in which the forwarded block was proposed. + round: Round, + /// The recipients to forward the block to. + recipients: Recipients

, + }, + } } - true } -/// Describes how a payload should be broadcast to the network. -pub enum Plan { - /// Initial broadcast of a newly proposed block to all participants. - Propose, - /// Forward a block to a specific set of peers. - Forward { - /// The round in which the forwarded block was proposed. - round: Round, - /// The peers to forward the block to. - peers: Vec

, - }, -} +#[cfg(any(test, feature = "mocks"))] +pub mod mocks; /// Convenience alias for [`N3f1::quorum`]. #[cfg(test)] @@ -394,7 +393,7 @@ mod tests { use super::*; use crate::{ simplex::{ - elector::{Config as Elector, Random, RoundRobin}, + elector::{Config as Elector, Elector as ElectorTrait, Random, RoundRobin}, mocks::{ scheme as scheme_mocks, twins::{self, Elector as TwinsElector}, @@ -414,7 +413,7 @@ mod tests { Nullification as TNullification, Nullify as TNullify, Proposal, Vote, }, }, - types::{Epoch, Round}, + types::{Epoch, Participant, Round}, Monitor, Viewable, }; use commonware_codec::{Decode, DecodeExt, Encode}; @@ -770,7 +769,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -951,6 +950,155 @@ mod tests { all_online::<_, _, RoundRobin>(secp256r1::fixture); } + /// A dishonest leader (validator 0) proposes payloads that all honest peers + /// refuse to certify. + /// + /// All n validators use the honest Application, but every peer's certifier + /// rejects proposals from views where validator 0 is the elected leader. + /// When validator 0 IS the leader, it short-circuits certification locally + /// (it built the proposal) and votes finalize, but every other peer + /// rejects via the Custom predicate and nullifies. The lone finalize vote + /// cannot form a certificate (quorum=4). The nullification cert (4 honest + /// peers) advances everyone. + /// + /// When an honest validator leads, all peers (including validator 0) + /// certify normally and finalize. The cluster makes progress on honest + /// leader views and nullifies dishonest leader views. + fn dishonest_leader_certification_rejected(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + RoundRobin: Elector, + { + let n = 5; + let required_containers = View::new(50); + let activity_timeout = ViewDelta::new(10); + let skip_timeout = ViewDelta::new(5); + let namespace = b"consensus".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(300)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let mut oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + let mut registrations = register_validators(&mut oracle, &participants).await; + + let link = Link { + latency: Duration::from_millis(10), + jitter: Duration::from_millis(1), + success_rate: 1.0, + }; + link_validators(&mut oracle, &participants, Action::Link(link), None).await; + + let elector = RoundRobin::default(); + let participants_set: Set = participants.clone().try_into().unwrap(); + let built_elector = elector.clone().build(&participants_set); + let relay = Arc::new(mocks::relay::Relay::new()); + let mut reporters = Vec::new(); + let mut engine_handlers = Vec::new(); + let dishonest = Participant::new(0); + for (idx, validator) in participants.iter().enumerate() { + let context = context.with_label(&format!("validator_{}", *validator)); + let reporter_config = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[idx].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + reporters.push(reporter.clone()); + + let application_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: validator.clone(), + propose_latency: (10.0, 5.0), + verify_latency: (10.0, 5.0), + certify_latency: (10.0, 5.0), + should_certify: mocks::application::Certifier::Custom(Box::new({ + let built_elector_clone = built_elector.clone(); + move |round, _| built_elector_clone.elect(round, None) != dishonest + })), + }; + let (actor, application) = mocks::application::Application::new( + context.with_label("application"), + application_cfg, + ); + actor.start(); + + let blocker = oracle.control(validator.clone()); + let cfg = config::Config { + scheme: schemes[idx].clone(), + elector: elector.clone(), + blocker, + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + strategy: Sequential, + partition: validator.to_string(), + mailbox_size: 1024, + epoch: Epoch::new(333), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(10), + fetch_timeout: Duration::from_secs(1), + activity_timeout, + skip_timeout, + fetch_concurrent: 4, + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + forwarding: ForwardingPolicy::Disabled, + }; + let engine = Engine::new(context.with_label("engine"), cfg); + let (pending, recovered, resolver) = registrations + .remove(validator) + .expect("validator should be registered"); + engine_handlers.push(engine.start(pending, recovered, resolver)); + } + + let mut finalizers = Vec::new(); + for reporter in reporters.iter_mut() { + let (mut latest, mut monitor) = reporter.subscribe().await; + finalizers.push(context.with_label("finalizer").spawn(move |_| async move { + while latest < required_containers { + latest = monitor.recv().await.expect("event missing"); + } + })); + } + join_all(finalizers).await; + + for reporter in reporters.iter() { + reporter.assert_no_faults(); + reporter.assert_no_invalid(); + } + }); + } + + #[test_group("slow")] + #[test_traced] + fn test_dishonest_leader_certification_rejected() { + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_std::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_std::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>(bls12381_multisig::fixture::); + dishonest_leader_certification_rejected::<_, _>(bls12381_multisig::fixture::); + dishonest_leader_certification_rejected::<_, _>(ed25519::fixture); + dishonest_leader_certification_rejected::<_, _>(secp256r1::fixture); + } + fn observer(mut fixture: F) where S: Scheme, @@ -1031,7 +1179,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1187,7 +1335,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1368,7 +1516,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1489,7 +1637,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1623,7 +1771,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1844,7 +1992,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, } } else { mocks::application::Config { @@ -1854,7 +2002,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, } }; let (actor, application) = mocks::application::Application::new( @@ -2020,7 +2168,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2221,7 +2369,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2418,7 +2566,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2670,7 +2818,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2846,7 +2994,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3014,7 +3162,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3206,7 +3354,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3367,7 +3515,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3458,7 +3606,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3682,7 +3830,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3835,7 +3983,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4005,7 +4153,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4141,7 +4289,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4304,7 +4452,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4523,7 +4671,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4876,7 +5024,7 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5081,7 +5229,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5221,7 +5369,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5317,7 +5465,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5810,7 +5958,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5879,7 +6027,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), diff --git a/p2p/src/authenticated/discovery/network.rs b/p2p/src/authenticated/discovery/network.rs index 69a3cef9c43..d8408b794c8 100644 --- a/p2p/src/authenticated/discovery/network.rs +++ b/p2p/src/authenticated/discovery/network.rs @@ -201,26 +201,26 @@ impl< let mut shutdown = self.context.stopped(); - // Wait for first actor to exit + // If any task completes, the network should stop info!("network started"); select! { _ = &mut shutdown => { debug!("context shutdown, stopping network"); }, tracker = &mut tracker_task => { - panic!("tracker exited unexpectedly: {tracker:?}"); + debug!(?tracker, "tracker stopped, shutting down network"); }, router = &mut router_task => { - panic!("router exited unexpectedly: {router:?}"); + debug!(?router, "router stopped, shutting down network"); }, spawner = &mut spawner_task => { - panic!("spawner exited unexpectedly: {spawner:?}"); + debug!(?spawner, "spawner stopped, shutting down network"); }, listener = &mut listener_task => { - panic!("listener exited unexpectedly: {listener:?}"); + debug!(?listener, "listener stopped, shutting down network"); }, dialer = &mut dialer_task => { - panic!("dialer exited unexpectedly: {dialer:?}"); + debug!(?dialer, "dialer stopped, shutting down network"); }, } } diff --git a/p2p/src/authenticated/lookup/network.rs b/p2p/src/authenticated/lookup/network.rs index ddb54c633a4..79a78200c15 100644 --- a/p2p/src/authenticated/lookup/network.rs +++ b/p2p/src/authenticated/lookup/network.rs @@ -194,26 +194,26 @@ impl< let mut shutdown = self.context.stopped(); - // Wait for first actor to exit + // If any task completes, the network should stop info!("network started"); select! { _ = &mut shutdown => { debug!("context shutdown, stopping network"); }, tracker = &mut tracker_task => { - panic!("tracker exited unexpectedly: {tracker:?}"); + debug!(?tracker, "tracker stopped, shutting down network"); }, router = &mut router_task => { - panic!("router exited unexpectedly: {router:?}"); + debug!(?router, "router stopped, shutting down network"); }, spawner = &mut spawner_task => { - panic!("spawner exited unexpectedly: {spawner:?}"); + debug!(?spawner, "spawner stopped, shutting down network"); }, listener = &mut listener_task => { - panic!("listener exited unexpectedly: {listener:?}"); + debug!(?listener, "listener stopped, shutting down network"); }, dialer = &mut dialer_task => { - panic!("dialer exited unexpectedly: {dialer:?}"); + debug!(?dialer, "dialer stopped, shutting down network"); }, } } diff --git a/storage/src/archive/immutable/mod.rs b/storage/src/archive/immutable/mod.rs index 6db7410f25f..d64090f8da4 100644 --- a/storage/src/archive/immutable/mod.rs +++ b/storage/src/archive/immutable/mod.rs @@ -5,9 +5,9 @@ //! //! # Uniqueness //! -//! [Archive] assumes all stored indexes and keys are unique. If the same key is associated with -//! multiple `indices`, there is no guarantee which value will be returned. If the key is written to -//! an existing `index`, [Archive] will return an error. +//! [Archive] assumes all stored indices are unique. Writing to an occupied index is a no-op. +//! If the same key is associated with multiple indices, there is no guarantee which value will +//! be returned. //! //! # Compression //! diff --git a/storage/src/archive/mod.rs b/storage/src/archive/mod.rs index 8955af09c5b..03e4e7caea0 100644 --- a/storage/src/archive/mod.rs +++ b/storage/src/archive/mod.rs @@ -1,7 +1,10 @@ //! A write-once key-value store for ordered data. //! -//! [Archive] is a key-value store designed for workloads where all data is written only once and is -//! uniquely associated with both an `index` and a `key`. +//! [Archive] is a key-value store designed for workloads where data is written only once and each +//! item is addressed by both an `index` and a `key`. Workloads with unique indices should use [Archive] +//! and workloads with overlapping indices should use [MultiArchive] (allows all items with the same index +//! to be retrieved). The same key may be stored at multiple indices in either case, and a key lookup may +//! return any of the associated values. use commonware_codec::Codec; use commonware_utils::Array; @@ -39,7 +42,7 @@ pub enum Error { RecordTooLarge, } -/// A write-once key-value store where each key is associated with a unique index. +/// A write-once key-value store addressed by both an index and a key. pub trait Archive: Send { /// The type of the key. type Key: Array; @@ -47,10 +50,12 @@ pub trait Archive: Send { /// The type of the value. type Value: Codec + Send; - /// Store an item in [Archive]. Both indices and keys are assumed to both be globally unique. + /// Store an item in [Archive]. /// - /// If the index already exists, put does nothing and returns. If the same key is stored multiple times - /// at different indices (not recommended), any value associated with the key may be returned. + /// Indices are unique: if the index already exists, put does nothing and returns. Duplicate + /// indices can be stored via [MultiArchive::put_multi]. Keys need not be unique: the same key + /// may be stored at multiple indices, and a subsequent [Archive::get] or [Archive::has] call + /// with an [Identifier::Key] identifier may return any of the values associated with that key. fn put( &mut self, index: u64, @@ -122,8 +127,7 @@ pub trait Archive: Send { /// /// Unlike [Archive::put], which is a no-op when the index already exists, /// [MultiArchive::put_multi] allows storing additional `(key, value)` pairs -/// at an existing index. As with [Archive::put], keys are assumed to be globally -/// unique, but duplicate keys are not rejected. +/// at an existing index. pub trait MultiArchive: Archive { /// Retrieve all values stored at the given index. /// @@ -383,6 +387,74 @@ mod tests { }); } + async fn test_duplicate_key_cross_index_impl( + mut archive: impl Archive, Value = i32>, + ) { + // Store the same key at two different indices; distinct values only so + // the test can observe which entry wins a key lookup. + let key = test_key("dupe-xindex"); + archive.put(2, key.clone(), 20).await.expect("put(2)"); + archive.put(5, key.clone(), 50).await.expect("put(5)"); + + // Both indices must resolve individually. + assert_eq!( + archive.get(Identifier::Index(2)).await.unwrap(), + Some(20), + "Index(2) must resolve to the value stored at 2" + ); + assert_eq!( + archive.get(Identifier::Index(5)).await.unwrap(), + Some(50), + "Index(5) must resolve to the value stored at 5" + ); + + // Key lookup may return either value per the contract; just assert it + // returns one of them and that `has` reports presence. + let got = archive + .get(Identifier::Key(&key)) + .await + .unwrap() + .expect("key lookup must find at least one entry"); + assert!(got == 20 || got == 50, "unexpected value: {got}"); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + } + + #[test_traced] + fn test_duplicate_key_cross_index_prunable_no_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_prunable(context, None).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_prunable_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_prunable(context, Some(3)).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_immutable_no_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_immutable(context, None).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_immutable_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_immutable(context, Some(3)).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + #[test_traced] fn test_duplicate_key_immutable_compression() { let executor = deterministic::Runner::default(); diff --git a/storage/src/archive/prunable/mod.rs b/storage/src/archive/prunable/mod.rs index 1025ab66735..6e2c01465e2 100644 --- a/storage/src/archive/prunable/mod.rs +++ b/storage/src/archive/prunable/mod.rs @@ -30,9 +30,13 @@ //! //! # Uniqueness //! -//! [Archive] assumes all stored indexes and keys are unique. If the same key is associated with -//! multiple `indices`, there is no guarantee which value will be returned. If the key is written to -//! an existing `index`, [Archive] will return an error. +//! Indices are unique for [Archive] and writing to an occupied index is a no-op. Duplicate +//! indices can be stored via [`crate::archive::MultiArchive::put_multi`]. +//! +//! Keys may be stored at multiple indices with either put variant. A lookup by +//! [`crate::archive::Identifier::Key`] may return any of the values at that key. Entries +//! whose index has been pruned are never returned or reported as present, so a key matching +//! both a pruned and a non-pruned entry resolves to the non-pruned entry. //! //! ## Conflicts //! @@ -681,6 +685,63 @@ mod tests { assert_eq!(state1, state2); } + /// Regression: when the same key is stored at multiple indices and the + /// earlier index is pruned, a subsequent `get`/`has` by key must resolve + /// to the surviving, non-pruned entry rather than report the pruned one. + /// Callers such as consensus's marshal cache rely on this to retain a + /// reproposal of the same block at a later index even after the + /// earlier index's retention window closes. + #[test_traced] + fn test_archive_key_lookup_skips_pruned_duplicates() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + translator: FourCap, + key_partition: "test-index".into(), + key_page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test-value".into(), + codec_config: (), + compression: None, + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), + items_per_section: NZU64!(1), + }; + let mut archive = Archive::init(context.clone(), cfg) + .await + .expect("Failed to initialize archive"); + + // Same key stored at two different indices. Distinct values only + // to make it observable which entry wins; a real caller would + // store the same value (e.g. the same block) at both indices. + let key = test_key("dupe-key"); + archive.put(2, key.clone(), 20).await.unwrap(); + archive.put(5, key.clone(), 50).await.unwrap(); + + // Before pruning, either entry is a permitted answer per the + // trait contract. The implementation happens to return the + // earlier index, but we only assert a value is present. + assert!(archive.get(Identifier::Key(&key)).await.unwrap().is_some()); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + + // Prune the earlier index (section 2). The later index must be + // the sole surviving answer. + archive.prune(3).await.unwrap(); + let got = archive.get(Identifier::Key(&key)).await.unwrap(); + assert_eq!( + got, + Some(50), + "key lookup must skip the pruned entry and return the surviving one" + ); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + + // Prune past the later index too — now nothing survives. + archive.prune(6).await.unwrap(); + assert_eq!(archive.get(Identifier::Key(&key)).await.unwrap(), None); + assert!(!archive.has(Identifier::Key(&key)).await.unwrap()); + }); + } + #[test_traced] fn test_get_all_after_prune() { let executor = deterministic::Runner::default();