From 62941033c535f48e7452829b4b8c501875ba0c09 Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 18:16:34 -0400 Subject: [PATCH 001/107] [consensus/simplex] Prevent certification of our own proposals --- consensus/src/simplex/actors/voter/actor.rs | 14 ++- consensus/src/simplex/actors/voter/mod.rs | 127 ++++++++++++++++++++ consensus/src/simplex/actors/voter/round.rs | 38 ++++++ consensus/src/simplex/actors/voter/slot.rs | 18 +++ consensus/src/simplex/actors/voter/state.rs | 7 ++ 5 files changed, 201 insertions(+), 3 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 55ce28c554a..053759c539a 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -32,7 +32,10 @@ use commonware_utils::{ futures::AbortablePool, }; use core::{future::Future, panic}; -use futures::{pin_mut, StreamExt}; +use futures::{ + future::{ready, Either}, + pin_mut, StreamExt, +}; use prometheus_client::metrics::{counter::Counter, family::Family, histogram::Histogram}; use rand_core::CryptoRngCore; use std::{ @@ -855,8 +858,13 @@ impl< let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let receiver = self.automaton.certify(round, proposal.payload).await; - let handle = certify_pool.push(async move { (round, receiver.await) }); + let result = if self.state.proposed_locally(view) { + Either::Left(ready(Ok(true))) + } else { + let receiver = self.automaton.certify(round, proposal.payload).await; + Either::Right(receiver) + }; + let handle = certify_pool.push(async move { (round, result.await) }); self.state.set_certify_handle(view, handle); } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 0e8c06a2209..52ee025cc2c 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -4758,6 +4758,133 @@ mod tests { certification_after_notarize_timeout_as_leader::<_, _>(secp256r1::fixture); } + /// A leader should certify its own proposal after notarization without asking + /// the application certifier again. + fn local_proposal_certifies_without_application_certify(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"local_proposal_certifies_without_application_certify".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + let (network, oracle) = Network::new( + context.with_label("network"), + NConfig { + max_size: 1024 * 1024, + disconnect_on_block: true, + tracked_peer_sets: None, + }, + ); + network.start(); + + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + let elector = RoundRobin::::default(); + let built_elector: RoundRobinElector = elector + .clone() + .build(&participants.clone().try_into().unwrap()); + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + &mut context, + &oracle, + &participants, + &schemes, + elector, + Duration::from_secs(100), + Duration::from_secs(100), + Duration::from_secs(100), + mocks::application::Certifier::Custom(Box::new(|_| false)), + ) + .await; + + let target_view = View::new(2); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + assert_eq!( + built_elector.elect(Round::new(Epoch::new(333), target_view), None), + Participant::new(0), + "we should be leader at view 2" + ); + + let proposal = loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == target_view => + { + break notarize.proposal.clone(); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected local notarize vote for view {target_view}"); + }, + } + }; + + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, proposal); + break; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "local proposal should certify immediately instead of nullifying view {target_view}" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected finalize vote for locally proposed view {target_view}"); + }, + } + } + }); + } + + #[test_traced] + fn test_local_proposal_certifies_without_application_certify() { + local_proposal_certifies_without_application_certify::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + local_proposal_certifies_without_application_certify::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + local_proposal_certifies_without_application_certify::<_, _>( + bls12381_multisig::fixture::, + ); + local_proposal_certifies_without_application_certify::<_, _>( + bls12381_multisig::fixture::, + ); + local_proposal_certifies_without_application_certify::<_, _>(ed25519::fixture); + local_proposal_certifies_without_application_certify::<_, _>(secp256r1::fixture); + } + /// Tests that when certification returns a cancelled receiver, the voter doesn't hang /// and continues to make progress (via voting to nullify the view that could not be certified). fn cancelled_certification_does_not_hang(mut fixture: F, traces: TraceStorage) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index 6457cfd8f1b..d21d57ea6d3 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -219,6 +219,11 @@ impl Round { matches!(self.certify, CertifyState::Certified(true)) } + /// Returns whether this round's proposal was built by the local participant. + pub const fn proposed_locally(&self) -> bool { + self.proposal.proposed_locally() + } + /// Returns true if certification was aborted due to finalization. #[cfg(test)] pub const fn is_certify_aborted(&self) -> bool { @@ -1083,6 +1088,39 @@ mod tests { // Has notarization and proposal came from certificate // try_certify returns the proposal from the certificate assert!(round.try_certify().is_some()); + assert!(!round.proposed_locally()); + } + + #[test] + fn locally_built_proposals_stay_marked_local_for_certification() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([7u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + round.set_leader(Participant::new(0)); + assert!(round.proposed(proposal.clone())); + assert!(round.proposed_locally()); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, _) = round.add_notarization(notarization); + assert!(added); + + assert!(round.try_certify().is_some()); + assert!(round.proposed_locally()); } #[test] diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index f8a7d1db8e3..b219d1f072c 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -37,6 +37,7 @@ where status: Status, requested_build: bool, requested_verify: bool, + proposed_locally: bool, } impl Slot @@ -49,6 +50,7 @@ where status: Status::None, requested_build: false, requested_verify: false, + proposed_locally: false, } } @@ -60,6 +62,11 @@ where self.status } + /// Returns whether the tracked proposal was built by the local participant. + pub const fn proposed_locally(&self) -> bool { + self.proposed_locally + } + /// Returns whether the slot contains a concrete proposal and no equivocation. pub fn has_unequivocated_proposal(&self) -> bool { self.proposal.is_some() && self.status != Status::Equivocated @@ -93,6 +100,7 @@ where self.status = Status::Verified; self.requested_build = true; self.requested_verify = true; + self.proposed_locally = true; } pub const fn request_verify(&mut self) -> bool { @@ -125,6 +133,7 @@ where None => { self.proposal = Some(proposal.clone()); self.status = Status::Unverified; + self.proposed_locally = false; Change::New } Some(existing) if existing == proposal => Change::Unchanged, @@ -137,6 +146,7 @@ where self.proposal = Some(retained.clone()); self.requested_build = true; self.requested_verify = true; + self.proposed_locally = false; } else { // If this isn't a certificate, we keep the proposal as-is. (retained, dropped) = (dropped, retained); @@ -186,6 +196,7 @@ mod tests { None => panic!("proposal missing after recording"), } assert_eq!(slot.status(), Status::Verified); + assert!(slot.proposed_locally()); assert!(!slot.should_build()); assert!(!slot.request_verify()); } @@ -200,6 +211,7 @@ mod tests { assert_eq!(slot.proposal(), Some(&proposal)); assert_eq!(slot.status(), Status::Verified); + assert!(slot.proposed_locally()); assert!(!slot.should_build()); assert!(!slot.request_verify()); } @@ -215,6 +227,7 @@ mod tests { assert!(!slot.should_build()); assert_eq!(slot.status(), Status::Verified); + assert!(slot.proposed_locally()); assert_eq!(slot.proposal(), Some(&proposal)); } @@ -251,6 +264,7 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_a)); + assert!(!slot.proposed_locally()); } #[test] @@ -268,6 +282,7 @@ mod tests { assert!(matches!(slot.update(&compromised, true), Change::New)); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.proposed_locally()); // Once we finally finish proposing our honest payload, the slot should just // ignore it (the equivocation was already detected when the certificate @@ -275,6 +290,7 @@ mod tests { slot.built(honest); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.proposed_locally()); } #[test] @@ -298,6 +314,7 @@ mod tests { other => panic!("expected equivocation, got {other:?}"), } assert_eq!(slot.status(), Status::Equivocated); + assert!(!slot.proposed_locally()); // Verifier completion arriving afterwards must be ignored. assert!(!slot.mark_verified()); assert!(matches!(slot.update(&conflicting, true), Change::Skipped)); @@ -320,6 +337,7 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_b)); + assert!(!slot.proposed_locally()); assert!(!slot.should_build()); } diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index d0f8ee96adb..2f61aa96c6e 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -575,6 +575,13 @@ impl, L: ElectorConfig, D: D .unwrap_or(false) } + /// Returns whether this view's proposal was built by the local participant. + pub fn proposed_locally(&self, view: View) -> bool { + self.views + .get(&view) + .is_some_and(|round| round.proposed_locally()) + } + /// Store the abort handle for an in-flight certification request. pub fn set_certify_handle(&mut self, view: View, handle: Aborter) { let Some(round) = self.views.get_mut(&view) else { From 131aaed33c5d3fad42ea58a8d753d8912515b8e7 Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 19:16:06 -0400 Subject: [PATCH 002/107] cover restart recovery case --- consensus/src/simplex/actors/voter/mod.rs | 245 ++++++++++++++++++++ consensus/src/simplex/actors/voter/round.rs | 43 +++- consensus/src/simplex/actors/voter/slot.rs | 5 + 3 files changed, 291 insertions(+), 2 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 52ee025cc2c..e8e4887adff 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -3819,6 +3819,251 @@ mod tests { no_recertification_after_replay::<_, _, RoundRobin>(secp256r1::fixture); } + /// After restart, a recovered notarization for a leader-owned round should still + /// skip application certification. + fn local_recovered_notarization_skips_certify_after_restart(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"local_recovered_notarization_skips_certify_after_restart".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); + executor.start(|mut context| async move { + let (network, oracle) = Network::new( + context.with_label("network"), + NConfig { + max_size: 1024 * 1024, + disconnect_on_block: true, + tracked_peer_sets: None, + }, + ); + network.start(); + + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let built_elector: RoundRobinElector = elector + .clone() + .build(&participants.clone().try_into().unwrap()); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let partition = "local_recovered_notarization_skips_certify_after_restart".to_string(); + let epoch = Epoch::new(333); + + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_initial"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter_initial"), voter_cfg); + + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + let target_view = View::new(2); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + assert_eq!( + built_elector.elect(Round::new(epoch, target_view), None), + Participant::new(0), + "we should be leader at view 2" + ); + + handle.abort(); + + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new(move |digest| { + tracker.lock().push(digest); + false + })), + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(100), + certification_timeout: Duration::from_secs(100), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + let proposal = Proposal::new( + Round::new(epoch, target_view), + target_view.previous().unwrap(), + Sha256::hash(b"recovered_local_leader_proposal"), + ); + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .recovered(Certificate::Notarization(notarization)) + .await; + + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, proposal); + break; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "leader-owned recovered proposal should finalize instead of nullifying view {target_view}" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected finalize for recovered leader proposal at view {target_view}"); + }, + } + } + + assert_eq!( + certify_calls.lock().len(), + 0, + "application certify should not run for a recovered leader proposal", + ); + }); + } + + #[test_traced] + fn test_local_recovered_notarization_skips_certify_after_restart() { + local_recovered_notarization_skips_certify_after_restart::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + local_recovered_notarization_skips_certify_after_restart::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + local_recovered_notarization_skips_certify_after_restart::<_, _>( + bls12381_multisig::fixture::, + ); + local_recovered_notarization_skips_certify_after_restart::<_, _>( + bls12381_multisig::fixture::, + ); + local_recovered_notarization_skips_certify_after_restart::<_, _>(ed25519::fixture); + local_recovered_notarization_skips_certify_after_restart::<_, _>(secp256r1::fixture); + } + /// Test that in-flight certification requests are cancelled when finalization occurs. /// /// 1. Use a very long certify latency to ensure certification is in-flight. diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index d21d57ea6d3..c835d7887bf 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -360,14 +360,23 @@ impl Round { /// /// Returns the leader's public key if equivocation is detected (conflicting proposals). pub fn add_recovered_proposal(&mut self, proposal: Proposal) -> Option { + let proposed_locally = self + .leader() + .as_ref() + .is_some_and(|leader| self.is_signer(leader.idx)); match self.proposal.update(&proposal, true) { ProposalChange::New => { + self.proposal.set_proposed_locally(proposed_locally); debug!(?proposal, "setting proposal from certificate"); self.leader_deadline = None; None } - ProposalChange::Unchanged => None, + ProposalChange::Unchanged => { + self.proposal.set_proposed_locally(proposed_locally); + None + } ProposalChange::Equivocated { dropped, retained } => { + self.proposal.set_proposed_locally(proposed_locally); // Receiving a certificate for a conflicting proposal means the // leader signed two different payloads for the same (epoch, // view). @@ -1071,7 +1080,7 @@ mod tests { let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([1u8; 32])); let mut round = Round::new(local_scheme, round_info, now); - round.set_leader(Participant::new(0)); + round.set_leader(Participant::new(1)); // Don't set proposal yet // Add notarization (which includes the proposal in the certificate) @@ -1091,6 +1100,36 @@ mod tests { assert!(!round.proposed_locally()); } + #[test] + fn recovered_local_leader_proposal_stays_local() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([8u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + round.set_leader(Participant::new(0)); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, _) = round.add_notarization(notarization); + assert!(added); + + assert!(round.proposed_locally()); + assert!(round.try_certify().is_some()); + } + #[test] fn locally_built_proposals_stay_marked_local_for_certification() { let mut rng = test_rng(); diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index b219d1f072c..3e0ef916b85 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -67,6 +67,11 @@ where self.proposed_locally } + /// Updates whether the tracked proposal belongs to the local participant. + pub const fn set_proposed_locally(&mut self, proposed_locally: bool) { + self.proposed_locally = proposed_locally; + } + /// Returns whether the slot contains a concrete proposal and no equivocation. pub fn has_unequivocated_proposal(&self) -> bool { self.proposal.is_some() && self.status != Status::Equivocated From b8ceef0e22fca2b74c0187d181f6c2a0bbc19567 Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 19:47:47 -0400 Subject: [PATCH 003/107] comment --- consensus/src/simplex/actors/voter/actor.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 053759c539a..29739005f01 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -859,6 +859,9 @@ impl< let view = round.view(); debug!(%view, "attempting certification"); let result = if self.state.proposed_locally(view) { + // The proposer should always be willing to certify their own + // proposals. Reaching out to the automaton is unnecessary and + // creates duplicate work. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; From 37fe7e65208544c6de951bf56fb056dac3c1d5be Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 19:47:47 -0400 Subject: [PATCH 004/107] don't inherit locally proposed status for conflicting certs --- consensus/src/simplex/actors/voter/round.rs | 36 ++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index c835d7887bf..f1f8c48e224 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -376,7 +376,7 @@ impl Round { None } ProposalChange::Equivocated { dropped, retained } => { - self.proposal.set_proposed_locally(proposed_locally); + self.proposal.set_proposed_locally(false); // Receiving a certificate for a conflicting proposal means the // leader signed two different payloads for the same (epoch, // view). @@ -1162,6 +1162,40 @@ mod tests { assert!(round.proposed_locally()); } + #[test] + fn recovered_conflicting_certificate_clears_local_marker() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let local_proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([5u8; 32])); + let conflicting = Proposal::new(round_info, View::new(0), Sha256Digest::from([6u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + round.set_leader(Participant::new(0)); + assert!(round.proposed(local_proposal)); + assert!(round.proposed_locally()); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, conflicting.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, equivocator) = round.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_some()); + + assert_eq!(round.proposal.status(), ProposalStatus::Equivocated); + assert!(!round.proposed_locally()); + } + #[test] fn certified_after_abort_handles_race_condition() { let mut rng = test_rng(); From 01900255f08e052027fee7424cf03c4cc3150a44 Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 20:24:56 -0400 Subject: [PATCH 005/107] regress follower replay local marker --- consensus/src/simplex/actors/voter/round.rs | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index f1f8c48e224..b37fa67c9e7 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -1130,6 +1130,28 @@ mod tests { assert!(round.try_certify().is_some()); } + #[test] + fn replayed_notarize_does_not_mark_follower_view_local() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { schemes, .. } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([9u8; 32])); + + let mut round = Round::new(local_scheme.clone(), round_info, now); + round.set_leader(Participant::new(1)); + + let local_vote = Notarize::sign(&local_scheme, proposal.clone()).unwrap(); + round.replay(&Artifact::Notarize(local_vote)); + + assert_eq!(round.proposal(), Some(&proposal)); + assert_eq!(round.proposal.status(), ProposalStatus::Verified); + assert!(!round.proposed_locally()); + } + #[test] fn locally_built_proposals_stay_marked_local_for_certification() { let mut rng = test_rng(); From b66224f89e1c2237ff590d51df1593ff6f1f1fba Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 20:26:14 -0400 Subject: [PATCH 006/107] mark proposed_locally by leadership --- consensus/src/simplex/actors/voter/round.rs | 68 +++++++++++++++------ consensus/src/simplex/actors/voter/slot.rs | 22 +++---- consensus/src/simplex/actors/voter/state.rs | 2 +- 3 files changed, 62 insertions(+), 30 deletions(-) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index b37fa67c9e7..bfce5ff0378 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -181,6 +181,13 @@ impl Round { self.scheme.me().is_some_and(|me| me == signer) } + /// Returns true if the leader is the local participant. + fn is_leader(&self) -> bool { + self.leader() + .as_ref() + .is_some_and(|leader| self.is_signer(leader.idx)) + } + /// Removes the leader and certification deadlines so timeouts stop firing. pub const fn clear_deadlines(&mut self) { self.leader_deadline = None; @@ -197,6 +204,9 @@ impl Round { .expect("leader index comes from elector, must be within bounds"); debug!(round=?self.round, %leader, ?key, "leader elected"); self.leader = Some(Leader { idx: leader, key }); + if self.proposal().is_some() { + self.proposal.set_proposed_locally(self.is_leader()); + } } /// Returns the notarization certificate if we already reconstructed one. @@ -219,7 +229,7 @@ impl Round { matches!(self.certify, CertifyState::Certified(true)) } - /// Returns whether this round's proposal was built by the local participant. + /// Returns whether this round belongs to a view led by the local participant. pub const fn proposed_locally(&self) -> bool { self.proposal.proposed_locally() } @@ -240,7 +250,8 @@ impl Round { if self.broadcast_nullify { return false; } - self.proposal.built(proposal); + self.proposal + .set_verified_proposal(proposal, self.is_leader()); self.leader_deadline = None; true } @@ -360,23 +371,15 @@ impl Round { /// /// Returns the leader's public key if equivocation is detected (conflicting proposals). pub fn add_recovered_proposal(&mut self, proposal: Proposal) -> Option { - let proposed_locally = self - .leader() - .as_ref() - .is_some_and(|leader| self.is_signer(leader.idx)); + self.proposal.set_proposed_locally(self.is_leader()); match self.proposal.update(&proposal, true) { ProposalChange::New => { - self.proposal.set_proposed_locally(proposed_locally); debug!(?proposal, "setting proposal from certificate"); self.leader_deadline = None; None } - ProposalChange::Unchanged => { - self.proposal.set_proposed_locally(proposed_locally); - None - } + ProposalChange::Unchanged => None, ProposalChange::Equivocated { dropped, retained } => { - self.proposal.set_proposed_locally(false); // Receiving a certificate for a conflicting proposal means the // leader signed two different payloads for the same (epoch, // view). @@ -547,10 +550,8 @@ impl Round { "replaying notarize from another signer" ); - // While we may not be the leader here, we still call - // built because the effect is the same (there is a proposal - // and it is verified). - self.proposal.built(notarize.proposal.clone()); + self.proposal + .set_verified_proposal(notarize.proposal.clone(), self.is_leader()); self.broadcast_notarize = true; } Artifact::Nullify(nullify) => { @@ -1130,6 +1131,37 @@ mod tests { assert!(round.try_certify().is_some()); } + #[test] + fn delayed_leader_assignment_marks_recovered_proposal_local() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([10u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, _) = round.add_notarization(notarization); + assert!(added); + assert!(!round.proposed_locally()); + + round.set_leader(Participant::new(0)); + + assert!(round.proposed_locally()); + } + #[test] fn replayed_notarize_does_not_mark_follower_view_local() { let mut rng = test_rng(); @@ -1185,7 +1217,7 @@ mod tests { } #[test] - fn recovered_conflicting_certificate_clears_local_marker() { + fn recovered_conflicting_certificate_keeps_leader_local_marker() { let mut rng = test_rng(); let namespace = b"ns"; let Fixture { @@ -1215,7 +1247,7 @@ mod tests { assert!(equivocator.is_some()); assert_eq!(round.proposal.status(), ProposalStatus::Equivocated); - assert!(!round.proposed_locally()); + assert!(round.proposed_locally()); } #[test] diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index 3e0ef916b85..aec3457dc86 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -62,12 +62,12 @@ where self.status } - /// Returns whether the tracked proposal was built by the local participant. + /// Returns whether the tracked proposal belongs to a locally led view. pub const fn proposed_locally(&self) -> bool { self.proposed_locally } - /// Updates whether the tracked proposal belongs to the local participant. + /// Updates whether the tracked proposal belongs to a locally led view. pub const fn set_proposed_locally(&mut self, proposed_locally: bool) { self.proposed_locally = proposed_locally; } @@ -85,10 +85,10 @@ where self.requested_build = true; } - /// Records the proposal in this slot and flips the build/verify flags. + /// Records a verified proposal in this slot and updates its locality marker. /// /// If the slot is already populated, we ignore the proposal. - pub fn built(&mut self, proposal: Proposal) { + pub fn set_verified_proposal(&mut self, proposal: Proposal, proposed_locally: bool) { if let Some(existing) = &self.proposal { // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. @@ -105,7 +105,7 @@ where self.status = Status::Verified; self.requested_build = true; self.requested_verify = true; - self.proposed_locally = true; + self.proposed_locally = proposed_locally; } pub const fn request_verify(&mut self) -> bool { @@ -183,7 +183,7 @@ mod tests { let mut slot = Slot::::new(); let round = Rnd::new(Epoch::new(7), View::new(3)); let proposal = Proposal::new(round, View::new(2), Sha256Digest::from([1u8; 32])); - slot.built(proposal); + slot.set_verified_proposal(proposal, true); assert!(!slot.should_build()); } @@ -194,7 +194,7 @@ mod tests { let round = Rnd::new(Epoch::new(9), View::new(1)); let proposal = Proposal::new(round, View::new(0), Sha256Digest::from([2u8; 32])); - slot.built(proposal.clone()); + slot.set_verified_proposal(proposal.clone(), true); match slot.proposal() { Some(stored) => assert_eq!(stored, &proposal), @@ -212,7 +212,7 @@ mod tests { let round = Rnd::new(Epoch::new(1), View::new(2)); let proposal = Proposal::new(round, View::new(1), Sha256Digest::from([10u8; 32])); - slot.built(proposal.clone()); + slot.set_verified_proposal(proposal.clone(), true); assert_eq!(slot.proposal(), Some(&proposal)); assert_eq!(slot.status(), Status::Verified); @@ -227,8 +227,8 @@ mod tests { let round = Rnd::new(Epoch::new(17), View::new(6)); let proposal = Proposal::new(round, View::new(5), Sha256Digest::from([11u8; 32])); - slot.built(proposal.clone()); - slot.built(proposal.clone()); + slot.set_verified_proposal(proposal.clone(), true); + slot.set_verified_proposal(proposal.clone(), true); assert!(!slot.should_build()); assert_eq!(slot.status(), Status::Verified); @@ -292,7 +292,7 @@ mod tests { // Once we finally finish proposing our honest payload, the slot should just // ignore it (the equivocation was already detected when the certificate // arrived). - slot.built(honest); + slot.set_verified_proposal(honest, true); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); assert!(!slot.proposed_locally()); diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index 2f61aa96c6e..d2a83404013 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -575,7 +575,7 @@ impl, L: ElectorConfig, D: D .unwrap_or(false) } - /// Returns whether this view's proposal was built by the local participant. + /// Returns whether this view belongs to a round led by the local participant. pub fn proposed_locally(&self, view: View) -> bool { self.views .get(&view) From 9590313d8e86c75fbfea73aca77059f08278093c Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 20:32:43 -0400 Subject: [PATCH 007/107] update `test_only_finalization_rescues_validator` --- consensus/src/simplex/actors/voter/mod.rs | 78 ++++++++++----------- consensus/src/simplex/actors/voter/round.rs | 8 ++- consensus/src/simplex/mod.rs | 4 +- 3 files changed, 47 insertions(+), 43 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index e8e4887adff..dc192d75a6f 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -5604,70 +5604,70 @@ mod tests { ) .await; - // Advance to view 3 - let view_3 = View::new(3); + // Advance to view 4 so the stuck round is not leader-owned by this validator. + let view_4 = View::new(4); let parent_payload = advance_to_view( &mut mailbox, &mut batcher_receiver, &schemes, quorum, - view_3, + view_4, ) .await; - let proposal_3 = Proposal::new( - Round::new(Epoch::new(333), view_3), - view_3.previous().unwrap(), - Sha256::hash(b"view_3_proposal"), + let proposal_4 = Proposal::new( + Round::new(Epoch::new(333), view_4), + view_4.previous().unwrap(), + Sha256::hash(b"view_4_proposal"), ); let leader = participants[1].clone(); - let contents = (proposal_3.round, parent_payload, 0u64).encode(); - relay.broadcast(&leader, (proposal_3.payload, contents)); - mailbox.proposal(proposal_3.clone()).await; + let contents = (proposal_4.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal_4.payload, contents)); + mailbox.proposal(proposal_4.clone()).await; - let (_, notarization_3) = build_notarization(&schemes, &proposal_3, quorum); + let (_, notarization_4) = build_notarization(&schemes, &proposal_4, quorum); mailbox - .resolved(Certificate::Notarization(notarization_3)) + .resolved(Certificate::Notarization(notarization_4)) .await; // Wait for the first nullify vote (confirms stuck state) loop { select! { msg = batcher_receiver.recv() => match msg.unwrap() { - batcher::Message::Constructed(Vote::Nullify(n)) if n.view() == view_3 => + batcher::Message::Constructed(Vote::Nullify(n)) if n.view() == view_4 => break, batcher::Message::Update { response, .. } => response.send(None).unwrap(), _ => {} }, _ = context.sleep(Duration::from_secs(10)) => { - panic!("expected nullify vote for view 3"); + panic!("expected nullify vote for view 4"); }, } } // Now simulate what the "advanced" validators (f+1 honest with context) are doing: - // They certified view 3 and advanced to view 4, where they're making progress. - // Send a notarization for view 4 to the stuck validator. - let view_4 = View::new(4); - let proposal_4 = Proposal::new( - Round::new(Epoch::new(333), view_4), - view_3, // Parent is view 3 (certified by the advanced validators) - Sha256::hash(b"view_4_proposal"), + // They certified view 4 and advanced to view 5, where they're making progress. + // Send a notarization for view 5 to the stuck validator. + let view_5 = View::new(5); + let proposal_5 = Proposal::new( + Round::new(Epoch::new(333), view_5), + view_4, // Parent is view 4 (certified by the advanced validators) + Sha256::hash(b"view_5_proposal"), ); - let (_, notarization_4) = build_notarization(&schemes, &proposal_4, quorum); + let (_, notarization_5) = build_notarization(&schemes, &proposal_5, quorum); - // Send the view 4 notarization to the stuck validator + // Send the view 5 notarization to the stuck validator mailbox - .resolved(Certificate::Notarization(notarization_4)) + .resolved(Certificate::Notarization(notarization_5)) .await; // The stuck validator should still not advance. // - // Receiving a notarization for view 4 doesn't help because: + // Receiving a notarization for view 5 doesn't help because: // 1. add_notarization() does not call enter_view() - it only adds to certification_candidates - // 2. To advance past view 3, the validator needs EITHER: - // a. Certification of view 3 to succeed (impossible - no context) - // b. A nullification certificate for view 3 (impossible - only f votes) + // 2. To advance past view 4, the validator needs EITHER: + // a. Certification of view 4 to succeed (impossible - no context) + // b. A nullification certificate for view 4 (impossible - only f votes) // c. A finalization certificate (requires Byzantine to vote finalize) let advanced = loop { select! { @@ -5677,15 +5677,15 @@ mod tests { current, response, .. } => { response.send(None).unwrap(); - if current > view_3 { + if current > view_4 { break true; } } batcher::Message::Constructed(Vote::Nullify(n)) => { - // Still voting nullify for view 3 - expected + // Still voting nullify for view 4 - expected assert_eq!( n.view(), - view_3, + view_4, "should only vote nullify for stuck view" ); } @@ -5700,11 +5700,11 @@ mod tests { assert!( !advanced, - "receiving a notarization for view 4 should NOT rescue the stuck validator - \ - they still can't certify view 3 (no context) and can't form a nullification \ - (not enough votes). The f+1 honest validators who advanced to view 4 cannot \ - retroactively help because they can only vote nullify for their current view (4), \ - not for view 3." + "receiving a notarization for view 5 should NOT rescue the stuck validator - \ + they still can't certify view 4 (no context) and can't form a nullification \ + (not enough votes). The f+1 honest validators who advanced to view 5 cannot \ + retroactively help because they can only vote nullify for their current view (5), \ + not for view 4." ); // HOWEVER: A finalization certificate WOULD rescue the stuck validator. @@ -5712,9 +5712,9 @@ mod tests { // the finalization would abort the stuck certification and advance the view. // // Let's demonstrate this escape route works (if Byzantine cooperate): - let (_, finalization_4) = build_finalization(&schemes, &proposal_4, quorum); + let (_, finalization_5) = build_finalization(&schemes, &proposal_5, quorum); mailbox - .resolved(Certificate::Finalization(finalization_4)) + .resolved(Certificate::Finalization(finalization_5)) .await; // Now the validator SHOULD advance (finalization aborts stuck certification) @@ -5726,7 +5726,7 @@ mod tests { } = msg.unwrap() { response.send(None).unwrap(); - if current > view_4 { + if current > view_5 { break true; } } diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index bfce5ff0378..3aea341a345 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -371,8 +371,8 @@ impl Round { /// /// Returns the leader's public key if equivocation is detected (conflicting proposals). pub fn add_recovered_proposal(&mut self, proposal: Proposal) -> Option { - self.proposal.set_proposed_locally(self.is_leader()); - match self.proposal.update(&proposal, true) { + let proposed_locally = self.is_leader(); + let leader = match self.proposal.update(&proposal, true) { ProposalChange::New => { debug!(?proposal, "setting proposal from certificate"); self.leader_deadline = None; @@ -393,7 +393,9 @@ impl Round { equivocator } ProposalChange::Skipped => None, - } + }; + self.proposal.set_proposed_locally(proposed_locally); + leader } /// Adds a verified notarization certificate to the round. diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 2a9ead0cbe2..9eb478e2849 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -4683,7 +4683,9 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + // This test only exercises reporter filtering. Keep certification + // uniform so leader-owned views do not diverge from follower views. + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From 1f9010203123e3468d815b20befa094e9eeb5706 Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 20:57:21 -0400 Subject: [PATCH 008/107] update remaining tests --- consensus/src/simplex/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 9eb478e2849..7fb592074e6 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -734,7 +734,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1165,7 +1165,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2725,7 +2725,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3463,7 +3463,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3554,7 +3554,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From aaecc6e3f28c27e3ab414e532377e63c01be9e2c Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 21:16:45 -0400 Subject: [PATCH 009/107] remove `Certifier::Sometimes` --- consensus/fuzz/src/lib.rs | 4 +- consensus/src/simplex/actors/voter/mod.rs | 20 +++++----- consensus/src/simplex/mocks/application.rs | 4 -- consensus/src/simplex/mod.rs | 46 +++++++++++----------- 4 files changed, 35 insertions(+), 39 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index c40c15de390..157415cba44 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -374,7 +374,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -608,7 +608,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index dc192d75a6f..232439748dd 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -337,7 +337,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -578,7 +578,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), app_config); @@ -851,7 +851,7 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; @@ -979,7 +979,7 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; @@ -1121,7 +1121,7 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; @@ -1256,7 +1256,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (100_000.0, 0.0), // Very slow verification certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1657,7 +1657,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2077,7 +2077,7 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; @@ -2185,7 +2185,7 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; @@ -3135,7 +3135,7 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_mins(60), - mocks::application::Certifier::Sometimes, + mocks::application::Certifier::Always, ) .await; diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 12e0d933ff1..638b7198898 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -130,9 +130,6 @@ type Latency = (f64, f64); pub enum Certifier { /// Always certify. Always, - /// Certify sometimes, but not always. The behavior is to certify pseudorandomly - /// (but deterministically) 82% of the time, depending on the last byte of the payload. - Sometimes, /// A custom predicate function. Custom(Box bool + Send + 'static>), /// Drop the sender without responding, causing the receiver to be cancelled. @@ -348,7 +345,6 @@ impl Application // Use configured predicate to determine certification match &self.should_certify { Certifier::Always => Some(true), - Certifier::Sometimes => Some((payload.as_ref().last().copied().unwrap_or(0) % 11) < 9), Certifier::Custom(func) => Some(func(payload)), Certifier::Cancel | Certifier::Pending => None, } diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 7fb592074e6..9a4e428616c 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -1000,7 +1000,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1357,7 +1357,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1478,7 +1478,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1623,7 +1623,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1855,7 +1855,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, } } else { mocks::application::Config { @@ -1865,7 +1865,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, } }; let (actor, application) = mocks::application::Application::new( @@ -2042,7 +2042,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2254,7 +2254,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2462,7 +2462,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2912,7 +2912,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3088,7 +3088,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3291,7 +3291,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3789,7 +3789,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3953,7 +3953,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4134,7 +4134,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4281,7 +4281,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4455,7 +4455,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5036,7 +5036,7 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5252,7 +5252,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5403,7 +5403,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5499,7 +5499,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5996,7 +5996,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -6065,7 +6065,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From fc7193180188547d550b156febc3eab0686dfaed Mon Sep 17 00:00:00 2001 From: clabby Date: Mon, 6 Apr 2026 23:14:59 -0400 Subject: [PATCH 010/107] remove addtional state tracking until we have a better soln We currently track if the proposal was local as its own flag, derived from the leader. Right now this doesn't buy us much. The original intention was to enable a more fool-proof setup where the proposer's node would keep track even in the event of crashes, but for now, this re-centers the PR on a solution that relies on the leader being known. --- consensus/src/simplex/actors/voter/actor.rs | 11 +- consensus/src/simplex/actors/voter/round.rs | 162 +------------------- consensus/src/simplex/actors/voter/slot.rs | 39 +---- consensus/src/simplex/actors/voter/state.rs | 7 - 4 files changed, 23 insertions(+), 196 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 29739005f01..f595d82c617 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -858,10 +858,13 @@ impl< let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let result = if self.state.proposed_locally(view) { - // The proposer should always be willing to certify their own - // proposals. Reaching out to the automaton is unnecessary and - // creates duplicate work. + let leader_is_local = self + .state + .leader_index(view) + .is_some_and(|leader| self.state.is_me(leader)); + let result = if leader_is_local { + // Once we know the local participant led this view, reaching out to the + // automaton is unnecessary and creates duplicate work. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index 3aea341a345..3d777790daf 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -181,13 +181,6 @@ impl Round { self.scheme.me().is_some_and(|me| me == signer) } - /// Returns true if the leader is the local participant. - fn is_leader(&self) -> bool { - self.leader() - .as_ref() - .is_some_and(|leader| self.is_signer(leader.idx)) - } - /// Removes the leader and certification deadlines so timeouts stop firing. pub const fn clear_deadlines(&mut self) { self.leader_deadline = None; @@ -204,9 +197,6 @@ impl Round { .expect("leader index comes from elector, must be within bounds"); debug!(round=?self.round, %leader, ?key, "leader elected"); self.leader = Some(Leader { idx: leader, key }); - if self.proposal().is_some() { - self.proposal.set_proposed_locally(self.is_leader()); - } } /// Returns the notarization certificate if we already reconstructed one. @@ -229,11 +219,6 @@ impl Round { matches!(self.certify, CertifyState::Certified(true)) } - /// Returns whether this round belongs to a view led by the local participant. - pub const fn proposed_locally(&self) -> bool { - self.proposal.proposed_locally() - } - /// Returns true if certification was aborted due to finalization. #[cfg(test)] pub const fn is_certify_aborted(&self) -> bool { @@ -250,8 +235,7 @@ impl Round { if self.broadcast_nullify { return false; } - self.proposal - .set_verified_proposal(proposal, self.is_leader()); + self.proposal.built(proposal); self.leader_deadline = None; true } @@ -371,8 +355,7 @@ impl Round { /// /// Returns the leader's public key if equivocation is detected (conflicting proposals). pub fn add_recovered_proposal(&mut self, proposal: Proposal) -> Option { - let proposed_locally = self.is_leader(); - let leader = match self.proposal.update(&proposal, true) { + match self.proposal.update(&proposal, true) { ProposalChange::New => { debug!(?proposal, "setting proposal from certificate"); self.leader_deadline = None; @@ -393,9 +376,7 @@ impl Round { equivocator } ProposalChange::Skipped => None, - }; - self.proposal.set_proposed_locally(proposed_locally); - leader + } } /// Adds a verified notarization certificate to the round. @@ -552,8 +533,10 @@ impl Round { "replaying notarize from another signer" ); - self.proposal - .set_verified_proposal(notarize.proposal.clone(), self.is_leader()); + // While we may not be the leader here, we still call + // built because the effect is the same (there is a proposal + // and it is verified). + self.proposal.built(notarize.proposal.clone()); self.broadcast_notarize = true; } Artifact::Nullify(nullify) => { @@ -1100,72 +1083,10 @@ mod tests { // Has notarization and proposal came from certificate // try_certify returns the proposal from the certificate assert!(round.try_certify().is_some()); - assert!(!round.proposed_locally()); - } - - #[test] - fn recovered_local_leader_proposal_stays_local() { - let mut rng = test_rng(); - let namespace = b"ns"; - let Fixture { - schemes, verifier, .. - } = ed25519::fixture(&mut rng, namespace, 4); - let local_scheme = schemes[0].clone(); - - let now = SystemTime::UNIX_EPOCH; - let round_info = Rnd::new(Epoch::new(1), View::new(1)); - let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([8u8; 32])); - - let mut round = Round::new(local_scheme, round_info, now); - round.set_leader(Participant::new(0)); - - let notarization_votes: Vec<_> = schemes - .iter() - .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) - .collect(); - let notarization = - Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) - .unwrap(); - let (added, _) = round.add_notarization(notarization); - assert!(added); - - assert!(round.proposed_locally()); - assert!(round.try_certify().is_some()); - } - - #[test] - fn delayed_leader_assignment_marks_recovered_proposal_local() { - let mut rng = test_rng(); - let namespace = b"ns"; - let Fixture { - schemes, verifier, .. - } = ed25519::fixture(&mut rng, namespace, 4); - let local_scheme = schemes[0].clone(); - - let now = SystemTime::UNIX_EPOCH; - let round_info = Rnd::new(Epoch::new(1), View::new(1)); - let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([10u8; 32])); - - let mut round = Round::new(local_scheme, round_info, now); - - let notarization_votes: Vec<_> = schemes - .iter() - .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) - .collect(); - let notarization = - Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) - .unwrap(); - let (added, _) = round.add_notarization(notarization); - assert!(added); - assert!(!round.proposed_locally()); - - round.set_leader(Participant::new(0)); - - assert!(round.proposed_locally()); } #[test] - fn replayed_notarize_does_not_mark_follower_view_local() { + fn replayed_notarize_keeps_proposal_verified() { let mut rng = test_rng(); let namespace = b"ns"; let Fixture { schemes, .. } = ed25519::fixture(&mut rng, namespace, 4); @@ -1183,73 +1104,6 @@ mod tests { assert_eq!(round.proposal(), Some(&proposal)); assert_eq!(round.proposal.status(), ProposalStatus::Verified); - assert!(!round.proposed_locally()); - } - - #[test] - fn locally_built_proposals_stay_marked_local_for_certification() { - let mut rng = test_rng(); - let namespace = b"ns"; - let Fixture { - schemes, verifier, .. - } = ed25519::fixture(&mut rng, namespace, 4); - let local_scheme = schemes[0].clone(); - - let now = SystemTime::UNIX_EPOCH; - let round_info = Rnd::new(Epoch::new(1), View::new(1)); - let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([7u8; 32])); - - let mut round = Round::new(local_scheme, round_info, now); - round.set_leader(Participant::new(0)); - assert!(round.proposed(proposal.clone())); - assert!(round.proposed_locally()); - - let notarization_votes: Vec<_> = schemes - .iter() - .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) - .collect(); - let notarization = - Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) - .unwrap(); - let (added, _) = round.add_notarization(notarization); - assert!(added); - - assert!(round.try_certify().is_some()); - assert!(round.proposed_locally()); - } - - #[test] - fn recovered_conflicting_certificate_keeps_leader_local_marker() { - let mut rng = test_rng(); - let namespace = b"ns"; - let Fixture { - schemes, verifier, .. - } = ed25519::fixture(&mut rng, namespace, 4); - let local_scheme = schemes[0].clone(); - - let now = SystemTime::UNIX_EPOCH; - let round_info = Rnd::new(Epoch::new(1), View::new(1)); - let local_proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([5u8; 32])); - let conflicting = Proposal::new(round_info, View::new(0), Sha256Digest::from([6u8; 32])); - - let mut round = Round::new(local_scheme, round_info, now); - round.set_leader(Participant::new(0)); - assert!(round.proposed(local_proposal)); - assert!(round.proposed_locally()); - - let notarization_votes: Vec<_> = schemes - .iter() - .map(|scheme| Notarize::sign(scheme, conflicting.clone()).unwrap()) - .collect(); - let notarization = - Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) - .unwrap(); - let (added, equivocator) = round.add_notarization(notarization); - assert!(added); - assert!(equivocator.is_some()); - - assert_eq!(round.proposal.status(), ProposalStatus::Equivocated); - assert!(round.proposed_locally()); } #[test] diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index aec3457dc86..f8a7d1db8e3 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -37,7 +37,6 @@ where status: Status, requested_build: bool, requested_verify: bool, - proposed_locally: bool, } impl Slot @@ -50,7 +49,6 @@ where status: Status::None, requested_build: false, requested_verify: false, - proposed_locally: false, } } @@ -62,16 +60,6 @@ where self.status } - /// Returns whether the tracked proposal belongs to a locally led view. - pub const fn proposed_locally(&self) -> bool { - self.proposed_locally - } - - /// Updates whether the tracked proposal belongs to a locally led view. - pub const fn set_proposed_locally(&mut self, proposed_locally: bool) { - self.proposed_locally = proposed_locally; - } - /// Returns whether the slot contains a concrete proposal and no equivocation. pub fn has_unequivocated_proposal(&self) -> bool { self.proposal.is_some() && self.status != Status::Equivocated @@ -85,10 +73,10 @@ where self.requested_build = true; } - /// Records a verified proposal in this slot and updates its locality marker. + /// Records the proposal in this slot and flips the build/verify flags. /// /// If the slot is already populated, we ignore the proposal. - pub fn set_verified_proposal(&mut self, proposal: Proposal, proposed_locally: bool) { + pub fn built(&mut self, proposal: Proposal) { if let Some(existing) = &self.proposal { // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. @@ -105,7 +93,6 @@ where self.status = Status::Verified; self.requested_build = true; self.requested_verify = true; - self.proposed_locally = proposed_locally; } pub const fn request_verify(&mut self) -> bool { @@ -138,7 +125,6 @@ where None => { self.proposal = Some(proposal.clone()); self.status = Status::Unverified; - self.proposed_locally = false; Change::New } Some(existing) if existing == proposal => Change::Unchanged, @@ -151,7 +137,6 @@ where self.proposal = Some(retained.clone()); self.requested_build = true; self.requested_verify = true; - self.proposed_locally = false; } else { // If this isn't a certificate, we keep the proposal as-is. (retained, dropped) = (dropped, retained); @@ -183,7 +168,7 @@ mod tests { let mut slot = Slot::::new(); let round = Rnd::new(Epoch::new(7), View::new(3)); let proposal = Proposal::new(round, View::new(2), Sha256Digest::from([1u8; 32])); - slot.set_verified_proposal(proposal, true); + slot.built(proposal); assert!(!slot.should_build()); } @@ -194,14 +179,13 @@ mod tests { let round = Rnd::new(Epoch::new(9), View::new(1)); let proposal = Proposal::new(round, View::new(0), Sha256Digest::from([2u8; 32])); - slot.set_verified_proposal(proposal.clone(), true); + slot.built(proposal.clone()); match slot.proposal() { Some(stored) => assert_eq!(stored, &proposal), None => panic!("proposal missing after recording"), } assert_eq!(slot.status(), Status::Verified); - assert!(slot.proposed_locally()); assert!(!slot.should_build()); assert!(!slot.request_verify()); } @@ -212,11 +196,10 @@ mod tests { let round = Rnd::new(Epoch::new(1), View::new(2)); let proposal = Proposal::new(round, View::new(1), Sha256Digest::from([10u8; 32])); - slot.set_verified_proposal(proposal.clone(), true); + slot.built(proposal.clone()); assert_eq!(slot.proposal(), Some(&proposal)); assert_eq!(slot.status(), Status::Verified); - assert!(slot.proposed_locally()); assert!(!slot.should_build()); assert!(!slot.request_verify()); } @@ -227,12 +210,11 @@ mod tests { let round = Rnd::new(Epoch::new(17), View::new(6)); let proposal = Proposal::new(round, View::new(5), Sha256Digest::from([11u8; 32])); - slot.set_verified_proposal(proposal.clone(), true); - slot.set_verified_proposal(proposal.clone(), true); + slot.built(proposal.clone()); + slot.built(proposal.clone()); assert!(!slot.should_build()); assert_eq!(slot.status(), Status::Verified); - assert!(slot.proposed_locally()); assert_eq!(slot.proposal(), Some(&proposal)); } @@ -269,7 +251,6 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_a)); - assert!(!slot.proposed_locally()); } #[test] @@ -287,15 +268,13 @@ mod tests { assert!(matches!(slot.update(&compromised, true), Change::New)); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); - assert!(!slot.proposed_locally()); // Once we finally finish proposing our honest payload, the slot should just // ignore it (the equivocation was already detected when the certificate // arrived). - slot.set_verified_proposal(honest, true); + slot.built(honest); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); - assert!(!slot.proposed_locally()); } #[test] @@ -319,7 +298,6 @@ mod tests { other => panic!("expected equivocation, got {other:?}"), } assert_eq!(slot.status(), Status::Equivocated); - assert!(!slot.proposed_locally()); // Verifier completion arriving afterwards must be ignored. assert!(!slot.mark_verified()); assert!(matches!(slot.update(&conflicting, true), Change::Skipped)); @@ -342,7 +320,6 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_b)); - assert!(!slot.proposed_locally()); assert!(!slot.should_build()); } diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index d2a83404013..d0f8ee96adb 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -575,13 +575,6 @@ impl, L: ElectorConfig, D: D .unwrap_or(false) } - /// Returns whether this view belongs to a round led by the local participant. - pub fn proposed_locally(&self, view: View) -> bool { - self.views - .get(&view) - .is_some_and(|round| round.proposed_locally()) - } - /// Store the abort handle for an in-flight certification request. pub fn set_certify_handle(&mut self, view: View, handle: Aborter) { let Some(round) = self.views.get_mut(&view) else { From 31cdab57de0451bbd6b58a453d28891d735addf1 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 14:04:03 -0700 Subject: [PATCH 011/107] revert to sometimes --- consensus/fuzz/src/lib.rs | 4 +- consensus/src/simplex/mocks/application.rs | 4 ++ consensus/src/simplex/mod.rs | 58 +++++++++++----------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 88f5ec4bd33..b75755b579a 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Always, + should_certify: application::Certifier::Sometimes, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,7 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Always, + should_certify: application::Certifier::Sometimes, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index c79927893d0..c709e8551db 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -144,6 +144,9 @@ type CertifyObserver = Box::Digest) + Send + 'sta pub enum Certifier { /// Always certify. Always, + /// Certify sometimes, but not always. The behavior is to certify pseudorandomly + /// (but deterministically) 82% of the time, depending on the last byte of the payload. + Sometimes, /// A custom predicate function. Custom(Box bool + Send + 'static>), /// Drop the sender without responding, causing the receiver to be cancelled. @@ -388,6 +391,7 @@ impl Application // Use configured predicate to determine certification match &self.should_certify { Certifier::Always => Some(true), + Certifier::Sometimes => Some((payload.as_ref().last().copied().unwrap_or(0) % 11) < 9), Certifier::Custom(func) => Some(func(payload)), Certifier::Cancel | Certifier::Pending => None, } diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 2d1ab09df7d..39a116577ba 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -770,7 +770,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1031,7 +1031,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1187,7 +1187,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1368,7 +1368,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1489,7 +1489,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1623,7 +1623,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1844,7 +1844,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, } } else { mocks::application::Config { @@ -1854,7 +1854,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, } }; let (actor, application) = mocks::application::Application::new( @@ -2020,7 +2020,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2221,7 +2221,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2418,7 +2418,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2670,7 +2670,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2846,7 +2846,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3014,7 +3014,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3206,7 +3206,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3367,7 +3367,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3458,7 +3458,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3682,7 +3682,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3835,7 +3835,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4005,7 +4005,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4141,7 +4141,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4304,7 +4304,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4525,7 +4525,7 @@ mod tests { certify_latency: (10.0, 5.0), // This test only exercises reporter filtering. Keep certification // uniform so leader-owned views do not diverge from follower views. - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4878,7 +4878,7 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5083,7 +5083,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5223,7 +5223,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5319,7 +5319,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5812,7 +5812,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5881,7 +5881,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From a18a0b8604a7dc66f4e226b84e77e9749d500ad2 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 14:06:30 -0700 Subject: [PATCH 012/107] remove duplicate --- consensus/src/simplex/actors/voter/round.rs | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index aaeebabef0f..c20a1d20871 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -1117,27 +1117,6 @@ mod tests { assert!(round.try_certify().is_some()); } - #[test] - fn replayed_notarize_keeps_proposal_verified() { - let mut rng = test_rng(); - let namespace = b"ns"; - let Fixture { schemes, .. } = ed25519::fixture(&mut rng, namespace, 4); - let local_scheme = schemes[0].clone(); - - let now = SystemTime::UNIX_EPOCH; - let round_info = Rnd::new(Epoch::new(1), View::new(1)); - let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([9u8; 32])); - - let mut round = Round::new(local_scheme.clone(), round_info, now); - round.set_leader(Participant::new(1)); - - let local_vote = Notarize::sign(&local_scheme, proposal.clone()).unwrap(); - round.replay(&Artifact::Notarize(local_vote)); - - assert_eq!(round.proposal(), Some(&proposal)); - assert_eq!(round.proposal.status(), ProposalStatus::Verified); - } - #[test] fn certified_after_abort_handles_race_condition() { let mut rng = test_rng(); From 2008ca7ba8cd52bd74a949334ca3603a3af53cbd Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 22:00:09 -0700 Subject: [PATCH 013/107] progress --- consensus/fuzz/src/lib.rs | 4 +- consensus/src/simplex/actors/voter/mod.rs | 111 ++++++----- consensus/src/simplex/mocks/application.rs | 36 ++-- consensus/src/simplex/mod.rs | 206 ++++++++++++++++----- 4 files changed, 235 insertions(+), 122 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index b75755b579a..3646fa05771 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,6 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,8 +608,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Sometimes, - }; + }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); actor.start(); diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 9ea639391c4..ccf90b1501e 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -181,7 +181,41 @@ mod tests { leader_timeout: Duration, certification_timeout: Duration, timeout_retry: Duration, - should_certify: mocks::application::Certifier, + ) -> ( + Mailbox, + mpsc::Receiver>, + mpsc::Receiver>, + Arc>, + mocks::reporter::Reporter, + ) + where + S: Scheme, + L: ElectorConfig, + { + setup_voter_with_certifier( + context, + oracle, + participants, + schemes, + elector, + leader_timeout, + certification_timeout, + timeout_retry, + mocks::application::Certifier::Always, + ) + .await + } + + async fn setup_voter_with_certifier( + context: &mut deterministic::Context, + oracle: &commonware_p2p::simulated::Oracle, + participants: &[S::PublicKey], + schemes: &[S], + elector: L, + leader_timeout: Duration, + certification_timeout: Duration, + timeout_retry: Duration, + certifier: mocks::application::Certifier, ) -> ( Mailbox, mpsc::Receiver>, @@ -210,10 +244,10 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify, }; - let (actor, application) = + let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); + actor.set_certifier(certifier); actor.start(); let voter_cfg = Config { @@ -352,7 +386,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -586,7 +619,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), app_config); @@ -852,7 +884,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Always, ) .await; @@ -973,7 +1004,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Always, ) .await; @@ -1109,7 +1139,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Always, ) .await; @@ -1238,7 +1267,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (100_000.0, 0.0), // Very slow verification certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1417,7 +1445,6 @@ mod tests { propose_latency: (50.0, 10.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1625,7 +1652,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1855,7 +1881,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2036,7 +2061,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Always, ) .await; @@ -2137,7 +2161,6 @@ mod tests { Duration::from_millis(500), Duration::from_secs(1000), Duration::from_secs(1000), - mocks::application::Certifier::Always, ) .await; @@ -2260,7 +2283,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (10.0, 0.0), // 10ms verification latency certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2478,7 +2500,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2683,7 +2704,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2857,7 +2877,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3056,7 +3075,6 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_mins(60), - mocks::application::Certifier::Always, ) .await; @@ -3215,7 +3233,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3508,13 +3525,13 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Custom(Box::new(move |d| { - tracker.lock().push(d); - true - })), }; - let (actor, application) = + let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); + actor.set_certifier(mocks::application::Certifier::Custom(Box::new(move |_, d| { + tracker.lock().push(d); + true + }))); actor.start(); let voter_cfg = Config { @@ -3644,13 +3661,13 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Custom(Box::new(move |d| { - tracker.lock().push(d); - true - })), }; - let (actor, application) = + let (mut actor, application) = mocks::application::Application::new(context.with_label("app2"), app_cfg); + actor.set_certifier(mocks::application::Certifier::Custom(Box::new(move |_, d| { + tracker.lock().push(d); + true + }))); actor.start(); let voter_cfg = Config { @@ -3782,7 +3799,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -3946,7 +3962,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4039,7 +4054,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4208,7 +4222,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4314,7 +4327,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4483,7 +4495,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4660,7 +4671,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4752,7 +4762,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4916,7 +4925,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5110,7 +5118,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5272,7 +5279,6 @@ mod tests { Duration::from_secs(5), Duration::from_secs(5), Duration::from_secs(5), - mocks::application::Certifier::Always, ) .await; @@ -5385,7 +5391,6 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_secs(100), - mocks::application::Certifier::Always, ) .await; @@ -5507,7 +5512,6 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_secs(100), - mocks::application::Certifier::Always, ) .await; @@ -5662,7 +5666,6 @@ mod tests { Duration::from_secs(10), Duration::from_secs(10), Duration::from_secs(100), - mocks::application::Certifier::Always, ) .await; @@ -5801,7 +5804,7 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Cancel - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, @@ -5960,10 +5963,10 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Cancel, }; - let (app_actor, application) = + let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app_cancel"), app_cfg); + app_actor.set_certifier(mocks::application::Certifier::Cancel); app_actor.start(); let voter_cfg = Config { @@ -6073,7 +6076,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); @@ -6220,7 +6222,7 @@ mod tests { // Setup voter with Certifier::Cancel to simulate missing verification context. let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, @@ -6429,7 +6431,7 @@ mod tests { // Set up voter with Certifier::Custom that always returns false // This simulates coding marshal's deferred_verify finding context mismatch - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, @@ -6438,7 +6440,7 @@ mod tests { Duration::from_secs(100), // Long timeout to prove nullify comes from cert failure Duration::from_secs(100), Duration::from_secs(100), - mocks::application::Certifier::Custom(Box::new(|_| false)), + mocks::application::Certifier::Custom(Box::new(|_, _| false)), ) .await; @@ -6562,7 +6564,7 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Pending (certify hangs indefinitely). - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, @@ -6701,7 +6703,6 @@ mod tests { Duration::from_secs(1), Duration::from_secs(5), Duration::from_mins(60), - mocks::application::Certifier::Always, ) .await; @@ -6836,7 +6837,7 @@ mod tests { .await; let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, @@ -6976,7 +6977,6 @@ mod tests { Duration::from_secs(1), Duration::from_secs(5), Duration::from_mins(60), - mocks::application::Certifier::Always, ) .await; @@ -7131,7 +7131,6 @@ mod tests { Duration::from_secs(1), Duration::from_secs(5), Duration::from_mins(60), - mocks::application::Certifier::Always, ) .await; diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index c709e8551db..eebddc5a3eb 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -141,14 +141,17 @@ type CertifyObserver = Box::Digest) + Send + 'sta /// Predicate to determine whether a payload should be certified. /// Returning true means certify, false means reject. +/// +/// Honest applications in an honest cluster always certify: a leader has the +/// certify record for its own proposal (it just built it) and an honest +/// follower has the context it verified against. The non-`Always` variants +/// model realistic runtime failure modes (missing context, late data) or are +/// used as test escape hatches. pub enum Certifier { - /// Always certify. + /// Always certify. Default for honest applications. Always, - /// Certify sometimes, but not always. The behavior is to certify pseudorandomly - /// (but deterministically) 82% of the time, depending on the last byte of the payload. - Sometimes, - /// A custom predicate function. - Custom(Box bool + Send + 'static>), + /// A custom predicate function that receives the round and payload digest. + Custom(Box bool + Send + 'static>), /// Drop the sender without responding, causing the receiver to be cancelled. /// This simulates scenarios where the automaton cannot determine certification /// (e.g., missing verification context in Marshaled). @@ -173,10 +176,6 @@ pub struct Config { pub propose_latency: Latency, pub verify_latency: Latency, pub certify_latency: Latency, - - /// Predicate to determine whether a payload should be certified. - /// Returning true means certify, false means reject. - pub should_certify: Certifier, } pub struct Application { @@ -251,7 +250,7 @@ impl Application fail_verification: false, drop_proposals: false, drop_verifications: false, - should_certify: cfg.should_certify, + should_certify: Certifier::Always, pending: HashMap::new(), verified: HashSet::new(), @@ -288,6 +287,14 @@ impl Application self.certify_observer = Some(observer); } + /// Override the certifier used by this application. Must be called before + /// [`start`]. Honest applications default to [`Certifier::Always`]; tests + /// that need to model missing context (`Cancel`), a hanging certify + /// (`Pending`), or a custom predicate set it here. + pub fn set_certifier(&mut self, certifier: Certifier) { + self.should_certify = certifier; + } + #[cfg(not(feature = "mocks"))] fn panic(&self, msg: &str) -> ! { panic!("[{:?}] {}", self.me, msg); @@ -381,7 +388,7 @@ impl Application true } - async fn certify(&mut self, payload: H::Digest, _contents: Bytes) -> Option { + async fn certify(&mut self, round: Round, payload: H::Digest, _contents: Bytes) -> Option { // Simulate the certify latency let duration = self.certify_latency.sample(&mut self.context); self.context @@ -391,8 +398,7 @@ impl Application // Use configured predicate to determine certification match &self.should_certify { Certifier::Always => Some(true), - Certifier::Sometimes => Some((payload.as_ref().last().copied().unwrap_or(0) % 11) < 9), - Certifier::Custom(func) => Some(func(payload)), + Certifier::Custom(func) => Some(func(round, payload)), Certifier::Cancel | Certifier::Pending => None, } } @@ -467,7 +473,7 @@ impl Application observer(round, payload); } let contents = seen.get(&payload).cloned().unwrap_or_default(); - if let Some(certified) = self.certify(payload, contents).await { + if let Some(certified) = self.certify(round, payload, contents).await { response.send_lossy(certified); } else if matches!(self.should_certify, Certifier::Pending) { // Hold the sender alive so the receiver never resolves. diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 39a116577ba..8da3bf18730 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -394,7 +394,7 @@ mod tests { use super::*; use crate::{ simplex::{ - elector::{Config as Elector, Random, RoundRobin}, + elector::{Config as Elector, Elector as ElectorTrait, Random, RoundRobin}, mocks::{ scheme as scheme_mocks, twins::{self, Elector as TwinsElector}, @@ -414,7 +414,7 @@ mod tests { Nullification as TNullification, Nullify as TNullify, Proposal, Vote, }, }, - types::{Epoch, Round}, + types::{Epoch, Participant, Round}, Monitor, Viewable, }; use commonware_codec::{Decode, DecodeExt, Encode}; @@ -770,8 +770,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -951,6 +950,145 @@ mod tests { all_online::<_, _, RoundRobin>(secp256r1::fixture); } + /// Integration test where a dishonest leader (validator 0) proposes + /// payloads that all honest peers refuse to certify (simulating a leader + /// that withholds shard data needed for block reconstruction). + /// + /// All n validators use the honest Application, but every peer's certifier + /// rejects proposals from views where validator 0 is the elected leader. + /// When validator 0 IS the leader, it short-circuits certification locally + /// (it built the proposal) and votes finalize, but every other peer + /// rejects via the Custom predicate and nullifies. The lone finalize vote + /// cannot form a certificate (quorum=4). The nullification cert (4 honest + /// peers) advances everyone. + /// + /// When an honest validator leads, all peers (including validator 0) + /// certify normally and finalize. The cluster makes progress on honest + /// leader views and nullifies dishonest leader views. + fn dishonest_leader_certification_rejected(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + RoundRobin: Elector, + { + let n = 5; + let required_containers = View::new(50); + let activity_timeout = ViewDelta::new(10); + let skip_timeout = ViewDelta::new(5); + let namespace = b"consensus".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(300)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let mut oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + let mut registrations = register_validators(&mut oracle, &participants).await; + + let link = Link { + latency: Duration::from_millis(10), + jitter: Duration::from_millis(1), + success_rate: 1.0, + }; + link_validators(&mut oracle, &participants, Action::Link(link), None).await; + + let elector = RoundRobin::default(); + let participants_set: Set = participants.clone().try_into().unwrap(); + let built_elector = elector.clone().build(&participants_set); + let relay = Arc::new(mocks::relay::Relay::new()); + let mut reporters = Vec::new(); + let mut engine_handlers = Vec::new(); + let dishonest = Participant::new(0); + for (idx, validator) in participants.iter().enumerate() { + let context = context.with_label(&format!("validator_{}", *validator)); + let reporter_config = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[idx].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_config); + reporters.push(reporter.clone()); + + let application_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: validator.clone(), + propose_latency: (10.0, 5.0), + verify_latency: (10.0, 5.0), + certify_latency: (10.0, 5.0), + }; + let (mut actor, application) = mocks::application::Application::new( + context.with_label("application"), + application_cfg, + ); + + // Every peer rejects certification for views led by the + // dishonest validator (simulates a leader that withheld + // the shard data honest peers need for reconstruction). + let built_elector_clone = built_elector.clone(); + actor.set_certifier(mocks::application::Certifier::Custom(Box::new( + move |round, _| built_elector_clone.elect(round, None) != dishonest, + ))); + actor.start(); + + let blocker = oracle.control(validator.clone()); + let cfg = config::Config { + scheme: schemes[idx].clone(), + elector: elector.clone(), + blocker, + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + strategy: Sequential, + partition: validator.to_string(), + mailbox_size: 1024, + epoch: Epoch::new(333), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(10), + fetch_timeout: Duration::from_secs(1), + activity_timeout, + skip_timeout, + fetch_concurrent: 4, + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + forwarding: ForwardingPolicy::Disabled, + }; + let engine = Engine::new(context.with_label("engine"), cfg); + let (pending, recovered, resolver) = registrations + .remove(validator) + .expect("validator should be registered"); + engine_handlers.push(engine.start(pending, recovered, resolver)); + } + + let mut finalizers = Vec::new(); + for reporter in reporters.iter_mut() { + let (mut latest, mut monitor) = reporter.subscribe().await; + finalizers.push(context.with_label("finalizer").spawn(move |_| async move { + while latest < required_containers { + latest = monitor.recv().await.expect("event missing"); + } + })); + } + join_all(finalizers).await; + + for reporter in reporters.iter() { + reporter.assert_no_faults(); + reporter.assert_no_invalid(); + } + }); + } + + #[test_group("slow")] + #[test_traced] + fn test_dishonest_leader_certification_rejected() { + dishonest_leader_certification_rejected::<_, _>(ed25519::fixture); + } + fn observer(mut fixture: F) where S: Scheme, @@ -1031,8 +1169,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1187,7 +1324,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1368,8 +1504,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1489,7 +1624,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1623,8 +1757,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1844,8 +1977,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - } + } } else { mocks::application::Config { hasher: Sha256::default(), @@ -1854,8 +1986,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - } + } }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2020,8 +2151,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2221,8 +2351,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2418,8 +2547,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2670,7 +2798,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2846,8 +2973,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -3014,8 +3140,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -3206,7 +3331,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3367,7 +3491,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3458,7 +3581,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3682,7 +3804,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3835,7 +3956,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4005,7 +4125,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4141,8 +4260,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -4304,7 +4422,6 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4525,8 +4642,7 @@ mod tests { certify_latency: (10.0, 5.0), // This test only exercises reporter filtering. Keep certification // uniform so leader-owned views do not diverge from follower views. - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -4878,7 +4994,6 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5083,8 +5198,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5223,8 +5337,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5319,8 +5432,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5812,8 +5924,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5881,7 +5992,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: mocks::application::Certifier::Sometimes, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From 0095e38d1fe59273ba324c61b7e0187c8140b6f6 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:19:51 -0700 Subject: [PATCH 014/107] fix flaky test --- runtime/src/lib.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 7760597a9b3..c02a2d40f9c 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -2501,7 +2501,19 @@ mod tests { assert_eq!(handle.await.expect("task failed"), expected as u64); } - let buffer = context.encode(); + // handle.await resolves when the task's output is ready, but + // the running-gauge decrement fires on task-struct drop which + // may lag slightly. Retry encode() to let the runtime finish + // cleanup. + let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; + let mut buffer = context.encode(); + for _ in 0..50 { + if buffer.contains(running_value) { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + buffer = context.encode(); + } // Count occurrences of each runtime task metric for our label. If // attributes were incorrectly folded into the task family key, we @@ -2537,7 +2549,6 @@ mod tests { buffer.contains(&spawned_value), "expected accumulated spawned counter `{spawned_value}`, got: {buffer}", ); - let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; assert!( buffer.contains(running_value), "expected running gauge to return to 0, got: {buffer}", @@ -2596,7 +2607,16 @@ mod tests { assert_eq!(handle.await.expect("task failed"), expected as u64); } - let buffer = context.encode(); + // Same race as above: gauge decrement lags handle completion. + let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; + let mut buffer = context.encode(); + for _ in 0..50 { + if buffer.contains(running_value) { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + buffer = context.encode(); + } let spawned_lines = buffer .lines() @@ -2628,7 +2648,6 @@ mod tests { buffer.contains(&spawned_value), "expected accumulated spawned counter `{spawned_value}`, got: {buffer}", ); - let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; assert!( buffer.contains(running_value), "expected running gauge to return to 0, got: {buffer}", From 98cc1c71ff28a56f306c8dd0e9ad86e528a63e4b Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:21:26 -0700 Subject: [PATCH 015/107] fmt --- consensus/fuzz/src/lib.rs | 2 +- consensus/src/simplex/actors/voter/mod.rs | 20 ++++++++----- consensus/src/simplex/mocks/application.rs | 7 ++++- consensus/src/simplex/mod.rs | 34 +++++++++++----------- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 3646fa05771..6eabfa4f509 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -608,7 +608,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); actor.start(); diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index ccf90b1501e..18c535b6d48 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -3528,10 +3528,12 @@ mod tests { }; let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); - actor.set_certifier(mocks::application::Certifier::Custom(Box::new(move |_, d| { - tracker.lock().push(d); - true - }))); + actor.set_certifier(mocks::application::Certifier::Custom(Box::new( + move |_, d| { + tracker.lock().push(d); + true + }, + ))); actor.start(); let voter_cfg = Config { @@ -3664,10 +3666,12 @@ mod tests { }; let (mut actor, application) = mocks::application::Application::new(context.with_label("app2"), app_cfg); - actor.set_certifier(mocks::application::Certifier::Custom(Box::new(move |_, d| { - tracker.lock().push(d); - true - }))); + actor.set_certifier(mocks::application::Certifier::Custom(Box::new( + move |_, d| { + tracker.lock().push(d); + true + }, + ))); actor.start(); let voter_cfg = Config { diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index eebddc5a3eb..9386d72f08c 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -388,7 +388,12 @@ impl Application true } - async fn certify(&mut self, round: Round, payload: H::Digest, _contents: Bytes) -> Option { + async fn certify( + &mut self, + round: Round, + payload: H::Digest, + _contents: Bytes, + ) -> Option { // Simulate the certify latency let duration = self.certify_latency.sample(&mut self.context); self.context diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 8da3bf18730..941392f9508 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -770,7 +770,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1169,7 +1169,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1504,7 +1504,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1757,7 +1757,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -1977,7 +1977,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - } + } } else { mocks::application::Config { hasher: Sha256::default(), @@ -1986,7 +1986,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - } + } }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2151,7 +2151,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2351,7 +2351,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2547,7 +2547,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -2973,7 +2973,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -3140,7 +3140,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -4260,7 +4260,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -4642,7 +4642,7 @@ mod tests { certify_latency: (10.0, 5.0), // This test only exercises reporter filtering. Keep certification // uniform so leader-owned views do not diverge from follower views. - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5198,7 +5198,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5337,7 +5337,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5432,7 +5432,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, @@ -5924,7 +5924,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - }; + }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, From 9705ef0758ffa41f89d15288913fb1ef7f10b778 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:26:40 -0700 Subject: [PATCH 016/107] nits --- consensus/src/simplex/actors/voter/actor.rs | 6 +----- consensus/src/simplex/actors/voter/state.rs | 23 ++++++++++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index f595d82c617..46192a02547 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -854,14 +854,10 @@ impl< } // Attempt to certify any views that we have notarizations for. - for proposal in self.state.certify_candidates() { + for (proposal, leader_is_local) in self.state.certify_candidates() { let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let leader_is_local = self - .state - .leader_index(view) - .is_some_and(|leader| self.state.is_me(leader)); let result = if leader_is_local { // Once we know the local participant led this view, reaching out to the // automaton is unnecessary and creates duplicate work. diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index d13c1d416f7..e390b299c75 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -584,8 +584,10 @@ impl, L: ElectorConfig, D: D self.outstanding_certifications.insert(view); } - /// Takes all certification candidates and returns proposals ready for certification. - pub fn certify_candidates(&mut self) -> Vec> { + /// Takes all certification candidates and returns proposals ready for + /// certification, along with whether the local participant is the leader + /// of each view (used to short-circuit certification for own proposals). + pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); candidates .into_iter() @@ -593,7 +595,12 @@ impl, L: ElectorConfig, D: D if view <= self.last_finalized { return None; } - self.views.get_mut(&view)?.try_certify() + let round = self.views.get_mut(&view)?; + let leader_is_local = round + .leader() + .is_some_and(|leader| self.scheme.me().is_some_and(|me| me == leader.idx)); + let proposal = round.try_certify()?; + Some((proposal, leader_is_local)) }) .collect() } @@ -1981,7 +1988,7 @@ mod tests { state.add_notarization(make_notarization(View::new(9))); let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(9)); + assert_eq!(candidates[0].0.round.view(), View::new(9)); // Set handle for view 9, add view 10 let handle9 = pool.push(futures::future::pending()); @@ -1991,7 +1998,7 @@ mod tests { // View 10 returned (view 9 has handle) let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(10)); + assert_eq!(candidates[0].0.round.view(), View::new(10)); // Finalize view 9 - aborts view 9's handle state.add_finalization(make_finalization(View::new(9))); @@ -2001,7 +2008,7 @@ mod tests { state.add_notarization(make_notarization(View::new(11))); let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), View::new(11)); + assert_eq!(candidates[0].0.round.view(), View::new(11)); }); } @@ -2056,7 +2063,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), view); + assert_eq!(candidates[0].0.round.view(), view); }); } @@ -2100,7 +2107,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].round.view(), view); + assert_eq!(candidates[0].0.round.view(), view); let mut pool = AbortablePool::<()>::default(); let handle = pool.push(futures::future::pending()); From eca119818f704b7cf67ee5412b921c9109427a0d Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:30:30 -0700 Subject: [PATCH 017/107] fix running --- runtime/src/lib.rs | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index c02a2d40f9c..09d54c172f4 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -2484,7 +2484,7 @@ mod tests { /// `runtime_tasks_spawned` / `runtime_tasks_running`. fn test_metrics_spawn_attribute_cardinality(runner: R) where - R::Context: Spawner + Metrics, + R::Context: Clock + Spawner + Metrics, { runner.start(|context| async move { const ROUNDS: u64 = 128; @@ -2503,16 +2503,15 @@ mod tests { // handle.await resolves when the task's output is ready, but // the running-gauge decrement fires on task-struct drop which - // may lag slightly. Retry encode() to let the runtime finish - // cleanup. + // may lag slightly. Loop until the runtime finishes cleanup. let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; - let mut buffer = context.encode(); - for _ in 0..50 { + let mut buffer; + loop { + buffer = context.encode(); if buffer.contains(running_value) { break; } - std::thread::sleep(std::time::Duration::from_millis(10)); - buffer = context.encode(); + context.sleep(Duration::from_millis(10)).await; } // Count occurrences of each runtime task metric for our label. If @@ -2549,10 +2548,6 @@ mod tests { buffer.contains(&spawned_value), "expected accumulated spawned counter `{spawned_value}`, got: {buffer}", ); - assert!( - buffer.contains(running_value), - "expected running gauge to return to 0, got: {buffer}", - ); // The per-round attribute must not surface on task metrics (the // task `Label` does not include context attributes). @@ -2588,7 +2583,7 @@ mod tests { /// those entries nor create additional ones. fn test_metrics_spawn_scope_cardinality(runner: R) where - R::Context: Spawner + Metrics, + R::Context: Clock + Spawner + Metrics, { runner.start(|context| async move { const ROUNDS: u64 = 128; @@ -2609,13 +2604,13 @@ mod tests { // Same race as above: gauge decrement lags handle completion. let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; - let mut buffer = context.encode(); - for _ in 0..50 { + let mut buffer; + loop { + buffer = context.encode(); if buffer.contains(running_value) { break; } - std::thread::sleep(std::time::Duration::from_millis(10)); - buffer = context.encode(); + context.sleep(Duration::from_millis(10)).await; } let spawned_lines = buffer @@ -2648,10 +2643,6 @@ mod tests { buffer.contains(&spawned_value), "expected accumulated spawned counter `{spawned_value}`, got: {buffer}", ); - assert!( - buffer.contains(running_value), - "expected running gauge to return to 0, got: {buffer}", - ); }); } From 7c6b409688468ae98f46968942fe9f0e1f366323 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:32:01 -0700 Subject: [PATCH 018/107] nit --- runtime/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 09d54c172f4..036891b7964 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -2602,7 +2602,9 @@ mod tests { assert_eq!(handle.await.expect("task failed"), expected as u64); } - // Same race as above: gauge decrement lags handle completion. + // handle.await resolves when the task's output is ready, but + // the running-gauge decrement fires on task-struct drop which + // may lag slightly. Loop until the runtime finishes cleanup. let running_value = "runtime_tasks_running{name=\"deferred_verify\",kind=\"Task\",execution=\"Shared\"} 0"; let mut buffer; loop { From 1ac6d3f0b7d80cc348f7ed0eb6cbe412e64f0c38 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:35:42 -0700 Subject: [PATCH 019/107] nits --- consensus/src/simplex/actors/voter/mod.rs | 18 ++++++++++++------ consensus/src/simplex/mocks/application.rs | 18 ------------------ 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 18c535b6d48..d2c6f044982 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -4502,9 +4502,12 @@ mod tests { }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); - app_actor.set_certify_observer(Box::new(move |round, _| { - certify_tracker.lock().push(round.view()) - })); + app_actor.set_certifier(mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + ))); app_actor.start(); // Build and start the voter wired to the observing application. @@ -4771,9 +4774,12 @@ mod tests { context.with_label("app_restarted"), app_cfg, ); - app_actor.set_certify_observer(Box::new(move |round, _| { - certify_tracker.lock().push(round.view()) - })); + app_actor.set_certifier(mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + ))); app_actor.start(); // Build and start the post-restart voter against the same journal partition. diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 9386d72f08c..a88ca0f6f2e 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -134,11 +134,6 @@ type ProposeObserver = Box::Digest, P>) + Sen type VerifyObserver = Box::Digest, P>, ::Digest) + Send + 'static>; -/// Observer invoked on every `Message::Certify` request. Used by tests to -/// detect spurious certification calls (e.g. a leader being asked to certify -/// its own proposal). -type CertifyObserver = Box::Digest) + Send + 'static>; - /// Predicate to determine whether a payload should be certified. /// Returning true means certify, false means reject. /// @@ -210,11 +205,6 @@ pub struct Application { /// of a leader-owned proposal). verify_observer: Option>, - /// Invoked on every `Message::Certify` request received by the application. - /// Used by tests to detect spurious certification requests (e.g. a leader - /// being asked to certify its own proposal). - certify_observer: Option>, - /// Senders held alive to simulate certifications that hang indefinitely /// (used by [`Certifier::Pending`]). pending_certifications: Vec>, @@ -256,7 +246,6 @@ impl Application verified: HashSet::new(), propose_observer: None, verify_observer: None, - certify_observer: None, pending_certifications: Vec::new(), }, Mailbox::new(sender), @@ -283,10 +272,6 @@ impl Application self.verify_observer = Some(observer); } - pub fn set_certify_observer(&mut self, observer: CertifyObserver) { - self.certify_observer = Some(observer); - } - /// Override the certifier used by this application. Must be called before /// [`start`]. Honest applications default to [`Certifier::Always`]; tests /// that need to model missing context (`Cancel`), a hanging certify @@ -474,9 +459,6 @@ impl Application payload, response, } => { - if let Some(observer) = &self.certify_observer { - observer(round, payload); - } let contents = seen.get(&payload).cloned().unwrap_or_default(); if let Some(certified) = self.certify(round, payload, contents).await { response.send_lossy(certified); From a4cd3b876c17d5e265f997ba1a28f835367dc1b4 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:36:28 -0700 Subject: [PATCH 020/107] nit --- consensus/src/simplex/actors/voter/actor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 46192a02547..12bb1b4780b 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -860,7 +860,7 @@ impl< debug!(%view, "attempting certification"); let result = if leader_is_local { // Once we know the local participant led this view, reaching out to the - // automaton is unnecessary and creates duplicate work. + // automaton is unnecessary. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; From c006297b455de1a3720f61126ea575549e1db509 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:46:27 -0700 Subject: [PATCH 021/107] nit --- consensus/fuzz/src/lib.rs | 2 + consensus/src/simplex/actors/voter/mod.rs | 80 +++++++++++++--------- consensus/src/simplex/mocks/application.rs | 18 ++--- consensus/src/simplex/mod.rs | 43 +++++++++--- 4 files changed, 90 insertions(+), 53 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 6eabfa4f509..450b888f877 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,6 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -608,6 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index d2c6f044982..c5d44d19583 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -244,10 +244,10 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier, }; - let (mut actor, application) = + let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); - actor.set_certifier(certifier); actor.start(); let voter_cfg = Config { @@ -386,6 +386,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -619,6 +620,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), app_config); @@ -1267,6 +1269,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (100_000.0, 0.0), // Very slow verification certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1445,6 +1448,7 @@ mod tests { propose_latency: (50.0, 10.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1652,6 +1656,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1881,6 +1886,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2283,6 +2289,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (10.0, 0.0), // 10ms verification latency certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2500,6 +2507,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2704,6 +2712,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2877,6 +2886,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3233,6 +3243,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3525,15 +3536,13 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - }; - let (mut actor, application) = - mocks::application::Application::new(context.with_label("app"), app_cfg); - actor.set_certifier(mocks::application::Certifier::Custom(Box::new( - move |_, d| { + certifier: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true - }, - ))); + })), + }; + let (actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); actor.start(); let voter_cfg = Config { @@ -3663,15 +3672,13 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - }; - let (mut actor, application) = - mocks::application::Application::new(context.with_label("app2"), app_cfg); - actor.set_certifier(mocks::application::Certifier::Custom(Box::new( - move |_, d| { + certifier: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true - }, - ))); + })), + }; + let (actor, application) = + mocks::application::Application::new(context.with_label("app2"), app_cfg); actor.start(); let voter_cfg = Config { @@ -3803,6 +3810,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -3966,6 +3974,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4058,6 +4067,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4226,6 +4236,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4331,6 +4342,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4499,15 +4511,15 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), }; - let (mut app_actor, application) = + let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); - app_actor.set_certifier(mocks::application::Certifier::Custom(Box::new( - move |round, _| { - certify_tracker.lock().push(round.view()); - true - }, - ))); app_actor.start(); // Build and start the voter wired to the observing application. @@ -4678,6 +4690,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4769,17 +4782,17 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), }; - let (mut app_actor, application) = mocks::application::Application::new( + let (app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), app_cfg, ); - app_actor.set_certifier(mocks::application::Certifier::Custom(Box::new( - move |round, _| { - certify_tracker.lock().push(round.view()); - true - }, - ))); app_actor.start(); // Build and start the post-restart voter against the same journal partition. @@ -4935,6 +4948,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5128,6 +5142,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5973,10 +5988,10 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Cancel, }; - let (mut app_actor, application) = + let (app_actor, application) = mocks::application::Application::new(context.with_label("app_cancel"), app_cfg); - app_actor.set_certifier(mocks::application::Certifier::Cancel); app_actor.start(); let voter_cfg = Config { @@ -6086,6 +6101,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds + certifier: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index a88ca0f6f2e..0c28ec08374 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -171,6 +171,8 @@ pub struct Config { pub propose_latency: Latency, pub verify_latency: Latency, pub certify_latency: Latency, + + pub certifier: Certifier, } pub struct Application { @@ -190,7 +192,7 @@ pub struct Application { fail_verification: bool, drop_proposals: bool, drop_verifications: bool, - should_certify: Certifier, + certifier: Certifier, pending: HashMap, @@ -240,7 +242,7 @@ impl Application fail_verification: false, drop_proposals: false, drop_verifications: false, - should_certify: Certifier::Always, + certifier: cfg.certifier, pending: HashMap::new(), verified: HashSet::new(), @@ -272,14 +274,6 @@ impl Application self.verify_observer = Some(observer); } - /// Override the certifier used by this application. Must be called before - /// [`start`]. Honest applications default to [`Certifier::Always`]; tests - /// that need to model missing context (`Cancel`), a hanging certify - /// (`Pending`), or a custom predicate set it here. - pub fn set_certifier(&mut self, certifier: Certifier) { - self.should_certify = certifier; - } - #[cfg(not(feature = "mocks"))] fn panic(&self, msg: &str) -> ! { panic!("[{:?}] {}", self.me, msg); @@ -386,7 +380,7 @@ impl Application .await; // Use configured predicate to determine certification - match &self.should_certify { + match &self.certifier { Certifier::Always => Some(true), Certifier::Custom(func) => Some(func(round, payload)), Certifier::Cancel | Certifier::Pending => None, @@ -462,7 +456,7 @@ impl Application let contents = seen.get(&payload).cloned().unwrap_or_default(); if let Some(certified) = self.certify(round, payload, contents).await { response.send_lossy(certified); - } else if matches!(self.should_certify, Certifier::Pending) { + } else if matches!(self.certifier, Certifier::Pending) { // Hold the sender alive so the receiver never resolves. // This simulates a certify that hangs indefinitely (e.g., // block never arrives for reconstruction). diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 941392f9508..229e5f2428d 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -770,6 +770,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1019,19 +1020,15 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Custom(Box::new({ + let built_elector_clone = built_elector.clone(); + move |round, _| built_elector_clone.elect(round, None) != dishonest + })), }; - let (mut actor, application) = mocks::application::Application::new( + let (actor, application) = mocks::application::Application::new( context.with_label("application"), application_cfg, ); - - // Every peer rejects certification for views led by the - // dishonest validator (simulates a leader that withheld - // the shard data honest peers need for reconstruction). - let built_elector_clone = built_elector.clone(); - actor.set_certifier(mocks::application::Certifier::Custom(Box::new( - move |round, _| built_elector_clone.elect(round, None) != dishonest, - ))); actor.start(); let blocker = oracle.control(validator.clone()); @@ -1169,6 +1166,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1324,6 +1322,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1504,6 +1503,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1624,6 +1624,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1757,6 +1758,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1977,6 +1979,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), + certifier: mocks::application::Certifier::Always, } } else { mocks::application::Config { @@ -1986,6 +1989,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, } }; let (actor, application) = mocks::application::Application::new( @@ -2151,6 +2155,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2351,6 +2356,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2547,6 +2553,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2798,6 +2805,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2973,6 +2981,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3140,6 +3149,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3331,6 +3341,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3491,6 +3502,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3581,6 +3593,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3804,6 +3817,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3956,6 +3970,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4125,6 +4140,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4260,6 +4276,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4422,6 +4439,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4642,6 +4660,7 @@ mod tests { certify_latency: (10.0, 5.0), // This test only exercises reporter filtering. Keep certification // uniform so leader-owned views do not diverge from follower views. + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4994,6 +5013,7 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5198,6 +5218,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5337,6 +5358,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5432,6 +5454,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5924,6 +5947,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5992,6 +6016,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), + certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From ab3b402f2f2efa5de15776e3d61dacbdb801db1c Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 13 Apr 2026 23:53:07 -0700 Subject: [PATCH 022/107] nit --- consensus/src/simplex/actors/voter/actor.rs | 4 ++-- consensus/src/simplex/actors/voter/state.rs | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 12bb1b4780b..900215d267f 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -854,11 +854,11 @@ impl< } // Attempt to certify any views that we have notarizations for. - for (proposal, leader_is_local) in self.state.certify_candidates() { + for (proposal, am_leader) in self.state.certify_candidates() { let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let result = if leader_is_local { + let result = if am_leader { // Once we know the local participant led this view, reaching out to the // automaton is unnecessary. Either::Left(ready(Ok(true))) diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index e390b299c75..9da3b04929c 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -589,6 +589,7 @@ impl, L: ElectorConfig, D: D /// of each view (used to short-circuit certification for own proposals). pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); + let me = self.scheme.me(); candidates .into_iter() .filter_map(|view| { @@ -596,11 +597,11 @@ impl, L: ElectorConfig, D: D return None; } let round = self.views.get_mut(&view)?; - let leader_is_local = round + let am_leader = round .leader() - .is_some_and(|leader| self.scheme.me().is_some_and(|me| me == leader.idx)); + .is_some_and(|leader| me.is_some_and(|me| me == leader.idx)); let proposal = round.try_certify()?; - Some((proposal, leader_is_local)) + Some((proposal, am_leader)) }) .collect() } From 02e007087c90d4cab29cb0c29dc79f37347d863f Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 00:14:50 -0700 Subject: [PATCH 023/107] fix lint --- consensus/src/simplex/actors/voter/mod.rs | 224 ++++++++++++---------- 1 file changed, 124 insertions(+), 100 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index c5d44d19583..db4ca2d6f66 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -170,52 +170,21 @@ mod tests { (votes, certificate) } - /// Helper to set up a voter actor for tests. - #[allow(clippy::too_many_arguments)] - async fn setup_voter( - context: &mut deterministic::Context, - oracle: &commonware_p2p::simulated::Oracle, - participants: &[S::PublicKey], - schemes: &[S], - elector: L, + struct VoterCfg { leader_timeout: Duration, certification_timeout: Duration, timeout_retry: Duration, - ) -> ( - Mailbox, - mpsc::Receiver>, - mpsc::Receiver>, - Arc>, - mocks::reporter::Reporter, - ) - where - S: Scheme, - L: ElectorConfig, - { - setup_voter_with_certifier( - context, - oracle, - participants, - schemes, - elector, - leader_timeout, - certification_timeout, - timeout_retry, - mocks::application::Certifier::Always, - ) - .await + certifier: mocks::application::Certifier, } - async fn setup_voter_with_certifier( + /// Helper to set up a voter actor for tests. + async fn setup_voter( context: &mut deterministic::Context, oracle: &commonware_p2p::simulated::Oracle, participants: &[S::PublicKey], schemes: &[S], elector: L, - leader_timeout: Duration, - certification_timeout: Duration, - timeout_retry: Duration, - certifier: mocks::application::Certifier, + cfg: VoterCfg, ) -> ( Mailbox, mpsc::Receiver>, @@ -227,6 +196,12 @@ mod tests { S: Scheme, L: ElectorConfig, { + let VoterCfg { + leader_timeout, + certification_timeout, + timeout_retry, + certifier, + } = cfg; let signing = schemes[0].clone(); let me = participants[0].clone(); let reporter_cfg = mocks::reporter::Config { @@ -883,9 +858,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_secs(1000), - Duration::from_secs(1000), + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(1000), + timeout_retry: Duration::from_secs(1000), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -1003,9 +981,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_secs(1000), - Duration::from_secs(1000), + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(1000), + timeout_retry: Duration::from_secs(1000), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -1138,9 +1119,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_secs(1000), - Duration::from_secs(1000), + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(1000), + timeout_retry: Duration::from_secs(1000), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -2064,9 +2048,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_secs(1000), - Duration::from_secs(1000), + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(1000), + timeout_retry: Duration::from_secs(1000), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -2164,9 +2151,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_secs(1000), - Duration::from_secs(1000), + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(1000), + timeout_retry: Duration::from_secs(1000), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -3082,9 +3072,12 @@ mod tests { &participants, &schemes, L::default(), - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_mins(60), + VoterCfg { + leader_timeout: Duration::from_secs(10), + certification_timeout: Duration::from_secs(10), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -5301,9 +5294,12 @@ mod tests { &participants, &schemes, RoundRobin::::default(), - Duration::from_secs(5), - Duration::from_secs(5), - Duration::from_secs(5), + VoterCfg { + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_secs(5), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -5413,9 +5409,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_secs(100), + VoterCfg { + leader_timeout: Duration::from_secs(10), + certification_timeout: Duration::from_secs(10), + timeout_retry: Duration::from_secs(100), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -5534,9 +5533,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_secs(100), + VoterCfg { + leader_timeout: Duration::from_secs(10), + certification_timeout: Duration::from_secs(10), + timeout_retry: Duration::from_secs(100), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -5688,9 +5690,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_secs(10), - Duration::from_secs(10), - Duration::from_secs(100), + VoterCfg { + leader_timeout: Duration::from_secs(10), + certification_timeout: Duration::from_secs(10), + timeout_retry: Duration::from_secs(100), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -5829,16 +5834,18 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Cancel - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_millis(500), - Duration::from_millis(500), - Duration::from_mins(60), - mocks::application::Certifier::Cancel, + VoterCfg { + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_millis(500), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Cancel, + }, ) .await; @@ -6248,16 +6255,18 @@ mod tests { // Setup voter with Certifier::Cancel to simulate missing verification context. let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( &mut context, &oracle, &participants, &schemes, elector.clone(), - Duration::from_secs(2), - Duration::from_secs(3), - Duration::from_secs(1), - mocks::application::Certifier::Cancel, + VoterCfg { + leader_timeout: Duration::from_secs(2), + certification_timeout: Duration::from_secs(3), + timeout_retry: Duration::from_secs(1), + certifier: mocks::application::Certifier::Cancel, + }, ) .await; @@ -6457,16 +6466,18 @@ mod tests { // Set up voter with Certifier::Custom that always returns false // This simulates coding marshal's deferred_verify finding context mismatch - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_secs(100), // Long timeout to prove nullify comes from cert failure - Duration::from_secs(100), - Duration::from_secs(100), - mocks::application::Certifier::Custom(Box::new(|_, _| false)), + VoterCfg { + leader_timeout: Duration::from_secs(100), + certification_timeout: Duration::from_secs(100), + timeout_retry: Duration::from_secs(100), + certifier: mocks::application::Certifier::Custom(Box::new(|_, _| false)), + }, ) .await; @@ -6590,16 +6601,18 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Pending (certify hangs indefinitely). - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_secs(3), - Duration::from_secs(4), - Duration::from_mins(60), - mocks::application::Certifier::Pending, + VoterCfg { + leader_timeout: Duration::from_secs(3), + certification_timeout: Duration::from_secs(4), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Pending, + }, ) .await; @@ -6726,9 +6739,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), + VoterCfg { + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -6863,16 +6879,18 @@ mod tests { .await; let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter_with_certifier( + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( &mut context, &oracle, &participants, &schemes, elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), - mocks::application::Certifier::Pending, + VoterCfg { + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Pending, + }, ) .await; @@ -7000,9 +7018,12 @@ mod tests { &participants, &schemes, RoundRobin::::default(), - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), + VoterCfg { + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Always, + }, ) .await; @@ -7154,9 +7175,12 @@ mod tests { &participants, &schemes, elector, - Duration::from_secs(1), - Duration::from_secs(5), - Duration::from_mins(60), + VoterCfg { + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + certifier: mocks::application::Certifier::Always, + }, ) .await; From 2bf14e00765a1171461aa37c38af3bf99e2c9ae0 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 00:23:03 -0700 Subject: [PATCH 024/107] nits --- consensus/src/simplex/mocks/application.rs | 8 +------- consensus/src/simplex/mod.rs | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 0c28ec08374..659292b4005 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -136,14 +136,8 @@ type VerifyObserver = /// Predicate to determine whether a payload should be certified. /// Returning true means certify, false means reject. -/// -/// Honest applications in an honest cluster always certify: a leader has the -/// certify record for its own proposal (it just built it) and an honest -/// follower has the context it verified against. The non-`Always` variants -/// model realistic runtime failure modes (missing context, late data) or are -/// used as test escape hatches. pub enum Certifier { - /// Always certify. Default for honest applications. + /// Always certify. Always, /// A custom predicate function that receives the round and payload digest. Custom(Box bool + Send + 'static>), diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 229e5f2428d..28ce22b5c6d 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -951,9 +951,8 @@ mod tests { all_online::<_, _, RoundRobin>(secp256r1::fixture); } - /// Integration test where a dishonest leader (validator 0) proposes - /// payloads that all honest peers refuse to certify (simulating a leader - /// that withholds shard data needed for block reconstruction). + /// A dishonest leader (validator 0) proposes payloads that all honest peers + /// refuse to certify. /// /// All n validators use the honest Application, but every peer's certifier /// rejects proposals from views where validator 0 is the elected leader. @@ -1083,7 +1082,22 @@ mod tests { #[test_group("slow")] #[test_traced] fn test_dishonest_leader_certification_rejected() { + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_std::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>( + bls12381_threshold_std::fixture::, + ); + dishonest_leader_certification_rejected::<_, _>(bls12381_multisig::fixture::); + dishonest_leader_certification_rejected::<_, _>(bls12381_multisig::fixture::); dishonest_leader_certification_rejected::<_, _>(ed25519::fixture); + dishonest_leader_certification_rejected::<_, _>(secp256r1::fixture); } fn observer(mut fixture: F) @@ -4658,8 +4672,6 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - // This test only exercises reporter filtering. Keep certification - // uniform so leader-owned views do not diverge from follower views. certifier: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( From 33e146c6f9dc92fd6ec9b3f37f207ae4c9d7e649 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 00:26:25 -0700 Subject: [PATCH 025/107] nit --- consensus/src/simplex/actors/voter/mod.rs | 225 ++++++++++------------ 1 file changed, 101 insertions(+), 124 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index db4ca2d6f66..00059673cd0 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -170,21 +170,17 @@ mod tests { (votes, certificate) } - struct VoterCfg { - leader_timeout: Duration, - certification_timeout: Duration, - timeout_retry: Duration, - certifier: mocks::application::Certifier, - } - /// Helper to set up a voter actor for tests. + #[allow(clippy::too_many_arguments)] async fn setup_voter( context: &mut deterministic::Context, oracle: &commonware_p2p::simulated::Oracle, participants: &[S::PublicKey], schemes: &[S], elector: L, - cfg: VoterCfg, + leader_timeout: Duration, + certification_timeout: Duration, + timeout_retry: Duration, ) -> ( Mailbox, mpsc::Receiver>, @@ -196,12 +192,42 @@ mod tests { S: Scheme, L: ElectorConfig, { - let VoterCfg { + setup_voter_with_certifier( + context, + oracle, + participants, + schemes, + elector, leader_timeout, certification_timeout, timeout_retry, - certifier, - } = cfg; + mocks::application::Certifier::Always, + ) + .await + } + + #[allow(clippy::too_many_arguments)] + async fn setup_voter_with_certifier( + context: &mut deterministic::Context, + oracle: &commonware_p2p::simulated::Oracle, + participants: &[S::PublicKey], + schemes: &[S], + elector: L, + leader_timeout: Duration, + certification_timeout: Duration, + timeout_retry: Duration, + certifier: mocks::application::Certifier, + ) -> ( + Mailbox, + mpsc::Receiver>, + mpsc::Receiver>, + Arc>, + mocks::reporter::Reporter, + ) + where + S: Scheme, + L: ElectorConfig, + { let signing = schemes[0].clone(); let me = participants[0].clone(); let reporter_cfg = mocks::reporter::Config { @@ -858,12 +884,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_secs(1000), - timeout_retry: Duration::from_secs(1000), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_millis(500), + Duration::from_secs(1000), + Duration::from_secs(1000), ) .await; @@ -981,12 +1004,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_secs(1000), - timeout_retry: Duration::from_secs(1000), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_millis(500), + Duration::from_secs(1000), + Duration::from_secs(1000), ) .await; @@ -1119,12 +1139,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_secs(1000), - timeout_retry: Duration::from_secs(1000), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_millis(500), + Duration::from_secs(1000), + Duration::from_secs(1000), ) .await; @@ -2048,12 +2065,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_secs(1000), - timeout_retry: Duration::from_secs(1000), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_millis(500), + Duration::from_secs(1000), + Duration::from_secs(1000), ) .await; @@ -2151,12 +2165,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_secs(1000), - timeout_retry: Duration::from_secs(1000), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_millis(500), + Duration::from_secs(1000), + Duration::from_secs(1000), ) .await; @@ -3072,12 +3083,9 @@ mod tests { &participants, &schemes, L::default(), - VoterCfg { - leader_timeout: Duration::from_secs(10), - certification_timeout: Duration::from_secs(10), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_mins(60), ) .await; @@ -5294,12 +5302,9 @@ mod tests { &participants, &schemes, RoundRobin::::default(), - VoterCfg { - leader_timeout: Duration::from_secs(5), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_secs(5), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(5), + Duration::from_secs(5), + Duration::from_secs(5), ) .await; @@ -5409,12 +5414,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(10), - certification_timeout: Duration::from_secs(10), - timeout_retry: Duration::from_secs(100), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), ) .await; @@ -5533,12 +5535,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(10), - certification_timeout: Duration::from_secs(10), - timeout_retry: Duration::from_secs(100), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), ) .await; @@ -5690,12 +5689,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(10), - certification_timeout: Duration::from_secs(10), - timeout_retry: Duration::from_secs(100), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::from_secs(100), ) .await; @@ -5834,18 +5830,16 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Cancel - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_millis(500), - certification_timeout: Duration::from_millis(500), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Cancel, - }, + Duration::from_millis(500), + Duration::from_millis(500), + Duration::from_mins(60), + mocks::application::Certifier::Cancel, ) .await; @@ -6255,18 +6249,16 @@ mod tests { // Setup voter with Certifier::Cancel to simulate missing verification context. let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector.clone(), - VoterCfg { - leader_timeout: Duration::from_secs(2), - certification_timeout: Duration::from_secs(3), - timeout_retry: Duration::from_secs(1), - certifier: mocks::application::Certifier::Cancel, - }, + Duration::from_secs(2), + Duration::from_secs(3), + Duration::from_secs(1), + mocks::application::Certifier::Cancel, ) .await; @@ -6466,18 +6458,16 @@ mod tests { // Set up voter with Certifier::Custom that always returns false // This simulates coding marshal's deferred_verify finding context mismatch - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(100), - certification_timeout: Duration::from_secs(100), - timeout_retry: Duration::from_secs(100), - certifier: mocks::application::Certifier::Custom(Box::new(|_, _| false)), - }, + Duration::from_secs(100), // Long timeout to prove nullify comes from cert failure + Duration::from_secs(100), + Duration::from_secs(100), + mocks::application::Certifier::Custom(Box::new(|_, _| false)), ) .await; @@ -6601,18 +6591,16 @@ mod tests { let elector = RoundRobin::::default(); // Set up voter with Certifier::Pending (certify hangs indefinitely). - let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, relay, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(3), - certification_timeout: Duration::from_secs(4), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Pending, - }, + Duration::from_secs(3), + Duration::from_secs(4), + Duration::from_mins(60), + mocks::application::Certifier::Pending, ) .await; @@ -6739,12 +6727,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(1), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), ) .await; @@ -6879,18 +6864,16 @@ mod tests { .await; let elector = RoundRobin::::default(); - let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter( + let (mut mailbox, mut batcher_receiver, _, _, _) = setup_voter_with_certifier( &mut context, &oracle, &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(1), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Pending, - }, + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), + mocks::application::Certifier::Pending, ) .await; @@ -7018,12 +7001,9 @@ mod tests { &participants, &schemes, RoundRobin::::default(), - VoterCfg { - leader_timeout: Duration::from_secs(1), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), ) .await; @@ -7175,12 +7155,9 @@ mod tests { &participants, &schemes, elector, - VoterCfg { - leader_timeout: Duration::from_secs(1), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - certifier: mocks::application::Certifier::Always, - }, + Duration::from_secs(1), + Duration::from_secs(5), + Duration::from_mins(60), ) .await; From e537338b2b764c96b30dae101167824fb4f0751b Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 21:31:56 -0700 Subject: [PATCH 026/107] increase coverage --- consensus/src/simplex/actors/voter/mod.rs | 529 ++++++++++++++++++++++ 1 file changed, 529 insertions(+) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 00059673cd0..97af575b89e 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -7301,4 +7301,533 @@ mod tests { first_view_progress_without_timeout::<_, _, RoundRobin>(ed25519::fixture); first_view_progress_without_timeout::<_, _, RoundRobin>(secp256r1::fixture); } + + /// Tests that a successful certification is correctly replayed from the journal + /// after a restart. + /// + /// 1. First run: follower certifies a view successfully, which is persisted to journal. + /// 2. Abort the voter. + /// 3. Second run: voter replays journal and processes the Artifact::Certification entry, + /// advancing past the certified view without re-certifying. + fn successful_certification_replayed_after_restart(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"successful_cert_replay".to_vec(); + let partition = "successful_cert_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); + + // First run: certify a follower view successfully. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3 (leader = participant 1). + let target_view = View::new(3); + let parent_payload = advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Send proposal + payload so verification passes. + let proposal = Proposal::new( + Round::new(epoch, target_view), + target_view.previous().unwrap(), + Sha256::hash(b"cert_replay_payload"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + // Send notarization to trigger certification. + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Wait for certification to complete (view advances past target_view). + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(success, "expected successful certification"); + break; + } + _ => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for certification in first run"); + }, + } + } + + // Drain any pending batcher messages so the view has advanced. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); + } + } + + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Certification from journal. + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Custom(Box::new( + move |round, _| { + certify_tracker.lock().push(round.view()); + true + }, + )), + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = + Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for replay to complete and verify the voter advanced past + // target_view (certification was replayed from journal). + let mut replayed_certified = false; + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(success, "replayed certification should be successful"); + replayed_certified = true; + } + _ => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for restarted voter to advance past view {target_view}"); + }, + } + } + + assert!( + replayed_certified, + "resolver should receive Certified during replay for view {target_view}" + ); + + // The voter should NOT have called certify on the automaton for + // target_view (it was replayed from journal). + let certified = certify_calls.lock(); + assert!( + !certified.contains(&target_view), + "voter should not re-certify view {target_view} during replay (observed: {certified:?})" + ); + }); + } + + #[test_traced] + fn test_successful_certification_replayed_after_restart() { + successful_certification_replayed_after_restart( + bls12381_threshold_vrf::fixture::, + ); + successful_certification_replayed_after_restart( + bls12381_threshold_vrf::fixture::, + ); + successful_certification_replayed_after_restart(bls12381_multisig::fixture::); + successful_certification_replayed_after_restart(bls12381_multisig::fixture::); + successful_certification_replayed_after_restart(ed25519::fixture); + successful_certification_replayed_after_restart(secp256r1::fixture); + } + + /// Tests that a failed certification (certify returns false) is correctly replayed + /// from the journal after a restart. The replayed failure should trigger a timeout + /// for the view (not re-certify or advance). + fn failed_certification_replayed_after_restart(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"failed_cert_replay".to_vec(); + let partition = "failed_cert_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); + + // First run: certify fails (returns false). + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Custom(Box::new(|_, _| false)), + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3. + let target_view = View::new(3); + let parent_payload = advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Send proposal + payload. + let proposal = Proposal::new( + Round::new(epoch, target_view), + target_view.previous().unwrap(), + Sha256::hash(b"failed_cert_replay_payload"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, parent_payload, 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + + // Send notarization to trigger certification. + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Wait for failed certification result to be reported to resolver. + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(!success, "expected failed certification"); + break; + } + _ => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { response, .. } = msg.unwrap() { + response.send(None).unwrap(); + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for failed certification in first run"); + }, + } + } + + // Let the journal sync. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); + } + } + + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Certification(false) from journal. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = + Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // The replayed failed certification should be reported to resolver + // and the voter should NOT advance past target_view. + let mut replayed_certified = false; + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certified { view, success } if view == target_view => { + assert!(!success, "replayed certification should be a failure"); + replayed_certified = true; + } + _ => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + // After replay, should be at target_view (not past it). + if current == target_view && replayed_certified { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + if replayed_certified { + break; + } + panic!("timed out waiting for replayed failed certification"); + }, + } + } + + assert!( + replayed_certified, + "resolver should receive Certified(false) during replay for view {target_view}" + ); + }); + } + + #[test_traced] + fn test_failed_certification_replayed_after_restart() { + failed_certification_replayed_after_restart(bls12381_threshold_vrf::fixture::); + failed_certification_replayed_after_restart(bls12381_threshold_vrf::fixture::); + failed_certification_replayed_after_restart(bls12381_multisig::fixture::); + failed_certification_replayed_after_restart(bls12381_multisig::fixture::); + failed_certification_replayed_after_restart(ed25519::fixture); + failed_certification_replayed_after_restart(secp256r1::fixture); + } } From 8d88a95004c70cdcc5086c628490de1a97e749df Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 21:39:53 -0700 Subject: [PATCH 027/107] add more coverage --- consensus/src/simplex/actors/voter/mod.rs | 453 ++++++++++++++++++++++ 1 file changed, 453 insertions(+) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 97af575b89e..ea2890c0a2c 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -7830,4 +7830,457 @@ mod tests { failed_certification_replayed_after_restart(ed25519::fixture); failed_certification_replayed_after_restart(secp256r1::fixture); } + + /// Tests that nullify votes and nullification certificates are correctly + /// replayed from the journal after a restart. + /// + /// 1. First run: follower times out, votes nullify, receives nullification + /// certificate. All persisted to journal. + /// 2. Abort the voter. + /// 3. Second run: voter replays journal and processes Artifact::Nullify and + /// Artifact::Nullification entries. The resolver receives the nullification + /// and the voter re-enters the same view (since it was never finalized). + fn nullify_and_nullification_replayed_after_restart(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"nullify_nullification_replay".to_vec(); + let partition = "nullify_nullification_replay".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + let epoch = Epoch::new(333); + + // First run: trigger timeout and nullification. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(1), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3. + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Wait for the timeout-driven nullify vote. + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(n)) + if n.view() == target_view => + { + break; + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected nullify vote for view {target_view}"); + }, + } + } + + // Send a nullification certificate for this view. + let (_, nullification) = + build_nullification(&schemes, Round::new(epoch, target_view), quorum); + mailbox + .resolved(Certificate::Nullification(nullification)) + .await; + + // Wait for the voter to process the nullification (advances to next view). + loop { + select! { + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("timed out waiting for view advance after nullification"); + }, + } + } + + // Let journal sync. + context.sleep(Duration::from_millis(50)).await; + while let Some(msg) = batcher_receiver.recv().now_or_never().flatten() { + if let batcher::Message::Update { response, .. } = msg { + response.send(None).unwrap(); + } + } + + // Abort first voter. + handle.abort(); + + // Second run: replay should process Artifact::Nullify and + // Artifact::Nullification from journal. + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(1), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = + Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Verify: resolver receives the replayed nullification. + let mut replayed_nullification = false; + loop { + select! { + msg = resolver_receiver.recv() => match msg.unwrap() { + MailboxMessage::Certificate(Certificate::Nullification(n)) + if n.view() == target_view => + { + replayed_nullification = true; + } + _ => {} + }, + msg = batcher_receiver.recv() => { + if let batcher::Message::Update { + current, response, .. + } = msg.unwrap() + { + response.send(None).unwrap(); + if current > target_view && replayed_nullification { + break; + } + } + }, + _ = context.sleep(Duration::from_secs(5)) => { + if replayed_nullification { + break; + } + panic!("timed out waiting for nullification replay"); + }, + } + } + + assert!( + replayed_nullification, + "resolver should receive nullification during replay for view {target_view}" + ); + }); + } + + #[test_traced] + fn test_nullify_and_nullification_replayed_after_restart() { + nullify_and_nullification_replayed_after_restart( + bls12381_threshold_vrf::fixture::, + ); + nullify_and_nullification_replayed_after_restart( + bls12381_threshold_vrf::fixture::, + ); + nullify_and_nullification_replayed_after_restart(bls12381_multisig::fixture::); + nullify_and_nullification_replayed_after_restart(bls12381_multisig::fixture::); + nullify_and_nullification_replayed_after_restart(ed25519::fixture); + nullify_and_nullification_replayed_after_restart(secp256r1::fixture); + } + + /// Tests that when the batcher signals a timeout reason on view update, + /// the voter immediately triggers a timeout for the current view. + /// + /// This covers the path where `batcher.update()` returns `Some(TimeoutReason)` + /// (e.g., because the leader is inactive or has already nullified the view). + fn batcher_update_triggers_timeout(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"batcher_update_timeout".to_vec(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition: format!("batcher_timeout_test_{me}"), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(100), + certification_timeout: Duration::from_secs(100), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.clone(), voter_cfg); + + let (resolver_sender, _resolver_receiver) = mpsc::channel(10); + let resolver = resolver::Mailbox::new(resolver_sender); + + let (batcher_sender, mut batcher_receiver) = mpsc::channel(1024); + let batcher = batcher::Mailbox::new(batcher_sender); + + let (vote_sender, _vote_receiver) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (certificate_sender, _certificate_receiver) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + + voter.start(batcher, resolver, vote_sender, certificate_sender); + + // Consume initial Update. + if let batcher::Message::Update { response, .. } = + batcher_receiver.recv().await.unwrap() + { + response.send(None).unwrap(); + } + + // Advance to follower view 3 using finalization. + let target_view = View::new(3); + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Certify view 3 to advance to view 4. + let proposal = Proposal::new( + Round::new(Epoch::new(333), target_view), + target_view.previous().unwrap(), + Sha256::hash(b"batcher_timeout_view3"), + ); + let leader = participants[1].clone(); + let contents = (proposal.round, Sha256::hash(b"genesis"), 0u64).encode(); + relay.broadcast(&leader, (proposal.payload, contents)); + mailbox.proposal(proposal.clone()).await; + let (_, notarization) = build_notarization(&schemes, &proposal, quorum); + mailbox + .resolved(Certificate::Notarization(notarization)) + .await; + + // Wait for the Update for view 4 and respond with a timeout reason + // to simulate batcher signaling that the leader should be skipped. + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Update { + current, + response, + .. + } if current > target_view => { + // Signal leader inactivity to trigger the timeout path. + response.send(Some(TimeoutReason::Inactivity)).unwrap(); + break; + } + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + } + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("expected Update for view > {target_view}"); + }, + } + } + + // The voter should emit a nullify vote for view 4 quickly (not + // after the 100s leader timeout) because the batcher signaled + // immediate timeout. + let next_view = target_view.next(); + loop { + select! { + msg = batcher_receiver.recv() => match msg.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == next_view => + { + break; + } + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + } + _ => {} + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!( + "expected nullify for view {next_view} triggered by batcher timeout" + ); + }, + } + } + }); + } + + #[test_traced] + fn test_batcher_update_triggers_timeout() { + batcher_update_triggers_timeout(bls12381_threshold_vrf::fixture::); + batcher_update_triggers_timeout(bls12381_threshold_vrf::fixture::); + batcher_update_triggers_timeout(bls12381_multisig::fixture::); + batcher_update_triggers_timeout(bls12381_multisig::fixture::); + batcher_update_triggers_timeout(ed25519::fixture); + batcher_update_triggers_timeout(secp256r1::fixture); + } } From cb55e8a287d28956c84d84985847fa82b262b039 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Tue, 14 Apr 2026 22:17:32 -0700 Subject: [PATCH 028/107] push --- consensus/src/simplex/actors/voter/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index ea2890c0a2c..2ef007204de 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -7760,8 +7760,7 @@ mod tests { write_buffer: NZUsize!(1024 * 1024), page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), }; - let (voter, _mailbox) = - Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); let (vote_sender, _) = oracle @@ -8029,8 +8028,7 @@ mod tests { write_buffer: NZUsize!(1024 * 1024), page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), }; - let (voter, _mailbox) = - Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); let (resolver_sender, mut resolver_receiver) = mpsc::channel(8); let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); let (vote_sender, _) = oracle From 19d0db1cf64709928462c0ef6bfcda80f69d7228 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 07:59:50 -0700 Subject: [PATCH 029/107] cover last_finalized --- consensus/src/simplex/actors/voter/state.rs | 78 +++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index 9da3b04929c..aac6870212f 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -2013,6 +2013,84 @@ mod tests { }); } + #[test] + fn certify_candidates_skips_views_at_or_below_last_finalized() { + let runtime = deterministic::Runner::default(); + runtime.start(|mut context| async move { + let namespace = b"ns".to_vec(); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); + + let cfg = Config { + scheme: schemes[0].clone(), + elector: ::default(), + epoch: Epoch::new(1), + activity_timeout: ViewDelta::new(10), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(3), + }; + let mut state = State::new(context, cfg); + state.set_genesis(test_genesis()); + + let make_notarization = |view: View| { + let proposal = Proposal::new( + Rnd::new(Epoch::new(1), view), + GENESIS_VIEW, + Sha256Digest::from([view.get() as u8; 32]), + ); + let votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + Notarization::from_notarizes(&verifier, votes.iter(), &Sequential).unwrap() + }; + + let make_finalization = |view: View| { + let proposal = Proposal::new( + Rnd::new(Epoch::new(1), view), + GENESIS_VIEW, + Sha256Digest::from([view.get() as u8; 32]), + ); + let votes: Vec<_> = schemes + .iter() + .map(|scheme| Finalize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + Finalization::from_finalizes(&verifier, votes.iter(), &Sequential).unwrap() + }; + + let stale_view = View::new(2); + let live_view = View::new(3); + + state.add_notarization(make_notarization(stale_view)); + state.add_notarization(make_notarization(live_view)); + state.add_finalization(make_finalization(stale_view)); + + // Reinsert a stale candidate to exercise the defensive finalized-view guard. + state.certification_candidates.insert(stale_view); + assert_eq!(state.last_finalized(), stale_view); + + // The stale round still looks certifiable without the finalized-view filter. + assert!( + state + .views + .get_mut(&stale_view) + .expect("stale round must exist") + .try_certify() + .is_some() + ); + + let expected_am_leader = state + .leader_index(live_view) + .is_some_and(|leader| state.is_me(leader)); + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].0.round.view(), live_view); + assert_eq!(candidates[0].1, expected_am_leader); + }); + } + #[test] fn nullification_keeps_notarization_as_certification_candidate() { let runtime = deterministic::Runner::default(); From 15ed6fdd68fcca61bdc1d33b43307317d31d5835 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 08:21:48 -0700 Subject: [PATCH 030/107] fmt --- consensus/src/simplex/actors/voter/state.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index aac6870212f..5fc5dba84cd 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -2072,14 +2072,12 @@ mod tests { assert_eq!(state.last_finalized(), stale_view); // The stale round still looks certifiable without the finalized-view filter. - assert!( - state - .views - .get_mut(&stale_view) - .expect("stale round must exist") - .try_certify() - .is_some() - ); + assert!(state + .views + .get_mut(&stale_view) + .expect("stale round must exist") + .try_certify() + .is_some()); let expected_am_leader = state .leader_index(live_view) From 714ece1944ecaccb5275f98226ce8354fb9d019a Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 14:17:11 -0700 Subject: [PATCH 031/107] spike on comments --- consensus/src/simplex/actors/voter/actor.rs | 14 ++++++++++++-- consensus/src/simplex/actors/voter/state.rs | 10 ++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 900215d267f..081b8d2de73 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -859,8 +859,18 @@ impl< let view = round.view(); debug!(%view, "attempting certification"); let result = if am_leader { - // Once we know the local participant led this view, reaching out to the - // automaton is unnecessary. + // We led this view, so the proposal is ours and certification is trivially + // true. Skipping the automaton call avoids a redundant round-trip. + // + // INVARIANT: `am_leader` implies we proposed. To propose, we must have + // entered the view (via `enter_view`), which always sets the round's + // leader. So when `state::certify_candidates` reports `am_leader = true`, + // we have provably proposed for this view. The converse case where the + // round's leader is unknown (e.g. notarization arrived via resolver + // before the prior view's certificate set this view's leader) is reported + // as `am_leader = false` and falls through to `automaton.certify`, which + // is correct: we never entered the view, so we never proposed and don't + // hold the block locally. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index 5fc5dba84cd..a32509b525b 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -587,6 +587,16 @@ impl, L: ElectorConfig, D: D /// Takes all certification candidates and returns proposals ready for /// certification, along with whether the local participant is the leader /// of each view (used to short-circuit certification for own proposals). + /// + /// The `am_leader` flag is `true` only when the round's leader has been + /// set AND matches the local participant. A round's leader is always set + /// before we can propose into it (proposing requires `enter_view`, which + /// sets the leader), so `am_leader = true` provably implies we proposed + /// the block. If the leader is unknown for this view (e.g. notarization + /// arrived via resolver/replay before the prior view's certificate set + /// this view's leader), `am_leader` is `false`: we never entered the + /// view, so the proposal cannot be ours and the caller must fall back to + /// the automaton to certify. pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); let me = self.scheme.me(); From 6953ebb937ec236f14b44bc30fbf5faf7c1057d3 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 17:13:02 -0700 Subject: [PATCH 032/107] spike on reliable storage --- consensus/src/marshal/coding/marshaled.rs | 8 +- consensus/src/marshal/coding/mod.rs | 121 +++++++++++++++++++++- consensus/src/marshal/core/actor.rs | 13 ++- consensus/src/marshal/core/mailbox.rs | 38 +++++-- consensus/src/marshal/mocks/harness.rs | 4 + consensus/src/marshal/mocks/verifying.rs | 14 ++- 6 files changed, 184 insertions(+), 14 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index ef04a4146bb..c08d07021aa 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -964,7 +964,13 @@ where height = %block.height(), "requested broadcast of built block" ); - self.shards.proposed(round, block).await; + // Route through marshal so the proposer's own block is durably + // persisted before (or alongside) shard broadcast. The marshal + // actor caches the verified block and forwards to the shards + // engine via the Buffer impl. Without this, the block lives + // only in the shards in-memory cache and a crash before any + // verify-driven persistence loses it. + self.marshal.proposed(round, block).await; } Plan::Forward { .. } => { // Coding variant does not support targeted forwarding; diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 5ec8cd89032..cb0856610d0 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -66,7 +66,7 @@ mod tests { use crate::{ marshal::{ coding::{ - types::{coding_config_for_participants, CodedBlock}, + types::{coding_config_for_participants, hash_context, CodedBlock}, Marshaled, MarshaledConfig, }, mocks::{ @@ -79,9 +79,9 @@ mod tests { verifying::MockVerifyingApp, }, }, - simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal}, + simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal, Plan}, types::{coding::Commitment, Epoch, Epocher, FixedEpocher, Height, Round, View}, - Automaton, CertifiableAutomaton, + Automaton, CertifiableAutomaton, CertifiableBlock, Relay, }; use commonware_codec::FixedSize; use commonware_coding::ReedSolomon; @@ -1595,4 +1595,119 @@ mod tests { assert!(rx.await.is_err()); }); } + + /// Regression: a proposer must be able to recover its own block after a + /// crash that occurs between `Marshaled::propose()` + `Relay::broadcast(Plan::Propose)` + /// and any verify-driven persistence. Without persisting on the broadcast + /// path, the block lives only in the in-memory shards cache and is lost + /// across restart. + #[test_traced("WARN")] + fn test_marshaled_proposed_block_persists_across_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + + // Compute the genesis commitment as Marshaled would (mirrors the + // private `genesis_coding_commitment` helper in marshaled.rs). + let genesis_parent_commitment = Commitment::from(( + genesis.digest(), + genesis.digest(), + hash_context::(&genesis.context()), + harness::GENESIS_CODING_CONFIG, + )); + + // Build the block we want propose() to return. Its embedded context + // uses the proper genesis commitment so fetch_parent matches the + // cached genesis without going through the marshal subscription. + let propose_round = Round::new(Epoch::zero(), View::new(1)); + let propose_context = CodingCtx { + round: propose_round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + let block_to_propose = + make_coding_block(propose_context.clone(), genesis.digest(), Height::new(1), 100); + let block_digest = block_to_propose.digest(); + let expected_commitment = + CodedBlock::<_, ReedSolomon, Sha256>::new( + block_to_propose.clone(), + coding_config, + &Sequential, + ) + .commitment(); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis).with_propose_result(block_to_propose); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + // Drive the full leader-side propose + broadcast path. + let commitment = marshaled + .propose(propose_context) + .await + .await + .expect("propose should produce a commitment"); + assert_eq!(commitment, expected_commitment); + marshaled.broadcast(commitment, Plan::Propose).await; + + // Crash immediately. `broadcast` already awaited the actor's + // persistence ack via `marshal.proposed`, so no extra sleep is + // needed - the block must be on disk by now. + drop(marshaled); + drop(marshal); + drop(shards); + + let setup2 = CodingHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + // The proposer must recover its own block after restart. Without + // the broadcast-path persistence fix, the block lived only in the + // shards engine's in-memory cache and is now gone. + let post_restart = marshal2.get_block(&block_digest).await; + assert!( + post_restart.is_some(), + "proposer should recover its own block after restart" + ); + }); + } } diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index e5ff4c0704b..4b29fc80acf 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -518,9 +518,13 @@ where }; response.send_lossy(info); } - Message::Proposed { round, block } => { + Message::Proposed { round, block, ack } => { + // Persist before acknowledging so the caller can rely on + // "voted ⟹ persisted" before broadcasting their own + // notarize vote on this proposal. self.cache_verified(round, block.digest(), block.clone()) .await; + ack.send_lossy(()); buffer.send(round, block, Recipients::All).await; } Message::Forward { @@ -538,8 +542,13 @@ where }; buffer.send(round, block, Recipients::Some(peers)).await; } - Message::Verified { round, block } => { + Message::Verified { round, block, ack } => { + // Persist before acknowledging so the caller (typically + // `Marshaled::deferred_verify`) can rely on + // "verify-ack ⟹ persisted", which in turn lets + // `certify` resolve true only after disk persistence. self.cache_verified(round, block.digest(), block).await; + ack.send_lossy(()); } Message::Notarization { notarization } => { let round = notarization.round(); diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 9c2ff0b8728..414394756fa 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -86,11 +86,17 @@ pub(crate) enum Message { response: oneshot::Sender, }, /// A request to broadcast a proposed block to peers. + /// + /// The `ack` is signaled after the block has been durably persisted + /// (`cache.put_verified` returns) so callers can establish + /// "voted ⟹ persisted" before broadcasting their own notarize vote. Proposed { /// The round in which the block was proposed. round: Round, /// The block to broadcast. block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, }, /// A request to forward a block to a set of peers. Forward { @@ -102,11 +108,17 @@ pub(crate) enum Message { peers: Vec, }, /// A notification that a block has been verified by the application. + /// + /// The `ack` is signaled after the block has been durably persisted + /// (`cache.put_verified` returns) so callers can establish + /// "voted ⟹ persisted" before resolving consensus's certify task. Verified { /// The round in which the block was verified. round: Round, /// The verified block. block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, }, /// Sets the sync starting point (advances if higher than current). /// @@ -283,17 +295,31 @@ impl Mailbox { .map(|block| AncestorStream::new(self.clone(), [V::into_inner(block)])) } - /// Requests that a proposed block is sent to peers. + /// Requests that a proposed block is sent to peers, awaiting the actor's + /// confirmation that the block has been durably persisted before returning. + /// + /// This is a safety boundary: it ensures the proposer cannot vote notarize + /// on its own proposal before the block exists on disk. Returns silently + /// if the actor has shut down (block is then unrecoverable from this node, + /// matching the existing post-shutdown contract for fire-and-forget calls). pub async fn proposed(&self, round: Round, block: V::Block) { - self.sender - .send_lossy(Message::Proposed { round, block }) + let _ = self + .sender + .request(|ack| Message::Proposed { round, block, ack }) .await; } - /// Notifies the actor that a block has been verified. + /// Notifies the actor that a block has been verified, awaiting the actor's + /// confirmation that the block has been durably persisted before returning. + /// + /// This is a safety boundary: it ensures consensus's certify task cannot + /// resolve true (and thus cannot drive a finalize vote) before the block + /// exists on disk for this validator. Returns silently if the actor has + /// shut down. pub async fn verified(&self, round: Round, block: V::Block) { - self.sender - .send_lossy(Message::Verified { round, block }) + let _ = self + .sender + .request(|ack| Message::Verified { round, block, ack }) .await; } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 19aef8c2b16..a17f0838375 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -165,6 +165,10 @@ pub struct ValidatorSetup { pub mailbox: Mailbox, pub extra: H::ValidatorExtra, pub height: Height, + /// Handle to the marshal actor task. Tests can call `.abort()` to simulate + /// an actor crash and verify that prior `marshal.verified()`/`proposed()` + /// calls have already been durably persisted. + pub actor_handle: commonware_runtime::Handle<()>, } /// Per-validator handle for test operations. diff --git a/consensus/src/marshal/mocks/verifying.rs b/consensus/src/marshal/mocks/verifying.rs index 0b392eb0c2e..ea01956e693 100644 --- a/consensus/src/marshal/mocks/verifying.rs +++ b/consensus/src/marshal/mocks/verifying.rs @@ -14,7 +14,7 @@ use commonware_runtime::deterministic; /// /// This mock: /// - Returns the provided genesis block from `genesis()` -/// - Returns `None` from `propose()` (never proposes) +/// - Returns the configured block (if any) from `propose()` /// - Returns a configurable result from `verify()` #[derive(Clone)] pub struct MockVerifyingApp { @@ -22,6 +22,8 @@ pub struct MockVerifyingApp { pub genesis: B, /// The result returned by `verify`. pub verify_result: bool, + /// The block returned by `propose`. If `None`, `propose` returns `None`. + pub propose_result: Option, _phantom: std::marker::PhantomData, } @@ -31,6 +33,7 @@ impl MockVerifyingApp { Self { genesis, verify_result: true, + propose_result: None, _phantom: std::marker::PhantomData, } } @@ -40,9 +43,16 @@ impl MockVerifyingApp { Self { genesis, verify_result, + propose_result: None, _phantom: std::marker::PhantomData, } } + + /// Configure the block returned by `propose`. + pub fn with_propose_result(mut self, block: B) -> Self { + self.propose_result = Some(block); + self + } } impl crate::Application for MockVerifyingApp @@ -64,7 +74,7 @@ where _context: (deterministic::Context, Self::Context), _ancestry: AncestorStream, ) -> Option { - None + self.propose_result.clone() } } From abb69a213029b560339b60a34182e0c527c6eeb6 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 17:21:22 -0700 Subject: [PATCH 033/107] more tests --- consensus/src/marshal/coding/mod.rs | 143 ++++++++++++++++++++++++- consensus/src/marshal/mocks/harness.rs | 10 +- 2 files changed, 148 insertions(+), 5 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index cb0856610d0..3e9a67c4d0f 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -1596,6 +1596,140 @@ mod tests { }); } + /// Regression: a validator must not vote finalize on a block that is not + /// durably persisted. `certify` resolves true ⟹ block is on disk for + /// this validator. We assert this by aborting the marshal actor the + /// instant `certify` returns true; without the persist-before-certify + /// fix, the actor may have only had the `Verified` message enqueued (not + /// processed), and the block is lost on restart even though the validator + /// would have proceeded to broadcast a finalize vote. + #[test_traced("WARN")] + fn test_marshaled_certify_persists_block_before_resolving() { + for seed in 0u64..16 { + certify_persists_block_before_resolving_at(seed); + } + } + + fn certify_persists_block_before_resolving_at(seed: u64) { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + + // Push parent (height 1) and child (height 2) into the shards + // engine. These are reconstructable but NOT durably persisted. + let parent_round = Round::new(Epoch::zero(), View::new(1)); + let parent_ctx = CodingCtx { + round: parent_round, + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); + let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); + let parent_commitment = coded_parent.commitment(); + shards.clone().proposed(parent_round, coded_parent).await; + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = CodingCtx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_commitment), + }; + let child = make_coding_block(child_ctx.clone(), parent.digest(), Height::new(2), 200); + let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); + let child_commitment = coded_child.commitment(); + let child_digest = coded_child.digest(); + shards.clone().proposed(child_round, coded_child).await; + + context.sleep(Duration::from_millis(10)).await; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + // Optimistic verify - returns shard validity (true). + let shard_validity = marshaled + .verify(child_ctx, child_commitment) + .await + .await + .expect("verify result missing"); + assert!(shard_validity, "shard validity should pass"); + + // Certify - this is the safety gate before finalize voting. + let certify_result = marshaled + .certify(child_round, child_commitment) + .await + .await + .expect("certify result missing"); + assert!(certify_result, "certify should succeed"); + + // CRITICAL: abort the marshal actor synchronously, with no + // intervening await. If certify returned true but the actor had + // only enqueued (not processed) the `Verified` message, this + // abort kills the actor before persistence completes. + actor_handle.abort(); + drop(marshaled); + drop(marshal); + drop(shards); + + // Restart from the same partition. The block must be durably + // persisted - otherwise the validator would have voted finalize + // for a block it cannot serve from local storage. + let setup2 = CodingHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "certify resolved true ⟹ block must be durably persisted" + ); + }); + } + /// Regression: a proposer must be able to recover its own block after a /// crash that occurs between `Marshaled::propose()` + `Relay::broadcast(Plan::Propose)` /// and any verify-driven persistence. Without persisting on the broadcast @@ -1626,6 +1760,7 @@ mod tests { .await; let marshal = setup.mailbox; let shards = setup.extra; + let actor_handle = setup.actor_handle; let genesis_ctx = CodingCtx { round: Round::zero(), @@ -1684,9 +1819,11 @@ mod tests { assert_eq!(commitment, expected_commitment); marshaled.broadcast(commitment, Plan::Propose).await; - // Crash immediately. `broadcast` already awaited the actor's - // persistence ack via `marshal.proposed`, so no extra sleep is - // needed - the block must be on disk by now. + // CRITICAL: abort the marshal actor synchronously so it cannot + // drain pending messages. `broadcast` must have already awaited + // the actor's persistence ack via `marshal.proposed`, so no extra + // sleep is needed - the block must be on disk by now. + actor_handle.abort(); drop(marshaled); drop(marshal); drop(shards); diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index a17f0838375..13ef8ceb2f4 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -486,13 +486,14 @@ impl TestHarness for StandardHarness { config, ) .await; - actor.start(application.clone(), buffer, resolver); + let actor_handle = actor.start(application.clone(), buffer, resolver); ValidatorSetup { application, mailbox, extra: (), height, + actor_handle, } } @@ -704,6 +705,7 @@ impl TestHarness for InlineHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -729,6 +731,7 @@ impl TestHarness for InlineHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -890,6 +893,7 @@ impl TestHarness for DeferredHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -915,6 +919,7 @@ impl TestHarness for DeferredHarness { mailbox: setup.mailbox, extra: setup.extra, height: setup.height, + actor_handle: setup.actor_handle, } } @@ -1254,13 +1259,14 @@ impl TestHarness for CodingHarness { config, ) .await; - actor.start(application.clone(), shard_mailbox.clone(), resolver); + let actor_handle = actor.start(application.clone(), shard_mailbox.clone(), resolver); ValidatorSetup { application, mailbox, extra: shard_mailbox, height, + actor_handle, } } From 962507b3dfcd1f85295fe9ea14649895a14b06f2 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 17:43:10 -0700 Subject: [PATCH 034/107] spike --- consensus/src/marshal/core/mailbox.rs | 33 ++++-- consensus/src/marshal/mocks/harness.rs | 25 ++-- consensus/src/marshal/standard/deferred.rs | 128 +++++++++++++++++++++ consensus/src/marshal/standard/inline.rs | 120 +++++++++++++++++++ 4 files changed, 284 insertions(+), 22 deletions(-) diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 414394756fa..02899f7b63d 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -299,14 +299,19 @@ impl Mailbox { /// confirmation that the block has been durably persisted before returning. /// /// This is a safety boundary: it ensures the proposer cannot vote notarize - /// on its own proposal before the block exists on disk. Returns silently - /// if the actor has shut down (block is then unrecoverable from this node, - /// matching the existing post-shutdown contract for fire-and-forget calls). + /// on its own proposal before the block exists on disk. + /// + /// # Panics + /// + /// Panics if the marshal actor has shut down before acknowledging + /// persistence. Returning silently here would let the caller proceed to + /// vote on a block that is not durably stored, which violates the + /// "voted ⟹ persisted" invariant the rest of the system relies on. pub async fn proposed(&self, round: Round, block: V::Block) { - let _ = self - .sender + self.sender .request(|ack| Message::Proposed { round, block, ack }) - .await; + .await + .expect("marshal actor dropped before acknowledging proposed block persistence"); } /// Notifies the actor that a block has been verified, awaiting the actor's @@ -314,13 +319,19 @@ impl Mailbox { /// /// This is a safety boundary: it ensures consensus's certify task cannot /// resolve true (and thus cannot drive a finalize vote) before the block - /// exists on disk for this validator. Returns silently if the actor has - /// shut down. + /// exists on disk for this validator. + /// + /// # Panics + /// + /// Panics if the marshal actor has shut down before acknowledging + /// persistence. Returning silently here would let `certify` resolve true + /// (driving a finalize vote) on a block that is not durably stored, + /// violating the "voted ⟹ persisted" invariant. pub async fn verified(&self, round: Round, block: V::Block) { - let _ = self - .sender + self.sender .request(|ack| Message::Verified { round, block, ack }) - .await; + .await + .expect("marshal actor dropped before acknowledging verified block persistence"); } /// Sets the sync starting point (advances if higher than current). diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 13ef8ceb2f4..a5d5b04adb1 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -327,7 +327,10 @@ impl TestHarness for StandardHarness { type ApplicationBlock = B; type Variant = Standard; type TestBlock = B; - type ValidatorExtra = (); + /// Exposes the buffered broadcast mailbox so tests can seed in-memory + /// blocks directly (mirroring how a peer's broadcast would land in the + /// local buffer). + type ValidatorExtra = buffered::Mailbox; type Commitment = D; async fn setup_validator( @@ -486,12 +489,12 @@ impl TestHarness for StandardHarness { config, ) .await; - let actor_handle = actor.start(application.clone(), buffer, resolver); + let actor_handle = actor.start(application.clone(), buffer.clone(), resolver); ValidatorSetup { application, mailbox, - extra: (), + extra: buffer, height, actor_handle, } @@ -673,9 +676,9 @@ impl TestHarness for StandardHarness { ) .await; let application = Application::::default(); - actor.start(application.clone(), buffer, resolver); + actor.start(application.clone(), buffer.clone(), resolver); - (mailbox, (), application) + (mailbox, buffer, application) } async fn verify_for_prune(handle: &mut ValidatorHandle, round: Round, block: &B) { @@ -771,7 +774,7 @@ impl TestHarness for InlineHarness { StandardHarness::propose( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -788,7 +791,7 @@ impl TestHarness for InlineHarness { StandardHarness::verify( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -862,7 +865,7 @@ impl TestHarness for InlineHarness { StandardHarness::verify_for_prune( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -959,7 +962,7 @@ impl TestHarness for DeferredHarness { InlineHarness::propose( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -976,7 +979,7 @@ impl TestHarness for DeferredHarness { InlineHarness::verify( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, @@ -1050,7 +1053,7 @@ impl TestHarness for DeferredHarness { InlineHarness::verify_for_prune( &mut ValidatorHandle:: { mailbox: handle.mailbox.clone(), - extra: handle.extra, + extra: handle.extra.clone(), }, round, block, diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index f16d18f23e7..5e0559c59cf 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -983,4 +983,132 @@ mod tests { } }) } + + /// Regression: a validator must not vote finalize on a block that is not + /// durably persisted. `certify` resolves true ⟹ block is on disk. + /// + /// To exercise the race we have to seed the parent and child via the + /// buffered broadcast layer (in-memory only) instead of `marshal.proposed`, + /// which already persists. Otherwise `marshal.verified` is just a no-op + /// re-write and the test cannot catch the pre-fix race. + #[test_traced("WARN")] + fn test_certify_persists_block_before_resolving() { + for seed in 0u64..16 { + certify_persists_block_before_resolving_at(seed); + } + } + + fn certify_persists_block_before_resolving_at(seed: u64) { + use commonware_broadcast::Broadcaster; + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + // Build parent (height 1) and child (height 2). Seed both into + // the buffered broadcast cache (in-memory only), bypassing + // `marshal.proposed` which would already persist them. + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + // Broadcast to no peers - this only inserts into the local + // buffer cache (mirrors the pre-fix in-memory-only state). + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .await + .await + .expect("buffer broadcast for child should ack"); + + // Optimistic verify: returns true after parent/child fetch from + // the buffer + ancestry validation + app verify. + let optimistic = marshaled + .verify(child_ctx, child_digest) + .await + .await + .expect("verify result missing"); + assert!(optimistic, "optimistic verify should pass"); + + // Certify - this is the safety gate before finalize voting. + let certify_result = marshaled + .certify(child_round, child_digest) + .await + .await + .expect("certify result missing"); + assert!(certify_result, "certify should succeed"); + + // CRITICAL: abort the marshal actor synchronously, with no + // intervening await. If certify returned true but the actor had + // only enqueued (not processed) the `Verified` message, this + // abort kills the actor before persistence completes. + actor_handle.abort(); + drop(marshaled); + drop(marshal); + drop(buffer); + + // Restart from the same partition. The block must be durably + // persisted - otherwise the validator would have voted finalize + // for a block it cannot serve from local storage. + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "certify resolved true ⟹ block must be durably persisted (seed={seed})" + ); + }); + } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 6c7cc8da2ae..0f6d0b1f803 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -725,4 +725,124 @@ mod tests { } }); } + + /// Regression: in inline mode, `verify` itself returns true after running + /// app verification. That return value drives the notarize vote, so it + /// must imply "block is durably persisted" -- otherwise a crash between + /// vote and persistence leaves the validator having voted for a block it + /// cannot serve. + /// + /// As with the deferred-mode test, the parent and child are seeded via + /// the buffered broadcast layer (in-memory only), bypassing + /// `marshal.proposed` which would already persist them. + #[test_traced("WARN")] + fn test_inline_verify_persists_block_before_resolving() { + for seed in 0u64..16 { + inline_verify_persists_block_before_resolving_at(seed); + } + } + + fn inline_verify_persists_block_before_resolving_at(seed: u64) { + use commonware_broadcast::Broadcaster; + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + // Build parent (height 1) and child (height 2). Seed both into + // the buffered broadcast cache (in-memory only). + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .await + .await + .expect("buffer broadcast for child should ack"); + + // Inline verify runs full validation inline and returns true only + // after `marshal.verified` is enqueued. With the persistence-ack + // fix, that enqueue blocks until put_sync completes. + let verify_result = inline + .verify(child_ctx, child_digest) + .await + .await + .expect("verify result missing"); + assert!(verify_result, "inline verify should pass"); + + // CRITICAL: abort the marshal actor synchronously, with no + // intervening await. If verify returned true but the actor had + // only enqueued (not processed) the `Verified` message, this + // abort kills the actor before persistence completes. + actor_handle.abort(); + drop(inline); + drop(marshal); + drop(buffer); + + // Restart from the same partition. The block must be durably + // persisted - otherwise the validator would have voted notarize + // for a block it cannot serve from local storage. + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "verify resolved true ⟹ block must be durably persisted (seed={seed})" + ); + }); + } } From e2955a1ee7e85cacf2a2a30cdbbdfd5b2fdbad74 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 17:49:22 -0700 Subject: [PATCH 035/107] spike --- consensus/src/marshal/coding/marshaled.rs | 60 ++++++++++++++---- consensus/src/marshal/core/mailbox.rs | 67 ++++++++++++++------ consensus/src/marshal/standard/deferred.rs | 24 ++++++- consensus/src/marshal/standard/inline.rs | 13 +++- consensus/src/marshal/standard/validation.rs | 26 ++++++-- 5 files changed, 149 insertions(+), 41 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index c08d07021aa..39d3431a394 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -425,8 +425,17 @@ where }; timer.observe(); if application_valid { - // The block is only persisted at this point. - marshal.verified(round, block).await; + // The block is only persisted at this point. If the marshal + // actor is gone, do NOT signal certify-true: the block was + // not durably stored and consensus must not finalize-vote + // on it. + if marshal.verified(round, block).await.is_err() { + debug!( + ?round, + "marshal unavailable during verified ack; skipping certify resolution" + ); + return; + } } tx.send_lossy(application_valid); }); @@ -759,8 +768,17 @@ where } // Valid re-proposal. Notify the marshal and complete the - // verification task for `certify`. - marshal.verified(round, block).await; + // verification task for `certify`. If marshal is gone, do + // not signal certify-true: the block was not durably + // stored. + if marshal.verified(round, block).await.is_err() { + debug!( + ?round, + "marshal unavailable during re-proposal verified ack; \ + skipping certify resolution" + ); + return; + } task_tx.send_lossy(true); tx.send_lossy(true); }); @@ -897,8 +915,16 @@ where if is_reproposal { // NOTE: It is possible that, during crash recovery, we call // `marshal.verified` twice for the same block. That function is - // idempotent, so this is safe. - marshaled.marshal.verified(round, block).await; + // idempotent, so this is safe. If marshal is gone, do not + // signal certify-true: the block was not durably stored. + if marshaled.marshal.verified(round, block).await.is_err() { + debug!( + ?round, + "marshal unavailable during certify re-proposal verified ack; \ + skipping certify resolution" + ); + return; + } tx.send_lossy(true); return; } @@ -965,12 +991,22 @@ where "requested broadcast of built block" ); // Route through marshal so the proposer's own block is durably - // persisted before (or alongside) shard broadcast. The marshal - // actor caches the verified block and forwards to the shards - // engine via the Buffer impl. Without this, the block lives - // only in the shards in-memory cache and a crash before any - // verify-driven persistence loses it. - self.marshal.proposed(round, block).await; + // persisted before shard broadcast. The marshal actor caches + // the verified block and forwards to the shards engine via + // the Buffer impl. Without this, the block would live only in + // the shards in-memory cache and a crash before any + // verify-driven persistence would lose it. + // + // If marshal is unavailable (graceful shutdown), log and + // skip: consensus is also being torn down and the local vote + // for this proposal must not proceed without persistence. + if self.marshal.proposed(round, block).await.is_err() { + warn!( + ?round, + ?commitment, + "marshal unavailable during proposed broadcast; block not persisted" + ); + } } Plan::Forward { .. } => { // Coding variant does not support targeted forwarding; diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 02899f7b63d..c4ac020dded 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -13,6 +13,22 @@ use commonware_utils::{ channel::{fallible::AsyncFallibleExt, mpsc, oneshot}, vec::NonEmptyVec, }; +use std::fmt; + +/// Returned by [Mailbox::verified] / [Mailbox::proposed] when the marshal actor +/// is no longer running (typical during graceful shutdown). Callers MUST +/// treat this as "persistence not confirmed" and avoid signaling consensus +/// that the block is good for voting. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MarshalUnavailable; + +impl fmt::Display for MarshalUnavailable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("marshal actor unavailable; persistence not confirmed") + } +} + +impl std::error::Error for MarshalUnavailable {} /// Messages sent to the marshal [Actor](super::Actor). /// @@ -298,40 +314,49 @@ impl Mailbox { /// Requests that a proposed block is sent to peers, awaiting the actor's /// confirmation that the block has been durably persisted before returning. /// - /// This is a safety boundary: it ensures the proposer cannot vote notarize - /// on its own proposal before the block exists on disk. - /// - /// # Panics + /// This is a safety boundary: callers may rely on `Ok(())` meaning the + /// block exists on disk. They MUST NOT proceed past `Err(_)` to a place + /// where they would vote on the block, since persistence was not + /// confirmed. /// - /// Panics if the marshal actor has shut down before acknowledging - /// persistence. Returning silently here would let the caller proceed to - /// vote on a block that is not durably stored, which violates the - /// "voted ⟹ persisted" invariant the rest of the system relies on. - pub async fn proposed(&self, round: Round, block: V::Block) { + /// Returns `Err(MarshalUnavailable)` if the marshal actor has shut down + /// (typical during graceful shutdown, where the spawned task should just + /// exit silently). + #[must_use = "callers must not proceed to vote on an unpersisted block"] + pub async fn proposed( + &self, + round: Round, + block: V::Block, + ) -> Result<(), MarshalUnavailable> { self.sender .request(|ack| Message::Proposed { round, block, ack }) .await - .expect("marshal actor dropped before acknowledging proposed block persistence"); + .map(|()| ()) + .ok_or(MarshalUnavailable) } /// Notifies the actor that a block has been verified, awaiting the actor's /// confirmation that the block has been durably persisted before returning. /// - /// This is a safety boundary: it ensures consensus's certify task cannot - /// resolve true (and thus cannot drive a finalize vote) before the block - /// exists on disk for this validator. - /// - /// # Panics + /// This is a safety boundary: callers may rely on `Ok(())` meaning the + /// block exists on disk. They MUST NOT proceed past `Err(_)` to a place + /// where they would resolve consensus's certify task as true (which + /// would drive a finalize vote) since persistence was not confirmed. /// - /// Panics if the marshal actor has shut down before acknowledging - /// persistence. Returning silently here would let `certify` resolve true - /// (driving a finalize vote) on a block that is not durably stored, - /// violating the "voted ⟹ persisted" invariant. - pub async fn verified(&self, round: Round, block: V::Block) { + /// Returns `Err(MarshalUnavailable)` if the marshal actor has shut down + /// (typical during graceful shutdown, where the spawned task should just + /// exit silently). + #[must_use = "callers must not proceed to certify true on an unpersisted block"] + pub async fn verified( + &self, + round: Round, + block: V::Block, + ) -> Result<(), MarshalUnavailable> { self.sender .request(|ack| Message::Verified { round, block, ack }) .await - .expect("marshal actor dropped before acknowledging verified block persistence"); + .map(|()| ()) + .ok_or(MarshalUnavailable) } /// Sets the sync starting point (advances if higher than current). diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 5e0559c59cf..16c271309c0 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -474,6 +474,11 @@ where return; } Decision::Continue(block) => block, + Decision::Aborted => { + // Persistence not confirmed (marshal shut down). Exit + // silently rather than signal a verdict to consensus. + return; + } }; // Before casting a notarize vote, ensure the block's embedded context matches @@ -582,7 +587,16 @@ where if is_reproposal { // NOTE: It is possible that, during crash recovery, we call `marshal.verified` // twice for the same block. That function is idempotent, so this is safe. - marshaled.marshal.verified(round, block).await; + // If marshal is gone, do not signal certify-true: the block was not durably + // stored. + if marshaled.marshal.verified(round, block).await.is_err() { + debug!( + ?round, + "marshal unavailable during certify re-proposal verified ack; \ + skipping certify resolution" + ); + return; + } tx.send_lossy(true); return; } @@ -630,7 +644,13 @@ where height = %block.height(), "requested broadcast of built block" ); - self.marshal.proposed(round, block).await; + if self.marshal.proposed(round, block).await.is_err() { + warn!( + ?round, + ?digest, + "marshal unavailable during proposed broadcast; block not persisted" + ); + } } Plan::Forward { round, peers } => { self.marshal.forward(round, digest, peers).await; diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 0f6d0b1f803..6d9091f748c 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -396,6 +396,11 @@ where return; } Decision::Continue(block) => block, + Decision::Aborted => { + // Persistence not confirmed (marshal shut down). Exit + // silently rather than signal a verdict to consensus. + return; + } }; // Non-reproposal path: fetch expected parent, validate ancestry, then @@ -496,7 +501,13 @@ where ); return; } - self.marshal.proposed(round, block).await; + if self.marshal.proposed(round, block).await.is_err() { + warn!( + ?round, + ?digest, + "marshal unavailable during proposed broadcast; block not persisted" + ); + } } Plan::Forward { round, peers } => { self.marshal.forward(round, digest, peers).await; diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index d53a3211514..f315bf561fb 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -51,11 +51,14 @@ where /// Result of the shared epoch / re-proposal pre-check step. /// -/// `Complete(valid)` indicates verification can terminate immediately with `valid`. -/// `Continue(block)` indicates full parent + application verification should continue. +/// - `Complete(valid)`: verification can terminate immediately with `valid`. +/// - `Continue(block)`: full parent + application verification should continue. +/// - `Aborted`: persistence could not be confirmed (marshal actor gone) and +/// the caller MUST exit silently rather than signal a verdict to consensus. pub(super) enum Decision { Complete(bool), Continue(B), + Aborted, } /// Performs shared pre-checks used by both inline and deferred verification paths. @@ -100,7 +103,13 @@ where return Decision::Complete(false); } - marshal.verified(context.round, block).await; + if marshal.verified(context.round, block).await.is_err() { + debug!( + round = ?context.round, + "marshal unavailable during re-proposal verified ack; aborting verify" + ); + return Decision::Aborted; + } return Decision::Complete(true); } @@ -199,8 +208,15 @@ where valid = validity_request => valid, }; - if application_valid { - marshal.verified(context.round, block).await; + if application_valid && marshal.verified(context.round, block).await.is_err() { + debug!( + round = ?context.round, + "marshal unavailable during verified ack; aborting verify" + ); + // Persistence not confirmed: caller MUST NOT signal a positive + // verdict to consensus. Returning `None` causes verify to exit + // silently without firing tx. + return None; } Some(application_valid) } From 852257268b0c1323ca4ca2dad08114f6c936d6e6 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 17:56:28 -0700 Subject: [PATCH 036/107] cleanup --- consensus/src/marshal/coding/mod.rs | 20 ++++++++-------- consensus/src/marshal/mocks/harness.rs | 12 +++++----- consensus/src/marshal/standard/deferred.rs | 14 +++++------ consensus/src/marshal/standard/inline.rs | 8 +++---- consensus/src/marshal/standard/mod.rs | 28 +++++++++++----------- 5 files changed, 41 insertions(+), 41 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 3e9a67c4d0f..6b295cdedaf 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -305,7 +305,7 @@ mod tests { let block_a = make_coding_block(context_a.clone(), parent_digest, Height::new(2), 200); let coded_block_a = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_block_a.commitment(); - shards.clone().proposed(round_a, coded_block_a).await; + let _ = shards.clone().proposed(round_a, coded_block_a).await; // Block B at view 10 (height 2, different block same height - could happen with // different proposers or re-proposals) @@ -318,7 +318,7 @@ mod tests { let block_b = make_coding_block(context_b.clone(), parent_digest, Height::new(2), 300); let coded_block_b = CodedBlock::new(block_b.clone(), coding_config, &Sequential); let commitment_b = coded_block_b.commitment(); - shards.clone().proposed(round_b, coded_block_b).await; + let _ = shards.clone().proposed(round_b, coded_block_b).await; context.sleep(Duration::from_millis(10)).await; @@ -417,7 +417,7 @@ mod tests { let block = make_coding_block(ctx.clone(), parent, Height::new(i), i * 100); let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); last_commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + let _ = shards.clone().proposed(round, coded_block).await; parent = block.digest(); last_view = View::new(i); } @@ -1269,7 +1269,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + let _ = shards.clone().proposed(parent_round, coded_parent).await; // Create child at height 2. let child_round = Round::new(Epoch::zero(), View::new(2)); @@ -1281,7 +1281,7 @@ mod tests { let child = make_coding_block(child_ctx, parent.digest(), Height::new(2), 200); let coded_child = CodedBlock::new(child, coding_config, &Sequential); let child_commitment = coded_child.commitment(); - shards.clone().proposed(child_round, coded_child).await; + let _ = shards.clone().proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; @@ -1390,7 +1390,7 @@ mod tests { let parent = make_coding_block(parent_context, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + let _ = shards.clone().proposed(parent_round, coded_parent).await; // 3) Publish a valid child so optimistic verify can succeed. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1403,7 +1403,7 @@ mod tests { make_coding_block(verify_context.clone(), parent.digest(), Height::new(2), 200); let coded_block = CodedBlock::new(block, coding_config, &Sequential); let commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + let _ = shards.clone().proposed(round, coded_block).await; context.sleep(Duration::from_millis(10)).await; @@ -1498,7 +1498,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - v1_mailbox.proposed(round1, coded_block_b.clone()).await; + let _ = v1_mailbox.proposed(round1, coded_block_b.clone()).await; context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). @@ -1658,7 +1658,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + let _ = shards.clone().proposed(parent_round, coded_parent).await; let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = CodingCtx { @@ -1670,7 +1670,7 @@ mod tests { let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); let child_commitment = coded_child.commitment(); let child_digest = coded_child.digest(); - shards.clone().proposed(child_round, coded_child).await; + let _ = shards.clone().proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index a5d5b04adb1..fb05a65f823 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -527,7 +527,7 @@ impl TestHarness for StandardHarness { } async fn propose(handle: &mut ValidatorHandle, round: Round, block: &B) { - handle.mailbox.proposed(round, block.clone()).await; + let _ = handle.mailbox.proposed(round, block.clone()).await; } async fn verify( @@ -536,7 +536,7 @@ impl TestHarness for StandardHarness { block: &B, _all_handles: &mut [ValidatorHandle], ) { - handle.mailbox.verified(round, block.clone()).await; + let _ = handle.mailbox.verified(round, block.clone()).await; } fn make_finalization(proposal: Proposal, schemes: &[S], quorum: u32) -> Finalization { @@ -682,7 +682,7 @@ impl TestHarness for StandardHarness { } async fn verify_for_prune(handle: &mut ValidatorHandle, round: Round, block: &B) { - handle.mailbox.verified(round, block.clone()).await; + let _ = handle.mailbox.verified(round, block.clone()).await; } } @@ -1315,7 +1315,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - handle.mailbox.proposed(round, block.clone()).await; + let _ = handle.mailbox.proposed(round, block.clone()).await; } async fn verify( @@ -1324,7 +1324,7 @@ impl TestHarness for CodingHarness { block: &CodedBlock, Sha256>, _all_handles: &mut [ValidatorHandle], ) { - handle.mailbox.verified(round, block.clone()).await; + let _ = handle.mailbox.verified(round, block.clone()).await; } fn make_finalization( @@ -1487,7 +1487,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - handle.mailbox.verified(round, block.clone()).await; + let _ = handle.mailbox.verified(round, block.clone()).await; } } diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 16c271309c0..d6f5e34a6b7 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -742,7 +742,7 @@ mod tests { // Create parent block at height 1 let parent = make_raw_block(genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal + let _ = marshal .clone() .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) .await; @@ -756,7 +756,7 @@ mod tests { }; let block_a = B::new::(context_a.clone(), parent_digest, Height::new(2), 200); let commitment_a = block_a.digest(); - marshal.clone().proposed(round_a, block_a.clone()).await; + let _ = marshal.clone().proposed(round_a, block_a.clone()).await; // Block B at view 10 (height 2, different block same height) let round_b = Round::new(Epoch::new(0), View::new(10)); @@ -767,7 +767,7 @@ mod tests { }; let block_b = B::new::(context_b.clone(), parent_digest, Height::new(2), 300); let commitment_b = block_b.digest(); - marshal.clone().proposed(round_b, block_b.clone()).await; + let _ = marshal.clone().proposed(round_b, block_b.clone()).await; context.sleep(Duration::from_millis(10)).await; @@ -874,7 +874,7 @@ mod tests { let parent = B::new::(parent_ctx.clone(), genesis.digest(), Height::new(19), 1000); let parent_digest = parent.digest(); - marshal + let _ = marshal .clone() .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) .await; @@ -893,7 +893,7 @@ mod tests { 2000, ); let block_commitment = block.digest(); - marshal + let _ = marshal .clone() .proposed(unsupported_round, block.clone()) .await; @@ -963,7 +963,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_commitment = parent.digest(); - marshal + let _ = marshal .clone() .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) .await; @@ -977,7 +977,7 @@ mod tests { }; let block_a = B::new::(context_a, parent.digest(), Height::new(2), 200); let commitment_a = block_a.digest(); - marshal.clone().proposed(round_a, block_a).await; + let _ = marshal.clone().proposed(round_a, block_a).await; context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 6d9091f748c..c2ba36fbbcf 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -629,7 +629,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + let _ = marshal.clone().proposed(parent_round, parent).await; let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -640,7 +640,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + let _ = marshal.clone().proposed(round, block).await; // Complete verify first so the block is already available locally. let verify_rx = inline.verify(verify_context, digest).await; @@ -707,7 +707,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + let _ = marshal.clone().proposed(parent_round, parent).await; let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -718,7 +718,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + let _ = marshal.clone().proposed(round, block).await; // Certify should still resolve by waiting on marshal block availability directly. let certify_rx = inline.certify(round, digest).await; diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 9162c1dcba9..cdf4a83e6b0 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -412,10 +412,10 @@ mod tests { ) .await .mailbox; - peer_mailbox + let _ = peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) .await; - peer_mailbox + let _ = peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) .await; StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; @@ -505,13 +505,13 @@ mod tests { ) .await .mailbox; - peer_mailbox + let _ = peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) .await; - peer_mailbox + let _ = peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) .await; - peer_mailbox + let _ = peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) .await; StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; @@ -691,7 +691,7 @@ mod tests { .await .mailbox; for (i, block) in blocks.iter().enumerate() { - peer_mailbox + let _ = peer_mailbox .proposed( Round::new(Epoch::zero(), View::new(block.height().get())), (*block).clone(), @@ -1120,7 +1120,7 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - marshal + let _ = marshal .clone() .proposed(boundary_round, boundary_block.clone()) .await; @@ -1190,7 +1190,7 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - marshal + let _ = marshal .clone() .proposed(boundary_round, boundary_block) .await; @@ -1227,7 +1227,7 @@ mod tests { 1000, ); let non_boundary_digest = non_boundary_block.digest(); - marshal + let _ = marshal .clone() .proposed(non_boundary_round, non_boundary_block) .await; @@ -1330,7 +1330,7 @@ mod tests { 200, ); let malformed_digest = malformed_block.digest(); - marshal + let _ = marshal .clone() .proposed(malformed_round, malformed_block) .await; @@ -1371,7 +1371,7 @@ mod tests { let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 300); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + let _ = marshal.clone().proposed(parent_round, parent).await; let mismatch_round = Round::new(Epoch::zero(), View::new(3)); let mismatched_context = Ctx { @@ -1386,7 +1386,7 @@ mod tests { 400, ); let mismatched_digest = mismatched_block.digest(); - marshal + let _ = marshal .clone() .proposed(mismatch_round, mismatched_block) .await; @@ -1462,7 +1462,7 @@ mod tests { }; let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - marshal.clone().proposed(parent_round, parent).await; + let _ = marshal.clone().proposed(parent_round, parent).await; // 2) Publish a valid child; only application-level verification should fail. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1473,7 +1473,7 @@ mod tests { }; let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - marshal.clone().proposed(round, block).await; + let _ = marshal.clone().proposed(round, block).await; context.sleep(Duration::from_millis(10)).await; From 7c9fa8ccf7b9e20aabd61aac592a0d14ec6f5f26 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 21:59:19 -0700 Subject: [PATCH 037/107] fmt --- consensus/src/marshal/coding/mod.rs | 25 +++++++++++++++---------- consensus/src/marshal/core/mailbox.rs | 12 ++---------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 6b295cdedaf..61820bce51c 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -79,7 +79,9 @@ mod tests { verifying::MockVerifyingApp, }, }, - simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal, Plan}, + simplex::{ + scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal, Plan, + }, types::{coding::Commitment, Epoch, Epocher, FixedEpocher, Height, Round, View}, Automaton, CertifiableAutomaton, CertifiableBlock, Relay, }; @@ -1787,16 +1789,19 @@ mod tests { leader: me.clone(), parent: (View::zero(), genesis_parent_commitment), }; - let block_to_propose = - make_coding_block(propose_context.clone(), genesis.digest(), Height::new(1), 100); + let block_to_propose = make_coding_block( + propose_context.clone(), + genesis.digest(), + Height::new(1), + 100, + ); let block_digest = block_to_propose.digest(); - let expected_commitment = - CodedBlock::<_, ReedSolomon, Sha256>::new( - block_to_propose.clone(), - coding_config, - &Sequential, - ) - .commitment(); + let expected_commitment = CodedBlock::<_, ReedSolomon, Sha256>::new( + block_to_propose.clone(), + coding_config, + &Sequential, + ) + .commitment(); let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis).with_propose_result(block_to_propose); diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index c4ac020dded..8b7422f7f3c 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -323,11 +323,7 @@ impl Mailbox { /// (typical during graceful shutdown, where the spawned task should just /// exit silently). #[must_use = "callers must not proceed to vote on an unpersisted block"] - pub async fn proposed( - &self, - round: Round, - block: V::Block, - ) -> Result<(), MarshalUnavailable> { + pub async fn proposed(&self, round: Round, block: V::Block) -> Result<(), MarshalUnavailable> { self.sender .request(|ack| Message::Proposed { round, block, ack }) .await @@ -347,11 +343,7 @@ impl Mailbox { /// (typical during graceful shutdown, where the spawned task should just /// exit silently). #[must_use = "callers must not proceed to certify true on an unpersisted block"] - pub async fn verified( - &self, - round: Round, - block: V::Block, - ) -> Result<(), MarshalUnavailable> { + pub async fn verified(&self, round: Round, block: V::Block) -> Result<(), MarshalUnavailable> { self.sender .request(|ack| Message::Verified { round, block, ack }) .await From 0db37a154ff99d7738c3ba200875607cc321a991 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 22:14:31 -0700 Subject: [PATCH 038/107] simplify --- consensus/src/marshal/coding/marshaled.rs | 8 ++-- consensus/src/marshal/core/mailbox.rs | 50 +++++--------------- consensus/src/marshal/standard/deferred.rs | 4 +- consensus/src/marshal/standard/inline.rs | 2 +- consensus/src/marshal/standard/validation.rs | 4 +- 5 files changed, 21 insertions(+), 47 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 39d3431a394..8bbe8525763 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -429,7 +429,7 @@ where // actor is gone, do NOT signal certify-true: the block was // not durably stored and consensus must not finalize-vote // on it. - if marshal.verified(round, block).await.is_err() { + if !marshal.verified(round, block).await { debug!( ?round, "marshal unavailable during verified ack; skipping certify resolution" @@ -771,7 +771,7 @@ where // verification task for `certify`. If marshal is gone, do // not signal certify-true: the block was not durably // stored. - if marshal.verified(round, block).await.is_err() { + if !marshal.verified(round, block).await { debug!( ?round, "marshal unavailable during re-proposal verified ack; \ @@ -917,7 +917,7 @@ where // `marshal.verified` twice for the same block. That function is // idempotent, so this is safe. If marshal is gone, do not // signal certify-true: the block was not durably stored. - if marshaled.marshal.verified(round, block).await.is_err() { + if !marshaled.marshal.verified(round, block).await { debug!( ?round, "marshal unavailable during certify re-proposal verified ack; \ @@ -1000,7 +1000,7 @@ where // If marshal is unavailable (graceful shutdown), log and // skip: consensus is also being torn down and the local vote // for this proposal must not proceed without persistence. - if self.marshal.proposed(round, block).await.is_err() { + if !self.marshal.proposed(round, block).await { warn!( ?round, ?commitment, diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 8b7422f7f3c..6df3b08314a 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -13,22 +13,6 @@ use commonware_utils::{ channel::{fallible::AsyncFallibleExt, mpsc, oneshot}, vec::NonEmptyVec, }; -use std::fmt; - -/// Returned by [Mailbox::verified] / [Mailbox::proposed] when the marshal actor -/// is no longer running (typical during graceful shutdown). Callers MUST -/// treat this as "persistence not confirmed" and avoid signaling consensus -/// that the block is good for voting. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct MarshalUnavailable; - -impl fmt::Display for MarshalUnavailable { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("marshal actor unavailable; persistence not confirmed") - } -} - -impl std::error::Error for MarshalUnavailable {} /// Messages sent to the marshal [Actor](super::Actor). /// @@ -314,41 +298,31 @@ impl Mailbox { /// Requests that a proposed block is sent to peers, awaiting the actor's /// confirmation that the block has been durably persisted before returning. /// - /// This is a safety boundary: callers may rely on `Ok(())` meaning the - /// block exists on disk. They MUST NOT proceed past `Err(_)` to a place - /// where they would vote on the block, since persistence was not - /// confirmed. - /// - /// Returns `Err(MarshalUnavailable)` if the marshal actor has shut down - /// (typical during graceful shutdown, where the spawned task should just - /// exit silently). + /// Returns `true` once the actor has completed `put_sync`. Returns `false` + /// if the marshal actor has shut down before acknowledging (typical during + /// graceful shutdown). Callers MUST NOT proceed to vote when this returns + /// `false` -- the block is not durably stored. #[must_use = "callers must not proceed to vote on an unpersisted block"] - pub async fn proposed(&self, round: Round, block: V::Block) -> Result<(), MarshalUnavailable> { + pub async fn proposed(&self, round: Round, block: V::Block) -> bool { self.sender .request(|ack| Message::Proposed { round, block, ack }) .await - .map(|()| ()) - .ok_or(MarshalUnavailable) + .is_some() } /// Notifies the actor that a block has been verified, awaiting the actor's /// confirmation that the block has been durably persisted before returning. /// - /// This is a safety boundary: callers may rely on `Ok(())` meaning the - /// block exists on disk. They MUST NOT proceed past `Err(_)` to a place - /// where they would resolve consensus's certify task as true (which - /// would drive a finalize vote) since persistence was not confirmed. - /// - /// Returns `Err(MarshalUnavailable)` if the marshal actor has shut down - /// (typical during graceful shutdown, where the spawned task should just - /// exit silently). + /// Returns `true` once the actor has completed `put_sync`. Returns `false` + /// if the marshal actor has shut down before acknowledging. Callers MUST + /// NOT proceed to resolve consensus's certify task as `true` (which would + /// drive a finalize vote) when this returns `false`. #[must_use = "callers must not proceed to certify true on an unpersisted block"] - pub async fn verified(&self, round: Round, block: V::Block) -> Result<(), MarshalUnavailable> { + pub async fn verified(&self, round: Round, block: V::Block) -> bool { self.sender .request(|ack| Message::Verified { round, block, ack }) .await - .map(|()| ()) - .ok_or(MarshalUnavailable) + .is_some() } /// Sets the sync starting point (advances if higher than current). diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index d6f5e34a6b7..a3d194770db 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -589,7 +589,7 @@ where // twice for the same block. That function is idempotent, so this is safe. // If marshal is gone, do not signal certify-true: the block was not durably // stored. - if marshaled.marshal.verified(round, block).await.is_err() { + if !marshaled.marshal.verified(round, block).await { debug!( ?round, "marshal unavailable during certify re-proposal verified ack; \ @@ -644,7 +644,7 @@ where height = %block.height(), "requested broadcast of built block" ); - if self.marshal.proposed(round, block).await.is_err() { + if !self.marshal.proposed(round, block).await { warn!( ?round, ?digest, diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index c2ba36fbbcf..4e83a3dee5a 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -501,7 +501,7 @@ where ); return; } - if self.marshal.proposed(round, block).await.is_err() { + if !self.marshal.proposed(round, block).await { warn!( ?round, ?digest, diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index f315bf561fb..5f9772e8167 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -103,7 +103,7 @@ where return Decision::Complete(false); } - if marshal.verified(context.round, block).await.is_err() { + if !marshal.verified(context.round, block).await { debug!( round = ?context.round, "marshal unavailable during re-proposal verified ack; aborting verify" @@ -208,7 +208,7 @@ where valid = validity_request => valid, }; - if application_valid && marshal.verified(context.round, block).await.is_err() { + if application_valid && !marshal.verified(context.round, block).await { debug!( round = ?context.round, "marshal unavailable during verified ack; aborting verify" From 09d76b5f139f36549c24ce3f1d4408d181d11a86 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 22:22:56 -0700 Subject: [PATCH 039/107] cleanup --- consensus/src/marshal/standard/deferred.rs | 14 +++++------- consensus/src/marshal/standard/inline.rs | 14 +++++------- consensus/src/marshal/standard/validation.rs | 23 +++++++++----------- 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index a3d194770db..457f66a6d50 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -451,7 +451,7 @@ where // Re-proposals return early and skip normal parent/height checks // because they were already verified when originally proposed and // parent-child checks would fail by construction when parent == block. - let block = match precheck_epoch_and_reproposal( + let Some(decision) = precheck_epoch_and_reproposal( &marshaled.epocher, &mut marshal, &context, @@ -459,7 +459,10 @@ where block, ) .await - { + else { + return; + }; + let block = match decision { Decision::Complete(valid) => { if valid { // Valid re-proposal. Create a completed verification task for `certify`. @@ -468,17 +471,10 @@ where task_tx.send_lossy(true); marshaled.verification_tasks.insert(round, digest, task_rx); } - // `Complete` means either immediate rejection or successful - // re-proposal handling with no further ancestry validation. tx.send_lossy(valid); return; } Decision::Continue(block) => block, - Decision::Aborted => { - // Persistence not confirmed (marshal shut down). Exit - // silently rather than signal a verdict to consensus. - return; - } }; // Before casting a notarize vote, ensure the block's embedded context matches diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 4e83a3dee5a..2df777daf3e 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -380,7 +380,7 @@ where // - Re-proposals skip normal parent/height checks because: // 1) the block was already verified when originally proposed // 2) parent-child checks would fail by construction when parent == block - let block = match precheck_epoch_and_reproposal( + let Some(decision) = precheck_epoch_and_reproposal( &epocher, &mut marshal, &context, @@ -388,19 +388,15 @@ where block, ) .await - { + else { + return; + }; + let block = match decision { Decision::Complete(valid) => { - // `Complete` means either an immediate reject or a valid - // re-proposal accepted without further ancestry checks. tx.send_lossy(valid); return; } Decision::Continue(block) => block, - Decision::Aborted => { - // Persistence not confirmed (marshal shut down). Exit - // silently rather than signal a verdict to consensus. - return; - } }; // Non-reproposal path: fetch expected parent, validate ancestry, then diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index 5f9772e8167..a7d9023aac7 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -53,12 +53,13 @@ where /// /// - `Complete(valid)`: verification can terminate immediately with `valid`. /// - `Continue(block)`: full parent + application verification should continue. -/// - `Aborted`: persistence could not be confirmed (marshal actor gone) and -/// the caller MUST exit silently rather than signal a verdict to consensus. +/// +/// The function returns `Option>`: `None` means the marshal actor +/// shut down during persistence and the caller must exit silently (consistent +/// with the `Option` convention used by [`verify_with_parent`]). pub(super) enum Decision { Complete(bool), Continue(B), - Aborted, } /// Performs shared pre-checks used by both inline and deferred verification paths. @@ -75,7 +76,7 @@ pub(super) async fn precheck_epoch_and_reproposal( context: &Context, digest: B::Digest, block: B, -) -> Decision +) -> Option> where ES: Epocher, S: Scheme, @@ -87,7 +88,7 @@ where height = %block.height(), "block height not in expected epoch" ); - return Decision::Complete(false); + return Some(Decision::Complete(false)); } // Re-proposals are signaled by `digest == context.parent.1`. @@ -100,20 +101,16 @@ where height = %block.height(), "re-proposal is not at epoch boundary" ); - return Decision::Complete(false); + return Some(Decision::Complete(false)); } if !marshal.verified(context.round, block).await { - debug!( - round = ?context.round, - "marshal unavailable during re-proposal verified ack; aborting verify" - ); - return Decision::Aborted; + return None; } - return Decision::Complete(true); + return Some(Decision::Complete(true)); } - Decision::Continue(block) + Some(Decision::Continue(block)) } /// Runs the shared non-reproposal verification flow. From 1f081f93d44912cac324050436cd975b4a4bf26d Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Wed, 15 Apr 2026 22:34:23 -0700 Subject: [PATCH 040/107] more persist --- consensus/src/marshal/coding/marshaled.rs | 8 ++++++++ consensus/src/marshal/standard/deferred.rs | 9 +++++++++ consensus/src/marshal/standard/inline.rs | 8 ++++++++ 3 files changed, 25 insertions(+) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 8bbe8525763..4aa96bbec52 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -627,6 +627,14 @@ where erasure_timer.observe(); let commitment = coded_block.commitment(); + + // Persist before returning the commitment to consensus. Once + // consensus receives it, the proposer will vote, so the block + // must be on disk before that point. + if !marshal.verified(consensus_context.round, coded_block.clone()).await { + return; + } + { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, coded_block)); diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 457f66a6d50..a4fbfba8ec7 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -393,6 +393,14 @@ where build_timer.observe(); let digest = built_block.digest(); + + // Persist before returning the digest to consensus. Once + // consensus receives the digest it will vote, so the block + // must be on disk before that point. + if !marshal.verified(consensus_context.round, built_block.clone()).await { + return; + } + { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); @@ -646,6 +654,7 @@ where ?digest, "marshal unavailable during proposed broadcast; block not persisted" ); + return; } } Plan::Forward { round, peers } => { diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 2df777daf3e..11806b8fb27 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -322,6 +322,14 @@ where build_timer.observe(); let digest = built_block.digest(); + + // Persist before returning the digest to consensus. Once + // consensus receives the digest it will vote, so the block + // must be on disk before that point. + if !marshal.verified(consensus_context.round, built_block.clone()).await { + return; + } + { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); From 5a9b2b0ed77fe4d18b8d31050879e65ae7d9533c Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 08:04:37 -0700 Subject: [PATCH 041/107] fix --- .../src/marshal/application/validation.rs | 6 --- consensus/src/marshal/coding/marshaled.rs | 48 ++++++------------- consensus/src/marshal/standard/deferred.rs | 25 +++------- consensus/src/marshal/standard/inline.rs | 27 +++++------ 4 files changed, 32 insertions(+), 74 deletions(-) diff --git a/consensus/src/marshal/application/validation.rs b/consensus/src/marshal/application/validation.rs index 46df606ffa5..70d501757a2 100644 --- a/consensus/src/marshal/application/validation.rs +++ b/consensus/src/marshal/application/validation.rs @@ -4,12 +4,6 @@ //! and certification flows. use crate::types::{Epoch, Epocher, Height, Round}; -use commonware_utils::sync::Mutex; -use std::sync::Arc; - -/// Cache for the last block built during proposal, shared between the -/// proposer task and the broadcast path. -pub(crate) type LastBuilt = Arc>>; /// Returns true if the block is at an epoch boundary (last block in its epoch). #[inline] diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 4aa96bbec52..f51b1141077 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -82,9 +82,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{ - is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, - }, + validation::{is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify}, verification_tasks::VerificationTasks, }, coding::{ @@ -125,6 +123,8 @@ use rand::Rng; use std::sync::{Arc, OnceLock}; use tracing::{debug, warn}; +type LastBuilt = Arc)>>>; + /// The [`CodingConfig`] used for genesis blocks. These blocks are never broadcasted in /// the proposal phase, and thus the configuration is irrelevant. const GENESIS_CODING_CONFIG: CodingConfig = CodingConfig { @@ -183,9 +183,9 @@ where scheme_provider: Z, epocher: ES, strategy: S, - last_built: LastBuilt>, verification_tasks: VerificationTasks, cached_genesis: Arc)>>, + last_built: LastBuilt, build_duration: Timed, verify_duration: Timed, @@ -266,9 +266,9 @@ where scheme_provider, strategy, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), cached_genesis: Arc::new(OnceLock::new()), + last_built: Arc::new(Mutex::new(None)), build_duration, verify_duration, @@ -507,10 +507,10 @@ where ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let strategy = self.strategy.clone(); let cached_genesis = self.cached_genesis.clone(); + let last_built = self.last_built.clone(); // If there's no scheme for the current epoch, we cannot verify the proposal. // Send back a receiver with a dropped sender. @@ -578,14 +578,15 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let commitment = parent.commitment(); + let round = consensus_context.round; { let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + *lock = Some((round, parent)); } let success = tx.send_lossy(commitment); debug!( - round = ?consensus_context.round, + ?round, ?commitment, success, "re-proposed parent block at epoch boundary" @@ -627,22 +628,16 @@ where erasure_timer.observe(); let commitment = coded_block.commitment(); - - // Persist before returning the commitment to consensus. Once - // consensus receives it, the proposer will vote, so the block - // must be on disk before that point. - if !marshal.verified(consensus_context.round, coded_block.clone()).await { - return; - } + let round = consensus_context.round; { let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, coded_block)); + *lock = Some((round, coded_block)); } let success = tx.send_lossy(commitment); debug!( - round = ?consensus_context.round, + ?round, ?commitment, success, "proposed new block" @@ -991,29 +986,14 @@ where "skipping requested broadcast of block with mismatched commitment" ); return; - } - debug!( - round = %round, - commitment = %block.commitment(), - height = %block.height(), - "requested broadcast of built block" - ); - // Route through marshal so the proposer's own block is durably - // persisted before shard broadcast. The marshal actor caches - // the verified block and forwards to the shards engine via - // the Buffer impl. Without this, the block would live only in - // the shards in-memory cache and a crash before any - // verify-driven persistence would lose it. - // - // If marshal is unavailable (graceful shutdown), log and - // skip: consensus is also being torn down and the local vote - // for this proposal must not proceed without persistence. + }; if !self.marshal.proposed(round, block).await { warn!( ?round, ?commitment, "marshal unavailable during proposed broadcast; block not persisted" ); + return; } } Plan::Forward { .. } => { diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index a4fbfba8ec7..256f2983382 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -74,7 +74,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{is_inferred_reproposal_at_certify, LastBuilt}, + validation::is_inferred_reproposal_at_certify, verification_tasks::VerificationTasks, }, core::Mailbox, @@ -105,6 +105,8 @@ use rand::Rng; use std::sync::Arc; use tracing::{debug, warn}; +type LastBuilt = Arc>>; + /// An [`Application`] adapter that handles epoch transitions and validates block ancestry. /// /// This wrapper intercepts consensus operations to enforce epoch boundaries and validate @@ -146,8 +148,8 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, verification_tasks: VerificationTasks<::Digest>, + last_built: LastBuilt, build_duration: Timed, } @@ -182,8 +184,8 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), + last_built: Arc::new(Mutex::new(None)), build_duration, } @@ -299,8 +301,8 @@ where ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); + let last_built = self.last_built.clone(); // Metrics let build_duration = self.build_duration.clone(); @@ -394,13 +396,6 @@ where let digest = built_block.digest(); - // Persist before returning the digest to consensus. Once - // consensus receives the digest it will vote, so the block - // must be on disk before that point. - if !marshal.verified(consensus_context.round, built_block.clone()).await { - return; - } - { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); @@ -641,13 +636,7 @@ where "skipping requested broadcast of block with mismatched digest" ); return; - } - debug!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "requested broadcast of built block" - ); + }; if !self.marshal.proposed(round, block).await { warn!( ?round, diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 11806b8fb27..77715515d17 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -45,7 +45,6 @@ use crate::{ marshal::{ ancestry::AncestorStream, - application::validation::LastBuilt, core::Mailbox, standard::{ validation::{ @@ -75,6 +74,8 @@ use rand::Rng; use std::{collections::BTreeSet, sync::Arc}; use tracing::{debug, warn}; +type LastBuilt = Arc>>; + /// Tracks `(round, digest)` pairs for which `verify` has already fetched the /// block, so `certify` can return immediately without re-subscribing to marshal. type AvailableBlocks = Arc>>; @@ -141,8 +142,8 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, available_blocks: AvailableBlocks, + last_built: LastBuilt, build_duration: Timed, } @@ -162,8 +163,7 @@ where { /// Creates a new inline-verification wrapper. /// - /// Registers a `build_duration` histogram for proposal latency and initializes - /// the shared "last built block" cache used by [`Relay::broadcast`]. + /// Registers a `build_duration` histogram for proposal latency. pub fn new(context: E, application: A, marshal: Mailbox>, epocher: ES) -> Self { let build_histogram = Histogram::new(Buckets::LOCAL); context.register( @@ -178,8 +178,8 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), available_blocks: Arc::new(Mutex::new(BTreeSet::new())), + last_built: Arc::new(Mutex::new(None)), build_duration, } } @@ -224,16 +224,16 @@ where /// Proposes a new block or re-proposes an epoch boundary block. /// /// Proposal runs in a spawned task and returns a receiver for the resulting digest. - /// Built/re-proposed blocks are cached in `last_built` so relay can broadcast - /// exactly what was proposed. + /// Blocks are persisted and broadcast via `marshal.proposed()` before the digest + /// is returned to consensus. async fn propose( &mut self, consensus_context: Context, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); + let last_built = self.last_built.clone(); let build_duration = self.build_duration.clone(); let (mut tx, rx) = oneshot::channel(); @@ -323,17 +323,11 @@ where let digest = built_block.digest(); - // Persist before returning the digest to consensus. Once - // consensus receives the digest it will vote, so the block - // must be on disk before that point. - if !marshal.verified(consensus_context.round, built_block.clone()).await { - return; - } - { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); } + let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -504,13 +498,14 @@ where "skipping requested broadcast of block with mismatched digest" ); return; - } + }; if !self.marshal.proposed(round, block).await { warn!( ?round, ?digest, "marshal unavailable during proposed broadcast; block not persisted" ); + return; } } Plan::Forward { round, peers } => { From 83f555296885a5d5fa02cf96ec3a0c8a9186c797 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 10:58:46 -0700 Subject: [PATCH 042/107] restore alias --- consensus/src/marshal/application/validation.rs | 6 ++++++ consensus/src/marshal/coding/marshaled.rs | 8 ++++---- consensus/src/marshal/standard/deferred.rs | 4 +--- consensus/src/marshal/standard/inline.rs | 3 +-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/consensus/src/marshal/application/validation.rs b/consensus/src/marshal/application/validation.rs index 70d501757a2..46df606ffa5 100644 --- a/consensus/src/marshal/application/validation.rs +++ b/consensus/src/marshal/application/validation.rs @@ -4,6 +4,12 @@ //! and certification flows. use crate::types::{Epoch, Epocher, Height, Round}; +use commonware_utils::sync::Mutex; +use std::sync::Arc; + +/// Cache for the last block built during proposal, shared between the +/// proposer task and the broadcast path. +pub(crate) type LastBuilt = Arc>>; /// Returns true if the block is at an epoch boundary (last block in its epoch). #[inline] diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index f51b1141077..d5363e7653d 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -82,7 +82,9 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify}, + validation::{ + is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, + }, verification_tasks::VerificationTasks, }, coding::{ @@ -123,8 +125,6 @@ use rand::Rng; use std::sync::{Arc, OnceLock}; use tracing::{debug, warn}; -type LastBuilt = Arc)>>>; - /// The [`CodingConfig`] used for genesis blocks. These blocks are never broadcasted in /// the proposal phase, and thus the configuration is irrelevant. const GENESIS_CODING_CONFIG: CodingConfig = CodingConfig { @@ -185,7 +185,7 @@ where strategy: S, verification_tasks: VerificationTasks, cached_genesis: Arc)>>, - last_built: LastBuilt, + last_built: LastBuilt>, build_duration: Timed, verify_duration: Timed, diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 256f2983382..9f995a6d38d 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -74,7 +74,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::is_inferred_reproposal_at_certify, + validation::{is_inferred_reproposal_at_certify, LastBuilt}, verification_tasks::VerificationTasks, }, core::Mailbox, @@ -105,8 +105,6 @@ use rand::Rng; use std::sync::Arc; use tracing::{debug, warn}; -type LastBuilt = Arc>>; - /// An [`Application`] adapter that handles epoch transitions and validates block ancestry. /// /// This wrapper intercepts consensus operations to enforce epoch boundaries and validate diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 77715515d17..ab330334259 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -45,6 +45,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, + application::validation::LastBuilt, core::Mailbox, standard::{ validation::{ @@ -74,8 +75,6 @@ use rand::Rng; use std::{collections::BTreeSet, sync::Arc}; use tracing::{debug, warn}; -type LastBuilt = Arc>>; - /// Tracks `(round, digest)` pairs for which `verify` has already fetched the /// block, so `certify` can return immediately without re-subscribing to marshal. type AvailableBlocks = Arc>>; From 268787615f3c676832b171c2fb5dbd632d79e46c Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 11:44:03 -0700 Subject: [PATCH 043/107] add assertions --- consensus/src/marshal/coding/mod.rs | 20 ++++---- .../src/marshal/coding/shards/mailbox.rs | 7 ++- consensus/src/marshal/coding/variant.rs | 10 +++- consensus/src/marshal/mocks/harness.rs | 12 ++--- consensus/src/marshal/standard/deferred.rs | 22 ++++---- consensus/src/marshal/standard/inline.rs | 8 +-- consensus/src/marshal/standard/mod.rs | 50 +++++++++---------- 7 files changed, 70 insertions(+), 59 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 61820bce51c..b37b1b07743 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -307,7 +307,7 @@ mod tests { let block_a = make_coding_block(context_a.clone(), parent_digest, Height::new(2), 200); let coded_block_a = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_block_a.commitment(); - let _ = shards.clone().proposed(round_a, coded_block_a).await; + assert!(shards.clone().proposed(round_a, coded_block_a).await); // Block B at view 10 (height 2, different block same height - could happen with // different proposers or re-proposals) @@ -320,7 +320,7 @@ mod tests { let block_b = make_coding_block(context_b.clone(), parent_digest, Height::new(2), 300); let coded_block_b = CodedBlock::new(block_b.clone(), coding_config, &Sequential); let commitment_b = coded_block_b.commitment(); - let _ = shards.clone().proposed(round_b, coded_block_b).await; + assert!(shards.clone().proposed(round_b, coded_block_b).await); context.sleep(Duration::from_millis(10)).await; @@ -419,7 +419,7 @@ mod tests { let block = make_coding_block(ctx.clone(), parent, Height::new(i), i * 100); let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); last_commitment = coded_block.commitment(); - let _ = shards.clone().proposed(round, coded_block).await; + assert!(shards.clone().proposed(round, coded_block).await); parent = block.digest(); last_view = View::new(i); } @@ -1271,7 +1271,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - let _ = shards.clone().proposed(parent_round, coded_parent).await; + assert!(shards.clone().proposed(parent_round, coded_parent).await); // Create child at height 2. let child_round = Round::new(Epoch::zero(), View::new(2)); @@ -1283,7 +1283,7 @@ mod tests { let child = make_coding_block(child_ctx, parent.digest(), Height::new(2), 200); let coded_child = CodedBlock::new(child, coding_config, &Sequential); let child_commitment = coded_child.commitment(); - let _ = shards.clone().proposed(child_round, coded_child).await; + assert!(shards.clone().proposed(child_round, coded_child).await); context.sleep(Duration::from_millis(10)).await; @@ -1392,7 +1392,7 @@ mod tests { let parent = make_coding_block(parent_context, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - let _ = shards.clone().proposed(parent_round, coded_parent).await; + assert!(shards.clone().proposed(parent_round, coded_parent).await); // 3) Publish a valid child so optimistic verify can succeed. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1405,7 +1405,7 @@ mod tests { make_coding_block(verify_context.clone(), parent.digest(), Height::new(2), 200); let coded_block = CodedBlock::new(block, coding_config, &Sequential); let commitment = coded_block.commitment(); - let _ = shards.clone().proposed(round, coded_block).await; + assert!(shards.clone().proposed(round, coded_block).await); context.sleep(Duration::from_millis(10)).await; @@ -1500,7 +1500,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - let _ = v1_mailbox.proposed(round1, coded_block_b.clone()).await; + assert!(v1_mailbox.proposed(round1, coded_block_b.clone()).await); context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). @@ -1660,7 +1660,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - let _ = shards.clone().proposed(parent_round, coded_parent).await; + assert!(shards.clone().proposed(parent_round, coded_parent).await); let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = CodingCtx { @@ -1672,7 +1672,7 @@ mod tests { let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); let child_commitment = coded_child.commitment(); let child_digest = coded_child.digest(); - let _ = shards.clone().proposed(child_round, coded_child).await; + assert!(shards.clone().proposed(child_round, coded_child).await); context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/marshal/coding/shards/mailbox.rs b/consensus/src/marshal/coding/shards/mailbox.rs index aff81e826b5..78bdbef54dd 100644 --- a/consensus/src/marshal/coding/shards/mailbox.rs +++ b/consensus/src/marshal/coding/shards/mailbox.rs @@ -116,9 +116,12 @@ where } /// Broadcast a proposed erasure coded block's shards to the participants. - pub async fn proposed(&self, round: Round, block: CodedBlock) { + /// + /// Returns `true` if the message was enqueued, `false` if the shard engine + /// has shut down. + pub async fn proposed(&self, round: Round, block: CodedBlock) -> bool { let msg = Message::Proposed { block, round }; - self.sender.send_lossy(msg).await; + self.sender.send_lossy(msg).await } /// Inform the engine of an externally proposed [`Commitment`]. diff --git a/consensus/src/marshal/coding/variant.rs b/consensus/src/marshal/coding/variant.rs index cd939dffe74..f88d2632fd8 100644 --- a/consensus/src/marshal/coding/variant.rs +++ b/consensus/src/marshal/coding/variant.rs @@ -15,6 +15,7 @@ use commonware_cryptography::{Committable, Digestible, Hasher, PublicKey}; use commonware_p2p::Recipients; use commonware_utils::channel::oneshot; use std::sync::Arc; +use tracing::warn; /// The coding variant of Marshal, which uses erasure coding for block dissemination. /// @@ -100,6 +101,13 @@ where } async fn send(&self, round: Round, block: CodedBlock, _recipients: Recipients

) { - self.proposed(round, block).await; + let commitment = block.commitment(); + if !self.proposed(round, block).await { + warn!( + ?round, + ?commitment, + "shards engine unavailable; block persisted but not broadcast" + ); + } } } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index fb05a65f823..1f55939661b 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -527,7 +527,7 @@ impl TestHarness for StandardHarness { } async fn propose(handle: &mut ValidatorHandle, round: Round, block: &B) { - let _ = handle.mailbox.proposed(round, block.clone()).await; + assert!(handle.mailbox.proposed(round, block.clone()).await); } async fn verify( @@ -536,7 +536,7 @@ impl TestHarness for StandardHarness { block: &B, _all_handles: &mut [ValidatorHandle], ) { - let _ = handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } fn make_finalization(proposal: Proposal, schemes: &[S], quorum: u32) -> Finalization { @@ -682,7 +682,7 @@ impl TestHarness for StandardHarness { } async fn verify_for_prune(handle: &mut ValidatorHandle, round: Round, block: &B) { - let _ = handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } } @@ -1315,7 +1315,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - let _ = handle.mailbox.proposed(round, block.clone()).await; + assert!(handle.mailbox.proposed(round, block.clone()).await); } async fn verify( @@ -1324,7 +1324,7 @@ impl TestHarness for CodingHarness { block: &CodedBlock, Sha256>, _all_handles: &mut [ValidatorHandle], ) { - let _ = handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } fn make_finalization( @@ -1487,7 +1487,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - let _ = handle.mailbox.verified(round, block.clone()).await; + assert!(handle.mailbox.verified(round, block.clone()).await); } } diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 9f995a6d38d..8b480b4bf92 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -734,10 +734,10 @@ mod tests { // Create parent block at height 1 let parent = make_raw_block(genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) - .await; + .await); // Block A at view 5 (height 2) let round_a = Round::new(Epoch::new(0), View::new(5)); @@ -748,7 +748,7 @@ mod tests { }; let block_a = B::new::(context_a.clone(), parent_digest, Height::new(2), 200); let commitment_a = block_a.digest(); - let _ = marshal.clone().proposed(round_a, block_a.clone()).await; + assert!(marshal.clone().proposed(round_a, block_a.clone()).await); // Block B at view 10 (height 2, different block same height) let round_b = Round::new(Epoch::new(0), View::new(10)); @@ -759,7 +759,7 @@ mod tests { }; let block_b = B::new::(context_b.clone(), parent_digest, Height::new(2), 300); let commitment_b = block_b.digest(); - let _ = marshal.clone().proposed(round_b, block_b.clone()).await; + assert!(marshal.clone().proposed(round_b, block_b.clone()).await); context.sleep(Duration::from_millis(10)).await; @@ -866,10 +866,10 @@ mod tests { let parent = B::new::(parent_ctx.clone(), genesis.digest(), Height::new(19), 1000); let parent_digest = parent.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) - .await; + .await); // Create a block at height 20 (first block in epoch 1, which is NOT supported) let unsupported_round = Round::new(Epoch::new(1), View::new(20)); @@ -885,10 +885,10 @@ mod tests { 2000, ); let block_commitment = block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(unsupported_round, block.clone()) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -955,10 +955,10 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_commitment = parent.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) - .await; + .await); // Build a block with context A (embedded in the block). let round_a = Round::new(Epoch::zero(), View::new(2)); @@ -969,7 +969,7 @@ mod tests { }; let block_a = B::new::(context_a, parent.digest(), Height::new(2), 200); let commitment_a = block_a.digest(); - let _ = marshal.clone().proposed(round_a, block_a).await; + assert!(marshal.clone().proposed(round_a, block_a).await); context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index ab330334259..5511d17c083 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -627,7 +627,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - let _ = marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.clone().proposed(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -638,7 +638,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - let _ = marshal.clone().proposed(round, block).await; + assert!(marshal.clone().proposed(round, block).await); // Complete verify first so the block is already available locally. let verify_rx = inline.verify(verify_context, digest).await; @@ -705,7 +705,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - let _ = marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.clone().proposed(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -716,7 +716,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - let _ = marshal.clone().proposed(round, block).await; + assert!(marshal.clone().proposed(round, block).await); // Certify should still resolve by waiting on marshal block availability directly. let certify_rx = inline.certify(round, digest).await; diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index cdf4a83e6b0..1f6f7e8e78a 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -412,12 +412,12 @@ mod tests { ) .await .mailbox; - let _ = peer_mailbox + assert!(peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await; - let _ = peer_mailbox + .await); + assert!(peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await; + .await); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; context.sleep(Duration::from_millis(200)).await; @@ -505,15 +505,15 @@ mod tests { ) .await .mailbox; - let _ = peer_mailbox + assert!(peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await; - let _ = peer_mailbox + .await); + assert!(peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await; - let _ = peer_mailbox + .await); + assert!(peer_mailbox .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) - .await; + .await); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; StandardHarness::report_finalization(&mut peer_mailbox, finalization_three.clone()) .await; @@ -691,12 +691,12 @@ mod tests { .await .mailbox; for (i, block) in blocks.iter().enumerate() { - let _ = peer_mailbox + assert!(peer_mailbox .proposed( Round::new(Epoch::zero(), View::new(block.height().get())), (*block).clone(), ) - .await; + .await); StandardHarness::report_finalization(&mut peer_mailbox, finalizations[i].clone()) .await; } @@ -1120,10 +1120,10 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(boundary_round, boundary_block.clone()) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -1190,10 +1190,10 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(boundary_round, boundary_block) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -1227,10 +1227,10 @@ mod tests { 1000, ); let non_boundary_digest = non_boundary_block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(non_boundary_round, non_boundary_block) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -1330,10 +1330,10 @@ mod tests { 200, ); let malformed_digest = malformed_block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(malformed_round, malformed_block) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -1371,7 +1371,7 @@ mod tests { let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 300); let parent_digest = parent.digest(); - let _ = marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.clone().proposed(parent_round, parent).await); let mismatch_round = Round::new(Epoch::zero(), View::new(3)); let mismatched_context = Ctx { @@ -1386,10 +1386,10 @@ mod tests { 400, ); let mismatched_digest = mismatched_block.digest(); - let _ = marshal + assert!(marshal .clone() .proposed(mismatch_round, mismatched_block) - .await; + .await); context.sleep(Duration::from_millis(10)).await; @@ -1462,7 +1462,7 @@ mod tests { }; let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - let _ = marshal.clone().proposed(parent_round, parent).await; + assert!(marshal.clone().proposed(parent_round, parent).await); // 2) Publish a valid child; only application-level verification should fail. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1473,7 +1473,7 @@ mod tests { }; let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - let _ = marshal.clone().proposed(round, block).await; + assert!(marshal.clone().proposed(round, block).await); context.sleep(Duration::from_millis(10)).await; From 2bd4ba13bf8165d9274622c45d3a4b533abb9abe Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 11:50:46 -0700 Subject: [PATCH 044/107] excessive comments (to trim) --- consensus/src/simplex/actors/voter/actor.rs | 30 +++++++++++++-------- consensus/src/simplex/actors/voter/state.rs | 20 ++++++++------ 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 081b8d2de73..d646d52572d 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -859,18 +859,26 @@ impl< let view = round.view(); debug!(%view, "attempting certification"); let result = if am_leader { - // We led this view, so the proposal is ours and certification is trivially - // true. Skipping the automaton call avoids a redundant round-trip. + // The elected leader of this view is us, so certification is + // trivially true. Skipping the automaton avoids a redundant + // round-trip. // - // INVARIANT: `am_leader` implies we proposed. To propose, we must have - // entered the view (via `enter_view`), which always sets the round's - // leader. So when `state::certify_candidates` reports `am_leader = true`, - // we have provably proposed for this view. The converse case where the - // round's leader is unknown (e.g. notarization arrived via resolver - // before the prior view's certificate set this view's leader) is reported - // as `am_leader = false` and falls through to `automaton.certify`, which - // is correct: we never entered the view, so we never proposed and don't - // hold the block locally. + // `am_leader` can only be `true` when the round's leader has + // been set AND matches the local participant. The leader for + // view V is set by processing V-1's notarization (which calls + // `set_leader(V, ...)`). In normal operation this happens + // before we enter V and propose, so `am_leader = true` means + // we proposed. + // + // During catch-up (resolver delivers notarizations out of + // order), `am_leader` can also be `true` for a view we never + // entered -- if V-1's notarization arrived and set V's leader + // before V's notarization added V as a candidate. A + // notarization for our view can only exist if we (or a clone + // holding our key) proposed, so accepting the network's 2f+1 + // commitment is safe. If V's notarization arrives without + // V-1's, the leader is unknown, `am_leader = false`, and we + // correctly fall through to the automaton. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index a32509b525b..c9df5343330 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -589,14 +589,18 @@ impl, L: ElectorConfig, D: D /// of each view (used to short-circuit certification for own proposals). /// /// The `am_leader` flag is `true` only when the round's leader has been - /// set AND matches the local participant. A round's leader is always set - /// before we can propose into it (proposing requires `enter_view`, which - /// sets the leader), so `am_leader = true` provably implies we proposed - /// the block. If the leader is unknown for this view (e.g. notarization - /// arrived via resolver/replay before the prior view's certificate set - /// this view's leader), `am_leader` is `false`: we never entered the - /// view, so the proposal cannot be ours and the caller must fall back to - /// the automaton to certify. + /// set AND matches the local participant. The leader for view V is set + /// by processing V-1's notarization (`add_notarization(V-1)` calls + /// `set_leader(V, ...)`). In normal operation this precedes entering V + /// and proposing, so `am_leader = true` means we proposed the block. + /// + /// During catch-up, `am_leader` can be `true` for a view we never + /// entered: if V-1's notarization set V's leader before V's notarization + /// added V as a candidate. A notarization where we are the elected + /// leader can only exist if we (or a clone holding our key) proposed, + /// so accepting the network's 2f+1 commitment is correct. If V's + /// notarization arrives without V-1's, the leader is unknown and + /// `am_leader` is `false`, falling through to the automaton. pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); let me = self.scheme.me(); From 17976d4a937e627d9bb842b6ef8194226f1d3c03 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 13:18:52 -0700 Subject: [PATCH 045/107] more cleanup --- consensus/src/lib.rs | 12 +++ consensus/src/simplex/actors/voter/actor.rs | 29 ++--- consensus/src/simplex/actors/voter/mod.rs | 39 ++++--- consensus/src/simplex/actors/voter/round.rs | 106 +++++++++++++++--- consensus/src/simplex/actors/voter/slot.rs | 56 ++++++++-- consensus/src/simplex/actors/voter/state.rs | 113 +++++++++++++++----- 6 files changed, 261 insertions(+), 94 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 2282d12ee86..d52fb624162 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -99,6 +99,14 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// If it is possible to generate a payload, the Digest should be returned over the provided /// channel. If it is not possible to generate a payload, the channel can be dropped. If construction /// takes too long, the consensus engine may drop the provided proposal. + /// + /// For [`CertifiableAutomaton`] implementations, returning a payload from + /// `propose` also commits the local proposer to certifying that same + /// `(round, payload)` if it later becomes notarized. Consensus engines + /// may therefore treat durable local evidence of proposal construction + /// (for example replay of a local vote on a leader-owned round) as + /// sufficient to bypass a later `certify` callback for that exact + /// proposal. fn propose( &mut self, context: Self::Context, @@ -144,6 +152,10 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// This is particularly useful for applications that employ erasure coding, which /// can override this method to delay or prevent finalization until they have /// reconstructed and validated the full block (e.g., after receiving enough shards). + /// Payloads produced locally by [`Automaton::propose`] are the exception: + /// the proposer must treat them as certifiable-by-construction for that + /// same round, allowing consensus to skip `certify` once it has durable + /// local evidence that the proposal originated here. /// /// Like [`Automaton::verify`], certification is single-shot for the given /// `(round, payload)`. Once the returned channel resolves or closes, consensus treats diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index d646d52572d..e79a687e0d9 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -854,31 +854,16 @@ impl< } // Attempt to certify any views that we have notarizations for. - for (proposal, am_leader) in self.state.certify_candidates() { + for (proposal, is_local) in self.state.certify_candidates() { let round = proposal.round; let view = round.view(); debug!(%view, "attempting certification"); - let result = if am_leader { - // The elected leader of this view is us, so certification is - // trivially true. Skipping the automaton avoids a redundant - // round-trip. - // - // `am_leader` can only be `true` when the round's leader has - // been set AND matches the local participant. The leader for - // view V is set by processing V-1's notarization (which calls - // `set_leader(V, ...)`). In normal operation this happens - // before we enter V and propose, so `am_leader = true` means - // we proposed. - // - // During catch-up (resolver delivers notarizations out of - // order), `am_leader` can also be `true` for a view we never - // entered -- if V-1's notarization arrived and set V's leader - // before V's notarization added V as a candidate. A - // notarization for our view can only exist if we (or a clone - // holding our key) proposed, so accepting the network's 2f+1 - // commitment is safe. If V's notarization arrives without - // V-1's, the leader is unknown, `am_leader = false`, and we - // correctly fall through to the automaton. + let result = if is_local { + // Locally proposed payloads are certifiable-by-construction + // for their proposer. We only apply this shortcut when we + // have explicit local evidence for the exact proposal, + // either from this process or from replaying our durable + // local notarize vote. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 2ef007204de..40ce9d915f7 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -4459,15 +4459,15 @@ mod tests { no_self_verify_after_restart(secp256r1::fixture); } - /// When the voter is the leader of a view and that view's notarization is - /// resolved (e.g. assembled locally from peer votes), it must not ask the - /// automaton to certify its own proposal: the leader already produced and - /// verified the payload locally. + /// When the voter is the leader of a view and later reconstructs a + /// notarization for the proposal it built locally, it must not ask the + /// automaton to certify that same proposal again. /// - /// This is enforced in `actor::run` where we short-circuit `automaton.certify` - /// for leader-owned views. The test asserts the end-to-end live invariant: a - /// `Finalize` is emitted for the leader-owned view (proving certification - /// completed) without ever invoking the certify observer for that view. + /// This is enforced in `actor::run` by short-circuiting certification only + /// when the round carries explicit local proposal evidence, not merely + /// because `leader == me`. The test asserts the end-to-end invariant on the + /// live path: a `Finalize` is emitted for the leader-owned view without the + /// certify observer firing for that view. fn no_self_certify_when_proposing(mut fixture: F) where S: Scheme, @@ -4621,8 +4621,8 @@ mod tests { } } - // Assert the live invariant: the certify observer never fired for the - // leader-owned view. + // Assert the live invariant: the certify observer never fired for + // the leader-owned proposal we built ourselves. let certified = certify_calls.lock(); assert!( !certified.contains(&target_view), @@ -4642,13 +4642,12 @@ mod tests { } /// Restart analogue of `no_self_certify_when_proposing`: after the voter has - /// proposed and journaled a local notarize as leader, restarting must not - /// cause it to consult the automaton when certifying its own proposal once - /// the corresponding notarization is resolved post-restart. + /// proposed and journaled a local notarize as leader, restarting must + /// recover that local proposal evidence and continue to bypass automaton + /// certification once the corresponding notarization is resolved. /// - /// Replay of the journaled local notarize restores the leader's proposal as - /// `Verified`; the leader-owned short-circuit in `actor::run` then bypasses - /// the automaton when certification runs. + /// The replayed local notarize is what distinguishes this case from merely + /// observing a leader-owned proposal certificate during catch-up. fn no_self_certify_after_restart(mut fixture: F) where S: Scheme, @@ -4861,8 +4860,8 @@ mod tests { .resolved(Certificate::Notarization(notarization)) .await; - // A finalize for the leader-owned view proves the voter certified its - // own proposal post-restart without consulting the automaton. + // A finalize for the leader-owned view proves the voter recovered + // the local certification shortcut after replay. loop { match batcher_receiver.recv().await.unwrap() { batcher::Message::Constructed(Vote::Finalize(finalize)) @@ -4884,8 +4883,8 @@ mod tests { } // Assert the restart invariant: certify did not fire for the - // leader-owned view whose journaled notarize replay should have - // restored the slot's proposal state. + // leader-owned view whose journaled local notarize replay restored + // the local proposal evidence. let certified = certify_calls.lock(); assert!( !certified.contains(&target_view), diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index c20a1d20871..ec90123cb5d 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -134,7 +134,11 @@ impl Round { } /// Attempt to certify this round's proposal. - pub fn try_certify(&mut self) -> Option> { + /// + /// Returns the proposal along with whether the local participant previously + /// proposed it, in which case certification can be inferred once a + /// notarization exists. + pub fn try_certify(&mut self) -> Option<(Proposal, bool)> { let notarization = self.notarization.as_ref()?; match self.certify { CertifyState::Ready => {} @@ -155,7 +159,7 @@ impl Round { &proposal, ¬arization.proposal, "slot proposal must match notarization proposal" ); - Some(proposal) + Some((proposal, self.proposal.is_local())) } /// Sets the handle for the certification request. @@ -486,7 +490,7 @@ impl Round { // This check prevents us from voting for a proposal if we have observed equivocation (where // the proposal would be set to ProposalStatus::Equivocated) or if verification hasn't // completed yet. - if self.proposal.status() != ProposalStatus::Verified { + if !matches!(self.proposal.status(), ProposalStatus::Verified(_)) { return None; } @@ -533,10 +537,20 @@ impl Round { "replaying notarize from another signer" ); - // While we may not be the leader here, we still call - // built because the effect is the same (there is a proposal - // and it is verified). - self.proposal.built(notarize.proposal.clone()); + // Replaying our local notarize restores a verified proposal and + // the fact that we already voted. Only leader-owned rounds gain + // the local certification shortcut from this replay; follower + // rounds also journal local notarize votes over other leaders' + // proposals. + if self + .leader + .as_ref() + .is_some_and(|leader| self.is_signer(leader.idx)) + { + self.proposal.built(notarize.proposal.clone()); + } else { + self.proposal.notarized(notarize.proposal.clone()); + } self.broadcast_notarize = true; } Artifact::Nullify(nullify) => { @@ -886,7 +900,9 @@ mod tests { fn replayed_local_notarize_restores_verified_proposal_state() { let mut rng = test_rng(); let namespace = b"ns"; - let Fixture { schemes, .. } = ed25519::fixture(&mut rng, namespace, 4); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); let local_scheme = schemes[0].clone(); // Create a proposal where we (participant 0) are the leader. @@ -902,7 +918,10 @@ mod tests { // Proposal should be restored as verified (we are the leader). assert_eq!(round.proposal.proposal(), Some(&proposal)); - assert_eq!(round.proposal.status(), ProposalStatus::Verified); + assert_eq!( + round.proposal.status(), + ProposalStatus::Verified(true) + ); assert!(round.broadcast_notarize); // No verification request should be emitted. @@ -910,6 +929,21 @@ mod tests { !round.try_verify(), "leader-owned replay should not request verification again" ); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, equivocator) = round.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(is_local, "local notarize replay should restore local certification"); } #[test] @@ -986,8 +1020,10 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // First try_certify should succeed - assert!(round.try_certify().is_some()); + // First try_certify should succeed, but not via the local shortcut. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); // Set a certify handle then mark as certified let mut pool = AbortablePool::<()>::default(); @@ -999,6 +1035,39 @@ mod tests { assert!(round.try_certify().is_none()); } + #[test] + fn try_certify_marks_locally_proposed_candidate() { + let mut rng = test_rng(); + let namespace = b"ns"; + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut rng, namespace, 4); + let local_scheme = schemes[0].clone(); + + let now = SystemTime::UNIX_EPOCH; + let round_info = Rnd::new(Epoch::new(1), View::new(1)); + let proposal = Proposal::new(round_info, View::new(0), Sha256Digest::from([7u8; 32])); + + let mut round = Round::new(local_scheme, round_info, now); + round.set_leader(Participant::new(0)); + assert!(round.proposed(proposal.clone())); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .unwrap(); + let (added, equivocator) = round.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(is_local, "locally proposed payload should carry local certify permission"); + } + #[test] fn try_certify_blocked_when_handle_exists() { let mut rng = test_rng(); @@ -1028,8 +1097,10 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // First try_certify should succeed - assert!(round.try_certify().is_some()); + // First try_certify should succeed, but not via the local shortcut. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); // Set a certify handle (simulating in-flight certification) let mut pool = AbortablePool::<()>::default(); @@ -1112,9 +1183,12 @@ mod tests { let (added, _) = round.add_notarization(notarization); assert!(added); - // Has notarization and proposal came from certificate - // try_certify returns the proposal from the certificate - assert!(round.try_certify().is_some()); + // Has notarization and proposal came from certificate. + // Certification should go through the automaton because the proposal was + // not built locally. + let (candidate, is_local) = round.try_certify().expect("certify candidate"); + assert_eq!(candidate, proposal); + assert!(!is_local); } #[test] diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index f8a7d1db8e3..9fc929f84ce 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -8,7 +8,7 @@ pub enum Status { #[default] None, Unverified, - Verified, + Verified(bool), Equivocated, } @@ -65,6 +65,11 @@ where self.proposal.is_some() && self.status != Status::Equivocated } + /// Returns whether the current proposal was built locally and remains usable. + pub fn is_local(&self) -> bool { + matches!(self.status, Status::Verified(true)) + } + pub const fn should_build(&self) -> bool { !self.requested_build && self.proposal.is_none() } @@ -73,11 +78,20 @@ where self.requested_build = true; } - /// Records the proposal in this slot and flips the build/verify flags. + /// Records a proposal that has already been verified. /// - /// If the slot is already populated, we ignore the proposal. - pub fn built(&mut self, proposal: Proposal) { + /// If the slot already contains the same proposal, we refresh it to + /// verified state. Conflicting proposals are ignored. + fn verified(&mut self, proposal: Proposal, local: bool) { + let verified = Status::Verified(local); if let Some(existing) = &self.proposal { + if existing == &proposal && self.status != Status::Equivocated { + self.status = verified; + self.requested_build = true; + self.requested_verify = true; + return; + } + // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. debug!( @@ -90,11 +104,21 @@ where // Otherwise, we record the proposal and flip the build/verify flags. self.proposal = Some(proposal); - self.status = Status::Verified; + self.status = verified; self.requested_build = true; self.requested_verify = true; } + /// Records a proposal built locally by this participant. + pub fn built(&mut self, proposal: Proposal) { + self.verified(proposal, true); + } + + /// Records a proposal we verified and voted for, but did not build locally. + pub fn notarized(&mut self, proposal: Proposal) { + self.verified(proposal, false); + } + pub const fn request_verify(&mut self) -> bool { if self.requested_verify { return false; @@ -107,7 +131,7 @@ where if self.status != Status::Unverified { return false; } - self.status = Status::Verified; + self.status = Status::Verified(false); true } @@ -185,9 +209,10 @@ mod tests { Some(stored) => assert_eq!(stored, &proposal), None => panic!("proposal missing after recording"), } - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert!(!slot.should_build()); assert!(!slot.request_verify()); + assert!(slot.is_local()); } #[test] @@ -199,9 +224,10 @@ mod tests { slot.built(proposal.clone()); assert_eq!(slot.proposal(), Some(&proposal)); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert!(!slot.should_build()); assert!(!slot.request_verify()); + assert!(slot.is_local()); } #[test] @@ -214,8 +240,9 @@ mod tests { slot.built(proposal.clone()); assert!(!slot.should_build()); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(true)); assert_eq!(slot.proposal(), Some(&proposal)); + assert!(slot.is_local()); } #[test] @@ -227,10 +254,12 @@ mod tests { assert!(matches!(slot.update(&proposal, false), Change::New)); assert!(matches!(slot.update(&proposal, true), Change::Unchanged)); assert_eq!(slot.status(), Status::Unverified); + assert!(!slot.is_local()); assert!(slot.mark_verified()); assert!(matches!(slot.update(&proposal, true), Change::Unchanged)); - assert_eq!(slot.status(), Status::Verified); + assert_eq!(slot.status(), Status::Verified(false)); + assert!(!slot.is_local()); } #[test] @@ -251,6 +280,7 @@ mod tests { } assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_a)); + assert!(!slot.is_local()); } #[test] @@ -268,6 +298,7 @@ mod tests { assert!(matches!(slot.update(&compromised, true), Change::New)); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.is_local()); // Once we finally finish proposing our honest payload, the slot should just // ignore it (the equivocation was already detected when the certificate @@ -275,6 +306,7 @@ mod tests { slot.built(honest); assert_eq!(slot.status(), Status::Unverified); assert_eq!(slot.proposal(), Some(&compromised)); + assert!(!slot.is_local()); } #[test] @@ -298,6 +330,7 @@ mod tests { other => panic!("expected equivocation, got {other:?}"), } assert_eq!(slot.status(), Status::Equivocated); + assert!(!slot.is_local()); // Verifier completion arriving afterwards must be ignored. assert!(!slot.mark_verified()); assert!(matches!(slot.update(&conflicting, true), Change::Skipped)); @@ -321,6 +354,7 @@ mod tests { assert_eq!(slot.status(), Status::Equivocated); assert_eq!(slot.proposal(), Some(&proposal_b)); assert!(!slot.should_build()); + assert!(!slot.is_local()); } #[test] @@ -337,6 +371,7 @@ mod tests { )); assert!(matches!(slot.update(&proposal_b, true), Change::Skipped)); assert_eq!(slot.status(), Status::Equivocated); + assert!(!slot.is_local()); } #[test] @@ -353,6 +388,7 @@ mod tests { // gating even before the follower-side verify path runs. assert!(matches!(slot.update(&proposal_a, true), Change::New)); assert!(slot.has_unequivocated_proposal()); + assert!(!slot.is_local()); // A conflicting proposal immediately revokes that property. assert!(matches!( diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index c9df5343330..5af52436653 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -585,25 +585,14 @@ impl, L: ElectorConfig, D: D } /// Takes all certification candidates and returns proposals ready for - /// certification, along with whether the local participant is the leader - /// of each view (used to short-circuit certification for own proposals). + /// certification, along with whether the proposal was built locally. /// - /// The `am_leader` flag is `true` only when the round's leader has been - /// set AND matches the local participant. The leader for view V is set - /// by processing V-1's notarization (`add_notarization(V-1)` calls - /// `set_leader(V, ...)`). In normal operation this precedes entering V - /// and proposing, so `am_leader = true` means we proposed the block. - /// - /// During catch-up, `am_leader` can be `true` for a view we never - /// entered: if V-1's notarization set V's leader before V's notarization - /// added V as a candidate. A notarization where we are the elected - /// leader can only exist if we (or a clone holding our key) proposed, - /// so accepting the network's 2f+1 commitment is correct. If V's - /// notarization arrives without V-1's, the leader is unknown and - /// `am_leader` is `false`, falling through to the automaton. + /// Certification may be inferred only when we have explicit evidence that we + /// proposed this exact payload for the round, either in the current process + /// or via replay of our durable local vote. Leader identity alone is not + /// sufficient during catch-up and recovery. pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); - let me = self.scheme.me(); candidates .into_iter() .filter_map(|view| { @@ -611,11 +600,8 @@ impl, L: ElectorConfig, D: D return None; } let round = self.views.get_mut(&view)?; - let am_leader = round - .leader() - .is_some_and(|leader| me.is_some_and(|me| me == leader.idx)); - let proposal = round.try_certify()?; - Some((proposal, am_leader)) + let candidate = round.try_certify()?; + Some(candidate) }) .collect() } @@ -1787,7 +1773,9 @@ mod tests { let runtime = deterministic::Runner::default(); runtime.start(|mut context| async move { let namespace = b"ns".to_vec(); - let Fixture { schemes, .. } = ed25519::fixture(&mut context, &namespace, 4); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); let epoch = Epoch::new(2); let view = View::new(2); @@ -1832,6 +1820,76 @@ mod tests { // No verification request should be emitted (leader-owned). assert!(state.try_verify().is_none()); + + let notarization_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarization_votes.iter(), &Sequential) + .expect("notarization"); + let (added, _) = state.add_notarization(notarization); + assert!(added); + + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].0.round.view(), view); + assert!(candidates[0].1); + }); + } + + #[test] + fn certify_candidates_do_not_short_circuit_leader_owned_recovered_proposals() { + let runtime = deterministic::Runner::default(); + runtime.start(|mut context| async move { + let namespace = b"ns".to_vec(); + let Fixture { + schemes, verifier, .. + } = ed25519::fixture(&mut context, &namespace, 4); + + let epoch = Epoch::new(2); + let view = View::new(2); + let proposal = Proposal::new( + Rnd::new(epoch, view), + View::new(1), + Sha256Digest::from([43u8; 32]), + ); + + let mut state = State::new( + context, + Config { + scheme: schemes[0].clone(), + elector: ::default(), + epoch, + activity_timeout: ViewDelta::new(5), + leader_timeout: Duration::from_secs(1), + certification_timeout: Duration::from_secs(2), + timeout_retry: Duration::from_secs(3), + }, + ); + state.set_genesis(test_genesis()); + assert!(state.enter_view(view)); + state.set_leader(view, None); + assert_eq!(state.leader_index(view), Some(Participant::new(0))); + + let notarize_votes: Vec<_> = schemes + .iter() + .map(|scheme| Notarize::sign(scheme, proposal.clone()).unwrap()) + .collect(); + let notarization = + Notarization::from_notarizes(&verifier, notarize_votes.iter(), &Sequential) + .expect("notarization"); + let (added, equivocator) = state.add_notarization(notarization); + assert!(added); + assert!(equivocator.is_none()); + + let candidates = state.certify_candidates(); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].0, proposal); + assert!( + !candidates[0].1, + "leader-owned recovered proposal must not inherit local certification" + ); }); } @@ -1964,6 +2022,7 @@ mod tests { // All 6 views should be candidates let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 6); + assert!(candidates.iter().all(|(_, is_local)| !is_local)); // Set certify handles for views 3, 4, 5, 7 (NOT 6 or 8) for i in [3u64, 4, 5, 7] { @@ -2004,6 +2063,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), View::new(9)); + assert!(!candidates[0].1); // Set handle for view 9, add view 10 let handle9 = pool.push(futures::future::pending()); @@ -2014,6 +2074,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), View::new(10)); + assert!(!candidates[0].1); // Finalize view 9 - aborts view 9's handle state.add_finalization(make_finalization(View::new(9))); @@ -2024,6 +2085,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), View::new(11)); + assert!(!candidates[0].1); }); } @@ -2093,13 +2155,10 @@ mod tests { .try_certify() .is_some()); - let expected_am_leader = state - .leader_index(live_view) - .is_some_and(|leader| state.is_me(leader)); let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), live_view); - assert_eq!(candidates[0].1, expected_am_leader); + assert!(!candidates[0].1); }); } @@ -2155,6 +2214,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), view); + assert!(!candidates[0].1); }); } @@ -2199,6 +2259,7 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); assert_eq!(candidates[0].0.round.view(), view); + assert!(!candidates[0].1); let mut pool = AbortablePool::<()>::default(); let handle = pool.push(futures::future::pending()); From af62c7802ecfa070a89ac16c95ea3f62565b3efd Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 13:26:45 -0700 Subject: [PATCH 046/107] more docs --- consensus/src/simplex/actors/voter/round.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index ec90123cb5d..6c8e8fe3a39 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -542,6 +542,11 @@ impl Round { // the local certification shortcut from this replay; follower // rounds also journal local notarize votes over other leaders' // proposals. + // + // This relies on journal replay remaining append-ordered. By the + // time we replay a local vote for round `v`, the earlier + // certificate for `v - 1` has already replayed and seeded this + // round's leader. if self .leader .as_ref() From ac26165c8e268bd41d71a03405281af92aea10b2 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 14:10:48 -0700 Subject: [PATCH 047/107] fmt --- consensus/src/lib.rs | 10 +- consensus/src/marshal/coding/marshaled.rs | 17 +- consensus/src/marshal/standard/deferred.rs | 50 +++-- consensus/src/marshal/standard/inline.rs | 21 +-- consensus/src/marshal/standard/mod.rs | 104 ++++++----- .../src/ordered_broadcast/mocks/automaton.rs | 3 +- consensus/src/simplex/actors/batcher/mod.rs | 3 +- consensus/src/simplex/actors/voter/actor.rs | 24 ++- consensus/src/simplex/actors/voter/mod.rs | 172 ++++++++++++++++++ consensus/src/simplex/actors/voter/round.rs | 15 +- consensus/src/simplex/mocks/application.rs | 4 +- 11 files changed, 320 insertions(+), 103 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index d52fb624162..0aeb6264592 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -206,11 +206,19 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { type Plan: Send; /// Broadcast a payload to the given recipients. + /// + /// Returns `true` when the relay accepted the payload for the requested + /// broadcast plan. Returns `false` when the relay could not complete the + /// handoff. + /// + /// For proposal broadcasts, returning `false` is fatal for the current + /// consensus attempt: callers must not proceed as though the payload were + /// available to the network. fn broadcast( &mut self, payload: Self::Digest, plan: Self::Plan, - ) -> impl Future + Send; + ) -> impl Future + Send; } /// Reporter is the interface responsible for reporting activity to some external actor. diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index d5363e7653d..6a84276c9cc 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -636,12 +636,7 @@ where } let success = tx.send_lossy(commitment); - debug!( - ?round, - ?commitment, - success, - "proposed new block" - ); + debug!(?round, ?commitment, success, "proposed new block"); }); rx } @@ -971,12 +966,12 @@ where type PublicKey = ::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, commitment: Self::Digest, plan: Self::Plan) { + async fn broadcast(&mut self, commitment: Self::Digest, plan: Self::Plan) -> bool { match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { warn!("missing block to broadcast"); - return; + return false; }; if block.commitment() != commitment { warn!( @@ -985,7 +980,7 @@ where height = %block.height(), "skipping requested broadcast of block with mismatched commitment" ); - return; + return false; }; if !self.marshal.proposed(round, block).await { warn!( @@ -993,14 +988,16 @@ where ?commitment, "marshal unavailable during proposed broadcast; block not persisted" ); - return; + return false; } + true } Plan::Forward { .. } => { // Coding variant does not support targeted forwarding; // peers reconstruct blocks from erasure-coded shards. // // TODO(#3389): Support checked data forwarding for PhasedScheme. + true } } } diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 8b480b4bf92..11305d8bdf1 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -619,12 +619,12 @@ where type PublicKey = S::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { + async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) -> bool { match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { warn!("missing block to broadcast"); - return; + return false; }; if block.digest() != digest { warn!( @@ -633,7 +633,7 @@ where height = %block.height(), "skipping requested broadcast of block with mismatched digest" ); - return; + return false; }; if !self.marshal.proposed(round, block).await { warn!( @@ -641,11 +641,13 @@ where ?digest, "marshal unavailable during proposed broadcast; block not persisted" ); - return; + return false; } + true } Plan::Forward { round, peers } => { self.marshal.forward(round, digest, peers).await; + true } } } @@ -734,10 +736,12 @@ mod tests { // Create parent block at height 1 let parent = make_raw_block(genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal - .clone() - .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) - .await); + assert!( + marshal + .clone() + .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) + .await + ); // Block A at view 5 (height 2) let round_a = Round::new(Epoch::new(0), View::new(5)); @@ -866,10 +870,12 @@ mod tests { let parent = B::new::(parent_ctx.clone(), genesis.digest(), Height::new(19), 1000); let parent_digest = parent.digest(); - assert!(marshal - .clone() - .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) - .await); + assert!( + marshal + .clone() + .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) + .await + ); // Create a block at height 20 (first block in epoch 1, which is NOT supported) let unsupported_round = Round::new(Epoch::new(1), View::new(20)); @@ -885,10 +891,12 @@ mod tests { 2000, ); let block_commitment = block.digest(); - assert!(marshal - .clone() - .proposed(unsupported_round, block.clone()) - .await); + assert!( + marshal + .clone() + .proposed(unsupported_round, block.clone()) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -955,10 +963,12 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_commitment = parent.digest(); - assert!(marshal - .clone() - .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) - .await); + assert!( + marshal + .clone() + .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) + .await + ); // Build a block with context A (embedded in the block). let round_a = Round::new(Epoch::zero(), View::new(2)); diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 5511d17c083..01fb1a4d71e 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -381,14 +381,9 @@ where // - Re-proposals skip normal parent/height checks because: // 1) the block was already verified when originally proposed // 2) parent-child checks would fail by construction when parent == block - let Some(decision) = precheck_epoch_and_reproposal( - &epocher, - &mut marshal, - &context, - digest, - block, - ) - .await + let Some(decision) = + precheck_epoch_and_reproposal(&epocher, &mut marshal, &context, digest, block) + .await else { return; }; @@ -482,12 +477,12 @@ where type PublicKey = S::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { + async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) -> bool { match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { warn!("missing block to broadcast"); - return; + return false; }; if block.digest() != digest { warn!( @@ -496,7 +491,7 @@ where height = %block.height(), "skipping requested broadcast of block with mismatched digest" ); - return; + return false; }; if !self.marshal.proposed(round, block).await { warn!( @@ -504,11 +499,13 @@ where ?digest, "marshal unavailable during proposed broadcast; block not persisted" ); - return; + return false; } + true } Plan::Forward { round, peers } => { self.marshal.forward(round, digest, peers).await; + true } } } diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 1f6f7e8e78a..6bbc89d13b0 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -412,12 +412,16 @@ mod tests { ) .await .mailbox; - assert!(peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await); - assert!(peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await); + assert!( + peer_mailbox + .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .await + ); + assert!( + peer_mailbox + .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; context.sleep(Duration::from_millis(200)).await; @@ -505,15 +509,21 @@ mod tests { ) .await .mailbox; - assert!(peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) - .await); - assert!(peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) - .await); - assert!(peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) - .await); + assert!( + peer_mailbox + .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .await + ); + assert!( + peer_mailbox + .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .await + ); + assert!( + peer_mailbox + .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; StandardHarness::report_finalization(&mut peer_mailbox, finalization_three.clone()) .await; @@ -691,12 +701,14 @@ mod tests { .await .mailbox; for (i, block) in blocks.iter().enumerate() { - assert!(peer_mailbox - .proposed( - Round::new(Epoch::zero(), View::new(block.height().get())), - (*block).clone(), - ) - .await); + assert!( + peer_mailbox + .proposed( + Round::new(Epoch::zero(), View::new(block.height().get())), + (*block).clone(), + ) + .await + ); StandardHarness::report_finalization(&mut peer_mailbox, finalizations[i].clone()) .await; } @@ -1120,10 +1132,12 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - assert!(marshal - .clone() - .proposed(boundary_round, boundary_block.clone()) - .await); + assert!( + marshal + .clone() + .proposed(boundary_round, boundary_block.clone()) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1190,10 +1204,12 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - assert!(marshal - .clone() - .proposed(boundary_round, boundary_block) - .await); + assert!( + marshal + .clone() + .proposed(boundary_round, boundary_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1227,10 +1243,12 @@ mod tests { 1000, ); let non_boundary_digest = non_boundary_block.digest(); - assert!(marshal - .clone() - .proposed(non_boundary_round, non_boundary_block) - .await); + assert!( + marshal + .clone() + .proposed(non_boundary_round, non_boundary_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1330,10 +1348,12 @@ mod tests { 200, ); let malformed_digest = malformed_block.digest(); - assert!(marshal - .clone() - .proposed(malformed_round, malformed_block) - .await); + assert!( + marshal + .clone() + .proposed(malformed_round, malformed_block) + .await + ); context.sleep(Duration::from_millis(10)).await; @@ -1386,10 +1406,12 @@ mod tests { 400, ); let mismatched_digest = mismatched_block.digest(); - assert!(marshal - .clone() - .proposed(mismatch_round, mismatched_block) - .await); + assert!( + marshal + .clone() + .proposed(mismatch_round, mismatched_block) + .await + ); context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/ordered_broadcast/mocks/automaton.rs b/consensus/src/ordered_broadcast/mocks/automaton.rs index 806846670cc..576971dbb7d 100644 --- a/consensus/src/ordered_broadcast/mocks/automaton.rs +++ b/consensus/src/ordered_broadcast/mocks/automaton.rs @@ -68,7 +68,8 @@ impl R for Automaton

{ type Plan = (); type PublicKey = P; - async fn broadcast(&mut self, payload: Self::Digest, _plan: ()) { + async fn broadcast(&mut self, payload: Self::Digest, _plan: ()) -> bool { trace!(?payload, "broadcast"); + true } } diff --git a/consensus/src/simplex/actors/batcher/mod.rs b/consensus/src/simplex/actors/batcher/mod.rs index 733c8963847..5cf77ded322 100644 --- a/consensus/src/simplex/actors/batcher/mod.rs +++ b/consensus/src/simplex/actors/batcher/mod.rs @@ -97,10 +97,11 @@ mod tests { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) { + async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) -> bool { if let Plan::Forward { round, peers } = plan { self.broadcasts.lock().push((payload, round, peers)); } + true } } diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index e79a687e0d9..c0b5b87b1ac 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -828,6 +828,7 @@ impl< let mut pending_verify: Option, bool>> = None; let mut certify_pool: AbortablePool<(Rnd, Result)> = Default::default(); + let mut stopped_after_broadcast_failure = false; select_loop! { self.context, on_start => { @@ -886,14 +887,6 @@ impl< }, on_stopped => { debug!("context shutdown, stopping voter"); - - // Sync and drop journal - self.journal - .take() - .unwrap() - .sync_all() - .await - .expect("unable to sync journal"); }, _ = self.context.sleep_until(timeout) => { // Process the timeout @@ -933,7 +926,14 @@ impl< view = self.state.current_view(); // Notify application of proposal - self.relay.broadcast(proposed, Plan::Propose).await; + if !self.relay.broadcast(proposed, Plan::Propose).await { + warn!( + round = ?context.round, + "failed to broadcast proposed payload, stopping voter" + ); + stopped_after_broadcast_failure = true; + break; + } }, (context, verified) = verify_wait => { // Clear verify waiter @@ -1105,5 +1105,11 @@ impl< } }, } + if stopped_after_broadcast_failure { + debug!("stopped voter after failed proposal broadcast"); + } + if let Some(journal) = self.journal.take() { + journal.sync_all().await.expect("unable to sync journal"); + } } } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 40ce9d915f7..ef1e4859e45 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -97,6 +97,33 @@ mod tests { const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); + #[derive(Clone, Default)] + struct FailingRelay { + proposes: Arc>>, + } + + impl FailingRelay { + fn new() -> Self { + Self::default() + } + } + + impl crate::Relay for FailingRelay { + type Digest = Sha256Digest; + type PublicKey = PublicKey; + type Plan = Plan; + + async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) -> bool { + match plan { + Plan::Propose => { + self.proposes.lock().push(payload); + false + } + Plan::Forward { .. } => true, + } + } + } + async fn start_test_network_with_peers( context: deterministic::Context, peers: I, @@ -119,6 +146,151 @@ mod tests { oracle } + fn propose_broadcast_failure_stops_before_notarize(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let namespace = b"propose_broadcast_failure_stops_before_notarize".to_vec(); + let partition = "propose_broadcast_failure_stops_before_notarize".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(10)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, 5); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + + let app_relay = Arc::new(mocks::relay::Relay::new()); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: app_relay, + me: me.clone(), + propose_latency: (0.0, 0.0), + verify_latency: (0.0, 0.0), + certify_latency: (0.0, 0.0), + certifier: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let relay = FailingRelay::new(); + let propose_attempts = relay.proposes.clone(); + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay, + reporter, + partition, + epoch: Epoch::new(4), + mailbox_size: 128, + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _resolver_receiver) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { + current, + leader, + response, + .. + } => { + assert_eq!(current, View::new(1)); + let _ = leader; + response.send(None).unwrap(); + } + _ => panic!("unexpected initial batcher message"), + } + + select! { + result = handle => { + result.expect("voter should stop cleanly after failed propose broadcast"); + }, + _ = context.sleep(Duration::from_secs(1)) => { + panic!("timed out waiting for voter to stop after failed propose broadcast"); + } + } + + assert_eq!( + propose_attempts.lock().len(), + 1, + "expected exactly one failed propose broadcast attempt" + ); + + while let Some(message) = batcher_receiver.recv().now_or_never().flatten() { + match message { + batcher::Message::Constructed(Vote::Notarize(notarize)) => { + panic!( + "unexpected notarize for view {} after failed propose broadcast", + notarize.view() + ); + } + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + } + batcher::Message::Constructed(_) => {} + } + } + }); + } + + #[test_traced] + fn test_propose_broadcast_failure_stops_before_notarize() { + propose_broadcast_failure_stops_before_notarize::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + propose_broadcast_failure_stops_before_notarize::<_, _>( + bls12381_threshold_vrf::fixture::, + ); + propose_broadcast_failure_stops_before_notarize::<_, _>( + bls12381_multisig::fixture::, + ); + propose_broadcast_failure_stops_before_notarize::<_, _>( + bls12381_multisig::fixture::, + ); + propose_broadcast_failure_stops_before_notarize::<_, _>(ed25519::fixture); + propose_broadcast_failure_stops_before_notarize::<_, _>(secp256r1::fixture); + } + fn build_notarization>( schemes: &[S], proposal: &Proposal, diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index 6c8e8fe3a39..e0101583ce8 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -923,10 +923,7 @@ mod tests { // Proposal should be restored as verified (we are the leader). assert_eq!(round.proposal.proposal(), Some(&proposal)); - assert_eq!( - round.proposal.status(), - ProposalStatus::Verified(true) - ); + assert_eq!(round.proposal.status(), ProposalStatus::Verified(true)); assert!(round.broadcast_notarize); // No verification request should be emitted. @@ -948,7 +945,10 @@ mod tests { let (candidate, is_local) = round.try_certify().expect("certify candidate"); assert_eq!(candidate, proposal); - assert!(is_local, "local notarize replay should restore local certification"); + assert!( + is_local, + "local notarize replay should restore local certification" + ); } #[test] @@ -1070,7 +1070,10 @@ mod tests { let (candidate, is_local) = round.try_certify().expect("certify candidate"); assert_eq!(candidate, proposal); - assert!(is_local, "locally proposed payload should carry local certify permission"); + assert!( + is_local, + "locally proposed payload should carry local certify permission" + ); } #[test] diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 659292b4005..29940427172 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -116,8 +116,8 @@ impl Re for Mailbox { type PublicKey = P; type Plan = Plan

; - async fn broadcast(&mut self, payload: Self::Digest, _plan: Plan

) { - self.sender.send_lossy(Message::Broadcast { payload }).await; + async fn broadcast(&mut self, payload: Self::Digest, _plan: Plan

) -> bool { + self.sender.send_lossy(Message::Broadcast { payload }).await } } From 5ce301e98bcc86f5dd245803d9f3c2cfac94f5b1 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 14:46:46 -0700 Subject: [PATCH 048/107] nits --- consensus/src/simplex/actors/voter/slot.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index 9fc929f84ce..1127777ec2f 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -80,31 +80,23 @@ where /// Records a proposal that has already been verified. /// - /// If the slot already contains the same proposal, we refresh it to - /// verified state. Conflicting proposals are ignored. + /// Additional observations of the same proposal are ignored here. + /// Conflicting proposals are handled separately as equivocation. fn verified(&mut self, proposal: Proposal, local: bool) { - let verified = Status::Verified(local); if let Some(existing) = &self.proposal { - if existing == &proposal && self.status != Status::Equivocated { - self.status = verified; - self.requested_build = true; - self.requested_verify = true; - return; - } - // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. debug!( ?existing, ?proposal, - "ignoring local proposal because slot already populated" + "ignoring verified proposal because slot already populated" ); return; } // Otherwise, we record the proposal and flip the build/verify flags. self.proposal = Some(proposal); - self.status = verified; + self.status = Status::Verified(local); self.requested_build = true; self.requested_verify = true; } From 84ce90839ad5597e63d72f3b3227c71e76b64ceb Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 14:58:13 -0700 Subject: [PATCH 049/107] progress --- examples/bridge/src/application/ingress.rs | 3 ++- examples/log/src/application/ingress.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/bridge/src/application/ingress.rs b/examples/bridge/src/application/ingress.rs index 6881ad3f127..c1abebaf54f 100644 --- a/examples/bridge/src/application/ingress.rs +++ b/examples/bridge/src/application/ingress.rs @@ -96,11 +96,12 @@ impl Re for Mailbox { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) { + async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) -> bool { // We don't broadcast our raw messages to other peers. // // If we were building an EVM blockchain, for example, we'd // send the block to other peers here. + true } } diff --git a/examples/log/src/application/ingress.rs b/examples/log/src/application/ingress.rs index 17bb94244ab..064fdd7a095 100644 --- a/examples/log/src/application/ingress.rs +++ b/examples/log/src/application/ingress.rs @@ -85,10 +85,11 @@ impl Re for Mailbox { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) { + async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) -> bool { // We don't broadcast our raw messages to other peers. // // If we were building an EVM blockchain, for example, we'd // send the block to other peers here. + true } } From 21cf103ae71c51b1bd116089eaab9c37296e9246 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:08:13 -0700 Subject: [PATCH 050/107] nits --- consensus/src/marshal/coding/marshaled.rs | 4 +- consensus/src/marshal/coding/mod.rs | 20 +-- consensus/src/marshal/mocks/verifying.rs | 8 +- consensus/src/marshal/standard/deferred.rs | 9 +- consensus/src/marshal/standard/inline.rs | 180 ++++++++++++++++++-- consensus/src/simplex/actors/voter/mod.rs | 68 ++++---- consensus/src/simplex/actors/voter/round.rs | 2 +- consensus/src/simplex/actors/voter/slot.rs | 2 +- consensus/src/simplex/actors/voter/state.rs | 5 +- consensus/src/simplex/mocks/application.rs | 10 +- consensus/src/simplex/mod.rs | 60 +++---- 11 files changed, 263 insertions(+), 105 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 6a84276c9cc..94f52f1deaf 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -183,9 +183,9 @@ where scheme_provider: Z, epocher: ES, strategy: S, + last_built: LastBuilt>, verification_tasks: VerificationTasks, cached_genesis: Arc)>>, - last_built: LastBuilt>, build_duration: Timed, verify_duration: Timed, @@ -629,7 +629,6 @@ where let commitment = coded_block.commitment(); let round = consensus_context.round; - { let mut lock = last_built.lock(); *lock = Some((round, coded_block)); @@ -990,6 +989,7 @@ where ); return false; } + debug!(?round, ?commitment, "requested broadcast of built block"); true } Plan::Forward { .. } => { diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index b37b1b07743..8b40a357b33 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -1640,7 +1640,7 @@ mod tests { .await; let marshal = setup.mailbox; let shards = setup.extra; - let actor_handle = setup.actor_handle; + let marshal_actor_handle = setup.actor_handle; let genesis_ctx = CodingCtx { round: Round::zero(), @@ -1703,11 +1703,9 @@ mod tests { .expect("certify result missing"); assert!(certify_result, "certify should succeed"); - // CRITICAL: abort the marshal actor synchronously, with no - // intervening await. If certify returned true but the actor had - // only enqueued (not processed) the `Verified` message, this - // abort kills the actor before persistence completes. - actor_handle.abort(); + // Abort marshal immediately after certify returns to prove the + // block is already persisted at that point. + marshal_actor_handle.abort(); drop(marshaled); drop(marshal); drop(shards); @@ -1762,7 +1760,7 @@ mod tests { .await; let marshal = setup.mailbox; let shards = setup.extra; - let actor_handle = setup.actor_handle; + let marshal_actor_handle = setup.actor_handle; let genesis_ctx = CodingCtx { round: Round::zero(), @@ -1824,11 +1822,9 @@ mod tests { assert_eq!(commitment, expected_commitment); marshaled.broadcast(commitment, Plan::Propose).await; - // CRITICAL: abort the marshal actor synchronously so it cannot - // drain pending messages. `broadcast` must have already awaited - // the actor's persistence ack via `marshal.proposed`, so no extra - // sleep is needed - the block must be on disk by now. - actor_handle.abort(); + // Abort marshal immediately after broadcast returns; the propose + // path must already have persisted the block. + marshal_actor_handle.abort(); drop(marshaled); drop(marshal); drop(shards); diff --git a/consensus/src/marshal/mocks/verifying.rs b/consensus/src/marshal/mocks/verifying.rs index ea01956e693..fb665a81cfc 100644 --- a/consensus/src/marshal/mocks/verifying.rs +++ b/consensus/src/marshal/mocks/verifying.rs @@ -20,10 +20,10 @@ use commonware_runtime::deterministic; pub struct MockVerifyingApp { /// The genesis block to return. pub genesis: B, - /// The result returned by `verify`. - pub verify_result: bool, /// The block returned by `propose`. If `None`, `propose` returns `None`. pub propose_result: Option, + /// The result returned by `verify`. + pub verify_result: bool, _phantom: std::marker::PhantomData, } @@ -32,8 +32,8 @@ impl MockVerifyingApp { pub fn new(genesis: B) -> Self { Self { genesis, - verify_result: true, propose_result: None, + verify_result: true, _phantom: std::marker::PhantomData, } } @@ -42,8 +42,8 @@ impl MockVerifyingApp { pub fn with_verify_result(genesis: B, verify_result: bool) -> Self { Self { genesis, - verify_result, propose_result: None, + verify_result, _phantom: std::marker::PhantomData, } } diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 11305d8bdf1..1e39bbeb310 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -146,8 +146,8 @@ where application: A, marshal: Mailbox>, epocher: ES, - verification_tasks: VerificationTasks<::Digest>, last_built: LastBuilt, + verification_tasks: VerificationTasks<::Digest>, build_duration: Timed, } @@ -182,8 +182,8 @@ where application, marshal, epocher, - verification_tasks: VerificationTasks::new(), last_built: Arc::new(Mutex::new(None)), + verification_tasks: VerificationTasks::new(), build_duration, } @@ -393,7 +393,6 @@ where build_timer.observe(); let digest = built_block.digest(); - { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); @@ -466,7 +465,8 @@ where let block = match decision { Decision::Complete(valid) => { if valid { - // Valid re-proposal. Create a completed verification task for `certify`. + // A valid re-proposal needs no further ancestry validation, but + // `certify` still expects a completed verification task. let round = context.round; let (task_tx, task_rx) = oneshot::channel(); task_tx.send_lossy(true); @@ -643,6 +643,7 @@ where ); return false; } + debug!(?round, ?digest, "requested broadcast of built block"); true } Plan::Forward { round, peers } => { diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 01fb1a4d71e..6a5a7a4b9d4 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -141,8 +141,8 @@ where application: A, marshal: Mailbox>, epocher: ES, - available_blocks: AvailableBlocks, last_built: LastBuilt, + available_blocks: AvailableBlocks, build_duration: Timed, } @@ -177,8 +177,8 @@ where application, marshal, epocher, - available_blocks: Arc::new(Mutex::new(BTreeSet::new())), last_built: Arc::new(Mutex::new(None)), + available_blocks: Arc::new(Mutex::new(BTreeSet::new())), build_duration, } } @@ -363,7 +363,6 @@ where .with_label("inline_verify") .with_attribute("round", context.round) .spawn(move |runtime_context| async move { - // If block can be fetched, mark it as available. let block_request = marshal .subscribe_by_digest(Some(context.round), digest) .await; @@ -372,7 +371,6 @@ where else { return; }; - available_blocks.lock().insert((context.round, digest)); // Shared pre-checks: // - Blocks are invalid if they are not in the expected epoch and are @@ -413,6 +411,9 @@ where Some(valid) => valid, None => return, }; + if application_valid { + available_blocks.lock().insert((context.round, digest)); + } tx.send_lossy(application_valid); }); rx @@ -555,9 +556,12 @@ mod tests { }; use commonware_macros::{select, test_traced}; use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; - use commonware_utils::NZUsize; + use commonware_utils::{ + channel::{fallible::OneshotExt, oneshot}, + NZUsize, + }; use rand::Rng; - use std::time::Duration; + use std::{sync::Arc, time::Duration}; // Compile-time assertion only: inline standard wrapper must not require `CertifiableBlock`. #[allow(dead_code)] @@ -624,7 +628,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal.clone().proposed(parent_round, parent).await); + assert!(marshal.proposed(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -635,7 +639,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - assert!(marshal.clone().proposed(round, block).await); + assert!(marshal.proposed(round, block).await); // Complete verify first so the block is already available locally. let verify_rx = inline.verify(verify_context, digest).await; @@ -702,7 +706,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal.clone().proposed(parent_round, parent).await); + assert!(marshal.proposed(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -713,7 +717,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - assert!(marshal.clone().proposed(round, block).await); + assert!(marshal.proposed(round, block).await); // Certify should still resolve by waiting on marshal block availability directly. let certify_rx = inline.certify(round, digest).await; @@ -851,4 +855,160 @@ mod tests { ); }); } + + #[derive(Clone)] + struct GatedVerifyingApp { + genesis: B, + started: Arc>>>, + release: Arc>>>, + } + + impl GatedVerifyingApp { + fn new(genesis: B) -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (started_tx, started_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + genesis, + started: Arc::new(std::sync::Mutex::new(Some(started_tx))), + release: Arc::new(std::sync::Mutex::new(Some(release_rx))), + }, + started_rx, + release_tx, + ) + } + } + + impl crate::Application for GatedVerifyingApp { + type Block = B; + type Context = Ctx; + type SigningScheme = S; + + async fn genesis(&mut self) -> Self::Block { + self.genesis.clone() + } + + async fn propose>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: crate::marshal::ancestry::AncestorStream, + ) -> Option { + None + } + } + + impl crate::VerifyingApplication for GatedVerifyingApp { + async fn verify>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: crate::marshal::ancestry::AncestorStream, + ) -> bool { + if let Some(started) = self.started.lock().unwrap().take() { + started.send_lossy(()); + } + let release = self + .release + .lock() + .unwrap() + .take() + .expect("release receiver missing"); + let _ = release.await; + true + } + } + + #[test_traced("WARN")] + fn test_inline_certify_does_not_bypass_failed_verify_persistence() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + use commonware_broadcast::Broadcaster; + + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let marshal_actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let (mock_app, verify_started, release_verify) = + GatedVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me, + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child) + .await + .await + .expect("buffer broadcast for child should ack"); + + let verify_rx = inline.verify(child_ctx, child_digest).await; + verify_started + .await + .expect("verify should reach application before marshal abort"); + marshal_actor_handle.abort(); + release_verify.send_lossy(()); + + select! { + result = verify_rx => { + assert!( + result.is_err(), + "verify must not resolve after marshal.verified loses its persistence ack" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("verify should terminate after marshal abort"); + }, + } + + let certify_rx = inline.certify(child_round, child_digest).await; + select! { + result = certify_rx => { + assert!( + result.is_err(), + "certify must not bypass failed verify persistence via stale availability" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should terminate after marshal abort"); + }, + } + }); + } } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index ef1e4859e45..370b27e7dab 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -181,7 +181,7 @@ mod tests { propose_latency: (0.0, 0.0), verify_latency: (0.0, 0.0), certify_latency: (0.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -388,7 +388,7 @@ mod tests { leader_timeout: Duration, certification_timeout: Duration, timeout_retry: Duration, - certifier: mocks::application::Certifier, + should_certify: mocks::application::Certifier, ) -> ( Mailbox, mpsc::Receiver>, @@ -559,7 +559,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -793,7 +793,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), app_config); @@ -1442,7 +1442,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (100_000.0, 0.0), // Very slow verification certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1621,7 +1621,7 @@ mod tests { propose_latency: (50.0, 10.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -1829,7 +1829,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2059,7 +2059,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2462,7 +2462,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (10.0, 0.0), // 10ms verification latency certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -2680,7 +2680,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -2885,7 +2885,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -3059,7 +3059,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3416,7 +3416,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); @@ -3709,7 +3709,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new(move |_, d| { + should_certify: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true })), @@ -3845,7 +3845,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new(move |_, d| { + should_certify: mocks::application::Certifier::Custom(Box::new(move |_, d| { tracker.lock().push(d); true })), @@ -3983,7 +3983,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4147,7 +4147,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4240,7 +4240,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4409,7 +4409,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4515,7 +4515,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (mut app_actor, application) = mocks::application::Application::new( context.with_label("app_restarted"), @@ -4684,7 +4684,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new( + should_certify: mocks::application::Certifier::Custom(Box::new( move |round, _| { certify_tracker.lock().push(round.view()); true @@ -4862,7 +4862,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -4954,7 +4954,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new( + should_certify: mocks::application::Certifier::Custom(Box::new( move |round, _| { certify_tracker.lock().push(round.view()); true @@ -5120,7 +5120,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5314,7 +5314,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -6160,7 +6160,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Cancel, + should_certify: mocks::application::Certifier::Cancel, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_cancel"), app_cfg); @@ -6273,7 +6273,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (2_000.0, 0.0), // 2 seconds - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); @@ -7519,7 +7519,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -7639,7 +7639,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new( + should_certify: mocks::application::Certifier::Custom(Box::new( move |round, _| { certify_tracker.lock().push(round.view()); true @@ -7789,7 +7789,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Custom(Box::new(|_, _| false)), + should_certify: mocks::application::Certifier::Custom(Box::new(|_, _| false)), }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -7907,7 +7907,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); @@ -8049,7 +8049,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); @@ -8175,7 +8175,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); @@ -8313,7 +8313,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (app_actor, application) = mocks::application::Application::new(context.with_label("app"), app_cfg); diff --git a/consensus/src/simplex/actors/voter/round.rs b/consensus/src/simplex/actors/voter/round.rs index e0101583ce8..6f9dab5c1a6 100644 --- a/consensus/src/simplex/actors/voter/round.rs +++ b/consensus/src/simplex/actors/voter/round.rs @@ -477,7 +477,7 @@ impl Round { /// Returns a proposal candidate for notarization if we're ready to vote. /// /// Marks that we've broadcast our notarize vote to prevent duplicates. - pub fn construct_notarize(&mut self) -> Option<&Proposal> { + pub const fn construct_notarize(&mut self) -> Option<&Proposal> { // Ensure we haven't already broadcast a notarize vote or nullify vote. if self.broadcast_notarize || self.broadcast_nullify { return None; diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index 1127777ec2f..d7522eef2e9 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -66,7 +66,7 @@ where } /// Returns whether the current proposal was built locally and remains usable. - pub fn is_local(&self) -> bool { + pub const fn is_local(&self) -> bool { matches!(self.status, Status::Verified(true)) } diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index 5af52436653..8818aa276c2 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -1885,9 +1885,10 @@ mod tests { let candidates = state.certify_candidates(); assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].0, proposal); + let (candidate, is_local) = &candidates[0]; + assert_eq!(*candidate, proposal); assert!( - !candidates[0].1, + !*is_local, "leader-owned recovered proposal must not inherit local certification" ); }); diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 29940427172..b4cc480167c 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -166,7 +166,7 @@ pub struct Config { pub verify_latency: Latency, pub certify_latency: Latency, - pub certifier: Certifier, + pub should_certify: Certifier, } pub struct Application { @@ -186,7 +186,7 @@ pub struct Application { fail_verification: bool, drop_proposals: bool, drop_verifications: bool, - certifier: Certifier, + should_certify: Certifier, pending: HashMap, @@ -236,7 +236,7 @@ impl Application fail_verification: false, drop_proposals: false, drop_verifications: false, - certifier: cfg.certifier, + should_certify: cfg.should_certify, pending: HashMap::new(), verified: HashSet::new(), @@ -374,7 +374,7 @@ impl Application .await; // Use configured predicate to determine certification - match &self.certifier { + match &self.should_certify { Certifier::Always => Some(true), Certifier::Custom(func) => Some(func(round, payload)), Certifier::Cancel | Certifier::Pending => None, @@ -450,7 +450,7 @@ impl Application let contents = seen.get(&payload).cloned().unwrap_or_default(); if let Some(certified) = self.certify(round, payload, contents).await { response.send_lossy(certified); - } else if matches!(self.certifier, Certifier::Pending) { + } else if matches!(self.should_certify, Certifier::Pending) { // Hold the sender alive so the receiver never resolves. // This simulates a certify that hangs indefinitely (e.g., // block never arrives for reconstruction). diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 28ce22b5c6d..4a6738e04cd 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -770,7 +770,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1019,7 +1019,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Custom(Box::new({ + should_certify: mocks::application::Certifier::Custom(Box::new({ let built_elector_clone = built_elector.clone(); move |round, _| built_elector_clone.elect(round, None) != dishonest })), @@ -1180,7 +1180,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1336,7 +1336,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1517,7 +1517,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1638,7 +1638,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1772,7 +1772,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -1993,7 +1993,7 @@ mod tests { propose_latency: (10_000.0, 0.0), verify_latency: (10_000.0, 5.0), certify_latency: (10_000.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, } } else { mocks::application::Config { @@ -2003,7 +2003,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, } }; let (actor, application) = mocks::application::Application::new( @@ -2169,7 +2169,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2370,7 +2370,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2567,7 +2567,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2819,7 +2819,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -2995,7 +2995,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3163,7 +3163,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3355,7 +3355,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3516,7 +3516,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3607,7 +3607,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3831,7 +3831,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -3984,7 +3984,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4154,7 +4154,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4290,7 +4290,7 @@ mod tests { propose_latency: (100.0, 50.0), verify_latency: (50.0, 40.0), certify_latency: (50.0, 40.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4453,7 +4453,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -4672,7 +4672,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5025,7 +5025,7 @@ mod tests { propose_latency: (250.0, 50.0), // ensure we process certificates first verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label(&format!("application_{}", *validator)), @@ -5230,7 +5230,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5370,7 +5370,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5466,7 +5466,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -5959,7 +5959,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), @@ -6028,7 +6028,7 @@ mod tests { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: mocks::application::Certifier::Always, + should_certify: mocks::application::Certifier::Always, }; let (actor, application) = mocks::application::Application::new( context.with_label("application"), From fb3588217b89047c264675fb32eb3f1a43f4cd78 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:14:57 -0700 Subject: [PATCH 051/107] minimize --- consensus/src/marshal/coding/mod.rs | 20 +++++++++---------- .../src/marshal/coding/shards/mailbox.rs | 7 ++----- consensus/src/marshal/coding/variant.rs | 10 +--------- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 8b40a357b33..dd663aa96d5 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -307,7 +307,7 @@ mod tests { let block_a = make_coding_block(context_a.clone(), parent_digest, Height::new(2), 200); let coded_block_a = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_block_a.commitment(); - assert!(shards.clone().proposed(round_a, coded_block_a).await); + shards.clone().proposed(round_a, coded_block_a).await; // Block B at view 10 (height 2, different block same height - could happen with // different proposers or re-proposals) @@ -320,7 +320,7 @@ mod tests { let block_b = make_coding_block(context_b.clone(), parent_digest, Height::new(2), 300); let coded_block_b = CodedBlock::new(block_b.clone(), coding_config, &Sequential); let commitment_b = coded_block_b.commitment(); - assert!(shards.clone().proposed(round_b, coded_block_b).await); + shards.clone().proposed(round_b, coded_block_b).await; context.sleep(Duration::from_millis(10)).await; @@ -419,7 +419,7 @@ mod tests { let block = make_coding_block(ctx.clone(), parent, Height::new(i), i * 100); let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); last_commitment = coded_block.commitment(); - assert!(shards.clone().proposed(round, coded_block).await); + shards.clone().proposed(round, coded_block).await; parent = block.digest(); last_view = View::new(i); } @@ -1271,7 +1271,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - assert!(shards.clone().proposed(parent_round, coded_parent).await); + shards.clone().proposed(parent_round, coded_parent).await; // Create child at height 2. let child_round = Round::new(Epoch::zero(), View::new(2)); @@ -1283,7 +1283,7 @@ mod tests { let child = make_coding_block(child_ctx, parent.digest(), Height::new(2), 200); let coded_child = CodedBlock::new(child, coding_config, &Sequential); let child_commitment = coded_child.commitment(); - assert!(shards.clone().proposed(child_round, coded_child).await); + shards.clone().proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; @@ -1392,7 +1392,7 @@ mod tests { let parent = make_coding_block(parent_context, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - assert!(shards.clone().proposed(parent_round, coded_parent).await); + shards.clone().proposed(parent_round, coded_parent).await; // 3) Publish a valid child so optimistic verify can succeed. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1405,7 +1405,7 @@ mod tests { make_coding_block(verify_context.clone(), parent.digest(), Height::new(2), 200); let coded_block = CodedBlock::new(block, coding_config, &Sequential); let commitment = coded_block.commitment(); - assert!(shards.clone().proposed(round, coded_block).await); + shards.clone().proposed(round, coded_block).await; context.sleep(Duration::from_millis(10)).await; @@ -1500,7 +1500,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - assert!(v1_mailbox.proposed(round1, coded_block_b.clone()).await); + v1_mailbox.proposed(round1, coded_block_b.clone()).await; context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). @@ -1660,7 +1660,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - assert!(shards.clone().proposed(parent_round, coded_parent).await); + shards.clone().proposed(parent_round, coded_parent).await; let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = CodingCtx { @@ -1672,7 +1672,7 @@ mod tests { let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); let child_commitment = coded_child.commitment(); let child_digest = coded_child.digest(); - assert!(shards.clone().proposed(child_round, coded_child).await); + shards.clone().proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; diff --git a/consensus/src/marshal/coding/shards/mailbox.rs b/consensus/src/marshal/coding/shards/mailbox.rs index 78bdbef54dd..aff81e826b5 100644 --- a/consensus/src/marshal/coding/shards/mailbox.rs +++ b/consensus/src/marshal/coding/shards/mailbox.rs @@ -116,12 +116,9 @@ where } /// Broadcast a proposed erasure coded block's shards to the participants. - /// - /// Returns `true` if the message was enqueued, `false` if the shard engine - /// has shut down. - pub async fn proposed(&self, round: Round, block: CodedBlock) -> bool { + pub async fn proposed(&self, round: Round, block: CodedBlock) { let msg = Message::Proposed { block, round }; - self.sender.send_lossy(msg).await + self.sender.send_lossy(msg).await; } /// Inform the engine of an externally proposed [`Commitment`]. diff --git a/consensus/src/marshal/coding/variant.rs b/consensus/src/marshal/coding/variant.rs index f88d2632fd8..cd939dffe74 100644 --- a/consensus/src/marshal/coding/variant.rs +++ b/consensus/src/marshal/coding/variant.rs @@ -15,7 +15,6 @@ use commonware_cryptography::{Committable, Digestible, Hasher, PublicKey}; use commonware_p2p::Recipients; use commonware_utils::channel::oneshot; use std::sync::Arc; -use tracing::warn; /// The coding variant of Marshal, which uses erasure coding for block dissemination. /// @@ -101,13 +100,6 @@ where } async fn send(&self, round: Round, block: CodedBlock, _recipients: Recipients

) { - let commitment = block.commitment(); - if !self.proposed(round, block).await { - warn!( - ?round, - ?commitment, - "shards engine unavailable; block persisted but not broadcast" - ); - } + self.proposed(round, block).await; } } From c837ce694d50d1ea8d2f91bf83a4ddd3634887c3 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:22:23 -0700 Subject: [PATCH 052/107] nit --- consensus/src/marshal/standard/inline.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 6a5a7a4b9d4..e946fdb36aa 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -398,6 +398,7 @@ where // // The helper returns `None` when work should stop early (for example, // receiver closed or parent unavailable). + let round = context.round; let application_valid = match verify_with_parent( runtime_context, context, @@ -412,7 +413,7 @@ where None => return, }; if application_valid { - available_blocks.lock().insert((context.round, digest)); + available_blocks.lock().insert((round, digest)); } tx.send_lossy(application_valid); }); From 460c09b4091ee011bd15565cef03798ee283a92e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:26:34 -0700 Subject: [PATCH 053/107] field name --- consensus/fuzz/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 450b888f877..88f5ec4bd33 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: application::Certifier::Always, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,7 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: application::Certifier::Always, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); From 42dda4282efdf1b0cb94617d66edc74f4dabaa3a Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:33:37 -0700 Subject: [PATCH 054/107] nits --- consensus/src/simplex/actors/voter/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 370b27e7dab..5347e9f7be4 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -417,7 +417,7 @@ mod tests { propose_latency: (1.0, 0.0), verify_latency: (1.0, 0.0), certify_latency: (1.0, 0.0), - certifier, + should_certify, }; let (actor, application) = mocks::application::Application::new(context.with_label("app"), application_cfg); From f8804e8d69ce21216da9040c1678d91e1514f5dd Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:38:51 -0700 Subject: [PATCH 055/107] nit --- consensus/src/marshal/coding/mod.rs | 2 +- consensus/src/marshal/standard/inline.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index dd663aa96d5..002b3fcdf86 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -1500,7 +1500,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - v1_mailbox.proposed(round1, coded_block_b.clone()).await; + assert!(v1_mailbox.proposed(round1, coded_block_b.clone()).await); context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index e946fdb36aa..581f842c51b 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -559,6 +559,7 @@ mod tests { use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; use commonware_utils::{ channel::{fallible::OneshotExt, oneshot}, + sync::Mutex, NZUsize, }; use rand::Rng; @@ -860,8 +861,8 @@ mod tests { #[derive(Clone)] struct GatedVerifyingApp { genesis: B, - started: Arc>>>, - release: Arc>>>, + started: Arc>>>, + release: Arc>>>, } impl GatedVerifyingApp { @@ -871,8 +872,8 @@ mod tests { ( Self { genesis, - started: Arc::new(std::sync::Mutex::new(Some(started_tx))), - release: Arc::new(std::sync::Mutex::new(Some(release_rx))), + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), }, started_rx, release_tx, @@ -904,13 +905,12 @@ mod tests { _context: (deterministic::Context, Self::Context), _ancestry: crate::marshal::ancestry::AncestorStream, ) -> bool { - if let Some(started) = self.started.lock().unwrap().take() { + if let Some(started) = self.started.lock().take() { started.send_lossy(()); } let release = self .release .lock() - .unwrap() .take() .expect("release receiver missing"); let _ = release.await; From e578913ee390d99b69e9f413c1e1b13e54661d1f Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 15:56:03 -0700 Subject: [PATCH 056/107] nit --- consensus/src/simplex/mocks/application.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index b4cc480167c..c56de38098d 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -134,8 +134,7 @@ type ProposeObserver = Box::Digest, P>) + Sen type VerifyObserver = Box::Digest, P>, ::Digest) + Send + 'static>; -/// Predicate to determine whether a payload should be certified. -/// Returning true means certify, false means reject. +/// How the mock application should respond to certify requests. pub enum Certifier { /// Always certify. Always, From d0434f74ee307837830bf2fe71387552952f6504 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 16:20:30 -0700 Subject: [PATCH 057/107] cleanup --- consensus/fuzz/src/lib.rs | 4 +- consensus/src/marshal/mocks/verifying.rs | 82 +++++++++++++++ consensus/src/marshal/standard/deferred.rs | 116 ++++++++------------- consensus/src/marshal/standard/inline.rs | 12 +-- consensus/src/simplex/mocks/application.rs | 3 +- 5 files changed, 135 insertions(+), 82 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 88f5ec4bd33..450b888f877 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Always, + certifier: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,7 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - should_certify: application::Certifier::Always, + certifier: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); diff --git a/consensus/src/marshal/mocks/verifying.rs b/consensus/src/marshal/mocks/verifying.rs index fb665a81cfc..2a8b44902ae 100644 --- a/consensus/src/marshal/mocks/verifying.rs +++ b/consensus/src/marshal/mocks/verifying.rs @@ -9,6 +9,11 @@ use crate::{ CertifiableBlock, Epochable, }; use commonware_runtime::deterministic; +use commonware_utils::{ + channel::{fallible::OneshotExt, oneshot}, + sync::Mutex, +}; +use std::{marker::PhantomData, sync::Arc}; /// A mock application that implements `VerifyingApplication` for testing. /// @@ -92,3 +97,80 @@ where self.verify_result } } + +/// A verifying mock application whose `verify()` signals `started` on entry and +/// blocks until `release` is received. Used to deterministically control when +/// the application verdict races with marshal shutdown. +#[derive(Clone)] +pub struct GatedVerifyingApp { + genesis: B, + started: Arc>>>, + release: Arc>>>, + _phantom: PhantomData, +} + +impl GatedVerifyingApp { + /// Returns the gated app, a `started` receiver fired when `verify()` is entered, + /// and a `release` sender that unblocks `verify()` once signaled. + pub fn new(genesis: B) -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (started_tx, started_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + genesis, + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), + _phantom: PhantomData, + }, + started_rx, + release_tx, + ) + } +} + +impl crate::Application for GatedVerifyingApp +where + B: CertifiableBlock + Clone + Send + Sync + 'static, + B::Context: Epochable + Clone + Send + Sync + 'static, + S: commonware_cryptography::certificate::Scheme + Clone + Send + Sync + 'static, +{ + type Block = B; + type Context = B::Context; + type SigningScheme = S; + + async fn genesis(&mut self) -> Self::Block { + self.genesis.clone() + } + + async fn propose>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: AncestorStream, + ) -> Option { + None + } +} + +impl crate::VerifyingApplication for GatedVerifyingApp +where + B: CertifiableBlock + Clone + Send + Sync + 'static, + B::Context: Epochable + Clone + Send + Sync + 'static, + S: commonware_cryptography::certificate::Scheme + Clone + Send + Sync + 'static, +{ + async fn verify>( + &mut self, + _context: (deterministic::Context, Self::Context), + _ancestry: AncestorStream, + ) -> bool { + if let Some(started) = self.started.lock().take() { + started.send_lossy(()); + } + let release = self + .release + .lock() + .take() + .expect("release receiver missing"); + let _ = release.await; + true + } +} diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 1e39bbeb310..2a38f2f4871 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -684,12 +684,13 @@ mod tests { default_leader, make_raw_block, setup_network_with_participants, Ctx, StandardHarness, TestHarness, B, BLOCKS_PER_EPOCH, NAMESPACE, NUM_VALIDATORS, S, V, }, - verifying::MockVerifyingApp, + verifying::{GatedVerifyingApp, MockVerifyingApp}, }, simplex::scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::{Epoch, Epocher, FixedEpocher, Height, Round, View}, Automaton, CertifiableAutomaton, }; + use commonware_broadcast::Broadcaster; use commonware_cryptography::{ certificate::{mocks::Fixture, ConstantProvider}, sha256::Sha256, @@ -697,7 +698,7 @@ mod tests { }; use commonware_macros::{select, test_traced}; use commonware_runtime::{deterministic, Clock, Metrics, Runner}; - use commonware_utils::NZUsize; + use commonware_utils::{channel::fallible::OneshotExt, NZUsize}; use std::time::Duration; #[test_traced("INFO")] @@ -1007,27 +1008,19 @@ mod tests { }) } - /// Regression: a validator must not vote finalize on a block that is not - /// durably persisted. `certify` resolves true ⟹ block is on disk. + /// Regression: `certify` resolving true drives the finalize vote, so it must imply + /// the block is durably persisted. In deferred mode `verify()` spawns the + /// `deferred_verify` background task and `certify()` returns that same receiver; the + /// persistence ack happens inside `verify_with_parent` after `app.verify` returns. /// - /// To exercise the race we have to seed the parent and child via the - /// buffered broadcast layer (in-memory only) instead of `marshal.proposed`, - /// which already persists. Otherwise `marshal.verified` is just a no-op - /// re-write and the test cannot catch the pre-fix race. + /// The gated app holds `app.verify()` open until the test releases it, so we can + /// abort the marshal actor deterministically after the optimistic path has run but + /// before the persistence-ack path runs. With the ack in place `verified()` returns + /// false once the actor is gone, `verify_with_parent` returns `None`, and the tx is + /// dropped unresolved; we assert the certify receiver errors. #[test_traced("WARN")] - fn test_certify_persists_block_before_resolving() { - for seed in 0u64..16 { - certify_persists_block_before_resolving_at(seed); - } - } - - fn certify_persists_block_before_resolving_at(seed: u64) { - use commonware_broadcast::Broadcaster; - let runner = deterministic::Runner::new( - deterministic::Config::new() - .with_seed(seed) - .with_timeout(Some(Duration::from_secs(60))), - ); + fn test_deferred_certify_does_not_bypass_failed_verify_persistence() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); runner.start(|mut context| async move { let Fixture { participants, @@ -1049,11 +1042,11 @@ mod tests { .await; let marshal = setup.mailbox; let buffer = setup.extra; - let actor_handle = setup.actor_handle; + let marshal_actor_handle = setup.actor_handle; let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); - let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); - + let (mock_app, verify_started, release_verify): (GatedVerifyingApp, _, _) = + GatedVerifyingApp::new(genesis.clone()); let mut marshaled = Deferred::new( context.clone(), mock_app, @@ -1061,77 +1054,54 @@ mod tests { FixedEpocher::new(BLOCKS_PER_EPOCH), ); - // Build parent (height 1) and child (height 2). Seed both into - // the buffered broadcast cache (in-memory only), bypassing - // `marshal.proposed` which would already persist them. + // Seed parent and child via the buffer (in-memory only) so + // `deferred_verify` can fetch them without going through the + // persisted marshal path. let parent = make_raw_block(genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = Ctx { round: child_round, - leader: me.clone(), + leader: me, parent: (View::new(1), parent_digest), }; let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); let child_digest = child.digest(); - // Broadcast to no peers - this only inserts into the local - // buffer cache (mirrors the pre-fix in-memory-only state). buffer - .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent) .await .await .expect("buffer broadcast for parent should ack"); buffer - .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .broadcast(commonware_p2p::Recipients::Some(vec![]), child) .await .await .expect("buffer broadcast for child should ack"); - // Optimistic verify: returns true after parent/child fetch from - // the buffer + ancestry validation + app verify. - let optimistic = marshaled - .verify(child_ctx, child_digest) - .await - .await - .expect("verify result missing"); - assert!(optimistic, "optimistic verify should pass"); - - // Certify - this is the safety gate before finalize voting. - let certify_result = marshaled - .certify(child_round, child_digest) - .await + // Kick off the optimistic verify, which spawns `deferred_verify`. + // Its gated `app.verify` blocks until we release it, giving us a + // deterministic window to abort the marshal actor. + let _optimistic_rx = marshaled.verify(child_ctx, child_digest).await; + let certify_rx = marshaled.certify(child_round, child_digest).await; + verify_started .await - .expect("certify result missing"); - assert!(certify_result, "certify should succeed"); - - // CRITICAL: abort the marshal actor synchronously, with no - // intervening await. If certify returned true but the actor had - // only enqueued (not processed) the `Verified` message, this - // abort kills the actor before persistence completes. - actor_handle.abort(); - drop(marshaled); - drop(marshal); - drop(buffer); - - // Restart from the same partition. The block must be durably - // persisted - otherwise the validator would have voted finalize - // for a block it cannot serve from local storage. - let setup2 = StandardHarness::setup_validator( - context.with_label("validator_0_restart"), - &mut oracle, - me.clone(), - ConstantProvider::new(schemes[0].clone()), - ) - .await; - let marshal2 = setup2.mailbox; + .expect("verify should reach application before marshal abort"); + marshal_actor_handle.abort(); + release_verify.send_lossy(()); - let post_restart = marshal2.get_block(&child_digest).await; - assert!( - post_restart.is_some(), - "certify resolved true ⟹ block must be durably persisted (seed={seed})" - ); + select! { + result = certify_rx => { + assert!( + result.is_err(), + "certify must not resolve after marshal.verified loses its persistence ack" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should terminate after marshal abort"); + }, + } }); } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 581f842c51b..e946fdb36aa 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -559,7 +559,6 @@ mod tests { use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; use commonware_utils::{ channel::{fallible::OneshotExt, oneshot}, - sync::Mutex, NZUsize, }; use rand::Rng; @@ -861,8 +860,8 @@ mod tests { #[derive(Clone)] struct GatedVerifyingApp { genesis: B, - started: Arc>>>, - release: Arc>>>, + started: Arc>>>, + release: Arc>>>, } impl GatedVerifyingApp { @@ -872,8 +871,8 @@ mod tests { ( Self { genesis, - started: Arc::new(Mutex::new(Some(started_tx))), - release: Arc::new(Mutex::new(Some(release_rx))), + started: Arc::new(std::sync::Mutex::new(Some(started_tx))), + release: Arc::new(std::sync::Mutex::new(Some(release_rx))), }, started_rx, release_tx, @@ -905,12 +904,13 @@ mod tests { _context: (deterministic::Context, Self::Context), _ancestry: crate::marshal::ancestry::AncestorStream, ) -> bool { - if let Some(started) = self.started.lock().take() { + if let Some(started) = self.started.lock().unwrap().take() { started.send_lossy(()); } let release = self .release .lock() + .unwrap() .take() .expect("release receiver missing"); let _ = release.await; diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index c56de38098d..b4cc480167c 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -134,7 +134,8 @@ type ProposeObserver = Box::Digest, P>) + Sen type VerifyObserver = Box::Digest, P>, ::Digest) + Send + 'static>; -/// How the mock application should respond to certify requests. +/// Predicate to determine whether a payload should be certified. +/// Returning true means certify, false means reject. pub enum Certifier { /// Always certify. Always, From 8991e3bc7d1f9fb21cf82ffa239a672b9c38858e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 22:05:23 -0700 Subject: [PATCH 058/107] fix issue --- consensus/src/marshal/core/actor.rs | 20 +- consensus/src/marshal/mod.rs | 4 + consensus/src/marshal/standard/mod.rs | 287 +++++++++++++++++++++++++- 3 files changed, 299 insertions(+), 12 deletions(-) diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 4b29fc80acf..7c3fabaf14c 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -602,6 +602,7 @@ where self.try_repair_gaps(&mut buffer, &mut resolver, &mut application) .await; self.sync_finalized().await; + self.try_dispatch_blocks(&mut application).await; debug!(?round, %height, "finalized block stored"); } } else { @@ -786,6 +787,7 @@ where // durability). if needs_sync { self.sync_finalized().await; + self.try_dispatch_blocks(&mut application).await; } // Handle produce requests in parallel. @@ -1234,6 +1236,10 @@ where /// arrive and [`Self::handle_block_processed`] calls /// [`Self::update_processed_height`]. /// + /// Callers must only invoke this after [`Self::sync_finalized`] has made any + /// preceding finalized-archive writes durable. In other words, anything fed + /// to the application from this method is already durably persisted in marshal. + /// /// Acks are processed in FIFO order so `last_processed_height` always /// advances sequentially. /// @@ -1247,8 +1253,8 @@ where /// ```text /// Iteration N (caller): /// store_finalization -> Archive::put (buffered) - /// try_dispatch_blocks -> sends blocks to app, enqueues pending acks /// sync_finalized -> archive durable + /// try_dispatch_blocks -> sends blocks to app, enqueues pending acks /// /// Iteration M (ack handler, M > N): /// handle_block_processed -> update_processed_height -> metadata buffered @@ -1352,8 +1358,10 @@ where /// /// Must be called within the same `select_loop!` arm as any preceding /// [`Self::store_finalization`] / [`Self::try_repair_gaps`] writes, before yielding back - /// to the loop. This ensures archives are durable before the ack handler - /// advances `last_processed_height`. See [`Self::try_dispatch_blocks`] for details. + /// to the loop. This is the durability barrier for application delivery: + /// [`Self::try_dispatch_blocks`] must run only after this sync completes. + /// It also ensures archives are durable before the ack handler advances + /// `last_processed_height`. See [`Self::try_dispatch_blocks`] for details. async fn sync_finalized(&mut self) { if let Err(e) = try_join!( async { @@ -1403,7 +1411,10 @@ where /// Add a finalized block, and optionally a finalization, to the archive. /// - /// After persisting the block, attempt to dispatch the next contiguous block to the application. + /// After persisting the block, the caller must sync finalized archives + /// before dispatching the next contiguous block to the application. The + /// buffered archive writes from this method are not a sufficient durability + /// guarantee for downstream application state transitions on their own. /// /// Writes are buffered and not synced. The caller must call /// [sync_finalized](Self::sync_finalized) before yielding to the @@ -1466,7 +1477,6 @@ where let _ = self.finalized_height.try_set(height.get()); } buffer.finalized(commitment).await; - self.try_dispatch_blocks(application).await; true } diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs index 3a2401fb377..3e4c8ac025f 100644 --- a/consensus/src/marshal/mod.rs +++ b/consensus/src/marshal/mod.rs @@ -133,6 +133,10 @@ pub enum Update { Tip(Round, Height, B::Digest), /// A new finalized block and an [Acknowledgement] for the application to signal once processed. /// + /// Marshal only emits this after it has durably persisted the delivered block locally. + /// For blocks flowing through the normal finalization path, the corresponding + /// height-indexed finalization metadata is also durably synced before delivery. + /// /// To ensure all blocks are delivered at least once, marshal waits to mark a block as delivered /// until the application explicitly acknowledges the update. If the [Acknowledgement] is dropped before /// handling, marshal will exit (assuming the application is shutting down). diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 6bbc89d13b0..8230ed6eac1 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -57,14 +57,14 @@ mod tests { verifying::MockVerifyingApp, }, resolver::handler, - Identifier, + Identifier, Update, }, simplex::{ scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::{Finalization, Proposal}, }, types::{Epoch, Epocher, FixedEpocher, Height, Round, View, ViewDelta}, - Automaton, CertifiableAutomaton, Heightable, + Automaton, CertifiableAutomaton, Heightable, Reporter, }; use bytes::Bytes; use commonware_broadcast::buffered; @@ -74,8 +74,11 @@ mod tests { sha256::Sha256, Digestible, Hasher as _, }; - use commonware_macros::{test_group, test_traced}; - use commonware_p2p::simulated::{self, Network}; + use commonware_macros::{select, test_group, test_traced}; + use commonware_p2p::{ + simulated::{self, Network}, + Recipients, + }; use commonware_parallel::Sequential; use commonware_resolver::Resolver; use commonware_runtime::{ @@ -87,12 +90,13 @@ mod tests { translator::{EightCap, TwoCap}, }; use commonware_utils::{ - channel::{mpsc, oneshot}, + channel::{fallible::OneshotExt, mpsc, oneshot}, vec::NonEmptyVec, NZUsize, NZU16, NZU64, }; use std::{ num::{NonZeroU32, NonZeroU64, NonZeroUsize}, + sync::{Arc, Mutex}, time::Duration, }; @@ -1530,7 +1534,17 @@ mod tests { /// A no-op resolver used by tests that drive the marshal actor's /// resolver_rx channel directly. Outbound fetches/cancellations are dropped. #[derive(Clone, Default)] - struct NoopResolver; + struct NoopResolver { + _keepalive: Option>>, + } + + impl NoopResolver { + fn holding(sender: mpsc::Sender>) -> Self { + Self { + _keepalive: Some(sender), + } + } + } impl Resolver for NoopResolver { type Key = handler::Request; @@ -1554,6 +1568,178 @@ mod tests { async fn retain(&mut self, _predicate: impl Fn(&Self::Key) -> bool + Send + 'static) {} } + /// A no-op buffer used by tests that do not need marshal's dissemination path. + #[derive(Clone, Default)] + struct NoopBuffer; + + impl crate::marshal::core::Buffer> for NoopBuffer { + type PublicKey = PublicKey; + type CachedBlock = B; + + async fn find_by_digest(&self, _digest: D) -> Option { + None + } + + async fn find_by_commitment(&self, _commitment: D) -> Option { + None + } + + async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn subscribe_by_commitment( + &self, + _commitment: D, + ) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn finalized(&self, _commitment: D) {} + + async fn send(&self, _round: Round, _block: B, _recipients: Recipients) {} + } + + /// A reporter that blocks inside `Update::Block` so tests can abort marshal + /// exactly when application delivery starts. + #[derive(Clone)] + struct GatedBlockReporter { + started: Arc>>>, + release: Arc>>>, + } + + impl GatedBlockReporter { + fn new() -> (Self, oneshot::Receiver, oneshot::Sender<()>) { + let (started_tx, started_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), + }, + started_rx, + release_tx, + ) + } + } + + impl Reporter for GatedBlockReporter { + type Activity = Update; + + async fn report(&mut self, activity: Self::Activity) { + match activity { + Update::Block(block, _ack) => { + if let Some(started) = self.started.lock().unwrap().take() { + started.send_lossy(block.height()); + } + let release = self.release.lock().unwrap().take(); + if let Some(release) = release { + let _ = release.await; + } + } + Update::Tip(_, _, _) => {} + } + } + } + + async fn start_standard_actor>>( + context: deterministic::Context, + partition_prefix: &str, + provider: ConstantProvider, + application: R, + ) -> (Mailbox>, commonware_runtime::Handle<()>) { + let config = Config { + provider, + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + mailbox_size: 100, + view_retention_timeout: ViewDelta::new(10), + max_repair: NZUsize!(10), + max_pending_acks: NZUsize!(1), + block_codec_config: (), + partition_prefix: partition_prefix.to_string(), + prunable_items_per_section: NZU64!(10), + replay_buffer: NZUsize!(1024), + key_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + strategy: Sequential, + }; + let finalizations_by_height = immutable::Archive::init( + context.with_label("finalizations_by_height"), + immutable::Config { + metadata_partition: format!("{partition_prefix}-finalizations-by-height-metadata"), + freezer_table_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-table" + ), + freezer_table_initial_size: 64, + freezer_table_resize_frequency: 10, + freezer_table_resize_chunk_size: 10, + freezer_key_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-key" + ), + freezer_key_page_cache: config.page_cache.clone(), + freezer_value_partition: format!( + "{partition_prefix}-finalizations-by-height-freezer-value" + ), + freezer_value_target_size: 1024, + freezer_value_compression: None, + ordinal_partition: format!("{partition_prefix}-finalizations-by-height-ordinal"), + items_per_section: NZU64!(10), + codec_config: S::certificate_codec_config_unbounded(), + replay_buffer: config.replay_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, + }, + ) + .await + .expect("failed to initialize finalizations by height archive"); + let finalized_blocks = immutable::Archive::init( + context.with_label("finalized_blocks"), + immutable::Config { + metadata_partition: format!("{partition_prefix}-finalized_blocks-metadata"), + freezer_table_partition: format!( + "{partition_prefix}-finalized_blocks-freezer-table" + ), + freezer_table_initial_size: 64, + freezer_table_resize_frequency: 10, + freezer_table_resize_chunk_size: 10, + freezer_key_partition: format!("{partition_prefix}-finalized_blocks-freezer-key"), + freezer_key_page_cache: config.page_cache.clone(), + freezer_value_partition: format!( + "{partition_prefix}-finalized_blocks-freezer-value" + ), + freezer_value_target_size: 1024, + freezer_value_compression: None, + ordinal_partition: format!("{partition_prefix}-finalized_blocks-ordinal"), + items_per_section: NZU64!(10), + codec_config: config.block_codec_config, + replay_buffer: config.replay_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, + }, + ) + .await + .expect("failed to initialize finalized blocks archive"); + let (actor, mailbox, _) = Actor::init( + context.clone(), + finalizations_by_height, + finalized_blocks, + config, + ) + .await; + let (resolver_tx, resolver_rx) = mpsc::channel(100); + let actor_handle = actor.start( + application, + NoopBuffer, + (resolver_rx, NoopResolver::holding(resolver_tx)), + ); + (mailbox, actor_handle) + } + /// When the provider has no verifier for an epoch, in-flight deliveries /// for that epoch must be acknowledged (`true`) so the serving peer is /// not blamed, rather than rejected (`false`). @@ -1656,7 +1842,7 @@ mod tests { actor.start( Application::::default(), buffer, - (resolver_rx, NoopResolver), + (resolver_rx, NoopResolver::default()), ); // Inject a Finalized delivery with garbage payload. The @@ -1691,6 +1877,93 @@ mod tests { }); } + /// Regression: application delivery of a finalized block must only happen + /// after the finalized archives are durably synced. Otherwise a crash in + /// the delivery callback can expose a block to another subsystem that then + /// persists derived state ahead of marshal's height-indexed finalization. + #[test_traced("WARN")] + fn test_standard_dispatches_finalized_blocks_after_sync() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let partition_prefix = format!("validator-{me}"); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (application, started, release) = GatedBlockReporter::new(); + let (mut mailbox, actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + application, + ) + .await; + + assert!( + mailbox.verified(round, block.clone()).await, + "verified block should persist to the cache" + ); + StandardHarness::report_finalization(&mut mailbox, finalization.clone()).await; + + select! { + height = started => { + assert_eq!( + height.expect("delivery signal missing"), + Height::new(1), + "application should observe the first finalized block" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("application should observe block delivery promptly"); + }, + } + + actor_handle.abort(); + let _ = release.send_lossy(()); + drop(mailbox); + + // Yield once so the aborted actor drops its storage handles before restart. + context.sleep(Duration::from_millis(1)).await; + + let (mailbox, _actor_handle) = start_standard_actor( + context.with_label("validator_0_restart"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + ) + .await; + + let recovered = mailbox + .get_block(Height::new(1)) + .await + .expect("finalized block must be durable before delivery"); + assert_eq!( + recovered.digest(), + block.digest(), + "restart should recover the delivered finalized block by height" + ); + assert_eq!( + mailbox + .get_finalization(Height::new(1)) + .await + .expect("finalization must be durable before delivery") + .round(), + round, + "restart should recover the delivered finalization by height" + ); + }); + } + /// Parse the `processed_height` gauge value from a prometheus-encoded /// metrics dump produced by `Metrics::encode`. Looks for any line of the /// form `processed_height `. From 6bbe825cd4f0b93272be15b7c3fa8b17ac7cbb6c Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 22:09:55 -0700 Subject: [PATCH 059/107] nit --- consensus/src/marshal/coding/mod.rs | 5 +- consensus/src/marshal/standard/inline.rs | 83 ++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 002b3fcdf86..71896e63847 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -1820,7 +1820,10 @@ mod tests { .await .expect("propose should produce a commitment"); assert_eq!(commitment, expected_commitment); - marshaled.broadcast(commitment, Plan::Propose).await; + assert!( + marshaled.broadcast(commitment, Plan::Propose).await, + "broadcast should persist the proposed block before returning" + ); // Abort marshal immediately after broadcast returns; the propose // path must already have persisted the block. diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index e946fdb36aa..def98f2aabc 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -387,6 +387,9 @@ where }; let block = match decision { Decision::Complete(valid) => { + if valid { + available_blocks.lock().insert((context.round, digest)); + } tx.send_lossy(valid); return; } @@ -737,6 +740,86 @@ mod tests { }); } + #[test_traced("INFO")] + fn test_certify_reproposal_uses_available_blocks_after_verify() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let marshal_actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let boundary_height = Height::new(BLOCKS_PER_EPOCH.get() - 1); + let boundary_round = Round::new(Epoch::zero(), View::new(boundary_height.get())); + let boundary_block = B::new::( + Ctx { + round: boundary_round, + leader: default_leader(), + parent: (View::zero(), genesis.digest()), + }, + genesis.digest(), + boundary_height, + 1900, + ); + let boundary_digest = boundary_block.digest(); + assert!(marshal.proposed(boundary_round, boundary_block).await); + + let reproposal_round = Round::new(Epoch::zero(), View::new(boundary_height.get() + 1)); + let reproposal_context = Ctx { + round: reproposal_round, + leader: me, + parent: (View::new(boundary_height.get()), boundary_digest), + }; + + let verify_rx = inline.verify(reproposal_context, boundary_digest).await; + assert!( + verify_rx.await.unwrap(), + "verify should accept a valid boundary re-proposal" + ); + + marshal_actor_handle.abort(); + drop(marshal); + context.sleep(Duration::from_millis(1)).await; + + let certify_rx = inline.certify(reproposal_round, boundary_digest).await; + select! { + result = certify_rx => { + assert!( + result.unwrap(), + "certify should use the available_blocks fast path for verified re-proposals" + ); + }, + _ = context.sleep(Duration::from_secs(5)) => { + panic!("certify should not depend on marshal after verify cached a re-proposal"); + }, + } + }); + } + /// Regression: in inline mode, `verify` itself returns true after running /// app verification. That return value drives the notarize vote, so it /// must imply "block is durably persisted" -- otherwise a crash between From cbc1aed5dc47a1b254583502d4db10e38e5040a7 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 22:18:26 -0700 Subject: [PATCH 060/107] nits --- consensus/fuzz/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 450b888f877..88f5ec4bd33 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -375,7 +375,7 @@ where propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: application::Certifier::Always, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(context.with_label("application"), app_cfg); @@ -609,7 +609,7 @@ fn run_with_twin_mutator(input: FuzzInput) { propose_latency: (10.0, 5.0), verify_latency: (10.0, 5.0), certify_latency: (10.0, 5.0), - certifier: application::Certifier::Always, + should_certify: application::Certifier::Always, }; let (actor, application) = application::Application::new(primary_context.with_label("application"), app_cfg); From 2a0711ccb2ecce558b086c4551bbf06aba5ee568 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 22:43:33 -0700 Subject: [PATCH 061/107] add restart tests --- consensus/src/marshal/standard/inline.rs | 33 +++++++++++++++---- consensus/src/marshal/standard/mod.rs | 7 ++-- consensus/src/simplex/actors/voter/mod.rs | 39 +++++++++++++++++++---- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index def98f2aabc..cd7e4016f4f 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -562,6 +562,7 @@ mod tests { use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; use commonware_utils::{ channel::{fallible::OneshotExt, oneshot}, + sync::Mutex, NZUsize, }; use rand::Rng; @@ -943,8 +944,8 @@ mod tests { #[derive(Clone)] struct GatedVerifyingApp { genesis: B, - started: Arc>>>, - release: Arc>>>, + started: Arc>>>, + release: Arc>>>, } impl GatedVerifyingApp { @@ -954,8 +955,8 @@ mod tests { ( Self { genesis, - started: Arc::new(std::sync::Mutex::new(Some(started_tx))), - release: Arc::new(std::sync::Mutex::new(Some(release_rx))), + started: Arc::new(Mutex::new(Some(started_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), }, started_rx, release_tx, @@ -987,13 +988,12 @@ mod tests { _context: (deterministic::Context, Self::Context), _ancestry: crate::marshal::ancestry::AncestorStream, ) -> bool { - if let Some(started) = self.started.lock().unwrap().take() { + if let Some(started) = self.started.lock().take() { started.send_lossy(()); } let release = self .release .lock() - .unwrap() .take() .expect("release receiver missing"); let _ = release.await; @@ -1045,7 +1045,7 @@ mod tests { let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = Ctx { round: child_round, - leader: me, + leader: me.clone(), parent: (View::new(1), parent_digest), }; let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); @@ -1093,6 +1093,25 @@ mod tests { panic!("certify should terminate after marshal abort"); }, } + + drop(inline); + drop(marshal); + drop(buffer); + + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me, + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_none(), + "failed marshal.verified ack must not leave a durably recoverable block" + ); }); } } diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 8230ed6eac1..b6c7d2269ea 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -91,12 +91,13 @@ mod tests { }; use commonware_utils::{ channel::{fallible::OneshotExt, mpsc, oneshot}, + sync::Mutex, vec::NonEmptyVec, NZUsize, NZU16, NZU64, }; use std::{ num::{NonZeroU32, NonZeroU64, NonZeroUsize}, - sync::{Arc, Mutex}, + sync::Arc, time::Duration, }; @@ -1631,10 +1632,10 @@ mod tests { async fn report(&mut self, activity: Self::Activity) { match activity { Update::Block(block, _ack) => { - if let Some(started) = self.started.lock().unwrap().take() { + if let Some(started) = self.started.lock().take() { started.send_lossy(block.height()); } - let release = self.release.lock().unwrap().take(); + let release = self.release.lock().take(); if let Some(release) = release { let _ = release.await; } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 5347e9f7be4..90832f86572 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -63,8 +63,8 @@ mod tests { secp256r1, Scheme, }, types::{ - Certificate, Finalization, Finalize, Notarization, Notarize, Nullification, - Nullify, Proposal, Vote, + Artifact, Certificate, Finalization, Finalize, Notarization, Notarize, + Nullification, Nullify, Proposal, Vote, }, }, types::{Participant, Round, View}, @@ -84,8 +84,9 @@ mod tests { use commonware_runtime::{ deterministic, telemetry::traces::collector::TraceStorage, Clock, Metrics, Quota, Runner, }; + use commonware_storage::journal::segmented::variable::{Config as JConfig, Journal}; use commonware_utils::{channel::mpsc, sync::Mutex, NZUsize, NZU16}; - use futures::FutureExt; + use futures::{pin_mut, FutureExt, StreamExt}; use std::{ num::{NonZeroU16, NonZeroU32}, sync::Arc, @@ -170,8 +171,10 @@ mod tests { scheme: schemes[0].clone(), elector: elector.clone(), }; - let reporter = - mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let reporter = mocks::reporter::Reporter::new( + context.with_label("reporter"), + reporter_cfg.clone(), + ); let app_relay = Arc::new(mocks::relay::Relay::new()); let app_cfg = mocks::application::Config { @@ -196,7 +199,7 @@ mod tests { automaton: application.clone(), relay, reporter, - partition, + partition: partition.clone(), epoch: Epoch::new(4), mailbox_size: 128, leader_timeout: Duration::from_secs(5), @@ -270,6 +273,30 @@ mod tests { batcher::Message::Constructed(_) => {} } } + + let journal = Journal::<_, Artifact>::init( + context.with_label("journal_check"), + JConfig { + partition, + compression: None, + codec_config: schemes[0].certificate_codec_config(), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024 * 1024), + }, + ) + .await + .expect("unable to open voter journal"); + let stream = journal + .replay(0, 0, NZUsize!(1024 * 1024)) + .await + .expect("unable to replay voter journal"); + pin_mut!(stream); + if let Some(entry) = stream.next().await { + let (_, _, _, artifact) = entry.expect("unable to decode voter journal artifact"); + panic!( + "failed propose broadcast must not leave durable vote remnants, found {artifact:?}" + ); + } }); } From a373ee504eec1a821260b6acd8ccafeb93235c42 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 23:03:43 -0700 Subject: [PATCH 062/107] nits --- consensus/src/marshal/coding/marshaled.rs | 32 ++++----- consensus/src/marshal/coding/mod.rs | 15 ++--- consensus/src/marshal/core/actor.rs | 7 -- consensus/src/marshal/standard/inline.rs | 72 ++------------------- consensus/src/simplex/actors/voter/actor.rs | 19 ++---- consensus/src/simplex/actors/voter/mod.rs | 8 +-- 6 files changed, 26 insertions(+), 127 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 94f52f1deaf..1cdf3749ea5 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -424,18 +424,12 @@ where is_valid = validity_request => is_valid, }; timer.observe(); - if application_valid { - // The block is only persisted at this point. If the marshal - // actor is gone, do NOT signal certify-true: the block was - // not durably stored and consensus must not finalize-vote - // on it. - if !marshal.verified(round, block).await { - debug!( - ?round, - "marshal unavailable during verified ack; skipping certify resolution" - ); - return; - } + if application_valid && !marshal.verified(round, block).await { + debug!( + ?round, + "marshal unavailable during verified ack; skipping certify resolution" + ); + return; } tx.send_lossy(application_valid); }); @@ -764,10 +758,8 @@ where return; } - // Valid re-proposal. Notify the marshal and complete the - // verification task for `certify`. If marshal is gone, do - // not signal certify-true: the block was not durably - // stored. + // Valid re-proposal: notify the marshal and complete the + // verification task for `certify`. if !marshal.verified(round, block).await { debug!( ?round, @@ -910,10 +902,8 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call - // `marshal.verified` twice for the same block. That function is - // idempotent, so this is safe. If marshal is gone, do not - // signal certify-true: the block was not durably stored. + // During crash recovery we may call `marshal.verified` twice for + // the same block; the call is idempotent. if !marshaled.marshal.verified(round, block).await { debug!( ?round, @@ -1079,7 +1069,7 @@ where } /// Constructs the [`Commitment`] for the genesis block. -fn genesis_coding_commitment(block: &B) -> Commitment { +pub(super) fn genesis_coding_commitment(block: &B) -> Commitment { Commitment::from(( block.digest(), block.digest(), diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 71896e63847..68798a58f64 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -66,7 +66,8 @@ mod tests { use crate::{ marshal::{ coding::{ - types::{coding_config_for_participants, hash_context, CodedBlock}, + marshaled::genesis_coding_commitment, + types::{coding_config_for_participants, CodedBlock}, Marshaled, MarshaledConfig, }, mocks::{ @@ -83,7 +84,7 @@ mod tests { scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal, Plan, }, types::{coding::Commitment, Epoch, Epocher, FixedEpocher, Height, Round, View}, - Automaton, CertifiableAutomaton, CertifiableBlock, Relay, + Automaton, CertifiableAutomaton, Relay, }; use commonware_codec::FixedSize; use commonware_coding::ReedSolomon; @@ -1768,15 +1769,7 @@ mod tests { parent: (View::zero(), genesis_commitment()), }; let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); - - // Compute the genesis commitment as Marshaled would (mirrors the - // private `genesis_coding_commitment` helper in marshaled.rs). - let genesis_parent_commitment = Commitment::from(( - genesis.digest(), - genesis.digest(), - hash_context::(&genesis.context()), - harness::GENESIS_CODING_CONFIG, - )); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); // Build the block we want propose() to return. Its embedded context // uses the proper genesis commitment so fetch_parent matches the diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 7c3fabaf14c..2d940ee6470 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -519,9 +519,6 @@ where response.send_lossy(info); } Message::Proposed { round, block, ack } => { - // Persist before acknowledging so the caller can rely on - // "voted ⟹ persisted" before broadcasting their own - // notarize vote on this proposal. self.cache_verified(round, block.digest(), block.clone()) .await; ack.send_lossy(()); @@ -543,10 +540,6 @@ where buffer.send(round, block, Recipients::Some(peers)).await; } Message::Verified { round, block, ack } => { - // Persist before acknowledging so the caller (typically - // `Marshaled::deferred_verify`) can rely on - // "verify-ack ⟹ persisted", which in turn lets - // `certify` resolve true only after disk persistence. self.cache_verified(round, block.digest(), block).await; ack.send_lossy(()); } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index cd7e4016f4f..dcb7e645256 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -547,7 +547,7 @@ mod tests { default_leader, make_raw_block, setup_network_with_participants, Ctx, StandardHarness, TestHarness, B, BLOCKS_PER_EPOCH, NAMESPACE, NUM_VALIDATORS, S, V, }, - verifying::MockVerifyingApp, + verifying::{GatedVerifyingApp, MockVerifyingApp}, }, simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context}, types::{Epoch, FixedEpocher, Height, Round, View}, @@ -560,13 +560,9 @@ mod tests { }; use commonware_macros::{select, test_traced}; use commonware_runtime::{deterministic, Clock, Metrics, Runner, Spawner}; - use commonware_utils::{ - channel::{fallible::OneshotExt, oneshot}, - sync::Mutex, - NZUsize, - }; + use commonware_utils::{channel::fallible::OneshotExt, NZUsize}; use rand::Rng; - use std::{sync::Arc, time::Duration}; + use std::time::Duration; // Compile-time assertion only: inline standard wrapper must not require `CertifiableBlock`. #[allow(dead_code)] @@ -941,66 +937,6 @@ mod tests { }); } - #[derive(Clone)] - struct GatedVerifyingApp { - genesis: B, - started: Arc>>>, - release: Arc>>>, - } - - impl GatedVerifyingApp { - fn new(genesis: B) -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { - let (started_tx, started_rx) = oneshot::channel(); - let (release_tx, release_rx) = oneshot::channel(); - ( - Self { - genesis, - started: Arc::new(Mutex::new(Some(started_tx))), - release: Arc::new(Mutex::new(Some(release_rx))), - }, - started_rx, - release_tx, - ) - } - } - - impl crate::Application for GatedVerifyingApp { - type Block = B; - type Context = Ctx; - type SigningScheme = S; - - async fn genesis(&mut self) -> Self::Block { - self.genesis.clone() - } - - async fn propose>( - &mut self, - _context: (deterministic::Context, Self::Context), - _ancestry: crate::marshal::ancestry::AncestorStream, - ) -> Option { - None - } - } - - impl crate::VerifyingApplication for GatedVerifyingApp { - async fn verify>( - &mut self, - _context: (deterministic::Context, Self::Context), - _ancestry: crate::marshal::ancestry::AncestorStream, - ) -> bool { - if let Some(started) = self.started.lock().take() { - started.send_lossy(()); - } - let release = self - .release - .lock() - .take() - .expect("release receiver missing"); - let _ = release.await; - true - } - } - #[test_traced("WARN")] fn test_inline_certify_does_not_bypass_failed_verify_persistence() { let runner = deterministic::Runner::timed(Duration::from_secs(30)); @@ -1030,7 +966,7 @@ mod tests { let marshal_actor_handle = setup.actor_handle; let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); - let (mock_app, verify_started, release_verify) = + let (mock_app, verify_started, release_verify): (GatedVerifyingApp, _, _) = GatedVerifyingApp::new(genesis.clone()); let mut inline = Inline::new( context.clone(), diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index c0b5b87b1ac..3fc0b7e6a1e 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -828,7 +828,6 @@ impl< let mut pending_verify: Option, bool>> = None; let mut certify_pool: AbortablePool<(Rnd, Result)> = Default::default(); - let mut stopped_after_broadcast_failure = false; select_loop! { self.context, on_start => { @@ -860,11 +859,6 @@ impl< let view = round.view(); debug!(%view, "attempting certification"); let result = if is_local { - // Locally proposed payloads are certifiable-by-construction - // for their proposer. We only apply this shortcut when we - // have explicit local evidence for the exact proposal, - // either from this process or from replaying our durable - // local notarize vote. Either::Left(ready(Ok(true))) } else { let receiver = self.automaton.certify(round, proposal.payload).await; @@ -931,7 +925,6 @@ impl< round = ?context.round, "failed to broadcast proposed payload, stopping voter" ); - stopped_after_broadcast_failure = true; break; } }, @@ -1105,11 +1098,11 @@ impl< } }, } - if stopped_after_broadcast_failure { - debug!("stopped voter after failed proposal broadcast"); - } - if let Some(journal) = self.journal.take() { - journal.sync_all().await.expect("unable to sync journal"); - } + self.journal + .take() + .expect("journal missing on voter exit") + .sync_all() + .await + .expect("unable to sync journal"); } } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 90832f86572..87f25a34584 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -103,12 +103,6 @@ mod tests { proposes: Arc>>, } - impl FailingRelay { - fn new() -> Self { - Self::default() - } - } - impl crate::Relay for FailingRelay { type Digest = Sha256Digest; type PublicKey = PublicKey; @@ -190,7 +184,7 @@ mod tests { mocks::application::Application::new(context.with_label("app"), app_cfg); app_actor.start(); - let relay = FailingRelay::new(); + let relay = FailingRelay::default(); let propose_attempts = relay.proposes.clone(); let voter_cfg = Config { scheme: schemes[0].clone(), From e6ebde143fcfcf8713ba64faaaf0bfb413c833c5 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 23:24:57 -0700 Subject: [PATCH 063/107] persist early --- consensus/src/marshal/core/actor.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 2d940ee6470..066e8768f88 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -1326,25 +1326,36 @@ where // -------------------- Prunable Storage -------------------- /// Add a verified block to the prunable archive. + /// + /// Persists the block durably before waking subscribers so that any waiter (e.g. + /// `Inline::certify`) that resolves on this notification is guaranteed to observe + /// a block that survives a crash. Reversing the order would let a certify waiter + /// emit a finalize vote for a block that is not yet on disk. async fn cache_verified( &mut self, round: Round, digest: ::Digest, block: V::Block, ) { + self.cache + .put_verified(round, digest, block.clone().into()) + .await; self.notify_subscribers(&block); - self.cache.put_verified(round, digest, block.into()).await; } /// Add a notarized block to the prunable archive. + /// + /// Persists before notifying for the same reason as [`Self::cache_verified`]. async fn cache_block( &mut self, round: Round, digest: ::Digest, block: V::Block, ) { + self.cache + .put_block(round, digest, block.clone().into()) + .await; self.notify_subscribers(&block); - self.cache.put_block(round, digest, block.into()).await; } /// Sync both finalization archives to durable storage. From b0c8a49a13532366fb6bbc093832310b6516f629 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Thu, 16 Apr 2026 23:46:51 -0700 Subject: [PATCH 064/107] revert change --- consensus/src/marshal/core/actor.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 066e8768f88..2d940ee6470 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -1326,36 +1326,25 @@ where // -------------------- Prunable Storage -------------------- /// Add a verified block to the prunable archive. - /// - /// Persists the block durably before waking subscribers so that any waiter (e.g. - /// `Inline::certify`) that resolves on this notification is guaranteed to observe - /// a block that survives a crash. Reversing the order would let a certify waiter - /// emit a finalize vote for a block that is not yet on disk. async fn cache_verified( &mut self, round: Round, digest: ::Digest, block: V::Block, ) { - self.cache - .put_verified(round, digest, block.clone().into()) - .await; self.notify_subscribers(&block); + self.cache.put_verified(round, digest, block.into()).await; } /// Add a notarized block to the prunable archive. - /// - /// Persists before notifying for the same reason as [`Self::cache_verified`]. async fn cache_block( &mut self, round: Round, digest: ::Digest, block: V::Block, ) { - self.cache - .put_block(round, digest, block.clone().into()) - .await; self.notify_subscribers(&block); + self.cache.put_block(round, digest, block.into()).await; } /// Sync both finalization archives to durable storage. From 7bd8b4f72236c8d4b79018b5123a8ce91aba3285 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 00:10:05 -0700 Subject: [PATCH 065/107] nit --- consensus/src/marshal/standard/inline.rs | 118 ++++++++++++++++++++++- 1 file changed, 114 insertions(+), 4 deletions(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index dcb7e645256..9f6d9a10034 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -450,15 +450,22 @@ where // // TODO(#3393): Avoid fetching the block just to check if it's available. let block_rx = self.marshal.subscribe_by_digest(Some(round), digest).await; + let marshal = self.marshal.clone(); let (mut tx, rx) = oneshot::channel(); self.context .with_label("inline_certify") .with_attribute("round", round) .spawn(move |_| async move { - if await_block_subscription(&mut tx, block_rx, &digest, "certification") - .await - .is_some() - { + let Some(block) = + await_block_subscription(&mut tx, block_rx, &digest, "certification").await + else { + return; + }; + + // `certify` resolving true drives the finalize vote, so mere + // buffered availability is not sufficient here. Persist the + // block through marshal before signaling success. + if marshal.verified(round, block).await { tx.send_lossy(true); } }); @@ -937,6 +944,109 @@ mod tests { }); } + /// Regression: `certify` resolving true drives the finalize vote in inline + /// mode, so it must imply the block is durably persisted even when the + /// certify path subscribed before `verify()` finished. + #[test_traced("WARN")] + fn test_inline_certify_persists_block_before_resolving() { + for seed in 0u64..16 { + inline_certify_persists_block_before_resolving_at(seed); + } + } + + fn inline_certify_persists_block_before_resolving_at(seed: u64) { + use commonware_broadcast::Broadcaster; + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(60))), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let buffer = setup.extra; + let actor_handle = setup.actor_handle; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let parent = make_raw_block(genesis.digest(), Height::new(1), 100); + let parent_digest = parent.digest(); + + let child_round = Round::new(Epoch::zero(), View::new(2)); + let child_ctx = Ctx { + round: child_round, + leader: me.clone(), + parent: (View::new(1), parent_digest), + }; + let child = B::new::(child_ctx.clone(), parent_digest, Height::new(2), 200); + let child_digest = child.digest(); + + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), parent.clone()) + .await + .await + .expect("buffer broadcast for parent should ack"); + buffer + .broadcast(commonware_p2p::Recipients::Some(vec![]), child.clone()) + .await + .await + .expect("buffer broadcast for child should ack"); + + let verify_rx = inline.verify(child_ctx, child_digest).await; + let certify_result = inline + .certify(child_round, child_digest) + .await + .await + .expect("certify result missing"); + assert!(certify_result, "certify should succeed"); + + actor_handle.abort(); + drop(verify_rx); + drop(inline); + drop(marshal); + drop(buffer); + + let setup2 = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal2 = setup2.mailbox; + + let post_restart = marshal2.get_block(&child_digest).await; + assert!( + post_restart.is_some(), + "certify resolved true ⟹ block must be durably persisted (seed={seed})" + ); + }); + } + #[test_traced("WARN")] fn test_inline_certify_does_not_bypass_failed_verify_persistence() { let runner = deterministic::Runner::timed(Duration::from_secs(30)); From ab8e60ab7a7c6f2b8775703192a78d6be25da14e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 11:13:25 -0700 Subject: [PATCH 066/107] more tests --- consensus/src/marshal/coding/mod.rs | 15 ++ consensus/src/marshal/mocks/harness.rs | 314 ++++++++++++++++++++++++- consensus/src/marshal/standard/mod.rs | 18 ++ 3 files changed, 346 insertions(+), 1 deletion(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 68798a58f64..434afd63a2a 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -149,6 +149,21 @@ mod tests { harness::ack_pipeline_backlog_persists_on_restart::(); } + #[test_traced("WARN")] + fn test_coding_proposed_success_implies_recoverable_after_restart() { + harness::proposed_success_implies_recoverable_after_restart::(); + } + + #[test_traced("WARN")] + fn test_coding_verified_success_implies_recoverable_after_restart() { + harness::verified_success_implies_recoverable_after_restart::(); + } + + #[test_traced("WARN")] + fn test_coding_delivery_visibility_implies_recoverable_after_restart() { + harness::delivery_visibility_implies_recoverable_after_restart::(); + } + #[test_traced("WARN")] fn test_coding_sync_height_floor() { harness::sync_height_floor::(); diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 1f55939661b..d7bcdec2ecf 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -40,7 +40,7 @@ use commonware_storage::{ archive::{immutable, prunable}, translator::EightCap, }; -use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16, NZU64}; +use commonware_utils::{test_rng_seeded, vec::NonEmptyVec, NZUsize, NZU16, NZU64}; use futures::StreamExt; use rand::{ seq::{IteratorRandom, SliceRandom}, @@ -316,6 +316,318 @@ pub trait TestHarness: 'static + Sized { ) -> impl Future + Send; } +fn contract_runner(seed: u64) -> deterministic::Runner { + deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(Duration::from_secs(30))), + ) +} + +fn restart_cycles_for_seed(seed: u64) -> usize { + let mut rng = test_rng_seeded(seed); + rng.gen_range(2..=4) +} + +/// Contract: `marshal.proposed(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +pub fn proposed_success_implies_recoverable_after_restart() { + for seed in 0u64..16 { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + H::propose(&mut handle, round, &block).await; + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + assert!( + restarted.mailbox.get_block(&digest).await.is_some(), + "marshal.proposed() returning true must imply the block is recoverable \ + after restart (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Contract: `marshal.verified(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +pub fn verified_success_implies_recoverable_after_restart() { + for seed in 0u64..16 { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block, &mut peers).await; + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + assert!( + restarted.mailbox.get_block(&digest).await.is_some(), + "marshal.verified() returning true must imply the block is recoverable \ + after restart (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Contract: once marshal has delivered a finalized block to the application, +/// that finalized block and its certificate must already be durable. +pub fn delivery_visibility_implies_recoverable_after_restart() { + for seed in 0u64..16 { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let application = Application::::manual_ack(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let finalization = H::make_finalization( + Proposal::new(round, View::zero(), H::commitment(&block)), + &schemes, + QUORUM, + ); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let application = application.clone(); + let block = block.clone(); + let finalization = finalization.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator_with( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + NZUsize!(1), + application.clone(), + ) + .await; + let mut mailbox = setup.mailbox; + let mut handle = ValidatorHandle:: { + mailbox: mailbox.clone(), + extra: setup.extra, + }; + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block, &mut peers).await; + H::report_finalization(&mut mailbox, finalization.clone()).await; + + let height = application.acknowledged().await; + assert_eq!( + height, + Height::new(1), + "expected the first delivered finalized block to become visible at height 1 \ + before restart (seed={seed})" + ); + } + }); + + for cycle in 0..recovery_cycles { + let expected_round = finalization.round(); + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = restarted.mailbox.get_block(Height::new(1)).await.expect( + "delivered finalized block must be recoverable after restart \ + (seed={seed}, cycle={cycle})", + ); + assert_eq!( + recovered.height(), + Height::new(1), + "restart should recover the delivered finalized block by height \ + (seed={seed}, cycle={cycle})" + ); + assert_eq!( + restarted + .mailbox + .get_finalization(Height::new(1)) + .await + .expect( + "delivered finalization must be recoverable after restart \ + (seed={seed}, cycle={cycle})", + ) + .round(), + expected_round, + "restart should recover the delivered finalization by height \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + // ============================================================================= // Standard Harness Implementation // ============================================================================= diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index b6c7d2269ea..88d304f05da 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -159,6 +159,24 @@ mod tests { harness::ack_pipeline_backlog_persists_on_restart::(); } + #[test_traced("WARN")] + fn test_standard_proposed_success_implies_recoverable_after_restart() { + harness::proposed_success_implies_recoverable_after_restart::(); + harness::proposed_success_implies_recoverable_after_restart::(); + } + + #[test_traced("WARN")] + fn test_standard_verified_success_implies_recoverable_after_restart() { + harness::verified_success_implies_recoverable_after_restart::(); + harness::verified_success_implies_recoverable_after_restart::(); + } + + #[test_traced("WARN")] + fn test_standard_delivery_visibility_implies_recoverable_after_restart() { + harness::delivery_visibility_implies_recoverable_after_restart::(); + harness::delivery_visibility_implies_recoverable_after_restart::(); + } + #[test_traced("WARN")] fn test_standard_sync_height_floor() { harness::sync_height_floor::(); From 37854e3eddfb929ff00522c3e2d22c6f2ec2febe Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 11:29:56 -0700 Subject: [PATCH 067/107] more tests --- consensus/src/marshal/coding/mod.rs | 10 + consensus/src/marshal/mocks/harness.rs | 378 +++++++++++++++++++++++++ consensus/src/marshal/standard/mod.rs | 15 + 3 files changed, 403 insertions(+) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 434afd63a2a..602d35fdfc9 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -139,6 +139,16 @@ mod tests { } } + #[test_group("slow")] + #[test_traced("WARN")] + fn test_coding_hailstorm_restarts() { + for seed in 0..2 { + let r1 = harness::hailstorm::(seed, 4, 4, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, LINK); + assert_eq!(r1, r2); + } + } + #[test_traced("WARN")] fn test_coding_ack_pipeline_backlog() { harness::ack_pipeline_backlog::(); diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index d7bcdec2ecf..8183fc2b3e9 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -329,6 +329,384 @@ fn restart_cycles_for_seed(seed: u64) -> usize { rng.gen_range(2..=4) } +struct HailstormValidator { + application: Application, + handle: ValidatorHandle, + actor_handle: commonware_runtime::Handle<()>, +} + +fn active_validator_indices( + validators: &[Option>], +) -> Vec { + validators + .iter() + .enumerate() + .filter_map(|(idx, validator)| validator.as_ref().map(|_| idx)) + .collect() +} + +async fn wait_for_validator_height( + context: &mut deterministic::Context, + validator: &HailstormValidator, + height: Height, + expected_digest: D, + expected_finalization: &Finalization, + label: &str, +) { + loop { + let block = validator.handle.mailbox.get_block(height).await; + let finalization = validator.handle.mailbox.get_finalization(height).await; + if let (Some(block), Some(finalization)) = (block, finalization) { + assert_eq!( + block.digest(), + expected_digest, + "{label}: wrong block digest at height {}", + height.get() + ); + assert_eq!( + finalization.round(), + expected_finalization.round(), + "{label}: wrong finalization round at height {}", + height.get() + ); + assert_eq!( + finalization.proposal.payload, + expected_finalization.proposal.payload, + "{label}: wrong finalization payload at height {}", + height.get() + ); + break; + } + context.sleep(Duration::from_millis(10)).await; + } +} + +async fn assert_validator_matches_canonical( + validator: &HailstormValidator, + canonical: &[(Height, D, Finalization)], + label: &str, +) { + let delivered = validator.application.blocks(); + for (height, block) in delivered { + let (_, expected_digest, _) = canonical + .iter() + .find(|(expected_height, _, _)| *expected_height == height) + .unwrap_or_else(|| { + panic!( + "{label}: unexpected delivered block at height {}", + height.get() + ) + }); + assert_eq!( + block.digest(), + *expected_digest, + "{label}: application delivered wrong digest at height {}", + height.get() + ); + } + + if let Some((height, digest)) = validator.application.tip() { + let (_, expected_digest, _) = canonical + .iter() + .find(|(expected_height, _, _)| *expected_height == height) + .unwrap_or_else(|| { + panic!( + "{label}: unexpected delivered tip at height {}", + height.get() + ) + }); + assert_eq!( + digest, + *expected_digest, + "{label}: application reported wrong tip digest at height {}", + height.get() + ); + } + + for (height, expected_digest, expected_finalization) in canonical { + let stored_block = validator + .handle + .mailbox + .get_block(*height) + .await + .unwrap_or_else(|| { + panic!( + "{label}: missing finalized block at height {}", + height.get() + ) + }); + assert_eq!( + stored_block.digest(), + *expected_digest, + "{label}: stored wrong block digest at height {}", + height.get() + ); + + let stored_finalization = validator + .handle + .mailbox + .get_finalization(*height) + .await + .unwrap_or_else(|| panic!("{label}: missing finalization at height {}", height.get())); + assert_eq!( + stored_finalization.round(), + expected_finalization.round(), + "{label}: stored wrong finalization round at height {}", + height.get() + ); + assert_eq!( + stored_finalization.proposal.payload, + expected_finalization.proposal.payload, + "{label}: stored wrong finalization payload at height {}", + height.get() + ); + } + + if let Some((height, digest, _)) = canonical.last() { + assert_eq!( + validator.handle.mailbox.get_info(Identifier::Latest).await, + Some((*height, *digest)), + "{label}: latest info should match the canonical tip", + ); + } +} + +async fn assert_active_validators_match_canonical( + validators: &[Option>], + canonical: &[(Height, D, Finalization)], +) { + for idx in active_validator_indices(validators) { + let validator = validators[idx] + .as_ref() + .expect("active validator should be present"); + assert_validator_matches_canonical(validator, canonical, &format!("validator_{idx}")).await; + } +} + +async fn advance_hailstorm_to( + target: u64, + context: &mut deterministic::Context, + validators: &mut [Option>], + canonical: &mut Vec<(Height, D, Finalization)>, + parent: &mut D, + parent_commitment: &mut H::Commitment, + participants: &[K], + schemes: &[S], + propagation_delay: Duration, +) { + for height_value in (canonical.len() as u64 + 1)..=target { + let height = Height::new(height_value); + let active = active_validator_indices(validators); + let proposer_idx = active[context.gen_range(0..active.len())]; + let block = H::make_test_block( + *parent, + parent_commitment.clone(), + height, + height_value, + participants.len() as u16, + ); + let round = Round::new(Epoch::zero(), View::new(height_value)); + let proposal = Proposal { + round, + parent: height + .previous() + .map(|previous| View::new(previous.get())) + .unwrap_or(View::zero()), + payload: H::commitment(&block), + }; + let expected_digest = H::digest(&block); + let finalization = H::make_finalization(proposal.clone(), schemes, QUORUM); + + { + let proposer = validators[proposer_idx] + .as_mut() + .expect("proposer should be active"); + H::propose(&mut proposer.handle, round, &block).await; + H::verify(&mut proposer.handle, round, &block, &mut []).await; + H::report_notarization( + &mut proposer.handle.mailbox, + H::make_notarization(proposal, schemes, QUORUM), + ) + .await; + } + + context.sleep(propagation_delay).await; + + for idx in active_validator_indices(validators) { + let validator = validators[idx] + .as_mut() + .expect("validator should remain active"); + H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; + } + + canonical.push((height, expected_digest, finalization)); + *parent = expected_digest; + *parent_commitment = H::commitment(&block); + + let (_, _, expected_finalization) = canonical + .last() + .expect("canonical chain should contain the new height"); + for idx in active_validator_indices(validators) { + let validator = validators[idx] + .as_ref() + .expect("validator should be active"); + wait_for_validator_height( + context, + validator, + height, + expected_digest, + expected_finalization, + &format!("validator_{idx}"), + ) + .await; + } + } + + assert_active_validators_match_canonical(validators, canonical).await; +} + +/// Stress marshal with repeated validator crashes and recoveries while a +/// canonical finalized chain continues to advance. +pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, link: Link) -> String { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_seed(seed) + .with_timeout(Some(H::finalize_timeout())), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(3), participants.clone()) + .await; + let propagation_delay = link.latency; + setup_network_links(&mut oracle, &participants, link).await; + + let mut validators = Vec::new(); + for (idx, validator) in participants.iter().enumerate() { + let setup = H::setup_validator( + context.with_label(&format!("validator_{idx}")), + &mut oracle, + validator.clone(), + ConstantProvider::new(schemes[idx].clone()), + ) + .await; + validators.push(Some(HailstormValidator:: { + application: setup.application, + handle: ValidatorHandle { + mailbox: setup.mailbox, + extra: setup.extra, + }, + actor_handle: setup.actor_handle, + })); + } + + let mut canonical = Vec::<(Height, D, Finalization)>::new(); + let mut parent = Sha256::hash(b""); + let mut parent_commitment = H::genesis_parent_commitment(participants.len() as u16); + let mut target_height = 0u64; + + for shutdown_idx in 0..shutdowns { + target_height += interval; + advance_hailstorm_to( + target_height, + &mut context, + &mut validators, + &mut canonical, + &mut parent, + &mut parent_commitment, + &participants, + &schemes, + propagation_delay, + ) + .await; + + let active = active_validator_indices(&validators); + let selected = active[context.gen_range(0..active.len())]; + let persisted_height = target_height; + let crashed = validators[selected] + .take() + .expect("selected validator should be active"); + crashed.actor_handle.abort(); + let _ = crashed.actor_handle.await; + info!( + seed, + shutdown_idx, selected, persisted_height, "marshal hailstorm shutdown" + ); + + target_height += interval; + advance_hailstorm_to( + target_height, + &mut context, + &mut validators, + &mut canonical, + &mut parent, + &mut parent_commitment, + &participants, + &schemes, + propagation_delay, + ) + .await; + + let restarted = H::setup_validator( + context.with_label(&format!("validator_{selected}_restart_{shutdown_idx}")), + &mut oracle, + participants[selected].clone(), + ConstantProvider::new(schemes[selected].clone()), + ) + .await; + assert_eq!( + restarted.height, + Height::new(persisted_height), + "validator {selected} should recover its persisted finalized height before replay" + ); + + let mut restarted = HailstormValidator:: { + application: restarted.application, + handle: ValidatorHandle { + mailbox: restarted.mailbox, + extra: restarted.extra, + }, + actor_handle: restarted.actor_handle, + }; + for (_, _, finalization) in canonical.iter().skip(persisted_height as usize) { + H::report_finalization(&mut restarted.handle.mailbox, finalization.clone()).await; + } + validators[selected] = Some(restarted); + + let expected_digest = canonical + .last() + .map(|(_, digest, _)| *digest) + .expect("canonical chain should be non-empty"); + let expected_finalization = canonical + .last() + .map(|(_, _, finalization)| finalization.clone()) + .expect("canonical chain should be non-empty"); + wait_for_validator_height( + &mut context, + validators[selected] + .as_ref() + .expect("restarted validator should be active"), + Height::new(target_height), + expected_digest, + &expected_finalization, + &format!("validator_{selected}_restarted"), + ) + .await; + assert_active_validators_match_canonical(&validators, &canonical).await; + info!( + seed, + shutdown_idx, selected, target_height, "marshal hailstorm recovered" + ); + } + + context.auditor().state() + }) +} + /// Contract: `marshal.proposed(...)=true` means the block survives an /// immediate crash and repeated recoveries. pub fn proposed_success_implies_recoverable_after_restart() { diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 88d304f05da..330ec379fb9 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -111,6 +111,12 @@ mod tests { assert_eq!(r1, r2); } + fn assert_hailstorm_deterministic(seed: u64) { + let r1 = harness::hailstorm::(seed, 4, 4, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, LINK); + assert_eq!(r1, r2); + } + #[test_group("slow")] #[test_traced("WARN")] fn test_standard_finalize_good_links() { @@ -147,6 +153,15 @@ mod tests { } } + #[test_group("slow")] + #[test_traced("WARN")] + fn test_standard_hailstorm_restarts() { + for seed in 0..2 { + assert_hailstorm_deterministic::(seed); + assert_hailstorm_deterministic::(seed); + } + } + #[test_traced("WARN")] fn test_standard_ack_pipeline_backlog() { harness::ack_pipeline_backlog::(); From dff52e2974d2772006d4ce3050af571ec0d6f0d5 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 11:47:53 -0700 Subject: [PATCH 068/107] extend --- consensus/src/marshal/mocks/harness.rs | 35 +++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 8183fc2b3e9..0297fade063 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -498,6 +498,12 @@ async fn advance_hailstorm_to( let height = Height::new(height_value); let active = active_validator_indices(validators); let proposer_idx = active[context.gen_range(0..active.len())]; + let verifier_count = usize::min(QUORUM as usize, active.len()); + let verifier_indices = active + .iter() + .copied() + .filter(|idx| *idx != proposer_idx) + .choose_multiple(context, verifier_count.saturating_sub(1)); let block = H::make_test_block( *parent, parent_commitment.clone(), @@ -522,7 +528,6 @@ async fn advance_hailstorm_to( .as_mut() .expect("proposer should be active"); H::propose(&mut proposer.handle, round, &block).await; - H::verify(&mut proposer.handle, round, &block, &mut []).await; H::report_notarization( &mut proposer.handle.mailbox, H::make_notarization(proposal, schemes, QUORUM), @@ -530,6 +535,13 @@ async fn advance_hailstorm_to( .await; } + for verifier_idx in verifier_indices.iter().copied() { + let verifier = validators[verifier_idx] + .as_mut() + .expect("verifier should be active"); + H::verify(&mut verifier.handle, round, &block, &mut []).await; + } + context.sleep(propagation_delay).await; for idx in active_validator_indices(validators) { @@ -583,7 +595,7 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin setup_network_with_participants(context.clone(), NZUsize!(3), participants.clone()) .await; let propagation_delay = link.latency; - setup_network_links(&mut oracle, &participants, link).await; + setup_network_links(&mut oracle, &participants, link.clone()).await; let mut validators = Vec::new(); for (idx, validator) in participants.iter().enumerate() { @@ -608,9 +620,11 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin let mut parent = Sha256::hash(b""); let mut parent_commitment = H::genesis_parent_commitment(participants.len() as u16); let mut target_height = 0u64; + let max_interval = interval.max(1); for shutdown_idx in 0..shutdowns { - target_height += interval; + let leadup = context.gen_range(1..=max_interval); + target_height += leadup; advance_hailstorm_to( target_height, &mut context, @@ -634,10 +648,15 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin let _ = crashed.actor_handle.await; info!( seed, - shutdown_idx, selected, persisted_height, "marshal hailstorm shutdown" + shutdown_idx, + selected, + persisted_height, + leadup, + "marshal hailstorm shutdown" ); - target_height += interval; + let downtime = context.gen_range(1..=max_interval); + target_height += downtime; advance_hailstorm_to( target_height, &mut context, @@ -699,7 +718,11 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin assert_active_validators_match_canonical(&validators, &canonical).await; info!( seed, - shutdown_idx, selected, target_height, "marshal hailstorm recovered" + shutdown_idx, + selected, + target_height, + downtime, + "marshal hailstorm recovered" ); } From cf1e2b9880ce28ea8c8dbf11dfe3c4246d9f7285 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 12:27:51 -0700 Subject: [PATCH 069/107] progress --- consensus/src/marshal/coding/mod.rs | 14 ++- consensus/src/marshal/mocks/harness.rs | 119 ++++++++++++++----------- consensus/src/marshal/standard/mod.rs | 19 +++- 3 files changed, 97 insertions(+), 55 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 602d35fdfc9..f0b1a028743 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -143,8 +143,18 @@ mod tests { #[test_traced("WARN")] fn test_coding_hailstorm_restarts() { for seed in 0..2 { - let r1 = harness::hailstorm::(seed, 4, 4, LINK); - let r2 = harness::hailstorm::(seed, 4, 4, LINK); + let r1 = harness::hailstorm::(seed, 4, 4, 1, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 1, LINK); + assert_eq!(r1, r2); + } + } + + #[test_group("slow")] + #[test_traced("WARN")] + fn test_coding_hailstorm_multi_restarts() { + for seed in 0..2 { + let r1 = harness::hailstorm::(seed, 4, 4, 2, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 2, LINK); assert_eq!(r1, r2); } } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 0297fade063..9ad60869b8c 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -579,7 +579,13 @@ async fn advance_hailstorm_to( /// Stress marshal with repeated validator crashes and recoveries while a /// canonical finalized chain continues to advance. -pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, link: Link) -> String { +pub fn hailstorm( + seed: u64, + shutdowns: usize, + interval: u64, + max_down: usize, + link: Link, +) -> String { let runner = deterministic::Runner::new( deterministic::Config::new() .with_seed(seed) @@ -621,6 +627,7 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin let mut parent_commitment = H::genesis_parent_commitment(participants.len() as u16); let mut target_height = 0u64; let max_interval = interval.max(1); + let max_down = max_down.max(1); for shutdown_idx in 0..shutdowns { let leadup = context.gen_range(1..=max_interval); @@ -639,17 +646,27 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin .await; let active = active_validator_indices(&validators); - let selected = active[context.gen_range(0..active.len())]; + let down_limit = usize::min(max_down, active.len().saturating_sub(1)); + let down_count = down_limit.max(1); + let down_count = context.gen_range(1..=down_count); + let mut selected = active + .iter() + .copied() + .choose_multiple(&mut context, down_count); + selected.sort_unstable(); let persisted_height = target_height; - let crashed = validators[selected] - .take() - .expect("selected validator should be active"); - crashed.actor_handle.abort(); - let _ = crashed.actor_handle.await; + for idx in selected.iter().copied() { + let crashed = validators[idx] + .take() + .expect("selected validator should be active"); + crashed.actor_handle.abort(); + let _ = crashed.actor_handle.await; + } info!( seed, shutdown_idx, - selected, + ?selected, + down_count, persisted_height, leadup, "marshal hailstorm shutdown" @@ -670,56 +687,56 @@ pub fn hailstorm(seed: u64, shutdowns: usize, interval: u64, lin ) .await; - let restarted = H::setup_validator( - context.with_label(&format!("validator_{selected}_restart_{shutdown_idx}")), - &mut oracle, - participants[selected].clone(), - ConstantProvider::new(schemes[selected].clone()), - ) - .await; - assert_eq!( - restarted.height, - Height::new(persisted_height), - "validator {selected} should recover its persisted finalized height before replay" - ); + for idx in selected.iter().copied() { + let restarted = H::setup_validator( + context.with_label(&format!("validator_{idx}_restart_{shutdown_idx}")), + &mut oracle, + participants[idx].clone(), + ConstantProvider::new(schemes[idx].clone()), + ) + .await; + assert_eq!( + restarted.height, + Height::new(persisted_height), + "validator {idx} should recover its persisted finalized height before replay" + ); - let mut restarted = HailstormValidator:: { - application: restarted.application, - handle: ValidatorHandle { - mailbox: restarted.mailbox, - extra: restarted.extra, - }, - actor_handle: restarted.actor_handle, - }; - for (_, _, finalization) in canonical.iter().skip(persisted_height as usize) { - H::report_finalization(&mut restarted.handle.mailbox, finalization.clone()).await; + let mut restarted = HailstormValidator:: { + application: restarted.application, + handle: ValidatorHandle { + mailbox: restarted.mailbox, + extra: restarted.extra, + }, + actor_handle: restarted.actor_handle, + }; + for (_, _, finalization) in canonical.iter().skip(persisted_height as usize) { + H::report_finalization(&mut restarted.handle.mailbox, finalization.clone()) + .await; + } + validators[idx] = Some(restarted); } - validators[selected] = Some(restarted); - - let expected_digest = canonical - .last() - .map(|(_, digest, _)| *digest) - .expect("canonical chain should be non-empty"); - let expected_finalization = canonical - .last() - .map(|(_, _, finalization)| finalization.clone()) - .expect("canonical chain should be non-empty"); - wait_for_validator_height( - &mut context, - validators[selected] + + for idx in selected.iter().copied() { + let validator = validators[idx] .as_ref() - .expect("restarted validator should be active"), - Height::new(target_height), - expected_digest, - &expected_finalization, - &format!("validator_{selected}_restarted"), - ) - .await; + .expect("restarted validator should be active"); + for (height, digest, finalization) in canonical.iter() { + wait_for_validator_height( + &mut context, + validator, + *height, + *digest, + finalization, + &format!("validator_{idx}_restarted"), + ) + .await; + } + } assert_active_validators_match_canonical(&validators, &canonical).await; info!( seed, shutdown_idx, - selected, + ?selected, target_height, downtime, "marshal hailstorm recovered" diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 330ec379fb9..512921602c0 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -112,8 +112,14 @@ mod tests { } fn assert_hailstorm_deterministic(seed: u64) { - let r1 = harness::hailstorm::(seed, 4, 4, LINK); - let r2 = harness::hailstorm::(seed, 4, 4, LINK); + let r1 = harness::hailstorm::(seed, 4, 4, 1, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 1, LINK); + assert_eq!(r1, r2); + } + + fn assert_hailstorm_multi_deterministic(seed: u64) { + let r1 = harness::hailstorm::(seed, 4, 4, 2, LINK); + let r2 = harness::hailstorm::(seed, 4, 4, 2, LINK); assert_eq!(r1, r2); } @@ -162,6 +168,15 @@ mod tests { } } + #[test_group("slow")] + #[test_traced("WARN")] + fn test_standard_hailstorm_multi_restarts() { + for seed in 0..2 { + assert_hailstorm_multi_deterministic::(seed); + assert_hailstorm_multi_deterministic::(seed); + } + } + #[test_traced("WARN")] fn test_standard_ack_pipeline_backlog() { harness::ack_pipeline_backlog::(); From fd60d042069d6483246efb9c8656beea1b86544e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 13:19:08 -0700 Subject: [PATCH 070/107] add more regression tests --- consensus/src/marshal/standard/mod.rs | 78 +++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 512921602c0..b355fb7bcbe 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -1618,8 +1618,13 @@ mod tests { } /// A no-op buffer used by tests that do not need marshal's dissemination path. + /// + /// Subscription senders are retained so marshal-local subscriptions stay live + /// until the actor drops the buffer. #[derive(Clone, Default)] - struct NoopBuffer; + struct NoopBuffer { + subscriptions: Arc>>>, + } impl crate::marshal::core::Buffer> for NoopBuffer { type PublicKey = PublicKey; @@ -1634,7 +1639,8 @@ mod tests { } async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { - let (_sender, receiver) = oneshot::channel(); + let (sender, receiver) = oneshot::channel(); + self.subscriptions.lock().push(sender); receiver } @@ -1642,7 +1648,8 @@ mod tests { &self, _commitment: D, ) -> oneshot::Receiver { - let (_sender, receiver) = oneshot::channel(); + let (sender, receiver) = oneshot::channel(); + self.subscriptions.lock().push(sender); receiver } @@ -1783,7 +1790,7 @@ mod tests { let (resolver_tx, resolver_rx) = mpsc::channel(100); let actor_handle = actor.start( application, - NoopBuffer, + NoopBuffer::default(), (resolver_rx, NoopResolver::holding(resolver_tx)), ); (mailbox, actor_handle) @@ -2013,6 +2020,69 @@ mod tests { }); } + /// Regression: a block subscription must not resolve until the corresponding + /// verified-block write has completed successfully. Otherwise a waiter can + /// act on a block that marshal immediately loses to a fatal storage error. + #[test_traced("WARN")] + fn test_standard_verified_write_failure_does_not_notify_subscribers() { + let runner = deterministic::Runner::new( + deterministic::Config::new() + .with_timeout(Some(Duration::from_secs(30))) + .with_catch_panics(true), + ); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let partition_prefix = format!("verified-write-failure-{me}"); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let digest = block.digest(); + + let (mailbox, actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + Application::::default(), + ) + .await; + + let subscription = mailbox.subscribe_by_digest(Some(round), digest).await; + + *context.storage_fault_config().write() = + deterministic::FaultConfig::default().write(1.0); + + assert!( + !mailbox.verified(round, block.clone()).await, + "verified should fail when marshal cannot persist the block" + ); + assert!( + actor_handle.await.is_err(), + "fatal verified write failure should terminate the marshal actor" + ); + assert!( + subscription.await.is_err(), + "subscription must not resolve before verified persistence succeeds" + ); + + *context.storage_fault_config().write() = deterministic::FaultConfig::default(); + let (mailbox, _actor_handle) = start_standard_actor( + context.with_label("validator_0_restart"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + Application::::default(), + ) + .await; + assert!( + mailbox.get_block(&digest).await.is_none(), + "failed verified write must not survive restart" + ); + }); + } + /// Parse the `processed_height` gauge value from a prometheus-encoded /// metrics dump produced by `Metrics::encode`. Looks for any line of the /// form `processed_height `. From 0c13c2183af1eceafc0ebfa4af013439949757d9 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 13:44:03 -0700 Subject: [PATCH 071/107] remove too strict test --- consensus/src/marshal/standard/mod.rs | 78 ++------------------------- 1 file changed, 4 insertions(+), 74 deletions(-) diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index b355fb7bcbe..512921602c0 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -1618,13 +1618,8 @@ mod tests { } /// A no-op buffer used by tests that do not need marshal's dissemination path. - /// - /// Subscription senders are retained so marshal-local subscriptions stay live - /// until the actor drops the buffer. #[derive(Clone, Default)] - struct NoopBuffer { - subscriptions: Arc>>>, - } + struct NoopBuffer; impl crate::marshal::core::Buffer> for NoopBuffer { type PublicKey = PublicKey; @@ -1639,8 +1634,7 @@ mod tests { } async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { - let (sender, receiver) = oneshot::channel(); - self.subscriptions.lock().push(sender); + let (_sender, receiver) = oneshot::channel(); receiver } @@ -1648,8 +1642,7 @@ mod tests { &self, _commitment: D, ) -> oneshot::Receiver { - let (sender, receiver) = oneshot::channel(); - self.subscriptions.lock().push(sender); + let (_sender, receiver) = oneshot::channel(); receiver } @@ -1790,7 +1783,7 @@ mod tests { let (resolver_tx, resolver_rx) = mpsc::channel(100); let actor_handle = actor.start( application, - NoopBuffer::default(), + NoopBuffer, (resolver_rx, NoopResolver::holding(resolver_tx)), ); (mailbox, actor_handle) @@ -2020,69 +2013,6 @@ mod tests { }); } - /// Regression: a block subscription must not resolve until the corresponding - /// verified-block write has completed successfully. Otherwise a waiter can - /// act on a block that marshal immediately loses to a fatal storage error. - #[test_traced("WARN")] - fn test_standard_verified_write_failure_does_not_notify_subscribers() { - let runner = deterministic::Runner::new( - deterministic::Config::new() - .with_timeout(Some(Duration::from_secs(30))) - .with_catch_panics(true), - ); - runner.start(|mut context| async move { - let Fixture { - participants, - schemes, - .. - } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); - let me = participants[0].clone(); - let partition_prefix = format!("verified-write-failure-{me}"); - let round = Round::new(Epoch::zero(), View::new(1)); - let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); - let digest = block.digest(); - - let (mailbox, actor_handle) = start_standard_actor( - context.with_label("validator_0"), - &partition_prefix, - ConstantProvider::new(schemes[0].clone()), - Application::::default(), - ) - .await; - - let subscription = mailbox.subscribe_by_digest(Some(round), digest).await; - - *context.storage_fault_config().write() = - deterministic::FaultConfig::default().write(1.0); - - assert!( - !mailbox.verified(round, block.clone()).await, - "verified should fail when marshal cannot persist the block" - ); - assert!( - actor_handle.await.is_err(), - "fatal verified write failure should terminate the marshal actor" - ); - assert!( - subscription.await.is_err(), - "subscription must not resolve before verified persistence succeeds" - ); - - *context.storage_fault_config().write() = deterministic::FaultConfig::default(); - let (mailbox, _actor_handle) = start_standard_actor( - context.with_label("validator_0_restart"), - &partition_prefix, - ConstantProvider::new(schemes[0].clone()), - Application::::default(), - ) - .await; - assert!( - mailbox.get_block(&digest).await.is_none(), - "failed verified write must not survive restart" - ); - }); - } - /// Parse the `processed_height` gauge value from a prometheus-encoded /// metrics dump produced by `Metrics::encode`. Looks for any line of the /// form `processed_height `. From cbe722b955e85bc4bb893bb101fdf1c9df331618 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 13:57:26 -0700 Subject: [PATCH 072/107] cleanup --- consensus/src/marshal/mocks/harness.rs | 106 +++++++++++++------------ 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 9ad60869b8c..deb8473b5cf 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -335,6 +335,19 @@ struct HailstormValidator { actor_handle: commonware_runtime::Handle<()>, } +type CanonicalEntry = (Height, D, Finalization::Commitment>); +type CanonicalChain = Vec>; + +struct HailstormState<'a, H: TestHarness> { + validators: &'a mut [Option>], + canonical: &'a mut CanonicalChain, + parent: &'a mut D, + parent_commitment: &'a mut H::Commitment, + participants: &'a [K], + schemes: &'a [S], + propagation_delay: Duration, +} + fn active_validator_indices( validators: &[Option>], ) -> Vec { @@ -383,7 +396,7 @@ async fn wait_for_validator_height( async fn assert_validator_matches_canonical( validator: &HailstormValidator, - canonical: &[(Height, D, Finalization)], + canonical: &[CanonicalEntry], label: &str, ) { let delivered = validator.application.blocks(); @@ -473,7 +486,7 @@ async fn assert_validator_matches_canonical( async fn assert_active_validators_match_canonical( validators: &[Option>], - canonical: &[(Height, D, Finalization)], + canonical: &[CanonicalEntry], ) { for idx in active_validator_indices(validators) { let validator = validators[idx] @@ -486,17 +499,11 @@ async fn assert_active_validators_match_canonical( async fn advance_hailstorm_to( target: u64, context: &mut deterministic::Context, - validators: &mut [Option>], - canonical: &mut Vec<(Height, D, Finalization)>, - parent: &mut D, - parent_commitment: &mut H::Commitment, - participants: &[K], - schemes: &[S], - propagation_delay: Duration, + state: &mut HailstormState<'_, H>, ) { - for height_value in (canonical.len() as u64 + 1)..=target { + for height_value in (state.canonical.len() as u64 + 1)..=target { let height = Height::new(height_value); - let active = active_validator_indices(validators); + let active = active_validator_indices(state.validators); let proposer_idx = active[context.gen_range(0..active.len())]; let verifier_count = usize::min(QUORUM as usize, active.len()); let verifier_indices = active @@ -505,11 +512,11 @@ async fn advance_hailstorm_to( .filter(|idx| *idx != proposer_idx) .choose_multiple(context, verifier_count.saturating_sub(1)); let block = H::make_test_block( - *parent, - parent_commitment.clone(), + *state.parent, + *state.parent_commitment, height, height_value, - participants.len() as u16, + state.participants.len() as u16, ); let round = Round::new(Epoch::zero(), View::new(height_value)); let proposal = Proposal { @@ -521,45 +528,46 @@ async fn advance_hailstorm_to( payload: H::commitment(&block), }; let expected_digest = H::digest(&block); - let finalization = H::make_finalization(proposal.clone(), schemes, QUORUM); + let finalization = H::make_finalization(proposal.clone(), state.schemes, QUORUM); { - let proposer = validators[proposer_idx] + let proposer = state.validators[proposer_idx] .as_mut() .expect("proposer should be active"); H::propose(&mut proposer.handle, round, &block).await; H::report_notarization( &mut proposer.handle.mailbox, - H::make_notarization(proposal, schemes, QUORUM), + H::make_notarization(proposal, state.schemes, QUORUM), ) .await; } for verifier_idx in verifier_indices.iter().copied() { - let verifier = validators[verifier_idx] + let verifier = state.validators[verifier_idx] .as_mut() .expect("verifier should be active"); H::verify(&mut verifier.handle, round, &block, &mut []).await; } - context.sleep(propagation_delay).await; + context.sleep(state.propagation_delay).await; - for idx in active_validator_indices(validators) { - let validator = validators[idx] + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] .as_mut() .expect("validator should remain active"); H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; } - canonical.push((height, expected_digest, finalization)); - *parent = expected_digest; - *parent_commitment = H::commitment(&block); + state.canonical.push((height, expected_digest, finalization)); + *state.parent = expected_digest; + *state.parent_commitment = H::commitment(&block); - let (_, _, expected_finalization) = canonical + let (_, _, expected_finalization) = state + .canonical .last() .expect("canonical chain should contain the new height"); - for idx in active_validator_indices(validators) { - let validator = validators[idx] + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] .as_ref() .expect("validator should be active"); wait_for_validator_height( @@ -574,7 +582,7 @@ async fn advance_hailstorm_to( } } - assert_active_validators_match_canonical(validators, canonical).await; + assert_active_validators_match_canonical(state.validators, state.canonical).await; } /// Stress marshal with repeated validator crashes and recoveries while a @@ -622,7 +630,7 @@ pub fn hailstorm( })); } - let mut canonical = Vec::<(Height, D, Finalization)>::new(); + let mut canonical = CanonicalChain::::new(); let mut parent = Sha256::hash(b""); let mut parent_commitment = H::genesis_parent_commitment(participants.len() as u16); let mut target_height = 0u64; @@ -632,18 +640,16 @@ pub fn hailstorm( for shutdown_idx in 0..shutdowns { let leadup = context.gen_range(1..=max_interval); target_height += leadup; - advance_hailstorm_to( - target_height, - &mut context, - &mut validators, - &mut canonical, - &mut parent, - &mut parent_commitment, - &participants, - &schemes, + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, propagation_delay, - ) - .await; + }; + advance_hailstorm_to(target_height, &mut context, &mut state).await; let active = active_validator_indices(&validators); let down_limit = usize::min(max_down, active.len().saturating_sub(1)); @@ -674,18 +680,16 @@ pub fn hailstorm( let downtime = context.gen_range(1..=max_interval); target_height += downtime; - advance_hailstorm_to( - target_height, - &mut context, - &mut validators, - &mut canonical, - &mut parent, - &mut parent_commitment, - &participants, - &schemes, + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, propagation_delay, - ) - .await; + }; + advance_hailstorm_to(target_height, &mut context, &mut state).await; for idx in selected.iter().copied() { let restarted = H::setup_validator( From 5fed8c49d8b417e34d4e7039e910da14ff80eb95 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:09:17 -0700 Subject: [PATCH 073/107] make fix-fmt --- consensus/src/marshal/coding/marshaled.rs | 6 ++--- consensus/src/marshal/core/actor.rs | 18 ++++++++++++++ consensus/src/marshal/core/mailbox.rs | 29 ++++++++++++++++++++++ consensus/src/marshal/mocks/harness.rs | 4 ++- consensus/src/marshal/standard/deferred.rs | 6 ++--- consensus/src/marshal/standard/inline.rs | 2 +- 6 files changed, 57 insertions(+), 8 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 1cdf3749ea5..f178bd50577 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -902,12 +902,12 @@ where round, ); if is_reproposal { - // During crash recovery we may call `marshal.verified` twice for + // During crash recovery we may call `marshal.persist` twice for // the same block; the call is idempotent. - if !marshaled.marshal.verified(round, block).await { + if !marshaled.marshal.persist(round, block).await { debug!( ?round, - "marshal unavailable during certify re-proposal verified ack; \ + "marshal unavailable during certify re-proposal persist ack; \ skipping certify resolution" ); return; diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 757cc9556ba..f8c3609ee83 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -543,6 +543,10 @@ where self.cache_verified(round, block.digest(), block).await; ack.send_lossy(()); } + Message::Persist { round, block, ack } => { + self.persist_block(round, block.digest(), block).await; + ack.send_lossy(()); + } Message::Notarization { notarization } => { let round = notarization.round(); let commitment = notarization.proposal.payload; @@ -1347,6 +1351,20 @@ where self.cache.put_block(round, digest, block.into()).await; } + /// Ensure a block is durably present in local storage without asserting it + /// belongs in the verified cache. + async fn persist_block( + &mut self, + round: Round, + digest: ::Digest, + block: V::Block, + ) { + if self.find_block_in_storage(digest).await.is_some() { + return; + } + self.cache_block(round, digest, block).await; + } + /// Sync both finalization archives to durable storage. /// /// Must be called within the same `select_loop!` arm as any preceding diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 6df3b08314a..d5fc5090bef 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -120,6 +120,20 @@ pub(crate) enum Message { /// A channel signaled once the block is durably stored. ack: oneshot::Sender<()>, }, + /// A request to make a block durable locally without asserting it was + /// locally application-verified. + /// + /// The `ack` is signaled after the block is durably present in marshal's + /// local storage, either because it was already present or because marshal + /// persisted it during this request. + Persist { + /// The round in which the block was notarized. + round: Round, + /// The block to durably retain. + block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, + }, /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below @@ -325,6 +339,21 @@ impl Mailbox { .is_some() } + /// Requests that marshal durably retain a block locally, without claiming + /// that the caller just performed application verification. + /// + /// Returns `true` once the actor has confirmed the block is durably present + /// in local storage. Returns `false` if the marshal actor shut down before + /// acknowledging. Callers MUST NOT treat `false` as sufficient to proceed + /// with any action that requires local crash recovery of the block. + #[must_use = "callers must not proceed on an unpersisted block"] + pub async fn persist(&self, round: Round, block: V::Block) -> bool { + self.sender + .request(|ack| Message::Persist { round, block, ack }) + .await + .is_some() + } + /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index deb8473b5cf..a550742e853 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -558,7 +558,9 @@ async fn advance_hailstorm_to( H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; } - state.canonical.push((height, expected_digest, finalization)); + state + .canonical + .push((height, expected_digest, finalization)); *state.parent = expected_digest; *state.parent_commitment = H::commitment(&block); diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 2a38f2f4871..f80f16184d7 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -582,14 +582,14 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call `marshal.verified` + // NOTE: It is possible that, during crash recovery, we call `marshal.persist` // twice for the same block. That function is idempotent, so this is safe. // If marshal is gone, do not signal certify-true: the block was not durably // stored. - if !marshaled.marshal.verified(round, block).await { + if !marshaled.marshal.persist(round, block).await { debug!( ?round, - "marshal unavailable during certify re-proposal verified ack; \ + "marshal unavailable during certify re-proposal persist ack; \ skipping certify resolution" ); return; diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 9f6d9a10034..fe6865f663b 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -465,7 +465,7 @@ where // `certify` resolving true drives the finalize vote, so mere // buffered availability is not sufficient here. Persist the // block through marshal before signaling success. - if marshal.verified(round, block).await { + if marshal.persist(round, block).await { tx.send_lossy(true); } }); From 30d7952aa6c879600d9473b05d0adf44b727f6c1 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:11:42 -0700 Subject: [PATCH 074/107] put order back --- consensus/src/marshal/coding/marshaled.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index f178bd50577..61fd60fe561 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -266,9 +266,9 @@ where scheme_provider, strategy, epocher, + last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), cached_genesis: Arc::new(OnceLock::new()), - last_built: Arc::new(Mutex::new(None)), build_duration, verify_duration, @@ -503,8 +503,8 @@ where let mut application = self.application.clone(); let epocher = self.epocher.clone(); let strategy = self.strategy.clone(); - let cached_genesis = self.cached_genesis.clone(); let last_built = self.last_built.clone(); + let cached_genesis = self.cached_genesis.clone(); // If there's no scheme for the current epoch, we cannot verify the proposal. // Send back a receiver with a dropped sender. From 880811c60bb6ee2c8df5f4d144f26a4855c97c53 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:12:59 -0700 Subject: [PATCH 075/107] move back --- consensus/src/marshal/coding/marshaled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 61fd60fe561..9015bc12111 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -501,9 +501,9 @@ where ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); + let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let strategy = self.strategy.clone(); - let last_built = self.last_built.clone(); let cached_genesis = self.cached_genesis.clone(); // If there's no scheme for the current epoch, we cannot verify the proposal. From f38383dad9b2616ca97e3f9ce874081d65b2844b Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:15:08 -0700 Subject: [PATCH 076/107] keep height --- consensus/src/marshal/coding/marshaled.rs | 4 +++- consensus/src/marshal/standard/deferred.rs | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 9015bc12111..7ac5c859345 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -971,15 +971,17 @@ where ); return false; }; + let height = block.height(); if !self.marshal.proposed(round, block).await { warn!( ?round, ?commitment, + %height, "marshal unavailable during proposed broadcast; block not persisted" ); return false; } - debug!(?round, ?commitment, "requested broadcast of built block"); + debug!(?round, ?commitment, %height, "requested broadcast of built block"); true } Plan::Forward { .. } => { diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index f80f16184d7..b2d0eebef60 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -635,15 +635,17 @@ where ); return false; }; + let height = block.height(); if !self.marshal.proposed(round, block).await { warn!( ?round, ?digest, + %height, "marshal unavailable during proposed broadcast; block not persisted" ); return false; } - debug!(?round, ?digest, "requested broadcast of built block"); + debug!(?round, ?digest, %height, "requested broadcast of built block"); true } Plan::Forward { round, peers } => { From 389d7dd79698f5469dcc9a9cd95c5e759375372e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:22:57 -0700 Subject: [PATCH 077/107] revert special treatment --- consensus/src/marshal/coding/marshaled.rs | 6 ++--- consensus/src/marshal/core/actor.rs | 18 -------------- consensus/src/marshal/core/mailbox.rs | 29 ---------------------- consensus/src/marshal/standard/deferred.rs | 6 ++--- consensus/src/marshal/standard/inline.rs | 2 +- 5 files changed, 7 insertions(+), 54 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 7ac5c859345..8044985053f 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -902,12 +902,12 @@ where round, ); if is_reproposal { - // During crash recovery we may call `marshal.persist` twice for + // During crash recovery we may call `marshal.verified` twice for // the same block; the call is idempotent. - if !marshaled.marshal.persist(round, block).await { + if !marshaled.marshal.verified(round, block).await { debug!( ?round, - "marshal unavailable during certify re-proposal persist ack; \ + "marshal unavailable during certify re-proposal verified ack; \ skipping certify resolution" ); return; diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index f8c3609ee83..757cc9556ba 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -543,10 +543,6 @@ where self.cache_verified(round, block.digest(), block).await; ack.send_lossy(()); } - Message::Persist { round, block, ack } => { - self.persist_block(round, block.digest(), block).await; - ack.send_lossy(()); - } Message::Notarization { notarization } => { let round = notarization.round(); let commitment = notarization.proposal.payload; @@ -1351,20 +1347,6 @@ where self.cache.put_block(round, digest, block.into()).await; } - /// Ensure a block is durably present in local storage without asserting it - /// belongs in the verified cache. - async fn persist_block( - &mut self, - round: Round, - digest: ::Digest, - block: V::Block, - ) { - if self.find_block_in_storage(digest).await.is_some() { - return; - } - self.cache_block(round, digest, block).await; - } - /// Sync both finalization archives to durable storage. /// /// Must be called within the same `select_loop!` arm as any preceding diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index d5fc5090bef..6df3b08314a 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -120,20 +120,6 @@ pub(crate) enum Message { /// A channel signaled once the block is durably stored. ack: oneshot::Sender<()>, }, - /// A request to make a block durable locally without asserting it was - /// locally application-verified. - /// - /// The `ack` is signaled after the block is durably present in marshal's - /// local storage, either because it was already present or because marshal - /// persisted it during this request. - Persist { - /// The round in which the block was notarized. - round: Round, - /// The block to durably retain. - block: V::Block, - /// A channel signaled once the block is durably stored. - ack: oneshot::Sender<()>, - }, /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below @@ -339,21 +325,6 @@ impl Mailbox { .is_some() } - /// Requests that marshal durably retain a block locally, without claiming - /// that the caller just performed application verification. - /// - /// Returns `true` once the actor has confirmed the block is durably present - /// in local storage. Returns `false` if the marshal actor shut down before - /// acknowledging. Callers MUST NOT treat `false` as sufficient to proceed - /// with any action that requires local crash recovery of the block. - #[must_use = "callers must not proceed on an unpersisted block"] - pub async fn persist(&self, round: Round, block: V::Block) -> bool { - self.sender - .request(|ack| Message::Persist { round, block, ack }) - .await - .is_some() - } - /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index b2d0eebef60..d44e0c37951 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -582,14 +582,14 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call `marshal.persist` + // NOTE: It is possible that, during crash recovery, we call `marshal.verified` // twice for the same block. That function is idempotent, so this is safe. // If marshal is gone, do not signal certify-true: the block was not durably // stored. - if !marshaled.marshal.persist(round, block).await { + if !marshaled.marshal.verified(round, block).await { debug!( ?round, - "marshal unavailable during certify re-proposal persist ack; \ + "marshal unavailable during certify re-proposal verified ack; \ skipping certify resolution" ); return; diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index fe6865f663b..9f6d9a10034 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -465,7 +465,7 @@ where // `certify` resolving true drives the finalize vote, so mere // buffered availability is not sufficient here. Persist the // block through marshal before signaling success. - if marshal.persist(round, block).await { + if marshal.verified(round, block).await { tx.send_lossy(true); } }); From 6e3d7835496a8f518bd5c718a8443ef59f0f8257 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:26:52 -0700 Subject: [PATCH 078/107] shorten --- consensus/src/marshal/coding/marshaled.rs | 24 ++++---------------- consensus/src/marshal/standard/deferred.rs | 13 ++--------- consensus/src/marshal/standard/inline.rs | 6 +---- consensus/src/marshal/standard/validation.rs | 5 +--- 4 files changed, 8 insertions(+), 40 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 8044985053f..c993e9b5d9e 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -425,10 +425,7 @@ where }; timer.observe(); if application_valid && !marshal.verified(round, block).await { - debug!( - ?round, - "marshal unavailable during verified ack; skipping certify resolution" - ); + debug!(?round, "marshal unable to accept block"); return; } tx.send_lossy(application_valid); @@ -761,11 +758,7 @@ where // Valid re-proposal: notify the marshal and complete the // verification task for `certify`. if !marshal.verified(round, block).await { - debug!( - ?round, - "marshal unavailable during re-proposal verified ack; \ - skipping certify resolution" - ); + debug!(?round, "marshal unable to accept block"); return; } task_tx.send_lossy(true); @@ -905,11 +898,7 @@ where // During crash recovery we may call `marshal.verified` twice for // the same block; the call is idempotent. if !marshaled.marshal.verified(round, block).await { - debug!( - ?round, - "marshal unavailable during certify re-proposal verified ack; \ - skipping certify resolution" - ); + debug!(?round, "marshal unable to accept block"); return; } tx.send_lossy(true); @@ -973,12 +962,7 @@ where }; let height = block.height(); if !self.marshal.proposed(round, block).await { - warn!( - ?round, - ?commitment, - %height, - "marshal unavailable during proposed broadcast; block not persisted" - ); + warn!(?round, ?commitment, %height, "marshal unable to accept block"); return false; } debug!(?round, ?commitment, %height, "requested broadcast of built block"); diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index d44e0c37951..35c6eb44503 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -587,11 +587,7 @@ where // If marshal is gone, do not signal certify-true: the block was not durably // stored. if !marshaled.marshal.verified(round, block).await { - debug!( - ?round, - "marshal unavailable during certify re-proposal verified ack; \ - skipping certify resolution" - ); + debug!(?round, "marshal unable to accept block"); return; } tx.send_lossy(true); @@ -637,12 +633,7 @@ where }; let height = block.height(); if !self.marshal.proposed(round, block).await { - warn!( - ?round, - ?digest, - %height, - "marshal unavailable during proposed broadcast; block not persisted" - ); + warn!(?round, ?digest, %height, "marshal unable to accept block"); return false; } debug!(?round, ?digest, %height, "requested broadcast of built block"); diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 9f6d9a10034..29516e73ccb 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -506,11 +506,7 @@ where return false; }; if !self.marshal.proposed(round, block).await { - warn!( - ?round, - ?digest, - "marshal unavailable during proposed broadcast; block not persisted" - ); + warn!(?round, ?digest, "marshal unable to accept block"); return false; } true diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index a7d9023aac7..ede79bde511 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -206,10 +206,7 @@ where }; if application_valid && !marshal.verified(context.round, block).await { - debug!( - round = ?context.round, - "marshal unavailable during verified ack; aborting verify" - ); + debug!(round = ?context.round, "marshal unable to accept block"); // Persistence not confirmed: caller MUST NOT signal a positive // verdict to consensus. Returning `None` causes verify to exit // silently without firing tx. From bb17f6ce0afe3511e327f7b1d521146770b18a5e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:28:42 -0700 Subject: [PATCH 079/107] nits --- consensus/src/marshal/coding/marshaled.rs | 2 +- consensus/src/marshal/standard/deferred.rs | 2 +- consensus/src/marshal/standard/inline.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index c993e9b5d9e..cc3d1def0ff 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -948,7 +948,7 @@ where match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); + warn!(?commitment, "missing block to broadcast"); return false; }; if block.commitment() != commitment { diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 35c6eb44503..e5cdbe747ec 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -619,7 +619,7 @@ where match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); + warn!(?digest, "missing block to broadcast"); return false; }; if block.digest() != digest { diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 29516e73ccb..5df9c189baf 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -493,7 +493,7 @@ where match plan { Plan::Propose => { let Some((round, block)) = self.last_built.lock().take() else { - warn!("missing block to broadcast"); + warn!(?digest, "missing block to broadcast"); return false; }; if block.digest() != digest { From 363cba813f7b94e795b279854759a54c6ec30eae Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:31:49 -0700 Subject: [PATCH 080/107] minimize --- consensus/src/marshal/core/mailbox.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 6df3b08314a..f168af91536 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -86,10 +86,6 @@ pub(crate) enum Message { response: oneshot::Sender, }, /// A request to broadcast a proposed block to peers. - /// - /// The `ack` is signaled after the block has been durably persisted - /// (`cache.put_verified` returns) so callers can establish - /// "voted ⟹ persisted" before broadcasting their own notarize vote. Proposed { /// The round in which the block was proposed. round: Round, @@ -108,10 +104,6 @@ pub(crate) enum Message { peers: Vec, }, /// A notification that a block has been verified by the application. - /// - /// The `ack` is signaled after the block has been durably persisted - /// (`cache.put_verified` returns) so callers can establish - /// "voted ⟹ persisted" before resolving consensus's certify task. Verified { /// The round in which the block was verified. round: Round, From 3de180a4db6fe69526a0f0c563f1745cd3489e8e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:32:25 -0700 Subject: [PATCH 081/107] nit --- consensus/src/marshal/standard/deferred.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index e5cdbe747ec..03e677e7174 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -299,8 +299,8 @@ where ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let epocher = self.epocher.clone(); let last_built = self.last_built.clone(); + let epocher = self.epocher.clone(); // Metrics let build_duration = self.build_duration.clone(); From b514391333d3c192d04c205a5ce0588034a01418 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:34:15 -0700 Subject: [PATCH 082/107] nit --- consensus/src/marshal/standard/deferred.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 03e677e7174..295f95b7217 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -472,6 +472,8 @@ where task_tx.send_lossy(true); marshaled.verification_tasks.insert(round, digest, task_rx); } + // `Complete` means either immediate rejection or successful + // re-proposal handling with no further ancestry validation. tx.send_lossy(valid); return; } @@ -582,10 +584,8 @@ where round, ); if is_reproposal { - // NOTE: It is possible that, during crash recovery, we call `marshal.verified` + // It is possible that, during crash recovery, we call `marshal.verified` // twice for the same block. That function is idempotent, so this is safe. - // If marshal is gone, do not signal certify-true: the block was not durably - // stored. if !marshaled.marshal.verified(round, block).await { debug!(?round, "marshal unable to accept block"); return; From 6fcf396c16d7d3dcba785b602e82671e32ad832b Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:36:02 -0700 Subject: [PATCH 083/107] add must use --- consensus/src/marshal/core/mailbox.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index f168af91536..2042b623be6 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -289,12 +289,7 @@ impl Mailbox { /// Requests that a proposed block is sent to peers, awaiting the actor's /// confirmation that the block has been durably persisted before returning. - /// - /// Returns `true` once the actor has completed `put_sync`. Returns `false` - /// if the marshal actor has shut down before acknowledging (typical during - /// graceful shutdown). Callers MUST NOT proceed to vote when this returns - /// `false` -- the block is not durably stored. - #[must_use = "callers must not proceed to vote on an unpersisted block"] + #[must_use = "callers must consider block durability before proceeding"] pub async fn proposed(&self, round: Round, block: V::Block) -> bool { self.sender .request(|ack| Message::Proposed { round, block, ack }) @@ -304,12 +299,7 @@ impl Mailbox { /// Notifies the actor that a block has been verified, awaiting the actor's /// confirmation that the block has been durably persisted before returning. - /// - /// Returns `true` once the actor has completed `put_sync`. Returns `false` - /// if the marshal actor has shut down before acknowledging. Callers MUST - /// NOT proceed to resolve consensus's certify task as `true` (which would - /// drive a finalize vote) when this returns `false`. - #[must_use = "callers must not proceed to certify true on an unpersisted block"] + #[must_use = "callers must consider block durability before proceeding"] pub async fn verified(&self, round: Round, block: V::Block) -> bool { self.sender .request(|ack| Message::Verified { round, block, ack }) From 3c9f09b4a133d899ca6c05e879e13c4e9a364372 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:37:19 -0700 Subject: [PATCH 084/107] remove --- consensus/src/marshal/standard/validation.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index ede79bde511..67d0cf1e67f 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -207,9 +207,6 @@ where if application_valid && !marshal.verified(context.round, block).await { debug!(round = ?context.round, "marshal unable to accept block"); - // Persistence not confirmed: caller MUST NOT signal a positive - // verdict to consensus. Returning `None` causes verify to exit - // silently without firing tx. return None; } Some(application_valid) From b846b2c96cd601e34e7600a7b57dc60ffea874bb Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:49:25 -0700 Subject: [PATCH 085/107] nits --- consensus/src/marshal/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs index 3e4c8ac025f..71546bcac82 100644 --- a/consensus/src/marshal/mod.rs +++ b/consensus/src/marshal/mod.rs @@ -133,15 +133,15 @@ pub enum Update { Tip(Round, Height, B::Digest), /// A new finalized block and an [Acknowledgement] for the application to signal once processed. /// - /// Marshal only emits this after it has durably persisted the delivered block locally. - /// For blocks flowing through the normal finalization path, the corresponding - /// height-indexed finalization metadata is also durably synced before delivery. - /// /// To ensure all blocks are delivered at least once, marshal waits to mark a block as delivered /// until the application explicitly acknowledges the update. If the [Acknowledgement] is dropped before /// handling, marshal will exit (assuming the application is shutting down). /// /// Because the [Acknowledgement] is clonable, the application can pass [Update] to multiple consumers /// (and marshal will only consider the block delivered once all consumers have acknowledged it). + /// + /// Marshal only emits a block after it has durably persisted the said block. This ensures applications + /// that make stateful changes based on a block in other locations can access the same block on restart (often + /// some logic on startup attempts on infallible read on the last processed block). Block(B, A), } From 51af0b82c6cd412f0fbf2090bd613d334399c356 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:51:41 -0700 Subject: [PATCH 086/107] nit --- consensus/src/lib.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 0aeb6264592..85c4cec8d59 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -210,10 +210,6 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// Returns `true` when the relay accepted the payload for the requested /// broadcast plan. Returns `false` when the relay could not complete the /// handoff. - /// - /// For proposal broadcasts, returning `false` is fatal for the current - /// consensus attempt: callers must not proceed as though the payload were - /// available to the network. fn broadcast( &mut self, payload: Self::Digest, From 8d5d8cd4e40c915a9b30992d913077a0e4ce65fa Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:53:38 -0700 Subject: [PATCH 087/107] nit --- consensus/src/lib.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 85c4cec8d59..06ac1d68e28 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -152,12 +152,9 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// This is particularly useful for applications that employ erasure coding, which /// can override this method to delay or prevent finalization until they have /// reconstructed and validated the full block (e.g., after receiving enough shards). - /// Payloads produced locally by [`Automaton::propose`] are the exception: - /// the proposer must treat them as certifiable-by-construction for that - /// same round, allowing consensus to skip `certify` once it has durable - /// local evidence that the proposal originated here. /// - /// Like [`Automaton::verify`], certification is single-shot for the given + /// Like [`Automaton::verify`], payloads produced by [`Automaton::propose`] are certifiable-by-construction. + /// Also like [`Automaton::verify`], certification is single-shot for the given /// `(round, payload)`. Once the returned channel resolves or closes, consensus treats /// certification as concluded and will not retry the same request. /// From da822e05769ca80d1fe2366336cd3f1e8ed24a3e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 14:55:56 -0700 Subject: [PATCH 088/107] nits --- consensus/src/lib.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 06ac1d68e28..0926c0cfd72 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -100,13 +100,12 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// channel. If it is not possible to generate a payload, the channel can be dropped. If construction /// takes too long, the consensus engine may drop the provided proposal. /// + /// Returning a payload from `propose` commits the local proposer to verifying + /// the same `(context, payload)`. + /// /// For [`CertifiableAutomaton`] implementations, returning a payload from /// `propose` also commits the local proposer to certifying that same - /// `(round, payload)` if it later becomes notarized. Consensus engines - /// may therefore treat durable local evidence of proposal construction - /// (for example replay of a local vote on a leader-owned round) as - /// sufficient to bypass a later `certify` callback for that exact - /// proposal. + /// `(round, payload)` if it later becomes notarized. fn propose( &mut self, context: Self::Context, From cc6840d463a792bdc9704a42f21fe4f13a6ac0c7 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:06:11 -0700 Subject: [PATCH 089/107] fix --- consensus/src/marshal/coding/mod.rs | 6 +++--- consensus/src/marshal/mocks/harness.rs | 18 ++++++++++++------ consensus/src/marshal/standard/mod.rs | 12 ++++++------ consensus/src/marshal/standard/validation.rs | 8 ++------ 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index f0b1a028743..4b83aa9765a 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -171,17 +171,17 @@ mod tests { #[test_traced("WARN")] fn test_coding_proposed_success_implies_recoverable_after_restart() { - harness::proposed_success_implies_recoverable_after_restart::(); + harness::proposed_success_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] fn test_coding_verified_success_implies_recoverable_after_restart() { - harness::verified_success_implies_recoverable_after_restart::(); + harness::verified_success_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] fn test_coding_delivery_visibility_implies_recoverable_after_restart() { - harness::delivery_visibility_implies_recoverable_after_restart::(); + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index a550742e853..9e0a52b3123 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -755,8 +755,10 @@ pub fn hailstorm( /// Contract: `marshal.proposed(...)=true` means the block survives an /// immediate crash and repeated recoveries. -pub fn proposed_success_implies_recoverable_after_restart() { - for seed in 0u64..16 { +pub fn proposed_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { let Fixture { participants, schemes, @@ -841,8 +843,10 @@ pub fn proposed_success_implies_recoverable_after_restart() { /// Contract: `marshal.verified(...)=true` means the block survives an /// immediate crash and repeated recoveries. -pub fn verified_success_implies_recoverable_after_restart() { - for seed in 0u64..16 { +pub fn verified_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { let Fixture { participants, schemes, @@ -928,8 +932,10 @@ pub fn verified_success_implies_recoverable_after_restart() { /// Contract: once marshal has delivered a finalized block to the application, /// that finalized block and its certificate must already be durable. -pub fn delivery_visibility_implies_recoverable_after_restart() { - for seed in 0u64..16 { +pub fn delivery_visibility_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { let Fixture { participants, schemes, diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 512921602c0..11bb51ea0c0 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -191,20 +191,20 @@ mod tests { #[test_traced("WARN")] fn test_standard_proposed_success_implies_recoverable_after_restart() { - harness::proposed_success_implies_recoverable_after_restart::(); - harness::proposed_success_implies_recoverable_after_restart::(); + harness::proposed_success_implies_recoverable_after_restart::(0..16); + harness::proposed_success_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] fn test_standard_verified_success_implies_recoverable_after_restart() { - harness::verified_success_implies_recoverable_after_restart::(); - harness::verified_success_implies_recoverable_after_restart::(); + harness::verified_success_implies_recoverable_after_restart::(0..16); + harness::verified_success_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] fn test_standard_delivery_visibility_implies_recoverable_after_restart() { - harness::delivery_visibility_implies_recoverable_after_restart::(); - harness::delivery_visibility_implies_recoverable_after_restart::(); + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); + harness::delivery_visibility_implies_recoverable_after_restart::(0..16); } #[test_traced("WARN")] diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index 67d0cf1e67f..31d96901ef8 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -51,12 +51,8 @@ where /// Result of the shared epoch / re-proposal pre-check step. /// -/// - `Complete(valid)`: verification can terminate immediately with `valid`. -/// - `Continue(block)`: full parent + application verification should continue. -/// -/// The function returns `Option>`: `None` means the marshal actor -/// shut down during persistence and the caller must exit silently (consistent -/// with the `Option` convention used by [`verify_with_parent`]). +/// `Complete(valid)` indicates verification can terminate immediately with `valid`. +/// `Continue(block)` indicates full parent + application verification should continue. pub(super) enum Decision { Complete(bool), Continue(B), From ea243743dd2a5eb2fe6ba0be2aea845fbf690876 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:07:31 -0700 Subject: [PATCH 090/107] nit --- consensus/src/simplex/actors/voter/slot.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index d7522eef2e9..b9fbe6d3d2c 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -1,6 +1,6 @@ use crate::simplex::types::Proposal; use commonware_cryptography::Digest; -use tracing::debug; +use tracing::warn; /// Proposal verification status within a round. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] @@ -86,9 +86,10 @@ where if let Some(existing) = &self.proposal { // This can happen if we receive a certificate for a conflicting proposal. Normally, // we would ignore this case but it is required to support [Twins](https://arxiv.org/abs/2004.10617) testing. - debug!( + warn!( ?existing, ?proposal, + ?local, "ignoring verified proposal because slot already populated" ); return; From f5a288875ab4a61a00a43ebd52d97ac82fe9eb74 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:08:29 -0700 Subject: [PATCH 091/107] local --- consensus/src/simplex/actors/voter/slot.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/simplex/actors/voter/slot.rs b/consensus/src/simplex/actors/voter/slot.rs index b9fbe6d3d2c..99b5574e4a0 100644 --- a/consensus/src/simplex/actors/voter/slot.rs +++ b/consensus/src/simplex/actors/voter/slot.rs @@ -65,7 +65,7 @@ where self.proposal.is_some() && self.status != Status::Equivocated } - /// Returns whether the current proposal was built locally and remains usable. + /// Returns whether the current proposal was built locally. pub const fn is_local(&self) -> bool { matches!(self.status, Status::Verified(true)) } From 7e5754b836bdb9cf176cea1d748bde8fffb6f302 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:09:54 -0700 Subject: [PATCH 092/107] nit --- consensus/src/simplex/mocks/application.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 65c2a872726..8602933211e 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -166,6 +166,7 @@ pub struct Config { pub verify_latency: Latency, pub certify_latency: Latency, + /// Predicate to determine whether a payload should be certified. pub should_certify: Certifier, } From 458ba150ec4f73b8f91da295e125381207e6e573 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:16:55 -0700 Subject: [PATCH 093/107] cleanup test --- consensus/src/simplex/actors/voter/state.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/consensus/src/simplex/actors/voter/state.rs b/consensus/src/simplex/actors/voter/state.rs index 8818aa276c2..2bb2b9dd201 100644 --- a/consensus/src/simplex/actors/voter/state.rs +++ b/consensus/src/simplex/actors/voter/state.rs @@ -589,8 +589,9 @@ impl, L: ElectorConfig, D: D /// /// Certification may be inferred only when we have explicit evidence that we /// proposed this exact payload for the round, either in the current process - /// or via replay of our durable local vote. Leader identity alone is not - /// sufficient during catch-up and recovery. + /// or via replay of our durable local vote. In certain cases for Byzantine nodes, + /// it is possible that a certificate is received for a proposal that we did not propose (although + /// we are the leader). pub fn certify_candidates(&mut self) -> Vec<(Proposal, bool)> { let candidates = take(&mut self.certification_candidates); candidates @@ -599,8 +600,7 @@ impl, L: ElectorConfig, D: D if view <= self.last_finalized { return None; } - let round = self.views.get_mut(&view)?; - let candidate = round.try_certify()?; + let candidate = self.views.get_mut(&view)?.try_certify()?; Some(candidate) }) .collect() @@ -1839,7 +1839,7 @@ mod tests { } #[test] - fn certify_candidates_do_not_short_circuit_leader_owned_recovered_proposals() { + fn certify_external_candidates_for_leader_controlled_views() { let runtime = deterministic::Runner::default(); runtime.start(|mut context| async move { let namespace = b"ns".to_vec(); From 81a2930c2cb26a11c8ecee577eef28a3f0d3f2b5 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:22:29 -0700 Subject: [PATCH 094/107] nit --- consensus/src/simplex/actors/voter/actor.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 47baf9a6293..55e66fff273 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -1097,6 +1097,8 @@ impl< } }, } + + // Sync and drop the journal self.journal .take() .expect("journal missing on voter exit") From 1d8c1d38b9eddeafc49b47ac5159a1b2cca423d8 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:30:14 -0700 Subject: [PATCH 095/107] nits --- consensus/src/marshal/standard/inline.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 5df9c189baf..fe3c695b136 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -162,7 +162,8 @@ where { /// Creates a new inline-verification wrapper. /// - /// Registers a `build_duration` histogram for proposal latency. + /// Registers a `build_duration` histogram for proposal latency and initializes + /// the shared "last built block" cache used by [`Relay::broadcast`]. pub fn new(context: E, application: A, marshal: Mailbox>, epocher: ES) -> Self { let build_histogram = Histogram::new(Buckets::LOCAL); context.register( @@ -231,8 +232,8 @@ where ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let epocher = self.epocher.clone(); let last_built = self.last_built.clone(); + let epocher = self.epocher.clone(); let build_duration = self.build_duration.clone(); let (mut tx, rx) = oneshot::channel(); @@ -321,12 +322,10 @@ where build_timer.observe(); let digest = built_block.digest(); - { let mut lock = last_built.lock(); *lock = Some((consensus_context.round, built_block)); } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -447,8 +446,6 @@ where } // Otherwise, subscribe to marshal for block availability. - // - // TODO(#3393): Avoid fetching the block just to check if it's available. let block_rx = self.marshal.subscribe_by_digest(Some(round), digest).await; let marshal = self.marshal.clone(); let (mut tx, rx) = oneshot::channel(); From c5b4b56a564d7d7c0e7d5358ff43b968d458de98 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 15:31:06 -0700 Subject: [PATCH 096/107] nits --- consensus/src/marshal/mocks/harness.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 9e0a52b3123..18c9e0d06e4 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -165,9 +165,6 @@ pub struct ValidatorSetup { pub mailbox: Mailbox, pub extra: H::ValidatorExtra, pub height: Height, - /// Handle to the marshal actor task. Tests can call `.abort()` to simulate - /// an actor crash and verify that prior `marshal.verified()`/`proposed()` - /// calls have already been durably persisted. pub actor_handle: commonware_runtime::Handle<()>, } @@ -1069,9 +1066,6 @@ impl TestHarness for StandardHarness { type ApplicationBlock = B; type Variant = Standard; type TestBlock = B; - /// Exposes the buffered broadcast mailbox so tests can seed in-memory - /// blocks directly (mirroring how a peer's broadcast would land in the - /// local buffer). type ValidatorExtra = buffered::Mailbox; type Commitment = D; From 94542dd01bf1fbf01f4df9984833a1d8855f4c48 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Fri, 17 Apr 2026 16:49:05 -0700 Subject: [PATCH 097/107] make consistent --- consensus/src/marshal/core/actor.rs | 2 +- consensus/src/marshal/standard/mod.rs | 150 +++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 757cc9556ba..1581358b7e0 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -521,8 +521,8 @@ where Message::Proposed { round, block, ack } => { self.cache_verified(round, block.digest(), block.clone()) .await; - ack.send_lossy(()); buffer.send(round, block, Recipients::All).await; + ack.send_lossy(()); } Message::Forward { round, diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 11bb51ea0c0..bcd882d0ee2 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -82,7 +82,7 @@ mod tests { use commonware_parallel::Sequential; use commonware_resolver::Resolver; use commonware_runtime::{ - buffer::paged::CacheRef, deterministic, Clock, Metrics, Quota, Runner, + buffer::paged::CacheRef, deterministic, Clock, Metrics, Quota, Runner, Spawner, }; use commonware_storage::{ archive::{immutable, prunable, Archive as _}, @@ -1651,6 +1651,67 @@ mod tests { async fn send(&self, _round: Round, _block: B, _recipients: Recipients) {} } + /// A buffer whose `send` blocks until released, and signals when entered. + /// Used to verify `proposed` only resolves after `buffer.send` completes. + #[derive(Clone)] + struct GatingBuffer { + send_entered: Arc>>>, + release: Arc>>>, + } + + impl GatingBuffer { + fn new() -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (entered_tx, entered_rx) = oneshot::channel(); + let (release_tx, release_rx) = oneshot::channel(); + ( + Self { + send_entered: Arc::new(Mutex::new(Some(entered_tx))), + release: Arc::new(Mutex::new(Some(release_rx))), + }, + entered_rx, + release_tx, + ) + } + } + + impl crate::marshal::core::Buffer> for GatingBuffer { + type PublicKey = PublicKey; + type CachedBlock = B; + + async fn find_by_digest(&self, _digest: D) -> Option { + None + } + + async fn find_by_commitment(&self, _commitment: D) -> Option { + None + } + + async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn subscribe_by_commitment( + &self, + _commitment: D, + ) -> oneshot::Receiver { + let (_sender, receiver) = oneshot::channel(); + receiver + } + + async fn finalized(&self, _commitment: D) {} + + async fn send(&self, _round: Round, _block: B, _recipients: Recipients) { + if let Some(entered) = self.send_entered.lock().take() { + entered.send_lossy(()); + } + let release = self.release.lock().take(); + if let Some(release) = release { + let _ = release.await; + } + } + } + /// A reporter that blocks inside `Update::Block` so tests can abort marshal /// exactly when application delivery starts. #[derive(Clone)] @@ -1699,6 +1760,27 @@ mod tests { provider: ConstantProvider, application: R, ) -> (Mailbox>, commonware_runtime::Handle<()>) { + start_standard_actor_with_buffer( + context, + partition_prefix, + provider, + application, + NoopBuffer, + ) + .await + } + + async fn start_standard_actor_with_buffer( + context: deterministic::Context, + partition_prefix: &str, + provider: ConstantProvider, + application: R, + buffer: Buf, + ) -> (Mailbox>, commonware_runtime::Handle<()>) + where + R: Reporter>, + Buf: crate::marshal::core::Buffer, PublicKey = PublicKey, CachedBlock = B>, + { let config = Config { provider, epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), @@ -1783,12 +1865,76 @@ mod tests { let (resolver_tx, resolver_rx) = mpsc::channel(100); let actor_handle = actor.start( application, - NoopBuffer, + buffer, (resolver_rx, NoopResolver::holding(resolver_tx)), ); (mailbox, actor_handle) } + /// Regression: `marshal.proposed` must not ack until the block has been + /// handed off to the provided buffer. + #[test_traced("WARN")] + fn test_standard_proposed_waits_for_buffer_send() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let partition_prefix = format!("proposed-waits-buffer-{me}"); + + let (buffer, send_entered, release) = GatingBuffer::new(); + let (mailbox, actor_handle) = start_standard_actor_with_buffer( + context.with_label("validator_0"), + &partition_prefix, + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + buffer, + ) + .await; + + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + + // Drive `proposed` from a spawned task so we can observe its state + // from the main task via a completion channel. + let (done_tx, done_rx) = oneshot::channel(); + context + .with_label("proposed_caller") + .spawn(move |_| async move { + let ok = mailbox.proposed(round, block).await; + done_tx.send_lossy(ok); + }); + + // Wait for the marshal actor to enter `buffer.send`. + send_entered + .await + .expect("buffer.send should be entered after cache_verified"); + + // With the buffer held in `send`, `proposed` must remain pending. + // Poll it against a generous timer; the timer should always win. + futures::pin_mut!(done_rx); + select! { + _ = context.sleep(Duration::from_millis(500)) => {}, + _ = &mut done_rx => { + panic!("proposed returned before buffer.send released"); + }, + } + + // Releasing the gate lets `send` complete; `proposed` must then ack. + release.send_lossy(()); + let ok = select! { + result = &mut done_rx => result.expect("proposed channel closed"), + _ = context.sleep(Duration::from_secs(5)) => { + panic!("proposed did not complete after buffer release"); + }, + }; + assert!(ok, "proposed should return true after durable dispatch"); + }); + } + /// When the provider has no verifier for an epoch, in-flight deliveries /// for that epoch must be acknowledged (`true`) so the serving peer is /// not blamed, rather than rejected (`false`). From 0b1f28bf6e6a0817f94d6400f42f9a499ebd9804 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Sat, 18 Apr 2026 16:03:49 -0700 Subject: [PATCH 098/107] fix --- consensus/src/marshal/standard/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index bcd882d0ee2..1aa7bb27003 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -1886,7 +1886,7 @@ mod tests { let partition_prefix = format!("proposed-waits-buffer-{me}"); let (buffer, send_entered, release) = GatingBuffer::new(); - let (mailbox, actor_handle) = start_standard_actor_with_buffer( + let (mailbox, _actor_handle) = start_standard_actor_with_buffer( context.with_label("validator_0"), &partition_prefix, ConstantProvider::new(schemes[0].clone()), From a2e88d4dd7ee9a3605c6ad1c73a531d97072ec84 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Sun, 19 Apr 2026 22:23:46 -0700 Subject: [PATCH 099/107] Address Feedback (#3628) --- .../src/marshal/application/validation.rs | 32 +- .../marshal/application/verification_tasks.rs | 111 +++++ consensus/src/marshal/coding/marshaled.rs | 21 +- consensus/src/marshal/coding/mod.rs | 103 ++++ consensus/src/marshal/core/actor.rs | 56 +-- consensus/src/marshal/core/mailbox.rs | 19 + consensus/src/marshal/mocks/harness.rs | 252 ++++++++++ consensus/src/marshal/resolver/handler.rs | 10 + consensus/src/marshal/standard/deferred.rs | 16 +- consensus/src/marshal/standard/inline.rs | 16 +- consensus/src/marshal/standard/mod.rs | 439 +++++++++++++++--- consensus/src/marshal/standard/validation.rs | 5 +- consensus/src/simplex/actors/voter/mod.rs | 103 ++++ consensus/src/simplex/engine.rs | 14 +- p2p/src/authenticated/discovery/network.rs | 12 +- p2p/src/authenticated/lookup/network.rs | 12 +- storage/src/archive/immutable/mod.rs | 6 +- storage/src/archive/mod.rs | 88 +++- storage/src/archive/prunable/mod.rs | 67 ++- 19 files changed, 1229 insertions(+), 153 deletions(-) diff --git a/consensus/src/marshal/application/validation.rs b/consensus/src/marshal/application/validation.rs index 46df606ffa5..e2e8c5e2b07 100644 --- a/consensus/src/marshal/application/validation.rs +++ b/consensus/src/marshal/application/validation.rs @@ -3,7 +3,11 @@ //! This module centralizes pure invariant checks shared across marshal verification //! and certification flows. -use crate::types::{Epoch, Epocher, Height, Round}; +use crate::{ + marshal::core::{Mailbox, Variant}, + types::{Epoch, Epocher, Height, Round}, +}; +use commonware_cryptography::certificate::Scheme; use commonware_utils::sync::Mutex; use std::sync::Arc; @@ -11,6 +15,32 @@ use std::sync::Arc; /// proposer task and the broadcast path. pub(crate) type LastBuilt = Arc>>; +/// Which stage of verification a block has reached. +/// +/// This is used to determine which marshal cache a block should be stored in. +#[derive(Clone, Copy, Debug)] +pub(crate) enum Stage { + /// The block has been verified (store in `verified_blocks`). + Verified, + /// The block has been certified (store in `notarized_blocks`). + Certified, +} + +impl Stage { + /// Store `block` in the marshal cache for the provided stage. + pub(crate) async fn store( + self, + marshal: &mut Mailbox, + round: Round, + block: V::Block, + ) -> bool { + match self { + Self::Verified => marshal.verified(round, block).await, + Self::Certified => marshal.certified(round, block).await, + } + } +} + /// Returns true if the block is at an epoch boundary (last block in its epoch). #[inline] fn is_at_epoch_boundary(epocher: &ES, block_height: Height, epoch: Epoch) -> bool { diff --git a/consensus/src/marshal/application/verification_tasks.rs b/consensus/src/marshal/application/verification_tasks.rs index 98df16a77ab..7c85b0502e5 100644 --- a/consensus/src/marshal/application/verification_tasks.rs +++ b/consensus/src/marshal/application/verification_tasks.rs @@ -59,3 +59,114 @@ where .retain(|(task_round, _), _| task_round > finalized_round); } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Epoch, View}; + use commonware_cryptography::{sha256::Digest as Sha256Digest, Hasher, Sha256}; + + type D = Sha256Digest; + + fn round(view: u64) -> Round { + Round::new(Epoch::zero(), View::new(view)) + } + + fn pending_task() -> oneshot::Receiver { + let (_tx, rx) = oneshot::channel(); + rx + } + + #[test] + fn test_insert_and_take_returns_task() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + tasks.insert(round(1), digest, pending_task()); + + assert!(tasks.take(round(1), digest).is_some()); + assert!( + tasks.take(round(1), digest).is_none(), + "taking twice should yield None" + ); + } + + #[test] + fn test_take_absent_key_is_none() { + let tasks = VerificationTasks::::new(); + assert!(tasks.take(round(1), Sha256::hash(b"missing")).is_none()); + } + + #[test] + fn test_take_distinguishes_rounds_and_digests() { + let tasks = VerificationTasks::::new(); + let digest_a = Sha256::hash(b"a"); + let digest_b = Sha256::hash(b"b"); + tasks.insert(round(1), digest_a, pending_task()); + tasks.insert(round(2), digest_a, pending_task()); + tasks.insert(round(1), digest_b, pending_task()); + + assert!(tasks.take(round(1), digest_a).is_some()); + assert!(tasks.take(round(2), digest_a).is_some()); + assert!(tasks.take(round(1), digest_b).is_some()); + } + + #[test] + fn test_retain_after_drops_at_and_below_boundary() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + tasks.insert(round(1), digest, pending_task()); + tasks.insert(round(2), digest, pending_task()); + tasks.insert(round(3), digest, pending_task()); + + tasks.retain_after(&round(2)); + + assert!( + tasks.take(round(1), digest).is_none(), + "tasks strictly below boundary should be dropped" + ); + assert!( + tasks.take(round(2), digest).is_none(), + "tasks at boundary should be dropped" + ); + assert!( + tasks.take(round(3), digest).is_some(), + "tasks strictly above boundary should be retained" + ); + } + + #[test] + fn test_retain_after_spans_epochs() { + let tasks = VerificationTasks::::new(); + let digest = Sha256::hash(b"block"); + let early = Round::new(Epoch::zero(), View::new(100)); + let late = Round::new(Epoch::new(1), View::zero()); + tasks.insert(early, digest, pending_task()); + tasks.insert(late, digest, pending_task()); + + tasks.retain_after(&early); + + assert!( + tasks.take(early, digest).is_none(), + "task at boundary must be dropped" + ); + assert!( + tasks.take(late, digest).is_some(), + "task in later epoch must outlive an earlier boundary" + ); + } + + #[test] + fn test_retain_after_empty_map_is_noop() { + let tasks = VerificationTasks::::new(); + tasks.retain_after(&round(5)); + assert!(tasks.take(round(5), Sha256::hash(b"x")).is_none()); + } + + #[test] + fn test_default_matches_new() { + let default = as Default>::default(); + let digest = Sha256::hash(b"block"); + default.insert(round(1), digest, pending_task()); + assert!(default.take(round(1), digest).is_some()); + } +} diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index cc3d1def0ff..dcd0018d711 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -83,7 +83,7 @@ use crate::{ ancestry::AncestorStream, application::{ validation::{ - is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, + is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, Stage, }, verification_tasks::VerificationTasks, }, @@ -299,6 +299,7 @@ where consensus_context: Context::PublicKey>, commitment: Commitment, prefetched_block: Option>, + stage: Stage, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); @@ -424,7 +425,7 @@ where is_valid = validity_request => is_valid, }; timer.observe(); - if application_valid && !marshal.verified(round, block).await { + if application_valid && !stage.store(&mut marshal, round, block).await { debug!(?round, "marshal unable to accept block"); return; } @@ -779,7 +780,7 @@ where // Kick off deferred verification early to hide verification latency behind // shard validity checks and network latency for collecting votes. let round = consensus_context.round; - let task = self.deferred_verify(consensus_context, payload, None); + let task = self.deferred_verify(consensus_context, payload, None, Stage::Verified); self.verification_tasks.insert(round, payload, task); match scheme.me() { @@ -895,9 +896,10 @@ where round, ); if is_reproposal { - // During crash recovery we may call `marshal.verified` twice for - // the same block; the call is idempotent. - if !marshaled.marshal.verified(round, block).await { + // Certifier holds a notarization for this block, so route + // the write to the notarized cache. `certified` is + // idempotent, so crash-recovery double-invocation is safe. + if !marshaled.marshal.certified(round, block).await { debug!(?round, "marshal unable to accept block"); return; } @@ -916,7 +918,12 @@ where // Use the block's embedded context for verification, passing the // prefetched block to avoid fetching it again inside deferred_verify. - let verify_rx = marshaled.deferred_verify(embedded_context, payload, Some(block)); + let verify_rx = marshaled.deferred_verify( + embedded_context, + payload, + Some(block), + Stage::Certified, + ); if let Ok(result) = verify_rx.await { tx.send_lossy(result); } diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 4b83aa9765a..e69e8f23d8b 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -259,6 +259,109 @@ mod tests { harness::finalize_same_height_different_views::(); } + #[test_traced("WARN")] + fn test_coding_certify_persists_equivocated_block() { + harness::certify_persists_equivocated_block::(); + } + + #[test_traced("WARN")] + fn test_coding_certify_at_later_view_survives_earlier_view_pruning() { + harness::certify_at_later_view_survives_earlier_view_pruning::(); + } + + /// Finalizing a descendant must not height-prune the shard-engine buffer before + /// `try_repair_gaps` has consumed buffer-only ancestors. + /// + /// Places parent (height 1) and descendant (height 2) in the shard engine's + /// reconstructed-block cache via `proposed()`, then reports a finalization + /// for the descendant only. + #[test_traced("WARN")] + fn test_coding_store_finalization_does_not_prune_buffer_before_repair() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let mut handle = harness::ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + // Build a 2-block chain: parent at height 1, descendant at height 2. + let parent_block = CodingHarness::make_test_block( + Sha256::hash(b""), + CodingHarness::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 1, + NUM_VALIDATORS as u16, + ); + let parent_digest = CodingHarness::digest(&parent_block); + let parent_commitment = CodingHarness::commitment(&parent_block); + + let descendant_block = CodingHarness::make_test_block( + parent_digest, + parent_commitment, + Height::new(2), + 2, + NUM_VALIDATORS as u16, + ); + let descendant_commitment = CodingHarness::commitment(&descendant_block); + + // Seed the shard engine's reconstructed-block cache with both blocks. + CodingHarness::propose( + &mut handle, + Round::new(Epoch::new(0), View::new(1)), + &parent_block, + ) + .await; + CodingHarness::propose( + &mut handle, + Round::new(Epoch::new(0), View::new(2)), + &descendant_block, + ) + .await; + + // Report finalization for the descendant only. The parent has no + // finalization certificate: it must be archived by walking the + // parent link from the descendant and sourcing the block from the + // shard-engine buffer. + let descendant_proposal = Proposal { + round: Round::new(Epoch::new(0), View::new(2)), + parent: View::new(1), + payload: descendant_commitment, + }; + let descendant_finalization = + CodingHarness::make_finalization(descendant_proposal, &schemes, QUORUM); + CodingHarness::report_finalization(&mut handle.mailbox, descendant_finalization).await; + + // Wait until the descendant is archived: that proves finalization processing + // has completed, at which point the parent must already have been repaired + // from the shard buffer. + while handle.mailbox.get_block(Height::new(2)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + let parent = handle.mailbox.get_block(Height::new(1)).await; + assert!( + parent.is_some(), + "parent must be archived from shard buffer before height-prune evicts it" + ); + }); + } + #[test_traced("WARN")] fn test_coding_init_processed_height() { harness::init_processed_height::(); diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 1581358b7e0..7d1fa31f1d8 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -52,7 +52,7 @@ use std::{ pin::Pin, sync::Arc, }; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, warn}; /// The key used to store the last processed height in the metadata store. const LATEST_KEY: U64 = U64::new(0xFF); @@ -453,7 +453,7 @@ where result = self.pending_acks.current() => { // Start with the ack that woke this `select_loop!` arm. let mut pending = Some(self.pending_acks.complete_current(result)); - loop { + let last_acked_commitment = loop { let (height, commitment, result) = pending.take().expect("pending ack must exist"); match result { @@ -471,11 +471,11 @@ where // Opportunistically drain any additional already-ready acks so we // can persist one metadata sync for the whole batch below. - let Some(next) = self.pending_acks.pop_ready() else { - break; - }; - pending = Some(next); - } + match self.pending_acks.pop_ready() { + Some(next) => pending = Some(next), + None => break commitment, + } + }; // Persist buffered processed-height updates once after draining all ready acks. if let Err(e) = self.application_metadata.sync().await { @@ -483,12 +483,15 @@ where return; } + // Inform the buffer of the last acknowledged commitment (anything below this is safe to prune). + buffer.finalized(last_acked_commitment).await; + // Fill the pipeline self.try_dispatch_blocks(&mut application).await; }, // Handle consensus inputs before backfill or resolver traffic Some(message) = self.mailbox.recv() else { - info!("mailbox closed, shutting down"); + debug!("mailbox closed, shutting down"); break; } => { match message { @@ -543,6 +546,10 @@ where self.cache_verified(round, block.digest(), block).await; ack.send_lossy(()); } + Message::Certified { round, block, ack } => { + self.cache_block(round, block.digest(), block).await; + ack.send_lossy(()); + } Message::Notarization { notarization } => { let round = notarization.round(); let commitment = notarization.proposal.payload; @@ -588,7 +595,6 @@ where block, Some(finalization), &mut application, - &mut buffer, ) .await { @@ -730,7 +736,7 @@ where }, // Handle resolver messages last (batched up to max_repair, sync once) Some(message) = resolver_rx.recv() else { - info!("handler closed, shutting down"); + debug!("handler closed, shutting down"); return; } => { // Drain up to max_repair messages: blocks handled immediately, @@ -758,7 +764,6 @@ where response, &mut delivers, &mut application, - &mut buffer, ) .await; } @@ -767,7 +772,7 @@ where // Batch verify and process all delivers. needs_sync |= self - .verify_delivered(delivers, &mut application, &mut buffer) + .verify_delivered(delivers, &mut application) .await; // Attempt to fill gaps before handling produce requests (so we @@ -923,14 +928,13 @@ where /// immediately. Finalized/Notarized delivers are parsed and structurally /// validated, then collected into `delivers` for batch certificate verification. /// Returns true if finalization archives were written and need syncing. - async fn handle_deliver>( + async fn handle_deliver( &mut self, key: Request, value: Bytes, response: oneshot::Sender, delivers: &mut Vec>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { match key { Request::Block(commitment) => { @@ -949,7 +953,7 @@ where let digest = block.digest(); let finalization = self.cache.get_finalization_for(digest).await; let wrote = self - .store_finalization(height, digest, block, finalization, application, buffer) + .store_finalization(height, digest, block, finalization, application) .await; debug!(?digest, %height, "received block"); response.send_lossy(true); // if a valid block is received, we should still send true (even if it was stale) @@ -1045,11 +1049,10 @@ where /// Batch verify pending certificates and process valid items. Returns true /// if finalization archives were written and need syncing. - async fn verify_delivered>( + async fn verify_delivered( &mut self, mut delivers: Vec>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { if delivers.is_empty() { return false; @@ -1132,14 +1135,7 @@ where debug!(?round, %height, "received finalization"); wrote |= self - .store_finalization( - height, - digest, - block, - Some(finalization), - application, - buffer, - ) + .store_finalization(height, digest, block, Some(finalization), application) .await; } PendingVerification::Notarized { @@ -1169,7 +1165,6 @@ where block.clone(), Some(finalization), application, - buffer, ) .await; } @@ -1414,14 +1409,13 @@ where /// `select_loop!` so that archive data is durable before the ack handler /// advances `last_processed_height`. See [`Self::try_dispatch_blocks`] for the /// crash safety invariant. - async fn store_finalization>( + async fn store_finalization( &mut self, height: Height, digest: ::Digest, block: V::Block, finalization: Option>, application: &mut impl Reporter>, - buffer: &mut Buf, ) -> bool { // Blocks below the last processed height are not useful to us, so we ignore them (this // has the nice byproduct of ensuring we don't call a backing store with a block below the @@ -1438,7 +1432,6 @@ where self.notify_subscribers(&block); // Convert block to storage format - let commitment = V::commitment(&block); let stored: V::StoredBlock = block.into(); let round = finalization.as_ref().map(|f| f.round()); @@ -1463,13 +1456,12 @@ where panic!("failed to finalize: {e}"); } - // Update metrics, buffer, and application + // Update metrics and application if let Some(round) = round.filter(|_| height > self.tip) { application.report(Update::Tip(round, height, digest)).await; self.tip = height; let _ = self.finalized_height.try_set(height.get()); } - buffer.finalized(commitment).await; true } @@ -1592,7 +1584,6 @@ where block, Some(finalization), application, - buffer, ) .await; } else { @@ -1638,7 +1629,6 @@ where block.clone(), finalization, application, - buffer, ) .await; debug!(height = %block.height(), "repaired block"); diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 2042b623be6..3dd41b1cdb8 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -112,6 +112,15 @@ pub(crate) enum Message { /// A channel signaled once the block is durably stored. ack: oneshot::Sender<()>, }, + /// A notification that a block has been certified by the application. + Certified { + /// The round in which the block was certified. + round: Round, + /// The certified block. + block: V::Block, + /// A channel signaled once the block is durably stored. + ack: oneshot::Sender<()>, + }, /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below @@ -307,6 +316,16 @@ impl Mailbox { .is_some() } + /// Notifies the actor that a block has been certified, awaiting the actor's + /// confirmation that the block has been durably persisted before returning. + #[must_use = "callers must consider block durability before proceeding"] + pub async fn certified(&self, round: Round, block: V::Block) -> bool { + self.sender + .request(|ack| Message::Certified { round, block, ack }) + .await + .is_some() + } + /// Sets the sync starting point (advances if higher than current). /// /// Marshal will sync and deliver blocks starting at `floor + 1`. Data below diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 18c9e0d06e4..5480677bc42 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -259,6 +259,13 @@ pub trait TestHarness: 'static + Sized { all_handles: &mut [ValidatorHandle], ) -> impl Future + Send; + /// Mark a block as certified. + fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> impl Future + Send; + /// Create a finalization certificate. fn make_finalization( proposal: Proposal, @@ -927,6 +934,207 @@ pub fn verified_success_implies_recoverable_after_restart( } } +/// Regression: when the same block is verified at an earlier view and later +/// certified at a much later view (epoch-boundary reproposal), both writes +/// must land so retention can prune the earlier view without losing the +/// block. A naive "skip the sibling write if the block's digest is already +/// present in the other archive" optimization is unsafe because the two +/// archives prune per-view on the same boundary: if the block lives only in +/// `verified_blocks[V_early]` and never gets written to +/// `notarized_blocks[V_late]`, advancing retention past V_early drops the +/// block even though V_late is still within the window. +pub fn certify_at_later_view_survives_earlier_view_pruning() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let application = setup.application; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + // A repeated block that we will verify at an early view and certify + // at a later view. Its height is intentionally well beyond the chain + // we'll drive below, so it never enters the finalized archive via + // gap repair and lives solely in the prunable caches. + let repeated = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(5_000), + 9_999, + NUM_VALIDATORS as u16, + ); + let repeated_digest = H::digest(&repeated); + + // Negative control: a verify-only block at the same early view. Because + // it is never certified, it lives solely in `verified_blocks[V=1]` and + // must disappear once retention pruning advances past V=1. Asserting it + // is gone confirms the prune actually fires at the expected floor, so + // the `repeated` survivor assertion below is genuinely load-bearing. + let orphan = H::make_test_block( + Sha256::hash(b"orphan"), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(6_000), + 9_998, + NUM_VALIDATORS as u16, + ); + let orphan_digest = H::digest(&orphan); + + // Verify `repeated` at V=1, then certify at V=25 (reproposal-style gap). + let v_early = Round::new(Epoch::zero(), View::new(1)); + let v_late = Round::new(Epoch::zero(), View::new(25)); + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, v_early, &repeated, &mut peers).await; + assert!( + H::certify(&mut handle, v_late, &repeated).await, + "certify must ack" + ); + + // Verify `orphan` at V=1 only (no certify). + H::verify(&mut handle, v_early, &orphan, &mut peers).await; + + // Drive the finalized chain forward to advance `last_processed_round` + // past V=1's retention boundary but not past V=25's. With + // view_retention_timeout=10 and prunable_items_per_section=10, + // processing views 1..=21 leaves `oldest_allowed=10` in both prunable + // archives — V=1 is dropped, V=25 is retained. + const CHAIN_LEN: u64 = 21; + let mut parent = Sha256::hash(b""); + let mut parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); + for i in 1..=CHAIN_LEN { + let block = H::make_test_block( + parent, + parent_commitment, + Height::new(i), + i, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let commitment = H::commitment(&block); + let round = Round::new(Epoch::zero(), View::new(i)); + H::propose(&mut handle, round, &block).await; + let proposal = Proposal { + round, + parent: View::new(i - 1), + payload: commitment, + }; + let finalization = H::make_finalization(proposal, &schemes, QUORUM); + H::report_finalization(&mut handle.mailbox, finalization).await; + parent = digest; + parent_commitment = commitment; + } + while (application.blocks().len() as u64) < CHAIN_LEN { + context.sleep(Duration::from_millis(10)).await; + } + context.sleep(Duration::from_millis(100)).await; + + // Negative control: the verify-only orphan at V=1 must be gone, which + // proves retention pruning actually evicted V=1 at the expected floor. + assert!( + handle.mailbox.get_block(&orphan_digest).await.is_none(), + "verify-only block at V=1 must be evicted by retention pruning" + ); + + // The repeated block must still be retrievable: verified_blocks[V=1] + // has been pruned, but notarized_blocks[V=25] still holds it. + let recovered = handle.mailbox.get_block(&repeated_digest).await; + assert!( + recovered.is_some(), + "block certified at V=25 must survive retention pruning of V=1" + ); + assert_eq!(recovered.unwrap().digest(), repeated_digest); + }); +} + +/// Regression: when a leader equivocates, a validator may verify one block +/// (A) and then certify a different block (B) at the same round. `verified()` +/// and `certified()` must write to distinct archives so both blocks are +/// retained and retrievable; otherwise the second write collides on the same +/// prunable-archive index (`skip_if_index_exists=true`) and is silently +/// dropped despite the mailbox returning success. +pub fn certify_persists_equivocated_block() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + participants[0].clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + + let round = Round::new(Epoch::zero(), View::new(1)); + let parent = Sha256::hash(b""); + let parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); + + // Two distinct blocks at the same height/round (leader equivocation): + // distinct timestamps yield distinct digests. + let block_a = H::make_test_block( + parent, + parent_commitment, + Height::new(1), + 1, + NUM_VALIDATORS as u16, + ); + let digest_a = H::digest(&block_a); + let block_b = H::make_test_block( + parent, + parent_commitment, + Height::new(1), + 2, + NUM_VALIDATORS as u16, + ); + let digest_b = H::digest(&block_b); + assert_ne!(digest_a, digest_b, "test requires distinct digests"); + + let mut peers: [ValidatorHandle; 0] = []; + H::verify(&mut handle, round, &block_a, &mut peers).await; + assert!( + H::certify(&mut handle, round, &block_b).await, + "certified must ack" + ); + + let got_a = handle.mailbox.get_block(&digest_a).await; + assert!( + got_a.is_some(), + "verified block A must be persisted in verified_blocks" + ); + assert_eq!(got_a.unwrap().digest(), digest_a); + let got_b = handle.mailbox.get_block(&digest_b).await; + assert!( + got_b.is_some(), + "certified block B must be persisted despite a verify at the same round" + ); + assert_eq!(got_b.unwrap().digest(), digest_b); + }); +} + /// Contract: once marshal has delivered a finalized block to the application, /// that finalized block and its certificate must already be durable. pub fn delivery_visibility_implies_recoverable_after_restart( @@ -1275,6 +1483,10 @@ impl TestHarness for StandardHarness { assert!(handle.mailbox.verified(round, block.clone()).await); } + async fn certify(handle: &mut ValidatorHandle, round: Round, block: &B) -> bool { + handle.mailbox.certified(round, block.clone()).await + } + fn make_finalization(proposal: Proposal, schemes: &[S], quorum: u32) -> Finalization { let finalizes: Vec<_> = schemes .iter() @@ -1536,6 +1748,22 @@ impl TestHarness for InlineHarness { .await; } + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> bool { + StandardHarness::certify( + &mut ValidatorHandle:: { + mailbox: handle.mailbox.clone(), + extra: handle.extra.clone(), + }, + round, + block, + ) + .await + } + fn make_finalization( proposal: Proposal, schemes: &[S], @@ -1724,6 +1952,22 @@ impl TestHarness for DeferredHarness { .await; } + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &Self::TestBlock, + ) -> bool { + InlineHarness::certify( + &mut ValidatorHandle:: { + mailbox: handle.mailbox.clone(), + extra: handle.extra.clone(), + }, + round, + block, + ) + .await + } + fn make_finalization( proposal: Proposal, schemes: &[S], @@ -2063,6 +2307,14 @@ impl TestHarness for CodingHarness { assert!(handle.mailbox.verified(round, block.clone()).await); } + async fn certify( + handle: &mut ValidatorHandle, + round: Round, + block: &CodedBlock, Sha256>, + ) -> bool { + handle.mailbox.certified(round, block.clone()).await + } + fn make_finalization( proposal: Proposal, schemes: &[S], diff --git a/consensus/src/marshal/resolver/handler.rs b/consensus/src/marshal/resolver/handler.rs index df54ec15ab9..f0ffdcc0910 100644 --- a/consensus/src/marshal/resolver/handler.rs +++ b/consensus/src/marshal/resolver/handler.rs @@ -342,6 +342,16 @@ mod tests { assert_eq!(decoded, Request::Notarized { round }); } + #[test] + fn test_subject_decode_rejects_invalid_enum_tag() { + let bad = [3u8]; + let mut buf = bad.as_ref(); + assert!(matches!( + Request::::read(&mut buf), + Err(CodecError::InvalidEnum(3)) + )); + } + #[test] fn test_subject_hash() { use std::collections::HashSet; diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 295f95b7217..396a86a0f19 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -74,7 +74,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{is_inferred_reproposal_at_certify, LastBuilt}, + validation::{is_inferred_reproposal_at_certify, LastBuilt, Stage}, verification_tasks::VerificationTasks, }, core::Mailbox, @@ -203,6 +203,7 @@ where &mut self, context: ::Context, block: B, + stage: Stage, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); @@ -226,6 +227,7 @@ where &mut application, &mut marshal, &mut tx, + stage, ) .await { @@ -500,7 +502,7 @@ where // Begin the rest of the verification process asynchronously. let round = context.round; - let task = marshaled.deferred_verify(context, block); + let task = marshaled.deferred_verify(context, block, Stage::Verified); marshaled.verification_tasks.insert(round, digest, task); tx.send_lossy(true); @@ -584,9 +586,10 @@ where round, ); if is_reproposal { - // It is possible that, during crash recovery, we call `marshal.verified` - // twice for the same block. That function is idempotent, so this is safe. - if !marshaled.marshal.verified(round, block).await { + // Certifier holds a notarization for this block, so route + // the write to the notarized cache. `certified` is + // idempotent, so crash-recovery double-invocation is safe. + if !marshaled.marshal.certified(round, block).await { debug!(?round, "marshal unable to accept block"); return; } @@ -594,7 +597,8 @@ where return; } - let verify_rx = marshaled.deferred_verify(embedded_context, block); + let verify_rx = + marshaled.deferred_verify(embedded_context, block, Stage::Certified); if let Ok(result) = verify_rx.await { tx.send_lossy(result); } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index fe3c695b136..87be653eea8 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -45,7 +45,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, - application::validation::LastBuilt, + application::validation::{LastBuilt, Stage}, core::Mailbox, standard::{ validation::{ @@ -224,8 +224,11 @@ where /// Proposes a new block or re-proposes an epoch boundary block. /// /// Proposal runs in a spawned task and returns a receiver for the resulting digest. - /// Blocks are persisted and broadcast via `marshal.proposed()` before the digest - /// is returned to consensus. + /// The built block is cached in memory (`last_built`) for the subsequent + /// `Relay::broadcast(Plan::Propose)` call, which invokes `marshal.proposed()` + /// as the durable persistence boundary before consensus continues. Receiving + /// a digest from `propose()` alone does not mean the block is recoverable + /// after restart. async fn propose( &mut self, consensus_context: Context, @@ -408,6 +411,7 @@ where &mut application, &mut marshal, &mut tx, + Stage::Verified, ) .await { @@ -461,8 +465,10 @@ where // `certify` resolving true drives the finalize vote, so mere // buffered availability is not sufficient here. Persist the - // block through marshal before signaling success. - if marshal.verified(round, block).await { + // block through marshal before signaling success. The caller + // holds a notarization for this block, so route it into the + // notarized cache directly rather than the verified cache. + if marshal.certified(round, block).await { tx.send_lossy(true); } }); diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index 1aa7bb27003..b4971181e57 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -201,6 +201,18 @@ mod tests { harness::verified_success_implies_recoverable_after_restart::(0..16); } + #[test_traced("WARN")] + fn test_standard_certify_persists_equivocated_block() { + harness::certify_persists_equivocated_block::(); + harness::certify_persists_equivocated_block::(); + } + + #[test_traced("WARN")] + fn test_standard_certify_at_later_view_survives_earlier_view_pruning() { + harness::certify_at_later_view_survives_earlier_view_pruning::(); + harness::certify_at_later_view_survives_earlier_view_pruning::(); + } + #[test_traced("WARN")] fn test_standard_delivery_visibility_implies_recoverable_after_restart() { harness::delivery_visibility_implies_recoverable_after_restart::(0..16); @@ -1580,48 +1592,22 @@ mod tests { } } - /// A no-op resolver used by tests that drive the marshal actor's - /// resolver_rx channel directly. Outbound fetches/cancellations are dropped. + /// Recorded `send` call on the [`RecordingBuffer`]. + type BufferSend = (Round, B, Recipients); + + /// A buffer that records each `send` invocation; other methods are no-ops. #[derive(Clone, Default)] - struct NoopResolver { - _keepalive: Option>>, + struct RecordingBuffer { + sends: Arc>>, } - impl NoopResolver { - fn holding(sender: mpsc::Sender>) -> Self { - Self { - _keepalive: Some(sender), - } + impl RecordingBuffer { + fn sends(&self) -> Vec { + self.sends.lock().clone() } } - impl Resolver for NoopResolver { - type Key = handler::Request; - type PublicKey = PublicKey; - - async fn fetch(&mut self, _key: Self::Key) {} - async fn fetch_all(&mut self, _keys: Vec) {} - async fn fetch_targeted( - &mut self, - _key: Self::Key, - _targets: NonEmptyVec, - ) { - } - async fn fetch_all_targeted( - &mut self, - _requests: Vec<(Self::Key, NonEmptyVec)>, - ) { - } - async fn cancel(&mut self, _key: Self::Key) {} - async fn clear(&mut self) {} - async fn retain(&mut self, _predicate: impl Fn(&Self::Key) -> bool + Send + 'static) {} - } - - /// A no-op buffer used by tests that do not need marshal's dissemination path. - #[derive(Clone, Default)] - struct NoopBuffer; - - impl crate::marshal::core::Buffer> for NoopBuffer { + impl crate::marshal::core::Buffer> for RecordingBuffer { type PublicKey = PublicKey; type CachedBlock = B; @@ -1648,7 +1634,76 @@ mod tests { async fn finalized(&self, _commitment: D) {} - async fn send(&self, _round: Round, _block: B, _recipients: Recipients) {} + async fn send(&self, round: Round, block: B, recipients: Recipients) { + self.sends.lock().push((round, block, recipients)); + } + } + + /// Recorded `fetch_targeted` call on the [`RecordingResolver`]. + type TargetedFetch = (handler::Request, NonEmptyVec); + + /// A resolver that records each `fetch_targeted` invocation; other + /// methods are no-ops. + /// + /// `_keepalive` optionally retains a resolver-message sender so the + /// actor's corresponding receiver stays alive when nothing else owns it. + #[derive(Clone, Default)] + struct RecordingResolver { + targeted: Arc>>, + _keepalive: Option>>, + } + + impl RecordingResolver { + fn holding(sender: mpsc::Sender>) -> Self { + Self { + targeted: Arc::new(Mutex::new(Vec::new())), + _keepalive: Some(sender), + } + } + + fn targeted(&self) -> Vec { + self.targeted.lock().clone() + } + + fn targeted_is_empty(&self) -> bool { + self.targeted.lock().is_empty() + } + } + + impl Resolver for RecordingResolver { + type Key = handler::Request; + type PublicKey = PublicKey; + + async fn fetch(&mut self, _key: Self::Key) {} + async fn fetch_all(&mut self, _keys: Vec) {} + async fn fetch_targeted(&mut self, key: Self::Key, targets: NonEmptyVec) { + self.targeted.lock().push((key, targets)); + } + async fn fetch_all_targeted( + &mut self, + requests: Vec<(Self::Key, NonEmptyVec)>, + ) { + self.targeted.lock().extend(requests); + } + async fn cancel(&mut self, _key: Self::Key) {} + async fn clear(&mut self) {} + async fn retain(&mut self, _predicate: impl Fn(&Self::Key) -> bool + Send + 'static) {} + } + + /// Poll `cond` on a 10ms tick until it returns true, panicking on timeout. + async fn wait_until bool>( + context: &deterministic::Context, + deadline: Duration, + label: &str, + mut cond: F, + ) { + let start = context.current(); + while !cond() { + if context.current().duration_since(start).unwrap_or_default() > deadline { + panic!("{label} did not hold within {deadline:?}"); + } + context.sleep(Duration::from_millis(10)).await; + } } /// A buffer whose `send` blocks until released, and signals when entered. @@ -1754,32 +1809,22 @@ mod tests { } } - async fn start_standard_actor>>( - context: deterministic::Context, - partition_prefix: &str, - provider: ConstantProvider, - application: R, - ) -> (Mailbox>, commonware_runtime::Handle<()>) { - start_standard_actor_with_buffer( - context, - partition_prefix, - provider, - application, - NoopBuffer, - ) - .await - } - - async fn start_standard_actor_with_buffer( + async fn start_standard_actor( context: deterministic::Context, partition_prefix: &str, provider: ConstantProvider, application: R, buffer: Buf, - ) -> (Mailbox>, commonware_runtime::Handle<()>) + ) -> ( + Mailbox>, + Buf, + RecordingResolver, + commonware_runtime::Handle<()>, + ) where R: Reporter>, - Buf: crate::marshal::core::Buffer, PublicKey = PublicKey, CachedBlock = B>, + Buf: crate::marshal::core::Buffer, PublicKey = PublicKey, CachedBlock = B> + + Clone, { let config = Config { provider, @@ -1863,12 +1908,10 @@ mod tests { ) .await; let (resolver_tx, resolver_rx) = mpsc::channel(100); - let actor_handle = actor.start( - application, - buffer, - (resolver_rx, NoopResolver::holding(resolver_tx)), - ); - (mailbox, actor_handle) + let resolver = RecordingResolver::holding(resolver_tx); + let actor_handle = + actor.start(application, buffer.clone(), (resolver_rx, resolver.clone())); + (mailbox, buffer, resolver, actor_handle) } /// Regression: `marshal.proposed` must not ack until the block has been @@ -1886,7 +1929,7 @@ mod tests { let partition_prefix = format!("proposed-waits-buffer-{me}"); let (buffer, send_entered, release) = GatingBuffer::new(); - let (mailbox, _actor_handle) = start_standard_actor_with_buffer( + let (mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( context.with_label("validator_0"), &partition_prefix, ConstantProvider::new(schemes[0].clone()), @@ -2037,7 +2080,7 @@ mod tests { actor.start( Application::::default(), buffer, - (resolver_rx, NoopResolver::default()), + (resolver_rx, RecordingResolver::default()), ); // Inject a Finalized delivery with garbage payload. The @@ -2096,11 +2139,12 @@ mod tests { ); let (application, started, release) = GatedBlockReporter::new(); - let (mut mailbox, actor_handle) = start_standard_actor( + let (mut mailbox, _buffer, _resolver, actor_handle) = start_standard_actor( context.with_label("validator_0"), &partition_prefix, ConstantProvider::new(schemes[0].clone()), application, + RecordingBuffer::default(), ) .await; @@ -2130,11 +2174,12 @@ mod tests { // Yield once so the aborted actor drops its storage handles before restart. context.sleep(Duration::from_millis(1)).await; - let (mailbox, _actor_handle) = start_standard_actor( + let (mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( context.with_label("validator_0_restart"), &partition_prefix, ConstantProvider::new(schemes[0].clone()), Application::::manual_ack(), + RecordingBuffer::default(), ) .await; @@ -2291,4 +2336,266 @@ mod tests { } }); } + + /// `Forward` for an unknown commitment must early-return without + /// dispatching, even when peers are provided. + #[test_traced("WARN")] + fn test_standard_forward_unknown_block_is_noop() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let unknown = Sha256::hash(b"unknown-block"); + + let (mailbox, buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("forward-unknown-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + mailbox + .forward(round, unknown, vec![participants[1].clone()]) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + buffer.sends().is_empty(), + "forward for an unknown block must not dispatch" + ); + }); + } + + /// `Forward` for a block that marshal has cached must dispatch that block + /// to exactly the provided peer set via the buffer. + #[test_traced("WARN")] + fn test_standard_forward_cached_block_sends_to_peers() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let digest = block.digest(); + + let (mailbox, buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("forward-cached-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + + let targets = vec![participants[1].clone(), participants[2].clone()]; + mailbox.forward(round, digest, targets.clone()).await; + + wait_until(&context, Duration::from_secs(5), "buffer.send", || { + !buffer.sends.lock().is_empty() + }) + .await; + + let sends = buffer.sends(); + assert_eq!(sends.len(), 1); + let (sent_round, sent_block, sent_recipients) = &sends[0]; + assert_eq!(*sent_round, round); + assert_eq!(sent_block.digest(), digest); + match sent_recipients { + Recipients::Some(peers) => assert_eq!(peers, &targets), + other => panic!("expected Recipients::Some, got {other:?}"), + } + }); + } + + /// `HintFinalized` at or below the floor must be a no-op: marshal must + /// not fire a targeted resolver fetch since the hint is stale. + #[test_traced("WARN")] + fn test_standard_hint_finalized_below_floor_is_noop() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + + let (mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-below-floor-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + // Raise the floor above the hint we are about to send. + mailbox.set_floor(Height::new(10)).await; + context.sleep(Duration::from_millis(50)).await; + + mailbox + .hint_finalized(Height::new(5), NonEmptyVec::new(participants[1].clone())) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + resolver.targeted_is_empty(), + "hint at or below floor must not fetch" + ); + }); + } + + /// `HintFinalized` for a height whose finalization is already durable must + /// be a no-op: marshal already has everything needed and must not + /// initiate a redundant fetch. + #[test_traced("WARN")] + fn test_standard_hint_finalized_skips_when_already_finalized() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (mut mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-already-final-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + StandardHarness::report_finalization(&mut mailbox, finalization).await; + + // Wait until marshal has durably stored the finalization. + while mailbox.get_finalization(Height::new(1)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + mailbox + .hint_finalized(Height::new(1), NonEmptyVec::new(participants[1].clone())) + .await; + context.sleep(Duration::from_millis(50)).await; + + assert!( + resolver.targeted_is_empty(), + "hint for a locally-finalized height must not fetch" + ); + }); + } + + /// `HintFinalized` above the floor for a not-yet-finalized height must + /// trigger exactly one targeted fetch via the resolver. + #[test_traced("WARN")] + fn test_standard_hint_finalized_emits_targeted_fetch() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + + let (mailbox, _buffer, resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("hint-targets-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + let target = participants[1].clone(); + mailbox + .hint_finalized(Height::new(7), NonEmptyVec::new(target.clone())) + .await; + + wait_until(&context, Duration::from_secs(5), "fetch_targeted", || { + !resolver.targeted.lock().is_empty() + }) + .await; + + let targeted = resolver.targeted(); + assert_eq!(targeted.len(), 1); + let (request, targets) = &targeted[0]; + assert_eq!( + request, + &handler::Request::Finalized { + height: Height::new(7) + } + ); + assert_eq!(&targets[..], &[target]); + }); + } + + /// `Prune` for a height above the floor must be rejected (warn + continue) + /// and must not advance the floor or alter the finalized archive contents. + #[test_traced("WARN")] + fn test_standard_prune_above_floor_is_rejected() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let me = participants[0].clone(); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); + let finalization = StandardHarness::make_finalization( + Proposal::new(round, View::zero(), block.digest()), + &schemes, + QUORUM, + ); + + let (mut mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( + context.with_label("validator_0"), + &format!("prune-above-floor-{me}"), + ConstantProvider::new(schemes[0].clone()), + Application::::manual_ack(), + RecordingBuffer::default(), + ) + .await; + + assert!(mailbox.verified(round, block.clone()).await); + StandardHarness::report_finalization(&mut mailbox, finalization).await; + + while mailbox.get_finalization(Height::new(1)).await.is_none() { + context.sleep(Duration::from_millis(10)).await; + } + + // Prune above the floor must be a no-op, not an error. + mailbox.prune(Height::new(100)).await; + context.sleep(Duration::from_millis(50)).await; + + // The finalized block and its finalization must still be retrievable. + assert!(mailbox.get_block(Height::new(1)).await.is_some()); + assert!(mailbox.get_finalization(Height::new(1)).await.is_some()); + }); + } } diff --git a/consensus/src/marshal/standard/validation.rs b/consensus/src/marshal/standard/validation.rs index 31d96901ef8..673f8e418a5 100644 --- a/consensus/src/marshal/standard/validation.rs +++ b/consensus/src/marshal/standard/validation.rs @@ -2,7 +2,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::validation::{ - has_contiguous_height, is_block_in_expected_epoch, is_valid_reproposal_at_verify, + has_contiguous_height, is_block_in_expected_epoch, is_valid_reproposal_at_verify, Stage, }, core::Mailbox, standard::Standard, @@ -125,6 +125,7 @@ pub(super) async fn verify_with_parent( application: &mut A, marshal: &mut Mailbox>, tx: &mut oneshot::Sender, + stage: Stage, ) -> Option where E: Rng + Spawner + Metrics + Clock, @@ -201,7 +202,7 @@ where valid = validity_request => valid, }; - if application_valid && !marshal.verified(context.round, block).await { + if application_valid && !stage.store(marshal, context.round, block).await { debug!(round = ?context.round, "marshal unable to accept block"); return None; } diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 87f25a34584..2441d2c2e81 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -312,6 +312,109 @@ mod tests { propose_broadcast_failure_stops_before_notarize::<_, _>(secp256r1::fixture); } + /// Engine must not panic when the voter exits cleanly after the local + /// relay rejects `Plan::Propose`. The voter treats that as a fatal stop; + /// the engine must agree and shut down gracefully. + #[test_traced] + fn test_engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast() { + let namespace = + b"engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast".to_vec(); + let partition = + "engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(10)); + executor.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, &namespace, 5); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = mocks::reporter::Reporter::new( + context.with_label("reporter"), + reporter_cfg.clone(), + ); + + let app_relay = Arc::new(mocks::relay::Relay::new()); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: app_relay, + me: me.clone(), + propose_latency: (0.0, 0.0), + verify_latency: (0.0, 0.0), + certify_latency: (0.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.start(); + + let cfg = crate::simplex::config::Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application, + relay: FailingRelay::default(), + reporter, + strategy: Sequential, + partition, + mailbox_size: 128, + epoch: Epoch::new(4), + leader_timeout: Duration::from_secs(5), + certification_timeout: Duration::from_secs(5), + timeout_retry: Duration::from_mins(60), + fetch_timeout: Duration::from_secs(1), + activity_timeout: ViewDelta::new(10), + skip_timeout: ViewDelta::new(5), + fetch_concurrent: 4, + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + forwarding: crate::simplex::config::ForwardingPolicy::Disabled, + }; + let engine = crate::simplex::Engine::new(context.with_label("engine"), cfg); + + let (vote_sender, vote_receiver) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, cert_receiver) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let (resolver_sender, resolver_receiver) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + + let handle = engine.start( + (vote_sender, vote_receiver), + (cert_sender, cert_receiver), + (resolver_sender, resolver_receiver), + ); + + select! { + result = handle => { + result.expect("engine should stop cleanly after voter exit"); + }, + _ = context.sleep(Duration::from_secs(2)) => { + panic!("timed out waiting for engine to stop after voter exit"); + } + } + }); + } + fn build_notarization>( schemes: &[S], proposal: &Proposal, diff --git a/consensus/src/simplex/engine.rs b/consensus/src/simplex/engine.rs index 1b609032908..22edfd73b58 100644 --- a/consensus/src/simplex/engine.rs +++ b/consensus/src/simplex/engine.rs @@ -225,20 +225,20 @@ impl< certificate_sender, ); - // Wait for the resolver or voter to finish + // If any task completes, the engine should stop let mut shutdown = self.context.stopped(); select! { _ = &mut shutdown => { debug!("context shutdown, stopping engine"); }, - _ = &mut voter_task => { - panic!("voter should not finish"); + voter = &mut voter_task => { + debug!(?voter, "voter stopped, shutting down engine"); }, - _ = &mut batcher_task => { - panic!("batcher should not finish"); + batcher = &mut batcher_task => { + debug!(?batcher, "batcher stopped, shutting down engine"); }, - _ = &mut resolver_task => { - panic!("resolver should not finish"); + resolver = &mut resolver_task => { + debug!(?resolver, "resolver stopped, shutting down engine"); }, } } diff --git a/p2p/src/authenticated/discovery/network.rs b/p2p/src/authenticated/discovery/network.rs index 69a3cef9c43..d8408b794c8 100644 --- a/p2p/src/authenticated/discovery/network.rs +++ b/p2p/src/authenticated/discovery/network.rs @@ -201,26 +201,26 @@ impl< let mut shutdown = self.context.stopped(); - // Wait for first actor to exit + // If any task completes, the network should stop info!("network started"); select! { _ = &mut shutdown => { debug!("context shutdown, stopping network"); }, tracker = &mut tracker_task => { - panic!("tracker exited unexpectedly: {tracker:?}"); + debug!(?tracker, "tracker stopped, shutting down network"); }, router = &mut router_task => { - panic!("router exited unexpectedly: {router:?}"); + debug!(?router, "router stopped, shutting down network"); }, spawner = &mut spawner_task => { - panic!("spawner exited unexpectedly: {spawner:?}"); + debug!(?spawner, "spawner stopped, shutting down network"); }, listener = &mut listener_task => { - panic!("listener exited unexpectedly: {listener:?}"); + debug!(?listener, "listener stopped, shutting down network"); }, dialer = &mut dialer_task => { - panic!("dialer exited unexpectedly: {dialer:?}"); + debug!(?dialer, "dialer stopped, shutting down network"); }, } } diff --git a/p2p/src/authenticated/lookup/network.rs b/p2p/src/authenticated/lookup/network.rs index ddb54c633a4..79a78200c15 100644 --- a/p2p/src/authenticated/lookup/network.rs +++ b/p2p/src/authenticated/lookup/network.rs @@ -194,26 +194,26 @@ impl< let mut shutdown = self.context.stopped(); - // Wait for first actor to exit + // If any task completes, the network should stop info!("network started"); select! { _ = &mut shutdown => { debug!("context shutdown, stopping network"); }, tracker = &mut tracker_task => { - panic!("tracker exited unexpectedly: {tracker:?}"); + debug!(?tracker, "tracker stopped, shutting down network"); }, router = &mut router_task => { - panic!("router exited unexpectedly: {router:?}"); + debug!(?router, "router stopped, shutting down network"); }, spawner = &mut spawner_task => { - panic!("spawner exited unexpectedly: {spawner:?}"); + debug!(?spawner, "spawner stopped, shutting down network"); }, listener = &mut listener_task => { - panic!("listener exited unexpectedly: {listener:?}"); + debug!(?listener, "listener stopped, shutting down network"); }, dialer = &mut dialer_task => { - panic!("dialer exited unexpectedly: {dialer:?}"); + debug!(?dialer, "dialer stopped, shutting down network"); }, } } diff --git a/storage/src/archive/immutable/mod.rs b/storage/src/archive/immutable/mod.rs index 6db7410f25f..d64090f8da4 100644 --- a/storage/src/archive/immutable/mod.rs +++ b/storage/src/archive/immutable/mod.rs @@ -5,9 +5,9 @@ //! //! # Uniqueness //! -//! [Archive] assumes all stored indexes and keys are unique. If the same key is associated with -//! multiple `indices`, there is no guarantee which value will be returned. If the key is written to -//! an existing `index`, [Archive] will return an error. +//! [Archive] assumes all stored indices are unique. Writing to an occupied index is a no-op. +//! If the same key is associated with multiple indices, there is no guarantee which value will +//! be returned. //! //! # Compression //! diff --git a/storage/src/archive/mod.rs b/storage/src/archive/mod.rs index 8955af09c5b..dc290b1ffb3 100644 --- a/storage/src/archive/mod.rs +++ b/storage/src/archive/mod.rs @@ -1,7 +1,10 @@ //! A write-once key-value store for ordered data. //! -//! [Archive] is a key-value store designed for workloads where all data is written only once and is -//! uniquely associated with both an `index` and a `key`. +//! [Archive] is a key-value store designed for workloads where data is written only once and each +//! item is addressed by both an `index` and a `key`. Workloads with unique indices should use [Archive] +//! and workloads with overlapping indices should use [MultiArchive] (allows all items with the same index +//! to be retrieved). The same key may be stored at multiple indices in either case, and a key lookup may +//! return any of the associated values. use commonware_codec::Codec; use commonware_utils::Array; @@ -39,7 +42,7 @@ pub enum Error { RecordTooLarge, } -/// A write-once key-value store where each key is associated with a unique index. +/// A write-once key-value store addressed by both an index and a key. pub trait Archive: Send { /// The type of the key. type Key: Array; @@ -47,10 +50,12 @@ pub trait Archive: Send { /// The type of the value. type Value: Codec + Send; - /// Store an item in [Archive]. Both indices and keys are assumed to both be globally unique. + /// Store an item in [Archive]. /// - /// If the index already exists, put does nothing and returns. If the same key is stored multiple times - /// at different indices (not recommended), any value associated with the key may be returned. + /// Indices are unique: if the index already exists, put does nothing and returns. Duplicate + /// indices can be stored via [MultiArchive::put_multi]. Keys need not be unique — the same key + /// may be stored at multiple indices, and a subsequent [Archive::get] or [Archive::has] call + /// with an [Identifier::Key] identifier may return any of the values associated with that key. fn put( &mut self, index: u64, @@ -122,8 +127,7 @@ pub trait Archive: Send { /// /// Unlike [Archive::put], which is a no-op when the index already exists, /// [MultiArchive::put_multi] allows storing additional `(key, value)` pairs -/// at an existing index. As with [Archive::put], keys are assumed to be globally -/// unique, but duplicate keys are not rejected. +/// at an existing index. pub trait MultiArchive: Archive { /// Retrieve all values stored at the given index. /// @@ -383,6 +387,74 @@ mod tests { }); } + async fn test_duplicate_key_cross_index_impl( + mut archive: impl Archive, Value = i32>, + ) { + // Store the same key at two different indices; distinct values only so + // the test can observe which entry wins a key lookup. + let key = test_key("dupe-xindex"); + archive.put(2, key.clone(), 20).await.expect("put(2)"); + archive.put(5, key.clone(), 50).await.expect("put(5)"); + + // Both indices must resolve individually. + assert_eq!( + archive.get(Identifier::Index(2)).await.unwrap(), + Some(20), + "Index(2) must resolve to the value stored at 2" + ); + assert_eq!( + archive.get(Identifier::Index(5)).await.unwrap(), + Some(50), + "Index(5) must resolve to the value stored at 5" + ); + + // Key lookup may return either value per the contract; just assert it + // returns one of them and that `has` reports presence. + let got = archive + .get(Identifier::Key(&key)) + .await + .unwrap() + .expect("key lookup must find at least one entry"); + assert!(got == 20 || got == 50, "unexpected value: {got}"); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + } + + #[test_traced] + fn test_duplicate_key_cross_index_prunable_no_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_prunable(context, None).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_prunable_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_prunable(context, Some(3)).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_immutable_no_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_immutable(context, None).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + + #[test_traced] + fn test_duplicate_key_cross_index_immutable_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let archive = create_immutable(context, Some(3)).await; + test_duplicate_key_cross_index_impl(archive).await; + }); + } + #[test_traced] fn test_duplicate_key_immutable_compression() { let executor = deterministic::Runner::default(); diff --git a/storage/src/archive/prunable/mod.rs b/storage/src/archive/prunable/mod.rs index 1025ab66735..6e2c01465e2 100644 --- a/storage/src/archive/prunable/mod.rs +++ b/storage/src/archive/prunable/mod.rs @@ -30,9 +30,13 @@ //! //! # Uniqueness //! -//! [Archive] assumes all stored indexes and keys are unique. If the same key is associated with -//! multiple `indices`, there is no guarantee which value will be returned. If the key is written to -//! an existing `index`, [Archive] will return an error. +//! Indices are unique for [Archive] and writing to an occupied index is a no-op. Duplicate +//! indices can be stored via [`crate::archive::MultiArchive::put_multi`]. +//! +//! Keys may be stored at multiple indices with either put variant. A lookup by +//! [`crate::archive::Identifier::Key`] may return any of the values at that key. Entries +//! whose index has been pruned are never returned or reported as present, so a key matching +//! both a pruned and a non-pruned entry resolves to the non-pruned entry. //! //! ## Conflicts //! @@ -681,6 +685,63 @@ mod tests { assert_eq!(state1, state2); } + /// Regression: when the same key is stored at multiple indices and the + /// earlier index is pruned, a subsequent `get`/`has` by key must resolve + /// to the surviving, non-pruned entry rather than report the pruned one. + /// Callers such as consensus's marshal cache rely on this to retain a + /// reproposal of the same block at a later index even after the + /// earlier index's retention window closes. + #[test_traced] + fn test_archive_key_lookup_skips_pruned_duplicates() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + translator: FourCap, + key_partition: "test-index".into(), + key_page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test-value".into(), + codec_config: (), + compression: None, + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), + items_per_section: NZU64!(1), + }; + let mut archive = Archive::init(context.clone(), cfg) + .await + .expect("Failed to initialize archive"); + + // Same key stored at two different indices. Distinct values only + // to make it observable which entry wins; a real caller would + // store the same value (e.g. the same block) at both indices. + let key = test_key("dupe-key"); + archive.put(2, key.clone(), 20).await.unwrap(); + archive.put(5, key.clone(), 50).await.unwrap(); + + // Before pruning, either entry is a permitted answer per the + // trait contract. The implementation happens to return the + // earlier index, but we only assert a value is present. + assert!(archive.get(Identifier::Key(&key)).await.unwrap().is_some()); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + + // Prune the earlier index (section 2). The later index must be + // the sole surviving answer. + archive.prune(3).await.unwrap(); + let got = archive.get(Identifier::Key(&key)).await.unwrap(); + assert_eq!( + got, + Some(50), + "key lookup must skip the pruned entry and return the surviving one" + ); + assert!(archive.has(Identifier::Key(&key)).await.unwrap()); + + // Prune past the later index too — now nothing survives. + archive.prune(6).await.unwrap(); + assert_eq!(archive.get(Identifier::Key(&key)).await.unwrap(), None); + assert!(!archive.has(Identifier::Key(&key)).await.unwrap()); + }); + } + #[test_traced] fn test_get_all_after_prune() { let executor = deterministic::Runner::default(); From c17539bd3d983d58c586d751d6fe9ad466ddde0e Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Sun, 19 Apr 2026 22:39:22 -0700 Subject: [PATCH 100/107] nit --- consensus/src/marshal/standard/inline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 87be653eea8..7e4da2c2bf8 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -914,7 +914,7 @@ mod tests { .expect("verify result missing"); assert!(verify_result, "inline verify should pass"); - // CRITICAL: abort the marshal actor synchronously, with no + // Abort the marshal actor synchronously, with no // intervening await. If verify returned true but the actor had // only enqueued (not processed) the `Verified` message, this // abort kills the actor before persistence completes. From 28ee529f5753ffd0093dedf9f24ca187d8011a5a Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Sun, 19 Apr 2026 22:40:11 -0700 Subject: [PATCH 101/107] move import --- consensus/src/marshal/standard/inline.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 7e4da2c2bf8..e7df6194e41 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -559,6 +559,7 @@ mod tests { types::{Epoch, FixedEpocher, Height, Round, View}, Automaton, Block, CertifiableAutomaton, Relay, VerifyingApplication, }; + use commonware_broadcast::Broadcaster; use commonware_cryptography::{ certificate::{mocks::Fixture, ConstantProvider, Scheme}, sha256::Sha256, @@ -840,7 +841,6 @@ mod tests { } fn inline_verify_persists_block_before_resolving_at(seed: u64) { - use commonware_broadcast::Broadcaster; let runner = deterministic::Runner::new( deterministic::Config::new() .with_seed(seed) @@ -954,7 +954,6 @@ mod tests { } fn inline_certify_persists_block_before_resolving_at(seed: u64) { - use commonware_broadcast::Broadcaster; let runner = deterministic::Runner::new( deterministic::Config::new() .with_seed(seed) @@ -1050,8 +1049,6 @@ mod tests { fn test_inline_certify_does_not_bypass_failed_verify_persistence() { let runner = deterministic::Runner::timed(Duration::from_secs(30)); runner.start(|mut context| async move { - use commonware_broadcast::Broadcaster; - let Fixture { participants, schemes, From eca4cdca3e65e5201b5d1c2bad4a8af994707892 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Sun, 19 Apr 2026 23:20:53 -0700 Subject: [PATCH 102/107] [consensus] Final Certify Nits (#3629) --- consensus/src/lib.rs | 13 +++---------- consensus/src/marshal/core/actor.rs | 8 ++++++++ consensus/src/marshal/mocks/harness.rs | 2 +- consensus/src/marshal/standard/inline.rs | 16 +++++++++++++--- storage/src/archive/mod.rs | 2 +- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 0926c0cfd72..4f656a4a309 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -141,16 +141,9 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// Determine whether a verified payload is safe to commit. /// /// The round parameter identifies which consensus round is being certified, allowing - /// applications to associate certification with the correct verification context. - /// - /// Note: In applications where payloads incorporate the round number (recommended), - /// each round will have a unique payload digest. However, the same payload may appear - /// in multiple rounds when re-proposing notarized blocks at epoch boundaries or in - /// integrations where payloads are round-agnostic. - /// - /// This is particularly useful for applications that employ erasure coding, which - /// can override this method to delay or prevent finalization until they have - /// reconstructed and validated the full block (e.g., after receiving enough shards). + /// applications to associate certification with the correct verification context. The + /// same payload may appear in multiple rounds, so implementations must key any state + /// on `(round, payload)` rather than `payload` alone. /// /// Like [`Automaton::verify`], payloads produced by [`Automaton::propose`] are certifiable-by-construction. /// Also like [`Automaton::verify`], certification is single-shot for the given diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 7d1fa31f1d8..74169bd6f21 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -543,10 +543,18 @@ where buffer.send(round, block, Recipients::Some(peers)).await; } Message::Verified { round, block, ack } => { + // If the round has already been pruned by tip advancement, + // `cache_verified` is a no-op because the round is below + // the retention floor (and no longer is required by consensus + // to make progress). self.cache_verified(round, block.digest(), block).await; ack.send_lossy(()); } Message::Certified { round, block, ack } => { + // If the round has already been pruned by tip advancement, + // `cache_block` is a no-op because the round is below + // the retention floor (and no longer is required by consensus + // to make progress). self.cache_block(round, block.digest(), block).await; ack.send_lossy(()); } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index 5480677bc42..ac1c60aa6b9 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -1011,7 +1011,7 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { // past V=1's retention boundary but not past V=25's. With // view_retention_timeout=10 and prunable_items_per_section=10, // processing views 1..=21 leaves `oldest_allowed=10` in both prunable - // archives — V=1 is dropped, V=25 is retained. + // archives. V=1 is dropped, V=25 is retained. const CHAIN_LEN: u64 = 21; let mut parent = Sha256::hash(b""); let mut parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index e7df6194e41..e0c934cf961 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -442,7 +442,17 @@ where ES: Epocher, { async fn certify(&mut self, round: Round, digest: Self::Digest) -> oneshot::Receiver { - // If block was already seen, return immediately. + // Verify has already run for this (round, digest) and its + // success was recorded in `available_blocks`. `verify` does not mark a + // round available until `marshal.verified(round, block)` has returned, + // and that call blocks on `put_sync` of the block into the round's + // verified cache. Because the verified and notarized caches share the + // same pruning schedule (both advance together to `min_view`), the + // block is already durable for this round and re-persisting it into + // the notarized cache would be a redundant `put_sync`. The slow path + // below persists through the notarized cache because in that case + // verify has not run locally and the block may be held only in the + // broadcast buffer, which is not durable. if self.available_blocks.lock().contains(&(round, digest)) { let (tx, rx) = oneshot::channel(); tx.send_lossy(true); @@ -938,7 +948,7 @@ mod tests { let post_restart = marshal2.get_block(&child_digest).await; assert!( post_restart.is_some(), - "verify resolved true ⟹ block must be durably persisted (seed={seed})" + "verify resolved true so block must be durably persisted (seed={seed})" ); }); } @@ -1040,7 +1050,7 @@ mod tests { let post_restart = marshal2.get_block(&child_digest).await; assert!( post_restart.is_some(), - "certify resolved true ⟹ block must be durably persisted (seed={seed})" + "certify resolved true so block must be durably persisted (seed={seed})" ); }); } diff --git a/storage/src/archive/mod.rs b/storage/src/archive/mod.rs index dc290b1ffb3..03e4e7caea0 100644 --- a/storage/src/archive/mod.rs +++ b/storage/src/archive/mod.rs @@ -53,7 +53,7 @@ pub trait Archive: Send { /// Store an item in [Archive]. /// /// Indices are unique: if the index already exists, put does nothing and returns. Duplicate - /// indices can be stored via [MultiArchive::put_multi]. Keys need not be unique — the same key + /// indices can be stored via [MultiArchive::put_multi]. Keys need not be unique: the same key /// may be stored at multiple indices, and a subsequent [Archive::get] or [Archive::has] call /// with an [Identifier::Key] identifier may return any of the values associated with that key. fn put( From d2b918e98e23220d99526f50c7ae4381901f817a Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 20 Apr 2026 00:27:16 -0700 Subject: [PATCH 103/107] add verified check on propose --- consensus/src/marshal/coding/marshaled.rs | 22 +++++ consensus/src/marshal/coding/mod.rs | 95 +++++++++++++++++++ consensus/src/marshal/core/actor.rs | 8 ++ consensus/src/marshal/core/cache.rs | 10 ++ consensus/src/marshal/core/mailbox.rs | 15 +++ consensus/src/marshal/standard/deferred.rs | 91 +++++++++++++++++- consensus/src/marshal/standard/inline.rs | 104 ++++++++++++++++++++- 7 files changed, 342 insertions(+), 3 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index dcd0018d711..48a7af37da5 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -529,6 +529,28 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified + // block for this round (persisted before voting in consensus). Building + // a fresh block would land on the same view index in the + // prunable archive and be silently dropped, so reuse the + // stored block instead. + if let Some(block) = marshal.get_verified(consensus_context.round).await { + let commitment = block.commitment(); + let round = consensus_context.round; + { + let mut lock = last_built.lock(); + *lock = Some((round, block)); + } + let success = tx.send_lossy(commitment); + debug!( + ?round, + ?commitment, + success, + "reused verified block from marshal on leader recovery" + ); + return; + } + let (parent_view, parent_commitment) = consensus_context.parent; let parent_request = fetch_parent( parent_commitment, diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index e69e8f23d8b..f9c7ffaa68c 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -1982,4 +1982,99 @@ mod tests { ); }); } + + /// Regression: if marshal already holds a verified block for a round + /// (say, persisted by a pre-crash propose whose notarize vote never + /// reached the journal), a restarted leader's `propose` must return + /// that block's commitment instead of rebuilding. Otherwise the + /// new block lands on the same view index in the prunable archive, + /// gets silently dropped (`skip_if_index_exists=true`), and the + /// leader's notarize targets a commitment no peer can serve. + #[test_traced("WARN")] + fn test_propose_reuses_verified_block_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); + + let round = Round::new(Epoch::zero(), View::new(1)); + let ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + + // Seed block A in marshal's verified cache for `round`. + let block_a = make_coding_block(ctx.clone(), genesis.digest(), Height::new(1), 100); + let coded_a: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(block_a.clone(), coding_config, &Sequential); + let commitment_a = coded_a.commitment(); + assert!(marshal.proposed(round, coded_a).await); + + // After restart, a fresh application would build a different + // block for the same round. + let block_b = make_coding_block(ctx.clone(), genesis.digest(), Height::new(1), 200); + let coded_b: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(block_b.clone(), coding_config, &Sequential); + let commitment_b = coded_b.commitment(); + assert_ne!( + commitment_a, commitment_b, + "test requires distinct commitments" + ); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis).with_propose_result(block_b); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + let commitment = marshaled + .propose(ctx) + .await + .await + .expect("propose must return a commitment"); + assert_eq!( + commitment, commitment_a, + "propose must reuse the block marshal already persisted for this round" + ); + + assert!( + marshaled.broadcast(commitment_a, Plan::Propose).await, + "relay broadcast must succeed after re-propose" + ); + }); + } } diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index 74169bd6f21..f6c6d168751 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -521,6 +521,14 @@ where }; response.send_lossy(info); } + Message::GetVerified { round, response } => { + let block = self + .cache + .get_verified(round) + .await + .map(Into::into); + response.send_lossy(block); + } Message::Proposed { round, block, ack } => { self.cache_verified(round, block.digest(), block.clone()) .await; diff --git a/consensus/src/marshal/core/cache.rs b/consensus/src/marshal/core/cache.rs index 9124d5b04f5..3d668dfcedc 100644 --- a/consensus/src/marshal/core/cache.rs +++ b/consensus/src/marshal/core/cache.rs @@ -371,6 +371,16 @@ where .expect("failed to get notarization") } + /// Get the block previously persisted in the verified archive for `round`. + pub(crate) async fn get_verified(&self, round: Round) -> Option { + let cache = self.caches.get(&round.epoch())?; + cache + .verified_blocks + .get(Identifier::Index(round.view().get())) + .await + .expect("failed to get verified block") + } + /// Get a finalization from the prunable archive by block digest. /// /// SAFETY: For blocks/certificates admitted by marshal verification, a block digest diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index 3dd41b1cdb8..c61107183f3 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -85,6 +85,13 @@ pub(crate) enum Message { /// A channel to send the retrieved block. response: oneshot::Sender, }, + /// A request to retrieve the verified block previously persisted for `round`. + GetVerified { + /// The round to query. + round: Round, + /// A channel to send the retrieved block, if any. + response: oneshot::Sender>, + }, /// A request to broadcast a proposed block to peers. Proposed { /// The round in which the block was proposed. @@ -296,6 +303,14 @@ impl Mailbox { .map(|block| AncestorStream::new(self.clone(), [V::into_inner(block)])) } + /// Returns the verified block previously persisted for `round`, if any. + pub async fn get_verified(&self, round: Round) -> Option { + self.sender + .request(|response| Message::GetVerified { round, response }) + .await + .flatten() + } + /// Requests that a proposed block is sent to peers, awaiting the actor's /// confirmation that the block has been durably persisted before returning. #[must_use = "callers must consider block durability before proceeding"] diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 396a86a0f19..17740019523 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -312,6 +312,28 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified + // block for this round (persisted by a pre-crash propose + // whose notarize vote never reached the journal). Building + // a fresh block would land on the same view index in the + // prunable archive and be silently dropped, so reuse the + // stored block instead. + if let Some(block) = marshal.get_verified(consensus_context.round).await { + let digest = block.digest(); + { + let mut lock = last_built.lock(); + *lock = Some((consensus_context.round, block)); + } + let success = tx.send_lossy(digest); + debug!( + round = ?consensus_context.round, + ?digest, + success, + "reused verified block from marshal on leader recovery" + ); + return; + } + let (parent_view, parent_digest) = consensus_context.parent; let parent_request = fetch_parent( parent_digest, @@ -683,9 +705,9 @@ mod tests { }, verifying::{GatedVerifyingApp, MockVerifyingApp}, }, - simplex::scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, + simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, Plan}, types::{Epoch, Epocher, FixedEpocher, Height, Round, View}, - Automaton, CertifiableAutomaton, + Automaton, CertifiableAutomaton, Relay, }; use commonware_broadcast::Broadcaster; use commonware_cryptography::{ @@ -1101,4 +1123,69 @@ mod tests { } }); } + + /// Regression: when marshal holds a verified block for a round from a + /// pre-crash propose, a restarted leader's `propose` must return that + /// block's digest instead of asking the application to build afresh. + /// See `standard::inline::tests::test_propose_reuses_verified_block_on_restart`. + #[test_traced("WARN")] + fn test_propose_reuses_verified_block_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let round = Round::new(Epoch::zero(), View::new(1)); + let ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); + let digest_a = block_a.digest(); + assert!(marshal.proposed(round, block_a.clone()).await); + + let block_b = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); + let digest_b = block_b.digest(); + assert_ne!(digest_a, digest_b, "test requires distinct digests"); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis.clone()).with_propose_result(block_b); + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = marshaled.propose(ctx).await; + let digest = digest_rx.await.expect("propose must return a digest"); + assert_eq!( + digest, digest_a, + "propose must reuse the block marshal already persisted for this round" + ); + + assert!( + marshaled.broadcast(digest_a, Plan::Propose).await, + "relay broadcast must succeed after re-propose" + ); + }); + } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index e0c934cf961..1be98c03680 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -244,6 +244,28 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { + // On leader recovery, marshal may already hold a verified + // block for this round (persisted by a pre-crash propose + // whose notarize vote never reached the journal). Building + // a fresh block would land on the same view index in the + // prunable archive and be silently dropped, so reuse the + // stored block instead. + if let Some(block) = marshal.get_verified(consensus_context.round).await { + let digest = block.digest(); + { + let mut lock = last_built.lock(); + *lock = Some((consensus_context.round, block)); + } + let success = tx.send_lossy(digest); + debug!( + round = ?consensus_context.round, + ?digest, + success, + "reused verified block from marshal on leader recovery" + ); + return; + } + let (parent_view, parent_digest) = consensus_context.parent; let parent_request = fetch_parent( parent_digest, @@ -565,7 +587,9 @@ mod tests { }, verifying::{GatedVerifyingApp, MockVerifyingApp}, }, - simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context}, + simplex::{ + scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context, Plan, + }, types::{Epoch, FixedEpocher, Height, Round, View}, Automaton, Block, CertifiableAutomaton, Relay, VerifyingApplication, }; @@ -1166,4 +1190,82 @@ mod tests { ); }); } + + /// Regression: if marshal persisted a verified block for a round before + /// a crash (say, via a prior `Relay::broadcast(Plan::Propose)` landing on + /// the verified cache) but the simplex notarize artifact never reached + /// the journal, a restarted leader must re-use the persisted block. + /// + /// Otherwise the application is asked to build afresh, returns a new + /// block whose digest does not match the one marshal already stored + /// (the prunable archive silently drops the second write at the same + /// view index), the leader broadcasts a `Notarize` for a digest no peer + /// can serve, and the view stalls until timeout. + #[test_traced("WARN")] + fn test_propose_reuses_verified_block_on_restart() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + + // Seed block A for round V=1 in marshal's verified cache as if + // the pre-crash leader had built and broadcasted it. + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let round = Round::new(Epoch::zero(), View::new(1)); + let ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); + let digest_a = block_a.digest(); + assert!(marshal.proposed(round, block_a.clone()).await); + + // After restart, the fresh application would build a different + // block for the same round (distinct timestamp -> distinct digest). + let block_b = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); + let digest_b = block_b.digest(); + assert_ne!(digest_a, digest_b, "test requires distinct digests"); + + let mock_app: MockVerifyingApp = + MockVerifyingApp::new(genesis.clone()).with_propose_result(block_b); + let mut inline = Inline::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = inline.propose(ctx).await; + let digest = digest_rx.await.expect("propose must return a digest"); + assert_eq!( + digest, digest_a, + "propose must reuse the block marshal already persisted for this round" + ); + + // After the automaton hands the digest to the voter, the voter + // calls Relay::broadcast(Plan::Propose). That call must succeed so + // the leader actually votes instead of bailing out. + assert!( + inline.broadcast(digest_a, Plan::Propose).await, + "relay broadcast must succeed after re-propose" + ); + }); + } } From 826c50304d42dceb2796b4008bd7532b01bf2101 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 20 Apr 2026 02:12:59 -0700 Subject: [PATCH 104/107] [consensus] Decouple Propose Durability from Broadcast (#3630) --- consensus/src/lib.rs | 8 +- .../src/marshal/application/validation.rs | 6 - consensus/src/marshal/coding/marshaled.rs | 79 ++--- consensus/src/marshal/coding/mod.rs | 71 ++-- consensus/src/marshal/coding/variant.rs | 1 + consensus/src/marshal/core/actor.rs | 12 +- consensus/src/marshal/core/mailbox.rs | 37 +-- consensus/src/marshal/mocks/harness.rs | 16 +- consensus/src/marshal/standard/deferred.rs | 105 +++--- consensus/src/marshal/standard/inline.rs | 107 +++---- consensus/src/marshal/standard/mod.rs | 165 ++-------- .../src/ordered_broadcast/mocks/automaton.rs | 3 +- consensus/src/simplex/actors/batcher/actor.rs | 4 +- consensus/src/simplex/actors/batcher/mod.rs | 9 +- consensus/src/simplex/actors/voter/actor.rs | 17 +- consensus/src/simplex/actors/voter/mod.rs | 302 +----------------- consensus/src/simplex/mocks/application.rs | 4 +- consensus/src/simplex/mod.rs | 95 +++--- examples/bridge/src/application/ingress.rs | 3 +- examples/log/src/application/ingress.rs | 3 +- 20 files changed, 245 insertions(+), 802 deletions(-) diff --git a/consensus/src/lib.rs b/consensus/src/lib.rs index 4f656a4a309..3bb9663c7e4 100644 --- a/consensus/src/lib.rs +++ b/consensus/src/lib.rs @@ -194,16 +194,12 @@ stability_scope!(BETA, cfg(not(target_arch = "wasm32")) { /// treat every broadcast identically can set this to `()`. type Plan: Send; - /// Broadcast a payload to the given recipients. - /// - /// Returns `true` when the relay accepted the payload for the requested - /// broadcast plan. Returns `false` when the relay could not complete the - /// handoff. + /// Broadcast a payload according to the given plan. fn broadcast( &mut self, payload: Self::Digest, plan: Self::Plan, - ) -> impl Future + Send; + ) -> impl Future + Send; } /// Reporter is the interface responsible for reporting activity to some external actor. diff --git a/consensus/src/marshal/application/validation.rs b/consensus/src/marshal/application/validation.rs index e2e8c5e2b07..82b2697da6c 100644 --- a/consensus/src/marshal/application/validation.rs +++ b/consensus/src/marshal/application/validation.rs @@ -8,12 +8,6 @@ use crate::{ types::{Epoch, Epocher, Height, Round}, }; use commonware_cryptography::certificate::Scheme; -use commonware_utils::sync::Mutex; -use std::sync::Arc; - -/// Cache for the last block built during proposal, shared between the -/// proposer task and the broadcast path. -pub(crate) type LastBuilt = Arc>>; /// Which stage of verification a block has reached. /// diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 48a7af37da5..4db0ddf6823 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -82,9 +82,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{ - is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, LastBuilt, Stage, - }, + validation::{is_inferred_reproposal_at_certify, is_valid_reproposal_at_verify, Stage}, verification_tasks::VerificationTasks, }, coding::{ @@ -106,6 +104,7 @@ use commonware_cryptography::{ Committable, Digestible, Hasher, }; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_parallel::Strategy; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, @@ -116,7 +115,6 @@ use commonware_utils::{ fallible::OneshotExt, oneshot::{self, error::RecvError}, }, - sync::Mutex, NZU16, }; use futures::future::{ready, try_join, Either, Ready}; @@ -183,7 +181,6 @@ where scheme_provider: Z, epocher: ES, strategy: S, - last_built: LastBuilt>, verification_tasks: VerificationTasks, cached_genesis: Arc)>>, @@ -266,7 +263,6 @@ where scheme_provider, strategy, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), cached_genesis: Arc::new(OnceLock::new()), @@ -491,15 +487,15 @@ where /// boundary block to avoid creating blocks that would be invalidated by the epoch transition. /// /// The proposal operation is spawned in a background task and returns a receiver that will - /// contain the proposed block's digest when ready. The built block is cached for later - /// broadcasting. + /// contain the proposed block's commitment when ready. The built block is persisted via + /// [`core::Mailbox::verified`] before the commitment is delivered, so consensus can rely + /// on the block surviving restart. async fn propose( &mut self, consensus_context: Context::PublicKey>, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let strategy = self.strategy.clone(); let cached_genesis = self.cached_genesis.clone(); @@ -537,10 +533,6 @@ where if let Some(block) = marshal.get_verified(consensus_context.round).await { let commitment = block.commitment(); let round = consensus_context.round; - { - let mut lock = last_built.lock(); - *lock = Some((round, block)); - } let success = tx.send_lossy(commitment); debug!( ?round, @@ -593,11 +585,14 @@ where if parent.height() == last_in_epoch { let commitment = parent.commitment(); let round = consensus_context.round; - { - let mut lock = last_built.lock(); - *lock = Some((round, parent)); + if !marshal.verified(round, parent).await { + debug!( + ?round, + ?commitment, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(commitment); debug!( ?round, @@ -643,11 +638,10 @@ where let commitment = coded_block.commitment(); let round = consensus_context.round; - { - let mut lock = last_built.lock(); - *lock = Some((round, coded_block)); + if !marshal.verified(round, coded_block).await { + debug!(?round, ?commitment, "marshal rejected proposed block"); + return; } - let success = tx.send_lossy(commitment); debug!(?round, ?commitment, success, "proposed new block"); }); @@ -973,38 +967,17 @@ where type PublicKey = ::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, commitment: Self::Digest, plan: Self::Plan) -> bool { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!(?commitment, "missing block to broadcast"); - return false; - }; - if block.commitment() != commitment { - warn!( - round = %round, - commitment = %block.commitment(), - height = %block.height(), - "skipping requested broadcast of block with mismatched commitment" - ); - return false; - }; - let height = block.height(); - if !self.marshal.proposed(round, block).await { - warn!(?round, ?commitment, %height, "marshal unable to accept block"); - return false; - } - debug!(?round, ?commitment, %height, "requested broadcast of built block"); - true - } - Plan::Forward { .. } => { - // Coding variant does not support targeted forwarding; - // peers reconstruct blocks from erasure-coded shards. - // - // TODO(#3389): Support checked data forwarding for PhasedScheme. - true - } - } + async fn broadcast(&mut self, commitment: Self::Digest, plan: Self::Plan) { + // Coding variant does not support targeted forwarding; + // peers reconstruct blocks from erasure-coded shards. + // + // TODO(#3389): Support checked data forwarding for PhasedScheme. + let Plan::Propose { round } = plan else { + return; + }; + self.marshal + .forward(round, commitment, Recipients::All) + .await; } } diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index f9c7ffaa68c..50b8741df16 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -80,11 +80,9 @@ mod tests { verifying::MockVerifyingApp, }, }, - simplex::{ - scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal, Plan, - }, + simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Proposal}, types::{coding::Commitment, Epoch, Epocher, FixedEpocher, Height, Round, View}, - Automaton, CertifiableAutomaton, Relay, + Automaton, CertifiableAutomaton, }; use commonware_codec::FixedSize; use commonware_coding::ReedSolomon; @@ -432,7 +430,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(0), View::new(1)), coded_parent) .await; @@ -446,7 +443,7 @@ mod tests { let block_a = make_coding_block(context_a.clone(), parent_digest, Height::new(2), 200); let coded_block_a = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_block_a.commitment(); - shards.clone().proposed(round_a, coded_block_a).await; + shards.proposed(round_a, coded_block_a).await; // Block B at view 10 (height 2, different block same height - could happen with // different proposers or re-proposals) @@ -459,7 +456,7 @@ mod tests { let block_b = make_coding_block(context_b.clone(), parent_digest, Height::new(2), 300); let coded_block_b = CodedBlock::new(block_b.clone(), coding_config, &Sequential); let commitment_b = coded_block_b.commitment(); - shards.clone().proposed(round_b, coded_block_b).await; + shards.proposed(round_b, coded_block_b).await; context.sleep(Duration::from_millis(10)).await; @@ -558,7 +555,7 @@ mod tests { let block = make_coding_block(ctx.clone(), parent, Height::new(i), i * 100); let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); last_commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + shards.proposed(round, coded_block).await; parent = block.digest(); last_view = View::new(i); } @@ -580,10 +577,7 @@ mod tests { let coded_boundary = CodedBlock::new(boundary_block.clone(), coding_config, &Sequential); let boundary_commitment = coded_boundary.commitment(); - shards - .clone() - .proposed(boundary_round, coded_boundary) - .await; + shards.proposed(boundary_round, coded_boundary).await; context.sleep(Duration::from_millis(10)).await; @@ -645,7 +639,6 @@ mod tests { // Make the non-boundary block available shards - .clone() .proposed(non_boundary_round, coded_non_boundary) .await; @@ -773,7 +766,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::zero(), View::new(1)), coded_parent) .await; @@ -1134,7 +1126,6 @@ mod tests { let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::zero(), View::new(19)), coded_parent) .await; @@ -1148,7 +1139,6 @@ mod tests { let coded_block = CodedBlock::new(block.clone(), coding_config, &Sequential); let block_commitment = coded_block.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(1), View::new(20)), coded_block) .await; @@ -1248,7 +1238,6 @@ mod tests { let coded_parent = CodedBlock::new(honest_parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); shards - .clone() .proposed(Round::new(Epoch::new(1), View::new(21)), coded_parent) .await; @@ -1270,10 +1259,7 @@ mod tests { let coded_malicious = CodedBlock::new(malicious_block.clone(), coding_config, &Sequential); let malicious_commitment = coded_malicious.commitment(); - shards - .clone() - .proposed(byzantine_round, coded_malicious) - .await; + shards.proposed(byzantine_round, coded_malicious).await; // Small delay to ensure broadcast is processed context.sleep(Duration::from_millis(10)).await; @@ -1318,10 +1304,7 @@ mod tests { let coded_malicious2 = CodedBlock::new(malicious_block2.clone(), coding_config, &Sequential); let malicious_commitment2 = coded_malicious2.commitment(); - shards - .clone() - .proposed(byzantine_round2, coded_malicious2) - .await; + shards.proposed(byzantine_round2, coded_malicious2).await; // Small delay to ensure broadcast is processed context.sleep(Duration::from_millis(10)).await; @@ -1410,7 +1393,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + shards.proposed(parent_round, coded_parent).await; // Create child at height 2. let child_round = Round::new(Epoch::zero(), View::new(2)); @@ -1422,7 +1405,7 @@ mod tests { let child = make_coding_block(child_ctx, parent.digest(), Height::new(2), 200); let coded_child = CodedBlock::new(child, coding_config, &Sequential); let child_commitment = coded_child.commitment(); - shards.clone().proposed(child_round, coded_child).await; + shards.proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; @@ -1531,7 +1514,7 @@ mod tests { let parent = make_coding_block(parent_context, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + shards.proposed(parent_round, coded_parent).await; // 3) Publish a valid child so optimistic verify can succeed. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1544,7 +1527,7 @@ mod tests { make_coding_block(verify_context.clone(), parent.digest(), Height::new(2), 200); let coded_block = CodedBlock::new(block, coding_config, &Sequential); let commitment = coded_block.commitment(); - shards.clone().proposed(round, coded_block).await; + shards.proposed(round, coded_block).await; context.sleep(Duration::from_millis(10)).await; @@ -1639,7 +1622,7 @@ mod tests { // Validator 1 proposes coded_block_b (same inner block, different coding). // This stores it in v1's shard engine and actor cache. - assert!(v1_mailbox.proposed(round1, coded_block_b.clone()).await); + assert!(v1_mailbox.verified(round1, coded_block_b.clone()).await); context.sleep(Duration::from_millis(100)).await; // Create finalization referencing commitment_a (the "correct" commitment). @@ -1799,7 +1782,7 @@ mod tests { let parent = make_coding_block(parent_ctx, genesis.digest(), Height::new(1), 100); let coded_parent = CodedBlock::new(parent.clone(), coding_config, &Sequential); let parent_commitment = coded_parent.commitment(); - shards.clone().proposed(parent_round, coded_parent).await; + shards.proposed(parent_round, coded_parent).await; let child_round = Round::new(Epoch::zero(), View::new(2)); let child_ctx = CodingCtx { @@ -1811,7 +1794,7 @@ mod tests { let coded_child = CodedBlock::new(child.clone(), coding_config, &Sequential); let child_commitment = coded_child.commitment(); let child_digest = coded_child.digest(); - shards.clone().proposed(child_round, coded_child).await; + shards.proposed(child_round, coded_child).await; context.sleep(Duration::from_millis(10)).await; @@ -1870,10 +1853,10 @@ mod tests { } /// Regression: a proposer must be able to recover its own block after a - /// crash that occurs between `Marshaled::propose()` + `Relay::broadcast(Plan::Propose)` - /// and any verify-driven persistence. Without persisting on the broadcast - /// path, the block lives only in the in-memory shards cache and is lost - /// across restart. + /// crash that occurs immediately after `Marshaled::propose()` returns a + /// commitment. `propose` is responsible for persisting the block via + /// `marshal.verified`, so the block must survive restart even if + /// `Relay::broadcast` never runs or marshal aborts in between. #[test_traced("WARN")] fn test_marshaled_proposed_block_persists_across_restart() { let runner = deterministic::Runner::timed(Duration::from_secs(60)); @@ -1944,19 +1927,16 @@ mod tests { }; let mut marshaled = Marshaled::new(context.clone(), cfg); - // Drive the full leader-side propose + broadcast path. + // Drive the leader-side propose path. `propose` must persist the + // block before returning the commitment. let commitment = marshaled .propose(propose_context) .await .await .expect("propose should produce a commitment"); assert_eq!(commitment, expected_commitment); - assert!( - marshaled.broadcast(commitment, Plan::Propose).await, - "broadcast should persist the proposed block before returning" - ); - // Abort marshal immediately after broadcast returns; the propose + // Abort marshal immediately after propose returns; the propose // path must already have persisted the block. marshal_actor_handle.abort(); drop(marshaled); @@ -2036,7 +2016,7 @@ mod tests { let coded_a: CodedBlock<_, ReedSolomon, Sha256> = CodedBlock::new(block_a.clone(), coding_config, &Sequential); let commitment_a = coded_a.commitment(); - assert!(marshal.proposed(round, coded_a).await); + assert!(marshal.verified(round, coded_a).await); // After restart, a fresh application would build a different // block for the same round. @@ -2070,11 +2050,6 @@ mod tests { commitment, commitment_a, "propose must reuse the block marshal already persisted for this round" ); - - assert!( - marshaled.broadcast(commitment_a, Plan::Propose).await, - "relay broadcast must succeed after re-propose" - ); }); } } diff --git a/consensus/src/marshal/coding/variant.rs b/consensus/src/marshal/coding/variant.rs index cd939dffe74..49ca3df89e4 100644 --- a/consensus/src/marshal/coding/variant.rs +++ b/consensus/src/marshal/coding/variant.rs @@ -100,6 +100,7 @@ where } async fn send(&self, round: Round, block: CodedBlock, _recipients: Recipients

) { + // Targeted forwarding is not supported by the coding variant. self.proposed(round, block).await; } } diff --git a/consensus/src/marshal/core/actor.rs b/consensus/src/marshal/core/actor.rs index f6c6d168751..90cb2cd8fe4 100644 --- a/consensus/src/marshal/core/actor.rs +++ b/consensus/src/marshal/core/actor.rs @@ -529,18 +529,12 @@ where .map(Into::into); response.send_lossy(block); } - Message::Proposed { round, block, ack } => { - self.cache_verified(round, block.digest(), block.clone()) - .await; - buffer.send(round, block, Recipients::All).await; - ack.send_lossy(()); - } Message::Forward { round, commitment, - peers, + recipients, } => { - if peers.is_empty() { + if matches!(&recipients, Recipients::Some(peers) if peers.is_empty()) { continue; } let Some(block) = self.find_block_by_commitment(&buffer, commitment).await @@ -548,7 +542,7 @@ where debug!(?commitment, "block not found for forwarding"); continue; }; - buffer.send(round, block, Recipients::Some(peers)).await; + buffer.send(round, block, recipients).await; } Message::Verified { round, block, ack } => { // If the round has already been pruned by tip advancement, diff --git a/consensus/src/marshal/core/mailbox.rs b/consensus/src/marshal/core/mailbox.rs index c61107183f3..01ff946c2f3 100644 --- a/consensus/src/marshal/core/mailbox.rs +++ b/consensus/src/marshal/core/mailbox.rs @@ -9,6 +9,7 @@ use crate::{ Reporter, }; use commonware_cryptography::{certificate::Scheme, Digestible}; +use commonware_p2p::Recipients; use commonware_utils::{ channel::{fallible::AsyncFallibleExt, mpsc, oneshot}, vec::NonEmptyVec, @@ -92,23 +93,14 @@ pub(crate) enum Message { /// A channel to send the retrieved block, if any. response: oneshot::Sender>, }, - /// A request to broadcast a proposed block to peers. - Proposed { - /// The round in which the block was proposed. - round: Round, - /// The block to broadcast. - block: V::Block, - /// A channel signaled once the block is durably stored. - ack: oneshot::Sender<()>, - }, - /// A request to forward a block to a set of peers. + /// A request to forward a block to a set of recipients. Forward { /// The round in which the block was proposed. round: Round, /// The commitment of the block to forward. commitment: V::Commitment, - /// The peers to forward the block to. - peers: Vec, + /// The recipients to forward the block to. + recipients: Recipients, }, /// A notification that a block has been verified by the application. Verified { @@ -311,16 +303,6 @@ impl Mailbox { .flatten() } - /// Requests that a proposed block is sent to peers, awaiting the actor's - /// confirmation that the block has been durably persisted before returning. - #[must_use = "callers must consider block durability before proceeding"] - pub async fn proposed(&self, round: Round, block: V::Block) -> bool { - self.sender - .request(|ack| Message::Proposed { round, block, ack }) - .await - .is_some() - } - /// Notifies the actor that a block has been verified, awaiting the actor's /// confirmation that the block has been durably persisted before returning. #[must_use = "callers must consider block durability before proceeding"] @@ -365,13 +347,18 @@ impl Mailbox { self.sender.send_lossy(Message::Prune { height }).await; } - /// Forward a block to a set of peers. - pub async fn forward(&self, round: Round, commitment: V::Commitment, peers: Vec) { + /// Forward a block to a set of recipients. + pub async fn forward( + &self, + round: Round, + commitment: V::Commitment, + recipients: Recipients, + ) { self.sender .send_lossy(Message::Forward { round, commitment, - peers, + recipients, }) .await; } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index ac1c60aa6b9..c348370c710 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -757,7 +757,7 @@ pub fn hailstorm( }) } -/// Contract: `marshal.proposed(...)=true` means the block survives an +/// Contract: `marshal.verified(...)=true` means the block survives an /// immediate crash and repeated recoveries. pub fn proposed_success_implies_recoverable_after_restart( seeds: impl IntoIterator, @@ -835,7 +835,7 @@ pub fn proposed_success_implies_recoverable_after_restart( .await; assert!( restarted.mailbox.get_block(&digest).await.is_some(), - "marshal.proposed() returning true must imply the block is recoverable \ + "marshal.verified() returning true must imply the block is recoverable \ after restart (seed={seed}, cycle={cycle})" ); } @@ -995,6 +995,8 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { let orphan_digest = H::digest(&orphan); // Verify `repeated` at V=1, then certify at V=25 (reproposal-style gap). + // The chain below starts at V=2 to avoid overwriting V=1 in the + // verified archive (which drops subsequent writes at an existing view). let v_early = Round::new(Epoch::zero(), View::new(1)); let v_late = Round::new(Epoch::zero(), View::new(25)); let mut peers: [ValidatorHandle; 0] = []; @@ -1010,7 +1012,7 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { // Drive the finalized chain forward to advance `last_processed_round` // past V=1's retention boundary but not past V=25's. With // view_retention_timeout=10 and prunable_items_per_section=10, - // processing views 1..=21 leaves `oldest_allowed=10` in both prunable + // processing views 2..=22 leaves `oldest_allowed=12` in both prunable // archives. V=1 is dropped, V=25 is retained. const CHAIN_LEN: u64 = 21; let mut parent = Sha256::hash(b""); @@ -1025,11 +1027,11 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { ); let digest = H::digest(&block); let commitment = H::commitment(&block); - let round = Round::new(Epoch::zero(), View::new(i)); + let round = Round::new(Epoch::zero(), View::new(i + 1)); H::propose(&mut handle, round, &block).await; let proposal = Proposal { round, - parent: View::new(i - 1), + parent: View::new(i), payload: commitment, }; let finalization = H::make_finalization(proposal, &schemes, QUORUM); @@ -1471,7 +1473,7 @@ impl TestHarness for StandardHarness { } async fn propose(handle: &mut ValidatorHandle, round: Round, block: &B) { - assert!(handle.mailbox.proposed(round, block.clone()).await); + assert!(handle.mailbox.verified(round, block.clone()).await); } async fn verify( @@ -2295,7 +2297,7 @@ impl TestHarness for CodingHarness { round: Round, block: &CodedBlock, Sha256>, ) { - assert!(handle.mailbox.proposed(round, block.clone()).await); + assert!(handle.mailbox.verified(round, block.clone()).await); } async fn verify( diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 17740019523..857819a2bdb 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -74,7 +74,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, application::{ - validation::{is_inferred_reproposal_at_certify, LastBuilt, Stage}, + validation::{is_inferred_reproposal_at_certify, Stage}, verification_tasks::VerificationTasks, }, core::Mailbox, @@ -93,17 +93,15 @@ use crate::{ }; use commonware_cryptography::{certificate::Scheme, Digestible}; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, Clock, Metrics, Spawner, }; -use commonware_utils::{ - channel::{fallible::OneshotExt, oneshot}, - sync::Mutex, -}; +use commonware_utils::channel::{fallible::OneshotExt, oneshot}; use rand::Rng; use std::sync::Arc; -use tracing::{debug, warn}; +use tracing::debug; /// An [`Application`] adapter that handles epoch transitions and validates block ancestry. /// @@ -146,7 +144,6 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, verification_tasks: VerificationTasks<::Digest>, build_duration: Timed, @@ -182,7 +179,6 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), verification_tasks: VerificationTasks::new(), build_duration, @@ -293,15 +289,15 @@ where /// boundary block to avoid creating blocks that would be invalidated by the epoch transition. /// /// The proposal operation is spawned in a background task and returns a receiver that will - /// contain the proposed block's digest when ready. The built block is cached for later - /// broadcasting. + /// contain the proposed block's digest when ready. The built block is persisted via + /// [`Mailbox::verified`] before the digest is delivered, so consensus can rely on the + /// block surviving restart. async fn propose( &mut self, consensus_context: Context, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); // Metrics @@ -320,10 +316,6 @@ where // stored block instead. if let Some(block) = marshal.get_verified(consensus_context.round).await { let digest = block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, block)); - } let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -372,11 +364,14 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let digest = parent.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + if !marshal.verified(consensus_context.round, parent).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -417,11 +412,14 @@ where build_timer.observe(); let digest = built_block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, built_block)); + if !marshal.verified(consensus_context.round, built_block).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected proposed block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -641,35 +639,12 @@ where type PublicKey = S::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) -> bool { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!(?digest, "missing block to broadcast"); - return false; - }; - if block.digest() != digest { - warn!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "skipping requested broadcast of block with mismatched digest" - ); - return false; - }; - let height = block.height(); - if !self.marshal.proposed(round, block).await { - warn!(?round, ?digest, %height, "marshal unable to accept block"); - return false; - } - debug!(?round, ?digest, %height, "requested broadcast of built block"); - true - } - Plan::Forward { round, peers } => { - self.marshal.forward(round, digest, peers).await; - true - } - } + async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { + let (round, recipients) = match plan { + Plan::Propose { round } => (round, Recipients::All), + Plan::Forward { round, recipients } => (round, recipients), + }; + self.marshal.forward(round, digest, recipients).await; } } @@ -705,9 +680,9 @@ mod tests { }, verifying::{GatedVerifyingApp, MockVerifyingApp}, }, - simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, Plan}, + simplex::scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::{Epoch, Epocher, FixedEpocher, Height, Round, View}, - Automaton, CertifiableAutomaton, Relay, + Automaton, CertifiableAutomaton, }; use commonware_broadcast::Broadcaster; use commonware_cryptography::{ @@ -759,8 +734,7 @@ mod tests { let parent_digest = parent.digest(); assert!( marshal - .clone() - .proposed(Round::new(Epoch::new(0), View::new(1)), parent.clone()) + .verified(Round::new(Epoch::new(0), View::new(1)), parent.clone()) .await ); @@ -773,7 +747,7 @@ mod tests { }; let block_a = B::new::(context_a.clone(), parent_digest, Height::new(2), 200); let commitment_a = block_a.digest(); - assert!(marshal.clone().proposed(round_a, block_a.clone()).await); + assert!(marshal.verified(round_a, block_a.clone()).await); // Block B at view 10 (height 2, different block same height) let round_b = Round::new(Epoch::new(0), View::new(10)); @@ -784,7 +758,7 @@ mod tests { }; let block_b = B::new::(context_b.clone(), parent_digest, Height::new(2), 300); let commitment_b = block_b.digest(); - assert!(marshal.clone().proposed(round_b, block_b.clone()).await); + assert!(marshal.verified(round_b, block_b.clone()).await); context.sleep(Duration::from_millis(10)).await; @@ -894,7 +868,7 @@ mod tests { assert!( marshal .clone() - .proposed(Round::new(Epoch::zero(), View::new(19)), parent.clone()) + .verified(Round::new(Epoch::zero(), View::new(19)), parent.clone()) .await ); @@ -915,7 +889,7 @@ mod tests { assert!( marshal .clone() - .proposed(unsupported_round, block.clone()) + .verified(unsupported_round, block.clone()) .await ); @@ -987,7 +961,7 @@ mod tests { assert!( marshal .clone() - .proposed(Round::new(Epoch::zero(), View::new(1)), parent.clone()) + .verified(Round::new(Epoch::zero(), View::new(1)), parent.clone()) .await ); @@ -1000,7 +974,7 @@ mod tests { }; let block_a = B::new::(context_a, parent.digest(), Height::new(2), 200); let commitment_a = block_a.digest(); - assert!(marshal.clone().proposed(round_a, block_a).await); + assert!(marshal.verified(round_a, block_a).await); context.sleep(Duration::from_millis(10)).await; @@ -1160,7 +1134,7 @@ mod tests { }; let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); let digest_a = block_a.digest(); - assert!(marshal.proposed(round, block_a.clone()).await); + assert!(marshal.verified(round, block_a.clone()).await); let block_b = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); let digest_b = block_b.digest(); @@ -1181,11 +1155,6 @@ mod tests { digest, digest_a, "propose must reuse the block marshal already persisted for this round" ); - - assert!( - marshaled.broadcast(digest_a, Plan::Propose).await, - "relay broadcast must succeed after re-propose" - ); }); } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index 1be98c03680..ec89db53e14 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -45,7 +45,7 @@ use crate::{ marshal::{ ancestry::AncestorStream, - application::validation::{LastBuilt, Stage}, + application::validation::Stage, core::Mailbox, standard::{ validation::{ @@ -62,6 +62,7 @@ use crate::{ }; use commonware_cryptography::certificate::Scheme; use commonware_macros::select; +use commonware_p2p::Recipients; use commonware_runtime::{ telemetry::metrics::histogram::{Buckets, Timed}, Clock, Metrics, Spawner, @@ -73,7 +74,7 @@ use commonware_utils::{ use prometheus_client::metrics::histogram::Histogram; use rand::Rng; use std::{collections::BTreeSet, sync::Arc}; -use tracing::{debug, warn}; +use tracing::debug; /// Tracks `(round, digest)` pairs for which `verify` has already fetched the /// block, so `certify` can return immediately without re-subscribing to marshal. @@ -141,7 +142,6 @@ where application: A, marshal: Mailbox>, epocher: ES, - last_built: LastBuilt, available_blocks: AvailableBlocks, build_duration: Timed, @@ -162,8 +162,7 @@ where { /// Creates a new inline-verification wrapper. /// - /// Registers a `build_duration` histogram for proposal latency and initializes - /// the shared "last built block" cache used by [`Relay::broadcast`]. + /// Registers a `build_duration` histogram for proposal latency. pub fn new(context: E, application: A, marshal: Mailbox>, epocher: ES) -> Self { let build_histogram = Histogram::new(Buckets::LOCAL); context.register( @@ -178,7 +177,6 @@ where application, marshal, epocher, - last_built: Arc::new(Mutex::new(None)), available_blocks: Arc::new(Mutex::new(BTreeSet::new())), build_duration, } @@ -224,18 +222,15 @@ where /// Proposes a new block or re-proposes an epoch boundary block. /// /// Proposal runs in a spawned task and returns a receiver for the resulting digest. - /// The built block is cached in memory (`last_built`) for the subsequent - /// `Relay::broadcast(Plan::Propose)` call, which invokes `marshal.proposed()` - /// as the durable persistence boundary before consensus continues. Receiving - /// a digest from `propose()` alone does not mean the block is recoverable - /// after restart. + /// The built block is persisted via [`Mailbox::verified`] before the digest is + /// delivered, so a digest received from `propose()` implies the block is + /// recoverable after restart. async fn propose( &mut self, consensus_context: Context, ) -> oneshot::Receiver { let mut marshal = self.marshal.clone(); let mut application = self.application.clone(); - let last_built = self.last_built.clone(); let epocher = self.epocher.clone(); let build_duration = self.build_duration.clone(); @@ -252,10 +247,6 @@ where // stored block instead. if let Some(block) = marshal.get_verified(consensus_context.round).await { let digest = block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, block)); - } let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -302,11 +293,14 @@ where .expect("current epoch should exist"); if parent.height() == last_in_epoch { let digest = parent.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, parent)); + if !marshal.verified(consensus_context.round, parent).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected re-proposed boundary block" + ); + return; } - let success = tx.send_lossy(digest); debug!( round = ?consensus_context.round, @@ -347,9 +341,13 @@ where build_timer.observe(); let digest = built_block.digest(); - { - let mut lock = last_built.lock(); - *lock = Some((consensus_context.round, built_block)); + if !marshal.verified(consensus_context.round, built_block).await { + debug!( + round = ?consensus_context.round, + ?digest, + "marshal rejected proposed block" + ); + return; } let success = tx.send_lossy(digest); debug!( @@ -524,33 +522,12 @@ where type PublicKey = S::PublicKey; type Plan = Plan; - async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) -> bool { - match plan { - Plan::Propose => { - let Some((round, block)) = self.last_built.lock().take() else { - warn!(?digest, "missing block to broadcast"); - return false; - }; - if block.digest() != digest { - warn!( - round = %round, - digest = %block.digest(), - height = %block.height(), - "skipping requested broadcast of block with mismatched digest" - ); - return false; - }; - if !self.marshal.proposed(round, block).await { - warn!(?round, ?digest, "marshal unable to accept block"); - return false; - } - true - } - Plan::Forward { round, peers } => { - self.marshal.forward(round, digest, peers).await; - true - } - } + async fn broadcast(&mut self, digest: Self::Digest, plan: Plan) { + let (round, recipients) = match plan { + Plan::Propose { round } => (round, Recipients::All), + Plan::Forward { round, recipients } => (round, recipients), + }; + self.marshal.forward(round, digest, recipients).await; } } @@ -587,9 +564,7 @@ mod tests { }, verifying::{GatedVerifyingApp, MockVerifyingApp}, }, - simplex::{ - scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context, Plan, - }, + simplex::{scheme::bls12381_threshold::vrf as bls12381_threshold_vrf, types::Context}, types::{Epoch, FixedEpocher, Height, Round, View}, Automaton, Block, CertifiableAutomaton, Relay, VerifyingApplication, }; @@ -670,7 +645,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal.proposed(parent_round, parent).await); + assert!(marshal.verified(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -681,7 +656,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - assert!(marshal.proposed(round, block).await); + assert!(marshal.verified(round, block).await); // Complete verify first so the block is already available locally. let verify_rx = inline.verify(verify_context, digest).await; @@ -748,7 +723,7 @@ mod tests { }; let parent = B::new::(parent_ctx, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal.proposed(parent_round, parent).await); + assert!(marshal.verified(parent_round, parent).await); let round = Round::new(Epoch::zero(), View::new(2)); let verify_context = Ctx { @@ -759,7 +734,7 @@ mod tests { let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - assert!(marshal.proposed(round, block).await); + assert!(marshal.verified(round, block).await); // Certify should still resolve by waiting on marshal block availability directly. let certify_rx = inline.certify(round, digest).await; @@ -824,7 +799,7 @@ mod tests { 1900, ); let boundary_digest = boundary_block.digest(); - assert!(marshal.proposed(boundary_round, boundary_block).await); + assert!(marshal.verified(boundary_round, boundary_block).await); let reproposal_round = Round::new(Epoch::zero(), View::new(boundary_height.get() + 1)); let reproposal_context = Ctx { @@ -1192,9 +1167,9 @@ mod tests { } /// Regression: if marshal persisted a verified block for a round before - /// a crash (say, via a prior `Relay::broadcast(Plan::Propose)` landing on - /// the verified cache) but the simplex notarize artifact never reached - /// the journal, a restarted leader must re-use the persisted block. + /// a crash (via a prior `propose` call) but the simplex notarize artifact + /// never reached the journal, a restarted leader must re-use the persisted + /// block. /// /// Otherwise the application is asked to build afresh, returns a new /// block whose digest does not match the one marshal already stored @@ -1235,7 +1210,7 @@ mod tests { }; let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); let digest_a = block_a.digest(); - assert!(marshal.proposed(round, block_a.clone()).await); + assert!(marshal.verified(round, block_a.clone()).await); // After restart, the fresh application would build a different // block for the same round (distinct timestamp -> distinct digest). @@ -1258,14 +1233,6 @@ mod tests { digest, digest_a, "propose must reuse the block marshal already persisted for this round" ); - - // After the automaton hands the digest to the voter, the voter - // calls Relay::broadcast(Plan::Propose). That call must succeed so - // the leader actually votes instead of bailing out. - assert!( - inline.broadcast(digest_a, Plan::Propose).await, - "relay broadcast must succeed after re-propose" - ); }); } } diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index b4971181e57..c0657e35376 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -82,7 +82,7 @@ mod tests { use commonware_parallel::Sequential; use commonware_resolver::Resolver; use commonware_runtime::{ - buffer::paged::CacheRef, deterministic, Clock, Metrics, Quota, Runner, Spawner, + buffer::paged::CacheRef, deterministic, Clock, Metrics, Quota, Runner, }; use commonware_storage::{ archive::{immutable, prunable, Archive as _}, @@ -479,12 +479,12 @@ mod tests { .mailbox; assert!( peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .verified(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) .await ); assert!( peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .verified(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) .await ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; @@ -576,17 +576,17 @@ mod tests { .mailbox; assert!( peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) + .verified(Round::new(Epoch::zero(), View::new(1)), block_one.clone()) .await ); assert!( peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) + .verified(Round::new(Epoch::zero(), View::new(2)), block_two.clone()) .await ); assert!( peer_mailbox - .proposed(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) + .verified(Round::new(Epoch::zero(), View::new(3)), block_three.clone()) .await ); StandardHarness::report_finalization(&mut peer_mailbox, finalization_two.clone()).await; @@ -768,7 +768,7 @@ mod tests { for (i, block) in blocks.iter().enumerate() { assert!( peer_mailbox - .proposed( + .verified( Round::new(Epoch::zero(), View::new(block.height().get())), (*block).clone(), ) @@ -1200,7 +1200,7 @@ mod tests { assert!( marshal .clone() - .proposed(boundary_round, boundary_block.clone()) + .verified(boundary_round, boundary_block.clone()) .await ); @@ -1272,7 +1272,7 @@ mod tests { assert!( marshal .clone() - .proposed(boundary_round, boundary_block) + .verified(boundary_round, boundary_block) .await ); @@ -1311,7 +1311,7 @@ mod tests { assert!( marshal .clone() - .proposed(non_boundary_round, non_boundary_block) + .verified(non_boundary_round, non_boundary_block) .await ); @@ -1416,7 +1416,7 @@ mod tests { assert!( marshal .clone() - .proposed(malformed_round, malformed_block) + .verified(malformed_round, malformed_block) .await ); @@ -1456,7 +1456,7 @@ mod tests { let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 300); let parent_digest = parent.digest(); - assert!(marshal.clone().proposed(parent_round, parent).await); + assert!(marshal.verified(parent_round, parent).await); let mismatch_round = Round::new(Epoch::zero(), View::new(3)); let mismatched_context = Ctx { @@ -1474,7 +1474,7 @@ mod tests { assert!( marshal .clone() - .proposed(mismatch_round, mismatched_block) + .verified(mismatch_round, mismatched_block) .await ); @@ -1549,7 +1549,7 @@ mod tests { }; let parent = B::new::(parent_context, genesis.digest(), Height::new(1), 100); let parent_digest = parent.digest(); - assert!(marshal.clone().proposed(parent_round, parent).await); + assert!(marshal.verified(parent_round, parent).await); // 2) Publish a valid child; only application-level verification should fail. let round = Round::new(Epoch::zero(), View::new(2)); @@ -1560,7 +1560,7 @@ mod tests { }; let block = B::new::(verify_context.clone(), parent_digest, Height::new(2), 200); let digest = block.digest(); - assert!(marshal.clone().proposed(round, block).await); + assert!(marshal.verified(round, block).await); context.sleep(Duration::from_millis(10)).await; @@ -1706,67 +1706,6 @@ mod tests { } } - /// A buffer whose `send` blocks until released, and signals when entered. - /// Used to verify `proposed` only resolves after `buffer.send` completes. - #[derive(Clone)] - struct GatingBuffer { - send_entered: Arc>>>, - release: Arc>>>, - } - - impl GatingBuffer { - fn new() -> (Self, oneshot::Receiver<()>, oneshot::Sender<()>) { - let (entered_tx, entered_rx) = oneshot::channel(); - let (release_tx, release_rx) = oneshot::channel(); - ( - Self { - send_entered: Arc::new(Mutex::new(Some(entered_tx))), - release: Arc::new(Mutex::new(Some(release_rx))), - }, - entered_rx, - release_tx, - ) - } - } - - impl crate::marshal::core::Buffer> for GatingBuffer { - type PublicKey = PublicKey; - type CachedBlock = B; - - async fn find_by_digest(&self, _digest: D) -> Option { - None - } - - async fn find_by_commitment(&self, _commitment: D) -> Option { - None - } - - async fn subscribe_by_digest(&self, _digest: D) -> oneshot::Receiver { - let (_sender, receiver) = oneshot::channel(); - receiver - } - - async fn subscribe_by_commitment( - &self, - _commitment: D, - ) -> oneshot::Receiver { - let (_sender, receiver) = oneshot::channel(); - receiver - } - - async fn finalized(&self, _commitment: D) {} - - async fn send(&self, _round: Round, _block: B, _recipients: Recipients) { - if let Some(entered) = self.send_entered.lock().take() { - entered.send_lossy(()); - } - let release = self.release.lock().take(); - if let Some(release) = release { - let _ = release.await; - } - } - } - /// A reporter that blocks inside `Update::Block` so tests can abort marshal /// exactly when application delivery starts. #[derive(Clone)] @@ -1914,70 +1853,6 @@ mod tests { (mailbox, buffer, resolver, actor_handle) } - /// Regression: `marshal.proposed` must not ack until the block has been - /// handed off to the provided buffer. - #[test_traced("WARN")] - fn test_standard_proposed_waits_for_buffer_send() { - let runner = deterministic::Runner::timed(Duration::from_secs(30)); - runner.start(|mut context| async move { - let Fixture { - participants, - schemes, - .. - } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); - let me = participants[0].clone(); - let partition_prefix = format!("proposed-waits-buffer-{me}"); - - let (buffer, send_entered, release) = GatingBuffer::new(); - let (mailbox, _buffer, _resolver, _actor_handle) = start_standard_actor( - context.with_label("validator_0"), - &partition_prefix, - ConstantProvider::new(schemes[0].clone()), - Application::::manual_ack(), - buffer, - ) - .await; - - let round = Round::new(Epoch::zero(), View::new(1)); - let block = make_raw_block(Sha256::hash(b""), Height::new(1), 100); - - // Drive `proposed` from a spawned task so we can observe its state - // from the main task via a completion channel. - let (done_tx, done_rx) = oneshot::channel(); - context - .with_label("proposed_caller") - .spawn(move |_| async move { - let ok = mailbox.proposed(round, block).await; - done_tx.send_lossy(ok); - }); - - // Wait for the marshal actor to enter `buffer.send`. - send_entered - .await - .expect("buffer.send should be entered after cache_verified"); - - // With the buffer held in `send`, `proposed` must remain pending. - // Poll it against a generous timer; the timer should always win. - futures::pin_mut!(done_rx); - select! { - _ = context.sleep(Duration::from_millis(500)) => {}, - _ = &mut done_rx => { - panic!("proposed returned before buffer.send released"); - }, - } - - // Releasing the gate lets `send` complete; `proposed` must then ack. - release.send_lossy(()); - let ok = select! { - result = &mut done_rx => result.expect("proposed channel closed"), - _ = context.sleep(Duration::from_secs(5)) => { - panic!("proposed did not complete after buffer release"); - }, - }; - assert!(ok, "proposed should return true after durable dispatch"); - }); - } - /// When the provider has no verifier for an epoch, in-flight deliveries /// for that epoch must be acknowledged (`true`) so the serving peer is /// not blamed, rather than rejected (`false`). @@ -2362,7 +2237,11 @@ mod tests { .await; mailbox - .forward(round, unknown, vec![participants[1].clone()]) + .forward( + round, + unknown, + Recipients::Some(vec![participants[1].clone()]), + ) .await; context.sleep(Duration::from_millis(50)).await; @@ -2401,7 +2280,9 @@ mod tests { assert!(mailbox.verified(round, block.clone()).await); let targets = vec![participants[1].clone(), participants[2].clone()]; - mailbox.forward(round, digest, targets.clone()).await; + mailbox + .forward(round, digest, Recipients::Some(targets.clone())) + .await; wait_until(&context, Duration::from_secs(5), "buffer.send", || { !buffer.sends.lock().is_empty() diff --git a/consensus/src/ordered_broadcast/mocks/automaton.rs b/consensus/src/ordered_broadcast/mocks/automaton.rs index 576971dbb7d..806846670cc 100644 --- a/consensus/src/ordered_broadcast/mocks/automaton.rs +++ b/consensus/src/ordered_broadcast/mocks/automaton.rs @@ -68,8 +68,7 @@ impl R for Automaton

{ type Plan = (); type PublicKey = P; - async fn broadcast(&mut self, payload: Self::Digest, _plan: ()) -> bool { + async fn broadcast(&mut self, payload: Self::Digest, _plan: ()) { trace!(?payload, "broadcast"); - true } } diff --git a/consensus/src/simplex/actors/batcher/actor.rs b/consensus/src/simplex/actors/batcher/actor.rs index d517e712fe3..46c61ee4cec 100644 --- a/consensus/src/simplex/actors/batcher/actor.rs +++ b/consensus/src/simplex/actors/batcher/actor.rs @@ -14,7 +14,7 @@ use crate::{ }; use commonware_cryptography::Digest; use commonware_macros::select_loop; -use commonware_p2p::{utils::codec::WrappedReceiver, Blocker, Receiver}; +use commonware_p2p::{utils::codec::WrappedReceiver, Blocker, Receiver, Recipients}; use commonware_parallel::Strategy; use commonware_runtime::{ spawn_cell, @@ -249,7 +249,7 @@ where proposal.payload, Plan::Forward { round: proposal.round, - peers, + recipients: Recipients::Some(peers), }, ) .await; diff --git a/consensus/src/simplex/actors/batcher/mod.rs b/consensus/src/simplex/actors/batcher/mod.rs index 5cf77ded322..6c52e1eab36 100644 --- a/consensus/src/simplex/actors/batcher/mod.rs +++ b/consensus/src/simplex/actors/batcher/mod.rs @@ -97,11 +97,14 @@ mod tests { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) -> bool { - if let Plan::Forward { round, peers } = plan { + async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) { + if let Plan::Forward { + round, + recipients: Recipients::Some(peers), + } = plan + { self.broadcasts.lock().push((payload, round, peers)); } - true } } diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 55e66fff273..8a3ef3dfb4a 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -918,14 +918,15 @@ impl< } view = self.state.current_view(); - // Notify application of proposal - if !self.relay.broadcast(proposed, Plan::Propose).await { - warn!( - round = ?context.round, - "failed to broadcast proposed payload, stopping voter" - ); - break; - } + // Notify application of proposal. + self.relay + .broadcast( + proposed, + Plan::Propose { + round: context.round, + }, + ) + .await; }, (context, verified) = verify_wait => { // Clear verify waiter diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 2441d2c2e81..1d724cc1d58 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -63,8 +63,8 @@ mod tests { secp256r1, Scheme, }, types::{ - Artifact, Certificate, Finalization, Finalize, Notarization, Notarize, - Nullification, Nullify, Proposal, Vote, + Certificate, Finalization, Finalize, Notarization, Notarize, Nullification, + Nullify, Proposal, Vote, }, }, types::{Participant, Round, View}, @@ -84,9 +84,8 @@ mod tests { use commonware_runtime::{ deterministic, telemetry::traces::collector::TraceStorage, Clock, Metrics, Quota, Runner, }; - use commonware_storage::journal::segmented::variable::{Config as JConfig, Journal}; use commonware_utils::{channel::mpsc, sync::Mutex, NZUsize, NZU16}; - use futures::{pin_mut, FutureExt, StreamExt}; + use futures::FutureExt; use std::{ num::{NonZeroU16, NonZeroU32}, sync::Arc, @@ -98,27 +97,6 @@ mod tests { const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); - #[derive(Clone, Default)] - struct FailingRelay { - proposes: Arc>>, - } - - impl crate::Relay for FailingRelay { - type Digest = Sha256Digest; - type PublicKey = PublicKey; - type Plan = Plan; - - async fn broadcast(&mut self, payload: Sha256Digest, plan: Self::Plan) -> bool { - match plan { - Plan::Propose => { - self.proposes.lock().push(payload); - false - } - Plan::Forward { .. } => true, - } - } - } - async fn start_test_network_with_peers( context: deterministic::Context, peers: I, @@ -141,280 +119,6 @@ mod tests { oracle } - fn propose_broadcast_failure_stops_before_notarize(mut fixture: F) - where - S: Scheme, - F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, - { - let namespace = b"propose_broadcast_failure_stops_before_notarize".to_vec(); - let partition = "propose_broadcast_failure_stops_before_notarize".to_string(); - let executor = deterministic::Runner::timed(Duration::from_secs(10)); - executor.start(|mut context| async move { - let Fixture { - participants, - schemes, - .. - } = fixture(&mut context, &namespace, 5); - let oracle = - start_test_network_with_peers(context.clone(), participants.clone(), true).await; - - let me = participants[0].clone(); - let elector = RoundRobin::::default(); - let reporter_cfg = mocks::reporter::Config { - participants: participants.clone().try_into().unwrap(), - scheme: schemes[0].clone(), - elector: elector.clone(), - }; - let reporter = mocks::reporter::Reporter::new( - context.with_label("reporter"), - reporter_cfg.clone(), - ); - - let app_relay = Arc::new(mocks::relay::Relay::new()); - let app_cfg = mocks::application::Config { - hasher: Sha256::default(), - relay: app_relay, - me: me.clone(), - propose_latency: (0.0, 0.0), - verify_latency: (0.0, 0.0), - certify_latency: (0.0, 0.0), - should_certify: mocks::application::Certifier::Always, - }; - let (app_actor, application) = - mocks::application::Application::new(context.with_label("app"), app_cfg); - app_actor.start(); - - let relay = FailingRelay::default(); - let propose_attempts = relay.proposes.clone(); - let voter_cfg = Config { - scheme: schemes[0].clone(), - elector, - blocker: oracle.control(me.clone()), - automaton: application.clone(), - relay, - reporter, - partition: partition.clone(), - epoch: Epoch::new(4), - mailbox_size: 128, - leader_timeout: Duration::from_secs(5), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - activity_timeout: ViewDelta::new(10), - replay_buffer: NZUsize!(1024 * 1024), - write_buffer: NZUsize!(1024 * 1024), - page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), - }; - let (voter, _mailbox) = Actor::new(context.with_label("voter"), voter_cfg); - let (resolver_sender, _resolver_receiver) = mpsc::channel(8); - let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); - let (vote_sender, _) = oracle - .control(me.clone()) - .register(0, TEST_QUOTA) - .await - .unwrap(); - let (cert_sender, _) = oracle - .control(me.clone()) - .register(1, TEST_QUOTA) - .await - .unwrap(); - let handle = voter.start( - batcher::Mailbox::new(batcher_sender), - resolver::Mailbox::new(resolver_sender), - vote_sender, - cert_sender, - ); - - match batcher_receiver.recv().await.unwrap() { - batcher::Message::Update { - current, - leader, - response, - .. - } => { - assert_eq!(current, View::new(1)); - let _ = leader; - response.send(None).unwrap(); - } - _ => panic!("unexpected initial batcher message"), - } - - select! { - result = handle => { - result.expect("voter should stop cleanly after failed propose broadcast"); - }, - _ = context.sleep(Duration::from_secs(1)) => { - panic!("timed out waiting for voter to stop after failed propose broadcast"); - } - } - - assert_eq!( - propose_attempts.lock().len(), - 1, - "expected exactly one failed propose broadcast attempt" - ); - - while let Some(message) = batcher_receiver.recv().now_or_never().flatten() { - match message { - batcher::Message::Constructed(Vote::Notarize(notarize)) => { - panic!( - "unexpected notarize for view {} after failed propose broadcast", - notarize.view() - ); - } - batcher::Message::Update { response, .. } => { - response.send(None).unwrap(); - } - batcher::Message::Constructed(_) => {} - } - } - - let journal = Journal::<_, Artifact>::init( - context.with_label("journal_check"), - JConfig { - partition, - compression: None, - codec_config: schemes[0].certificate_codec_config(), - page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), - write_buffer: NZUsize!(1024 * 1024), - }, - ) - .await - .expect("unable to open voter journal"); - let stream = journal - .replay(0, 0, NZUsize!(1024 * 1024)) - .await - .expect("unable to replay voter journal"); - pin_mut!(stream); - if let Some(entry) = stream.next().await { - let (_, _, _, artifact) = entry.expect("unable to decode voter journal artifact"); - panic!( - "failed propose broadcast must not leave durable vote remnants, found {artifact:?}" - ); - } - }); - } - - #[test_traced] - fn test_propose_broadcast_failure_stops_before_notarize() { - propose_broadcast_failure_stops_before_notarize::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - propose_broadcast_failure_stops_before_notarize::<_, _>( - bls12381_threshold_vrf::fixture::, - ); - propose_broadcast_failure_stops_before_notarize::<_, _>( - bls12381_multisig::fixture::, - ); - propose_broadcast_failure_stops_before_notarize::<_, _>( - bls12381_multisig::fixture::, - ); - propose_broadcast_failure_stops_before_notarize::<_, _>(ed25519::fixture); - propose_broadcast_failure_stops_before_notarize::<_, _>(secp256r1::fixture); - } - - /// Engine must not panic when the voter exits cleanly after the local - /// relay rejects `Plan::Propose`. The voter treats that as a fatal stop; - /// the engine must agree and shut down gracefully. - #[test_traced] - fn test_engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast() { - let namespace = - b"engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast".to_vec(); - let partition = - "engine_stops_cleanly_when_voter_exits_after_failed_propose_broadcast".to_string(); - let executor = deterministic::Runner::timed(Duration::from_secs(10)); - executor.start(|mut context| async move { - let Fixture { - participants, - schemes, - .. - } = bls12381_threshold_vrf::fixture::(&mut context, &namespace, 5); - let oracle = - start_test_network_with_peers(context.clone(), participants.clone(), true).await; - - let me = participants[0].clone(); - let elector = RoundRobin::::default(); - let reporter_cfg = mocks::reporter::Config { - participants: participants.clone().try_into().unwrap(), - scheme: schemes[0].clone(), - elector: elector.clone(), - }; - let reporter = mocks::reporter::Reporter::new( - context.with_label("reporter"), - reporter_cfg.clone(), - ); - - let app_relay = Arc::new(mocks::relay::Relay::new()); - let app_cfg = mocks::application::Config { - hasher: Sha256::default(), - relay: app_relay, - me: me.clone(), - propose_latency: (0.0, 0.0), - verify_latency: (0.0, 0.0), - certify_latency: (0.0, 0.0), - should_certify: mocks::application::Certifier::Always, - }; - let (app_actor, application) = - mocks::application::Application::new(context.with_label("app"), app_cfg); - app_actor.start(); - - let cfg = crate::simplex::config::Config { - scheme: schemes[0].clone(), - elector, - blocker: oracle.control(me.clone()), - automaton: application, - relay: FailingRelay::default(), - reporter, - strategy: Sequential, - partition, - mailbox_size: 128, - epoch: Epoch::new(4), - leader_timeout: Duration::from_secs(5), - certification_timeout: Duration::from_secs(5), - timeout_retry: Duration::from_mins(60), - fetch_timeout: Duration::from_secs(1), - activity_timeout: ViewDelta::new(10), - skip_timeout: ViewDelta::new(5), - fetch_concurrent: 4, - replay_buffer: NZUsize!(1024 * 1024), - write_buffer: NZUsize!(1024 * 1024), - page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), - forwarding: crate::simplex::config::ForwardingPolicy::Disabled, - }; - let engine = crate::simplex::Engine::new(context.with_label("engine"), cfg); - - let (vote_sender, vote_receiver) = oracle - .control(me.clone()) - .register(0, TEST_QUOTA) - .await - .unwrap(); - let (cert_sender, cert_receiver) = oracle - .control(me.clone()) - .register(1, TEST_QUOTA) - .await - .unwrap(); - let (resolver_sender, resolver_receiver) = oracle - .control(me.clone()) - .register(2, TEST_QUOTA) - .await - .unwrap(); - - let handle = engine.start( - (vote_sender, vote_receiver), - (cert_sender, cert_receiver), - (resolver_sender, resolver_receiver), - ); - - select! { - result = handle => { - result.expect("engine should stop cleanly after voter exit"); - }, - _ = context.sleep(Duration::from_secs(2)) => { - panic!("timed out waiting for engine to stop after voter exit"); - } - } - }); - } - fn build_notarization>( schemes: &[S], proposal: &Proposal, diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 8602933211e..4648950ca75 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -116,8 +116,8 @@ impl Re for Mailbox { type PublicKey = P; type Plan = Plan

; - async fn broadcast(&mut self, payload: Self::Digest, _plan: Plan

) -> bool { - self.sender.send_lossy(Message::Broadcast { payload }).await + async fn broadcast(&mut self, payload: Self::Digest, _plan: Plan

) { + self.sender.send_lossy(Message::Broadcast { payload }).await; } } diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index 4a6738e04cd..84c016727ef 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -314,72 +314,71 @@ //! Before sending a message, the `Journal` sync is invoked to prevent inadvertent Byzantine behavior //! on restart (especially in the case of unclean shutdown). -use crate::types::Round; -use commonware_cryptography::PublicKey; - pub mod elector; pub mod scheme; pub mod types; cfg_if::cfg_if! { if #[cfg(not(target_arch = "wasm32"))] { + use crate::types::{Round, View, ViewDelta}; + use commonware_cryptography::PublicKey; + use commonware_p2p::Recipients; + mod actors; pub mod config; pub use config::{Config, ForwardingPolicy}; mod engine; pub use engine::Engine; mod metrics; - } -} -#[cfg(any(test, feature = "mocks"))] -pub mod mocks; - -#[cfg(not(target_arch = "wasm32"))] -use crate::types::{View, ViewDelta}; + /// The minimum view we are tracking both in-memory and on-disk. + pub(crate) const fn min_active(activity_timeout: ViewDelta, last_finalized: View) -> View { + last_finalized.saturating_sub(activity_timeout) + } -/// The minimum view we are tracking both in-memory and on-disk. -#[cfg(not(target_arch = "wasm32"))] -pub(crate) const fn min_active(activity_timeout: ViewDelta, last_finalized: View) -> View { - last_finalized.saturating_sub(activity_timeout) -} + /// Whether or not a view is interesting to us. This is a function + /// of both `min_active` and whether or not the view is too far + /// in the future (based on the view we are currently in). + pub(crate) fn interesting( + activity_timeout: ViewDelta, + last_finalized: View, + current: View, + pending: View, + allow_future: bool, + ) -> bool { + // If the view is genesis, skip it, genesis doesn't have votes + if pending.is_zero() { + return false; + } + if pending < min_active(activity_timeout, last_finalized) { + return false; + } + if !allow_future && pending > current.next() { + return false; + } + true + } -/// Whether or not a view is interesting to us. This is a function -/// of both `min_active` and whether or not the view is too far -/// in the future (based on the view we are currently in). -#[cfg(not(target_arch = "wasm32"))] -pub(crate) fn interesting( - activity_timeout: ViewDelta, - last_finalized: View, - current: View, - pending: View, - allow_future: bool, -) -> bool { - // If the view is genesis, skip it, genesis doesn't have votes - if pending.is_zero() { - return false; - } - if pending < min_active(activity_timeout, last_finalized) { - return false; - } - if !allow_future && pending > current.next() { - return false; + /// Describes how a payload should be broadcast to the network. + pub enum Plan { + /// Initial broadcast of a newly proposed block to all participants. + Propose { + /// The round in which the block was proposed. + round: Round, + }, + /// Forward a block to a specific set of peers. + Forward { + /// The round in which the forwarded block was proposed. + round: Round, + /// The recipients to forward the block to. + recipients: Recipients

, + }, + } } - true } -/// Describes how a payload should be broadcast to the network. -pub enum Plan { - /// Initial broadcast of a newly proposed block to all participants. - Propose, - /// Forward a block to a specific set of peers. - Forward { - /// The round in which the forwarded block was proposed. - round: Round, - /// The peers to forward the block to. - peers: Vec

, - }, -} +#[cfg(any(test, feature = "mocks"))] +pub mod mocks; /// Convenience alias for [`N3f1::quorum`]. #[cfg(test)] diff --git a/examples/bridge/src/application/ingress.rs b/examples/bridge/src/application/ingress.rs index c1abebaf54f..6881ad3f127 100644 --- a/examples/bridge/src/application/ingress.rs +++ b/examples/bridge/src/application/ingress.rs @@ -96,12 +96,11 @@ impl Re for Mailbox { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) -> bool { + async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) { // We don't broadcast our raw messages to other peers. // // If we were building an EVM blockchain, for example, we'd // send the block to other peers here. - true } } diff --git a/examples/log/src/application/ingress.rs b/examples/log/src/application/ingress.rs index 064fdd7a095..17bb94244ab 100644 --- a/examples/log/src/application/ingress.rs +++ b/examples/log/src/application/ingress.rs @@ -85,11 +85,10 @@ impl Re for Mailbox { type PublicKey = PublicKey; type Plan = Plan; - async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) -> bool { + async fn broadcast(&mut self, _: Self::Digest, _: Self::Plan) { // We don't broadcast our raw messages to other peers. // // If we were building an EVM blockchain, for example, we'd // send the block to other peers here. - true } } From a5db7358104831f63f2af5e19767450a3265b222 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 20 Apr 2026 12:27:03 -0700 Subject: [PATCH 105/107] [consensus] Address Last Round of Feedback (#3634) --- consensus/src/marshal/coding/marshaled.rs | 28 +- consensus/src/marshal/coding/mod.rs | 91 ++++ consensus/src/marshal/mocks/harness.rs | 448 +++++++++++++++----- consensus/src/marshal/standard/deferred.rs | 97 ++++- consensus/src/marshal/standard/inline.rs | 67 ++- consensus/src/marshal/standard/mod.rs | 6 + consensus/src/simplex/actors/voter/actor.rs | 8 + 7 files changed, 592 insertions(+), 153 deletions(-) diff --git a/consensus/src/marshal/coding/marshaled.rs b/consensus/src/marshal/coding/marshaled.rs index 4db0ddf6823..9b54ea55d9a 100644 --- a/consensus/src/marshal/coding/marshaled.rs +++ b/consensus/src/marshal/coding/marshaled.rs @@ -525,12 +525,30 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { - // On leader recovery, marshal may already hold a verified - // block for this round (persisted before voting in consensus). Building - // a fresh block would land on the same view index in the - // prunable archive and be silently dropped, so reuse the - // stored block instead. + // On leader recovery, marshal may already hold a verified block + // for this round (persisted before voting in consensus). + // + // Building a fresh block would land on the same prunable + // archive index and be silently dropped, so the stored block + // is the only proposal we can broadcast for this round. + // + // The recovered block is safe to reuse only if its embedded + // context matches the context simplex just recovered. + // Otherwise the cached block was built against a different + // parent and cannot be broadcast under the current header, so + // drop the receiver and let the voter nullify the view via + // timeout. if let Some(block) = marshal.get_verified(consensus_context.round).await { + let block_context = block.context(); + if block_context != consensus_context { + debug!( + round = ?consensus_context.round, + ?consensus_context, + ?block_context, + "skipping proposal: cached verified block context no longer matches" + ); + return; + } let commitment = block.commitment(); let round = consensus_context.round; let success = tx.send_lossy(commitment); diff --git a/consensus/src/marshal/coding/mod.rs b/consensus/src/marshal/coding/mod.rs index 50b8741df16..35c6d728074 100644 --- a/consensus/src/marshal/coding/mod.rs +++ b/consensus/src/marshal/coding/mod.rs @@ -177,6 +177,11 @@ mod tests { harness::verified_success_implies_recoverable_after_restart::(0..16); } + #[test_traced("WARN")] + fn test_coding_certified_success_implies_recoverable_after_restart() { + harness::certified_success_implies_recoverable_after_restart::(0..16); + } + #[test_traced("WARN")] fn test_coding_delivery_visibility_implies_recoverable_after_restart() { harness::delivery_visibility_implies_recoverable_after_restart::(0..16); @@ -2052,4 +2057,90 @@ mod tests { ); }); } + + /// Regression: if a pre-crash leader persisted a verified block for a + /// round but the simplex `Notarize` never reached the journal, replay + /// can recover a `consensus_context` whose parent differs from the one + /// the cached block was built against. The restarted leader must then + /// drop the receiver so the voter nullifies the view via + /// `MissingProposal`, rather than broadcasting the stale cached block + /// under a header that peers will reject. + #[test_traced("WARN")] + fn test_propose_skips_when_verified_block_context_changed() { + let runner = deterministic::Runner::timed(Duration::from_secs(60)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let coding_config = coding_config_for_participants(NUM_VALIDATORS as u16); + + let setup = CodingHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + let shards = setup.extra; + + let genesis_ctx = CodingCtx { + round: Round::zero(), + leader: default_leader(), + parent: (View::zero(), genesis_commitment()), + }; + let genesis = make_coding_block(genesis_ctx, Sha256::hash(b""), Height::zero(), 0); + let genesis_parent_commitment = genesis_coding_commitment::(&genesis); + + // Stash a stale block built against genesis as its parent at round V=2. + let round = Round::new(Epoch::zero(), View::new(2)); + let stale_ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::zero(), genesis_parent_commitment), + }; + let stale_block = make_coding_block(stale_ctx, genesis.digest(), Height::new(1), 100); + let stale_coded: CodedBlock<_, ReedSolomon, Sha256> = + CodedBlock::new(stale_block, coding_config, &Sequential); + assert!(marshal.verified(round, stale_coded).await); + + // Simulate a replay where parent selection now points to a + // different parent commitment than the cached block was built for. + let new_parent_commitment = Commitment::from(( + Sha256::hash(b"different-parent-block"), + Sha256::hash(b"different-parent-inner"), + Sha256::hash(b"different-parent-ctx"), + coding_config, + )); + let new_ctx = CodingCtx { + round, + leader: me.clone(), + parent: (View::new(1), new_parent_commitment), + }; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis); + let cfg = MarshaledConfig { + application: mock_app, + marshal: marshal.clone(), + shards: shards.clone(), + scheme_provider: ConstantProvider::new(schemes[0].clone()), + epocher: FixedEpocher::new(BLOCKS_PER_EPOCH), + strategy: Sequential, + }; + let mut marshaled = Marshaled::new(context.clone(), cfg); + + let commitment_rx = marshaled.propose(new_ctx).await; + assert!( + commitment_rx.await.is_err(), + "propose must drop the receiver when the cached block's context no longer matches" + ); + }); + } } diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index c348370c710..c213fea7538 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -349,7 +349,6 @@ struct HailstormState<'a, H: TestHarness> { parent_commitment: &'a mut H::Commitment, participants: &'a [K], schemes: &'a [S], - propagation_delay: Duration, } fn active_validator_indices( @@ -500,92 +499,137 @@ async fn assert_active_validators_match_canonical( } } -async fn advance_hailstorm_to( - target: u64, +/// A height that has been driven through propose + verify but has not yet had +/// its finalization reported to the validators. +struct PendingHailstormHeight { + height: Height, + expected_digest: D, + finalization: Finalization, + next_parent: D, + next_parent_commitment: H::Commitment, +} + +/// Drives one height through the propose and verify phases without reporting +/// finalization. The returned pending height must be committed via +/// [`finalize_hailstorm_height`] to advance the canonical chain. +async fn drive_hailstorm_height_up_to_verify( + height_value: u64, context: &mut deterministic::Context, state: &mut HailstormState<'_, H>, -) { - for height_value in (state.canonical.len() as u64 + 1)..=target { - let height = Height::new(height_value); - let active = active_validator_indices(state.validators); - let proposer_idx = active[context.gen_range(0..active.len())]; - let verifier_count = usize::min(QUORUM as usize, active.len()); - let verifier_indices = active - .iter() - .copied() - .filter(|idx| *idx != proposer_idx) - .choose_multiple(context, verifier_count.saturating_sub(1)); - let block = H::make_test_block( - *state.parent, - *state.parent_commitment, - height, - height_value, - state.participants.len() as u16, - ); - let round = Round::new(Epoch::zero(), View::new(height_value)); - let proposal = Proposal { - round, - parent: height - .previous() - .map(|previous| View::new(previous.get())) - .unwrap_or(View::zero()), - payload: H::commitment(&block), - }; - let expected_digest = H::digest(&block); - let finalization = H::make_finalization(proposal.clone(), state.schemes, QUORUM); +) -> PendingHailstormHeight { + let height = Height::new(height_value); + let active = active_validator_indices(state.validators); + let proposer_idx = active[context.gen_range(0..active.len())]; + let verifier_count = usize::min(QUORUM as usize, active.len()); + let verifier_indices = active + .iter() + .copied() + .filter(|idx| *idx != proposer_idx) + .choose_multiple(context, verifier_count.saturating_sub(1)); + let block = H::make_test_block( + *state.parent, + *state.parent_commitment, + height, + height_value, + state.participants.len() as u16, + ); + let round = Round::new(Epoch::zero(), View::new(height_value)); + let proposal = Proposal { + round, + parent: height + .previous() + .map(|previous| View::new(previous.get())) + .unwrap_or(View::zero()), + payload: H::commitment(&block), + }; + let expected_digest = H::digest(&block); + let finalization = H::make_finalization(proposal.clone(), state.schemes, QUORUM); + + { + let proposer = state.validators[proposer_idx] + .as_mut() + .expect("proposer should be active"); + H::propose(&mut proposer.handle, round, &block).await; + H::report_notarization( + &mut proposer.handle.mailbox, + H::make_notarization(proposal, state.schemes, QUORUM), + ) + .await; + } - { - let proposer = state.validators[proposer_idx] - .as_mut() - .expect("proposer should be active"); - H::propose(&mut proposer.handle, round, &block).await; - H::report_notarization( - &mut proposer.handle.mailbox, - H::make_notarization(proposal, state.schemes, QUORUM), - ) - .await; - } + for verifier_idx in verifier_indices.iter().copied() { + let verifier = state.validators[verifier_idx] + .as_mut() + .expect("verifier should be active"); + H::verify(&mut verifier.handle, round, &block, &mut []).await; + } - for verifier_idx in verifier_indices.iter().copied() { - let verifier = state.validators[verifier_idx] - .as_mut() - .expect("verifier should be active"); - H::verify(&mut verifier.handle, round, &block, &mut []).await; - } + PendingHailstormHeight { + height, + expected_digest, + finalization, + next_parent: expected_digest, + next_parent_commitment: H::commitment(&block), + } +} - context.sleep(state.propagation_delay).await; +/// Reports the finalization for a previously-driven pending height to every +/// currently-active validator, waits for them to reach the height, and updates +/// the canonical chain. +async fn finalize_hailstorm_height( + pending: PendingHailstormHeight, + context: &mut deterministic::Context, + state: &mut HailstormState<'_, H>, +) { + let PendingHailstormHeight { + height, + expected_digest, + finalization, + next_parent, + next_parent_commitment, + } = pending; + + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] + .as_mut() + .expect("validator should remain active"); + H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; + } - for idx in active_validator_indices(state.validators) { - let validator = state.validators[idx] - .as_mut() - .expect("validator should remain active"); - H::report_finalization(&mut validator.handle.mailbox, finalization.clone()).await; - } + state + .canonical + .push((height, expected_digest, finalization)); + *state.parent = next_parent; + *state.parent_commitment = next_parent_commitment; + + let (_, _, expected_finalization) = state + .canonical + .last() + .expect("canonical chain should contain the new height"); + for idx in active_validator_indices(state.validators) { + let validator = state.validators[idx] + .as_ref() + .expect("validator should be active"); + wait_for_validator_height( + context, + validator, + height, + expected_digest, + expected_finalization, + &format!("validator_{idx}"), + ) + .await; + } +} - state - .canonical - .push((height, expected_digest, finalization)); - *state.parent = expected_digest; - *state.parent_commitment = H::commitment(&block); - - let (_, _, expected_finalization) = state - .canonical - .last() - .expect("canonical chain should contain the new height"); - for idx in active_validator_indices(state.validators) { - let validator = state.validators[idx] - .as_ref() - .expect("validator should be active"); - wait_for_validator_height( - context, - validator, - height, - expected_digest, - expected_finalization, - &format!("validator_{idx}"), - ) - .await; - } +async fn advance_hailstorm_to( + target: u64, + context: &mut deterministic::Context, + state: &mut HailstormState<'_, H>, +) { + for height_value in (state.canonical.len() as u64 + 1)..=target { + let pending = drive_hailstorm_height_up_to_verify(height_value, context, state).await; + finalize_hailstorm_height(pending, context, state).await; } assert_active_validators_match_canonical(state.validators, state.canonical).await; @@ -614,7 +658,6 @@ pub fn hailstorm( let mut oracle = setup_network_with_participants(context.clone(), NZUsize!(3), participants.clone()) .await; - let propagation_delay = link.latency; setup_network_links(&mut oracle, &participants, link.clone()).await; let mut validators = Vec::new(); @@ -646,27 +689,62 @@ pub fn hailstorm( for shutdown_idx in 0..shutdowns { let leadup = context.gen_range(1..=max_interval); target_height += leadup; - let mut state = HailstormState { - validators: &mut validators, - canonical: &mut canonical, - parent: &mut parent, - parent_commitment: &mut parent_commitment, - participants: &participants, - schemes: &schemes, - propagation_delay, - }; - advance_hailstorm_to(target_height, &mut context, &mut state).await; - let active = active_validator_indices(&validators); - let down_limit = usize::min(max_down, active.len().saturating_sub(1)); - let down_count = down_limit.max(1); - let down_count = context.gen_range(1..=down_count); - let mut selected = active + // Pick validators to crash and compute how far the advance should + // run before aborting them. `crash_after == leadup` fires the + // crash after every new height has fully finalized; any smaller + // value lands mid-cycle, after `verified` / `certified` have + // returned for the post-crash height but before finalization is + // reported for it. + let active_pre = active_validator_indices(&validators); + let down_limit = usize::min(max_down, active_pre.len().saturating_sub(1)); + let down_count = context.gen_range(1..=down_limit.max(1)); + let mut selected = active_pre .iter() .copied() .choose_multiple(&mut context, down_count); selected.sort_unstable(); - let persisted_height = target_height; + let crash_after = context.gen_range(0..=leadup); + let persisted_height = target_height - leadup + crash_after; + + { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + advance_hailstorm_to(persisted_height, &mut context, &mut state).await; + } + + // Crash mid-advance: drive propose + verify for the next height + // and abort the selected validators before reporting + // finalization. If `crash_after == leadup - 1` the crash still + // happens after the last height's finalization because the loop + // below is a no-op. + let pending = if persisted_height < target_height { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + Some( + drive_hailstorm_height_up_to_verify( + persisted_height + 1, + &mut context, + &mut state, + ) + .await, + ) + } else { + None + }; + for idx in selected.iter().copied() { let crashed = validators[idx] .take() @@ -674,6 +752,19 @@ pub fn hailstorm( crashed.actor_handle.abort(); let _ = crashed.actor_handle.await; } + + if let Some(pending) = pending { + let mut state = HailstormState { + validators: &mut validators, + canonical: &mut canonical, + parent: &mut parent, + parent_commitment: &mut parent_commitment, + participants: &participants, + schemes: &schemes, + }; + finalize_hailstorm_height(pending, &mut context, &mut state).await; + } + info!( seed, shutdown_idx, @@ -681,6 +772,7 @@ pub fn hailstorm( down_count, persisted_height, leadup, + crash_after, "marshal hailstorm shutdown" ); @@ -693,7 +785,6 @@ pub fn hailstorm( parent_commitment: &mut parent_commitment, participants: &participants, schemes: &schemes, - propagation_delay, }; advance_hailstorm_to(target_height, &mut context, &mut state).await; @@ -833,11 +924,29 @@ pub fn proposed_success_implies_recoverable_after_restart( provider.clone(), ) .await; + let recovered = + restarted + .mailbox + .get_verified(round) + .await + .unwrap_or_else(|| { + panic!( + "marshal.verified() returning true must imply \ + get_verified(round) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_verified(round) must return the proposed block \ + (seed={seed}, cycle={cycle})" + ); assert!( - restarted.mailbox.get_block(&digest).await.is_some(), - "marshal.verified() returning true must imply the block is recoverable \ - after restart (seed={seed}, cycle={cycle})" - ); + restarted.mailbox.get_block(&digest).await.is_some(), + "get_block(&digest) must also recover the proposed block \ + (seed={seed}, cycle={cycle})" + ); } }); checkpoint = next_checkpoint; @@ -922,11 +1031,138 @@ pub fn verified_success_implies_recoverable_after_restart( provider.clone(), ) .await; + let recovered = + restarted + .mailbox + .get_verified(round) + .await + .unwrap_or_else(|| { + panic!( + "marshal.verified() returning true must imply \ + get_verified(round) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_verified(round) must return the verified block \ + (seed={seed}, cycle={cycle})" + ); assert!( - restarted.mailbox.get_block(&digest).await.is_some(), - "marshal.verified() returning true must imply the block is recoverable \ - after restart (seed={seed}, cycle={cycle})" - ); + restarted.mailbox.get_block(&digest).await.is_some(), + "get_block(&digest) must also recover the verified block \ + (seed={seed}, cycle={cycle})" + ); + } + }); + checkpoint = next_checkpoint; + } + } +} + +/// Contract: `marshal.certified(...)=true` means the block survives an +/// immediate crash and repeated recoveries. +/// +/// Complements [`verified_success_implies_recoverable_after_restart`] by +/// exercising the `Message::Certified -> cache_block -> put_sync` handshake. +/// A regression that acked before syncing the notarized cache would surface +/// here as a missing block after restart. +pub fn certified_success_implies_recoverable_after_restart( + seeds: impl IntoIterator, +) { + for seed in seeds { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::( + &mut test_rng_seeded(seed), + NAMESPACE, + NUM_VALIDATORS, + ); + + let me = participants[0].clone(); + let provider = ConstantProvider::new(schemes[0].clone()); + let round = Round::new(Epoch::zero(), View::new(1)); + let block = H::make_test_block( + Sha256::hash(b""), + H::genesis_parent_commitment(NUM_VALIDATORS as u16), + Height::new(1), + 100, + NUM_VALIDATORS as u16, + ); + let digest = H::digest(&block); + let recovery_cycles = restart_cycles_for_seed(seed); + + let (_, mut checkpoint) = contract_runner(seed).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + let block = block.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let setup = H::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let mut handle = ValidatorHandle:: { + mailbox: setup.mailbox, + extra: setup.extra, + }; + assert!( + H::certify(&mut handle, round, &block).await, + "certify must ack" + ); + } + }); + + for cycle in 0..recovery_cycles { + let ((), next_checkpoint) = + deterministic::Runner::from(checkpoint).start_and_recover({ + let participants = participants.clone(); + let me = me.clone(); + let provider = provider.clone(); + move |context| async move { + let mut oracle = setup_network_with_participants( + context.clone(), + NZUsize!(1), + participants.clone(), + ) + .await; + let restarted = H::setup_validator( + context.with_label(&format!("validator_0_restart_{cycle}")), + &mut oracle, + me.clone(), + provider.clone(), + ) + .await; + let recovered = + restarted + .mailbox + .get_block(&digest) + .await + .unwrap_or_else(|| { + panic!( + "marshal.certified() returning true must imply \ + get_block(&digest) recovers the block after restart \ + (seed={seed}, cycle={cycle})" + ) + }); + assert_eq!( + recovered.digest(), + digest, + "get_block(&digest) must return the certified block \ + (seed={seed}, cycle={cycle})" + ); } }); checkpoint = next_checkpoint; diff --git a/consensus/src/marshal/standard/deferred.rs b/consensus/src/marshal/standard/deferred.rs index 857819a2bdb..d9b29d067aa 100644 --- a/consensus/src/marshal/standard/deferred.rs +++ b/consensus/src/marshal/standard/deferred.rs @@ -308,13 +308,30 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { - // On leader recovery, marshal may already hold a verified - // block for this round (persisted by a pre-crash propose - // whose notarize vote never reached the journal). Building - // a fresh block would land on the same view index in the - // prunable archive and be silently dropped, so reuse the - // stored block instead. + // On leader recovery, marshal may already hold a verified block + // for this round (persisted by a pre-crash propose whose + // notarize vote never reached the journal). + // + // Building a fresh block would land on the same prunable archive + // index and be silently dropped, so the stored block is the only proposal + // we can broadcast for this round. + // + // The recovered block is safe to reuse only if its embedded + // context matches the context simplex just recovered. Otherwise the + // cached block was built against a different parent and cannot be + // broadcast under the current header, so drop the receiver + // and let the voter nullify the view via timeout. if let Some(block) = marshal.get_verified(consensus_context.round).await { + let block_context = block.context(); + if block_context != consensus_context { + debug!( + round = ?consensus_context.round, + ?consensus_context, + ?block_context, + "skipping proposal: cached verified block context no longer matches" + ); + return; + } let digest = block.digest(); let success = tx.send_lossy(digest); debug!( @@ -1157,4 +1174,72 @@ mod tests { ); }); } + + /// Regression: if a pre-crash leader persisted a verified block for a + /// round but the simplex `Notarize` never reached the journal, replay + /// can recover a `consensus_context` whose parent differs from the one + /// the cached block was built against (e.g. a late certification of an + /// older view changes the parent selected by `State::find_parent`). + /// In that case the restarted leader must not broadcast the stale + /// cached block; it must drop the receiver so the voter nullifies the + /// view via `MissingProposal`. + #[test_traced("WARN")] + fn test_propose_skips_when_verified_block_context_changed() { + let runner = deterministic::Runner::timed(Duration::from_secs(30)); + runner.start(|mut context| async move { + let Fixture { + participants, + schemes, + .. + } = bls12381_threshold_vrf::fixture::(&mut context, NAMESPACE, NUM_VALIDATORS); + let mut oracle = + setup_network_with_participants(context.clone(), NZUsize!(1), participants.clone()) + .await; + + let me = participants[0].clone(); + let setup = StandardHarness::setup_validator( + context.with_label("validator_0"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let marshal = setup.mailbox; + + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + + // Stash a stale block built against genesis as its parent at round V=2. + let round = Round::new(Epoch::zero(), View::new(2)); + let stale_ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + let stale_block = B::new::(stale_ctx, genesis.digest(), Height::new(1), 100); + assert!(marshal.verified(round, stale_block).await); + + // Simulate a replay where parent selection now points to a + // different parent view than the cached block was built for. + let new_parent_digest = Sha256::hash(b"late-certified-parent"); + let new_ctx = Ctx { + round, + leader: me.clone(), + parent: (View::new(1), new_parent_digest), + }; + + let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()); + let mut marshaled = Deferred::new( + context.clone(), + mock_app, + marshal.clone(), + FixedEpocher::new(BLOCKS_PER_EPOCH), + ); + + let digest_rx = marshaled.propose(new_ctx).await; + assert!( + digest_rx.await.is_err(), + "propose must drop the receiver when the cached block's context no longer matches" + ); + }); + } } diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index ec89db53e14..b5c258b9500 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -239,20 +239,24 @@ where .with_label("propose") .with_attribute("round", consensus_context.round) .spawn(move |runtime_context| async move { - // On leader recovery, marshal may already hold a verified - // block for this round (persisted by a pre-crash propose - // whose notarize vote never reached the journal). Building - // a fresh block would land on the same view index in the - // prunable archive and be silently dropped, so reuse the - // stored block instead. - if let Some(block) = marshal.get_verified(consensus_context.round).await { - let digest = block.digest(); - let success = tx.send_lossy(digest); + // On leader recovery, marshal may already hold a verified block + // for this round (persisted by a pre-crash propose whose + // notarize vote never reached the journal). + // + // The parent context recovered by simplex may differ from the one + // the cached block was built against, so the stored block is not safe to reuse + // and building a fresh block would land on the same prunable + // archive index and be silently dropped. + // + // Skip this view and let the voter nullify it via timeout. + if marshal + .get_verified(consensus_context.round) + .await + .is_some() + { debug!( round = ?consensus_context.round, - ?digest, - success, - "reused verified block from marshal on leader recovery" + "skipping proposal: verified block already exists for round on restart" ); return; } @@ -1168,16 +1172,15 @@ mod tests { /// Regression: if marshal persisted a verified block for a round before /// a crash (via a prior `propose` call) but the simplex notarize artifact - /// never reached the journal, a restarted leader must re-use the persisted - /// block. - /// - /// Otherwise the application is asked to build afresh, returns a new - /// block whose digest does not match the one marshal already stored - /// (the prunable archive silently drops the second write at the same - /// view index), the leader broadcasts a `Notarize` for a digest no peer - /// can serve, and the view stalls until timeout. + /// never reached the journal, the restarted leader must skip proposing + /// for that round. The cached block was built against a parent context + /// that replay may have changed, so reusing it can broadcast a proposal + /// whose payload no longer matches the recovered header. Building a + /// fresh block would also be unsafe because the prunable archive silently + /// drops the second write at the same view index. Dropping the receiver + /// lets the voter nullify the view via `MissingProposal`. #[test_traced("WARN")] - fn test_propose_reuses_verified_block_on_restart() { + fn test_propose_skips_when_verified_block_exists_on_restart() { let runner = deterministic::Runner::timed(Duration::from_secs(30)); runner.start(|mut context| async move { let Fixture { @@ -1199,8 +1202,6 @@ mod tests { .await; let marshal = setup.mailbox; - // Seed block A for round V=1 in marshal's verified cache as if - // the pre-crash leader had built and broadcasted it. let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); let round = Round::new(Epoch::zero(), View::new(1)); let ctx = Ctx { @@ -1208,18 +1209,13 @@ mod tests { leader: me.clone(), parent: (View::zero(), genesis.digest()), }; - let block_a = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); - let digest_a = block_a.digest(); - assert!(marshal.verified(round, block_a.clone()).await); + let stale_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); + assert!(marshal.verified(round, stale_block).await); - // After restart, the fresh application would build a different - // block for the same round (distinct timestamp -> distinct digest). - let block_b = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); - let digest_b = block_b.digest(); - assert_ne!(digest_a, digest_b, "test requires distinct digests"); + let fresh_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); let mock_app: MockVerifyingApp = - MockVerifyingApp::new(genesis.clone()).with_propose_result(block_b); + MockVerifyingApp::new(genesis.clone()).with_propose_result(fresh_block); let mut inline = Inline::new( context.clone(), mock_app, @@ -1228,10 +1224,9 @@ mod tests { ); let digest_rx = inline.propose(ctx).await; - let digest = digest_rx.await.expect("propose must return a digest"); - assert_eq!( - digest, digest_a, - "propose must reuse the block marshal already persisted for this round" + assert!( + digest_rx.await.is_err(), + "propose must drop the receiver so the voter nullifies the round via timeout" ); }); } diff --git a/consensus/src/marshal/standard/mod.rs b/consensus/src/marshal/standard/mod.rs index c0657e35376..01935371232 100644 --- a/consensus/src/marshal/standard/mod.rs +++ b/consensus/src/marshal/standard/mod.rs @@ -207,6 +207,12 @@ mod tests { harness::certify_persists_equivocated_block::(); } + #[test_traced("WARN")] + fn test_standard_certified_success_implies_recoverable_after_restart() { + harness::certified_success_implies_recoverable_after_restart::(0..16); + harness::certified_success_implies_recoverable_after_restart::(0..16); + } + #[test_traced("WARN")] fn test_standard_certify_at_later_view_survives_earlier_view_pruning() { harness::certify_at_later_view_survives_earlier_view_pruning::(); diff --git a/consensus/src/simplex/actors/voter/actor.rs b/consensus/src/simplex/actors/voter/actor.rs index 8a3ef3dfb4a..73ed8e9387a 100644 --- a/consensus/src/simplex/actors/voter/actor.rs +++ b/consensus/src/simplex/actors/voter/actor.rs @@ -795,6 +795,14 @@ impl< .await; } } + + // We deliberately avoid re-seeding the batcher with our + // own votes (or the votes of other peers) on replay. We assume that + // whatever view we were in during shutdown is no longer the latest + // and we'll quickly jump ahead to a new view. + // + // If this is not the case (cluster-wide shutdown), we will recover + // when timing out. } } self.journal = Some(journal); From d34a7e8b7f37187c839943ec129e6a01c593cb2b Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 20 Apr 2026 13:59:28 -0700 Subject: [PATCH 106/107] nits --- consensus/src/marshal/mocks/harness.rs | 47 +- consensus/src/marshal/standard/inline.rs | 49 +- consensus/src/simplex/actors/voter/mod.rs | 492 +++++++++++++++++++++ consensus/src/simplex/mocks/application.rs | 19 + 4 files changed, 578 insertions(+), 29 deletions(-) diff --git a/consensus/src/marshal/mocks/harness.rs b/consensus/src/marshal/mocks/harness.rs index c213fea7538..0749d7e060c 100644 --- a/consensus/src/marshal/mocks/harness.rs +++ b/consensus/src/marshal/mocks/harness.rs @@ -1216,11 +1216,15 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { ); let repeated_digest = H::digest(&repeated); - // Negative control: a verify-only block at the same early view. Because - // it is never certified, it lives solely in `verified_blocks[V=1]` and - // must disappear once retention pruning advances past V=1. Asserting it - // is gone confirms the prune actually fires at the expected floor, so - // the `repeated` survivor assertion below is genuinely load-bearing. + // Negative control: a verify-only block at a distinct early view. + // Placing `orphan` at V=2 (instead of V=1, where `repeated` already + // occupies the verified index) guarantees the write actually lands in + // `verified_blocks[V=2]` rather than being silently dropped as a + // duplicate index. Because it is never certified, it lives solely in + // that verified entry and must disappear once retention pruning + // advances past V=2. Asserting it is gone (after asserting it was + // present before pruning) confirms the prune actually fires at the + // expected floor. let orphan = H::make_test_block( Sha256::hash(b"orphan"), H::genesis_parent_commitment(NUM_VALIDATORS as u16), @@ -1231,9 +1235,11 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { let orphan_digest = H::digest(&orphan); // Verify `repeated` at V=1, then certify at V=25 (reproposal-style gap). - // The chain below starts at V=2 to avoid overwriting V=1 in the - // verified archive (which drops subsequent writes at an existing view). + // The chain below starts at V=3 to avoid overwriting V=1 (`repeated`) + // or V=2 (`orphan`) in the verified archive (which drops subsequent + // writes at an existing view). let v_early = Round::new(Epoch::zero(), View::new(1)); + let v_orphan = Round::new(Epoch::zero(), View::new(2)); let v_late = Round::new(Epoch::zero(), View::new(25)); let mut peers: [ValidatorHandle; 0] = []; H::verify(&mut handle, v_early, &repeated, &mut peers).await; @@ -1242,14 +1248,20 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { "certify must ack" ); - // Verify `orphan` at V=1 only (no certify). - H::verify(&mut handle, v_early, &orphan, &mut peers).await; + // Verify `orphan` at its own distinct view V=2 (no certify). + H::verify(&mut handle, v_orphan, &orphan, &mut peers).await; + assert!( + handle.mailbox.get_block(&orphan_digest).await.is_some(), + "negative control assumes `orphan` is present before pruning; \ + if it is not, the V=2 write was dropped and the post-prune \ + assertion would pass vacuously" + ); // Drive the finalized chain forward to advance `last_processed_round` - // past V=1's retention boundary but not past V=25's. With - // view_retention_timeout=10 and prunable_items_per_section=10, - // processing views 2..=22 leaves `oldest_allowed=12` in both prunable - // archives. V=1 is dropped, V=25 is retained. + // past V=2's retention boundary but not past V=25's. With + // view_retention_timeout=10 and prunable_items_per_section=10, the + // prune floor snaps down to the section boundary and evicts V=1 and + // V=2 while leaving V=25 intact. const CHAIN_LEN: u64 = 21; let mut parent = Sha256::hash(b""); let mut parent_commitment = H::genesis_parent_commitment(NUM_VALIDATORS as u16); @@ -1263,7 +1275,7 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { ); let digest = H::digest(&block); let commitment = H::commitment(&block); - let round = Round::new(Epoch::zero(), View::new(i + 1)); + let round = Round::new(Epoch::zero(), View::new(i + 2)); H::propose(&mut handle, round, &block).await; let proposal = Proposal { round, @@ -1280,11 +1292,12 @@ pub fn certify_at_later_view_survives_earlier_view_pruning() { } context.sleep(Duration::from_millis(100)).await; - // Negative control: the verify-only orphan at V=1 must be gone, which - // proves retention pruning actually evicted V=1 at the expected floor. + // Negative control: the verify-only orphan at V=2 must be gone, which + // proves retention pruning actually evicted the early-view entries at + // the expected floor. assert!( handle.mailbox.get_block(&orphan_digest).await.is_none(), - "verify-only block at V=1 must be evicted by retention pruning" + "verify-only block at V=2 must be evicted by retention pruning" ); // The repeated block must still be retrievable: verified_blocks[V=1] diff --git a/consensus/src/marshal/standard/inline.rs b/consensus/src/marshal/standard/inline.rs index b5c258b9500..bc1e6f63a64 100644 --- a/consensus/src/marshal/standard/inline.rs +++ b/consensus/src/marshal/standard/inline.rs @@ -1193,33 +1193,58 @@ mod tests { .await; let me = participants[0].clone(); - let setup = StandardHarness::setup_validator( + let round = Round::new(Epoch::zero(), View::new(1)); + let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); + let ctx = Ctx { + round, + leader: me.clone(), + parent: (View::zero(), genesis.digest()), + }; + + // Pre-crash: seed `verified_blocks[V=1]` through the live mailbox, + // mirroring an aborted pre-crash `Inline::propose` that persisted + // its verified block before the voter could journal a notarize. + let pre_setup = StandardHarness::setup_validator( context.with_label("validator_0"), &mut oracle, me.clone(), ConstantProvider::new(schemes[0].clone()), ) .await; - let marshal = setup.mailbox; + let pre_marshal = pre_setup.mailbox; + let pre_actor = pre_setup.actor_handle; + let pre_extra = pre_setup.extra; + let pre_application = pre_setup.application; - let genesis = make_raw_block(Sha256::hash(b""), Height::zero(), 0); - let round = Round::new(Epoch::zero(), View::new(1)); - let ctx = Ctx { - round, - leader: me.clone(), - parent: (View::zero(), genesis.digest()), - }; let stale_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 100); - assert!(marshal.verified(round, stale_block).await); + assert!(pre_marshal.verified(round, stale_block).await); + + // Simulate a crash: abort the actor and drop every handle so the + // storage partition is fully released before reopening. + pre_actor.abort(); + drop(pre_marshal); + drop(pre_extra); + drop(pre_application); + + // Post-crash: reopen the same partition. The verified block must + // be recovered from storage during archive restore so that + // `Message::GetVerified` on the new mailbox observes it. + let post_setup = StandardHarness::setup_validator( + context.with_label("validator_0_restart"), + &mut oracle, + me.clone(), + ConstantProvider::new(schemes[0].clone()), + ) + .await; + let post_marshal = post_setup.mailbox; let fresh_block = B::new::(ctx.clone(), genesis.digest(), Height::new(1), 200); - let mock_app: MockVerifyingApp = MockVerifyingApp::new(genesis.clone()).with_propose_result(fresh_block); let mut inline = Inline::new( context.clone(), mock_app, - marshal.clone(), + post_marshal.clone(), FixedEpocher::new(BLOCKS_PER_EPOCH), ); diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 1d724cc1d58..325e5e36b39 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -4182,6 +4182,294 @@ mod tests { no_self_propose_or_verify_after_restart(secp256r1::fixture); } + /// Regression: a leader that crashes after calling `automaton.propose` but + /// before journaling its local `Notarize` must, on restart, issue at most a + /// single `automaton.propose` call for the leader-owned view and exit that + /// view via `Vote::Nullify` instead of retrying proposals through the live + /// run loop. + fn nullify_after_crash_in_propose_window(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"nullify_after_crash_in_propose_window".to_vec(); + let partition = "nullify_after_crash_in_propose_window".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(30)); + executor.start(|mut context| async move { + // Set up the simulated network. + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + // Pre-crash: drop every propose response. The leader calls + // `automaton.propose`, the mock swallows the request, and nothing + // is journaled. An observer records that the pre-crash leader + // actually got as far as requesting a proposal so the test knows + // the abort happens inside the propose window rather than before + // the voter even became leader. + let pre_propose_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let pre_propose_tracker = pre_propose_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + // Stall (not drop) so the voter's receiver stays open indefinitely. + // Dropping the sender would fire `MissingProposal` and journal a + // `Nullify` before we can abort, which would in turn cause replay + // to skip the propose path entirely post-restart. + app_actor.set_stall_proposals(true); + app_actor.set_propose_observer(Box::new(move |ctx| { + pre_propose_tracker.lock().push(ctx.view()); + })); + app_actor.start(); + + // Build and start the pre-crash voter. `leader_timeout` is long + // enough that the voter won't auto-nullify before we abort, + // guaranteeing the journal contains no `Nullify` either. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector: elector.clone(), + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter: reporter.clone(), + partition: partition.clone(), + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_secs(600), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + let handle = voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for startup, then advance into the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + break; + } + batcher::Message::Constructed(_) => {} + } + } + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Wait for the pre-crash voter to call `automaton.propose` for + // the leader-owned view. The observer fires before the mock parks + // the response sender, so seeing this entry confirms the voter + // entered the propose window and is now blocked on a response + // that will never arrive. Driving the runtime forward with a + // short `context.sleep` lets the voter and application tasks + // progress to their next await points without consuming batcher + // messages we still need for later assertions. + for _ in 0..100 { + if pre_propose_calls.lock().iter().any(|v| *v == target_view) { + break; + } + context.sleep(Duration::from_millis(10)).await; + } + assert!( + pre_propose_calls.lock().iter().any(|v| *v == target_view), + "pre-crash voter must reach the propose window for the leader-owned view" + ); + + // Crash: abort the voter. Because `propose` never returned, no + // `Notarize` (or any other artifact for the target view) reached + // the journal. + handle.abort(); + + // Post-restart: install a fresh application that also drops + // `propose` responses. This mirrors the marshal's post-restart + // behavior when `get_verified` sees a cached block for the round + // and deliberately drops the tx, forcing the voter to nullify + // the view rather than reuse the stale block. A propose observer + // on this application is the assertion anchor: it must record + // exactly one call for the target view. + let post_propose_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let post_propose_tracker = post_propose_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Always, + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app_restarted"), app_cfg); + app_actor.set_drop_proposals(true); + app_actor.set_propose_observer(Box::new(move |ctx| { + post_propose_tracker.lock().push(ctx.view()); + })); + app_actor.start(); + + // Build and start the post-restart voter on the same partition + // with a short `leader_timeout` so the nullify path fires promptly + // once the restarted voter has had a chance to issue its single + // (dropped) propose request. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch: Epoch::new(333), + mailbox_size: 128, + leader_timeout: Duration::from_millis(500), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, _mailbox) = Actor::new(context.with_label("voter_restarted"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(2, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(3, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for replay to complete and confirm we re-entered the + // leader-owned target view. Journal replay saw no notarize for + // this view, so the slot starts empty and the voter will call + // `automaton.propose` from scratch. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { + current, + leader, + response, + .. + } => { + response.send(None).unwrap(); + assert_eq!(current, target_view); + assert_eq!(leader, Participant::new(0)); + break; + } + batcher::Message::Constructed(_) => {} + } + } + + // Wait for the leader-timeout nullify. This also proves the + // run loop stayed responsive after the dropped propose request: + // the voter did not livelock trying to re-propose, it reached the + // timeout path and emitted the nullify vote. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + break; + } + batcher::Message::Constructed(Vote::Notarize(notarize)) + if notarize.view() == target_view => + { + panic!( + "restarted voter must not emit a new Notarize for the \ + leader-owned view; its stale verified block could \ + still be cached in marshal" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + } + + // Assert the core restart invariant: the restarted voter issued + // `automaton.propose` at most once for the target view and then + // nullified instead of retrying. + let proposed = post_propose_calls.lock(); + let target_call_count = proposed.iter().filter(|v| **v == target_view).count(); + assert_eq!( + target_call_count, 1, + "restarted voter must call automaton.propose exactly once for the \ + leader-owned view before nullifying (observed: {proposed:?})" + ); + }); + } + + #[test_traced] + fn test_nullify_after_crash_in_propose_window() { + nullify_after_crash_in_propose_window(bls12381_threshold_vrf::fixture::); + nullify_after_crash_in_propose_window(bls12381_threshold_vrf::fixture::); + nullify_after_crash_in_propose_window(bls12381_multisig::fixture::); + nullify_after_crash_in_propose_window(bls12381_multisig::fixture::); + nullify_after_crash_in_propose_window(ed25519::fixture); + nullify_after_crash_in_propose_window(secp256r1::fixture); + } + /// After restart, a proposal we already voted on must not be re-verified /// when it is re-delivered to the voter (e.g. via the automaton after /// peer vote aggregation reconstructs it). @@ -4903,6 +5191,210 @@ mod tests { no_self_certify_after_restart(secp256r1::fixture); } + /// Regression: when an elected leader receives an external notarization + /// for a proposal it did *not* build locally, it must invoke + /// `automaton.certify` before finalizing the view. The + /// `is_local=true` shortcut in `actor::run` must only short-circuit when + /// the slot carries explicit local proposal evidence; an + /// externally-recovered proposal on a leader-owned view produces + /// `is_local=false`, which requires consulting the automaton. + fn certify_observer_fires_for_external_leader_proposal(mut fixture: F) + where + S: Scheme, + F: FnMut(&mut deterministic::Context, &[u8], u32) -> Fixture, + { + let n = 5; + let quorum = quorum(n); + let namespace = b"certify_observer_fires_for_external_leader_proposal".to_vec(); + let partition = "certify_observer_fires_for_external_leader_proposal".to_string(); + let executor = deterministic::Runner::timed(Duration::from_secs(20)); + executor.start(|mut context| async move { + // Set up the simulated network. + let Fixture { + participants, + schemes, + .. + } = fixture(&mut context, &namespace, n); + let oracle = + start_test_network_with_peers(context.clone(), participants.clone(), true).await; + + // RoundRobin with epoch=333, n=5: view 2 -> leader=Participant::new(0) = us. + let target_view = View::new(2); + let target_epoch = Epoch::new(333); + let me = participants[0].clone(); + let elector = RoundRobin::::default(); + let reporter_cfg = mocks::reporter::Config { + participants: participants.clone().try_into().unwrap(), + scheme: schemes[0].clone(), + elector: elector.clone(), + }; + let reporter = + mocks::reporter::Reporter::new(context.with_label("reporter"), reporter_cfg); + let relay = Arc::new(mocks::relay::Relay::new()); + + // Stall the propose response so the slot is never populated with + // a locally-built proposal. The slot stays empty (proposal=None, + // status=None) while the voter's internal flag `requested_build` + // is true, exactly the state in which an externally-recovered + // proposal lands with `is_local=false` at the leader. + // + // The certify observer records every `automaton.certify` call so + // the final assertion can confirm the `is_local=false` code path + // ran instead of being short-circuited. + let certify_calls: Arc>> = Arc::new(Mutex::new(Vec::new())); + let certify_tracker = certify_calls.clone(); + let app_cfg = mocks::application::Config { + hasher: Sha256::default(), + relay: relay.clone(), + me: me.clone(), + propose_latency: (1.0, 0.0), + verify_latency: (1.0, 0.0), + certify_latency: (1.0, 0.0), + should_certify: mocks::application::Certifier::Custom(Box::new(move |round, _| { + certify_tracker.lock().push(round.view()); + true + })), + }; + let (mut app_actor, application) = + mocks::application::Application::new(context.with_label("app"), app_cfg); + app_actor.set_stall_proposals(true); + app_actor.start(); + + // Build and start the voter. Use long `leader_timeout` so the + // stalled proposal does not trigger a nullify before the + // conflicting notarization reaches the voter. + let voter_cfg = Config { + scheme: schemes[0].clone(), + elector, + blocker: oracle.control(me.clone()), + automaton: application.clone(), + relay: application.clone(), + reporter, + partition, + epoch: target_epoch, + mailbox_size: 128, + leader_timeout: Duration::from_secs(600), + certification_timeout: Duration::from_secs(600), + timeout_retry: Duration::from_secs(600), + activity_timeout: ViewDelta::new(10), + replay_buffer: NZUsize!(1024 * 1024), + write_buffer: NZUsize!(1024 * 1024), + page_cache: CacheRef::from_pooler(&context, PAGE_SIZE, PAGE_CACHE_SIZE), + }; + let (voter, mut mailbox) = Actor::new(context.with_label("voter"), voter_cfg); + let (resolver_sender, _) = mpsc::channel(8); + let (batcher_sender, mut batcher_receiver) = mpsc::channel(8); + let (vote_sender, _) = oracle + .control(me.clone()) + .register(0, TEST_QUOTA) + .await + .unwrap(); + let (cert_sender, _) = oracle + .control(me.clone()) + .register(1, TEST_QUOTA) + .await + .unwrap(); + voter.start( + batcher::Mailbox::new(batcher_sender), + resolver::Mailbox::new(resolver_sender), + vote_sender, + cert_sender, + ); + + // Wait for startup, then advance into the leader-owned view. + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Update { response, .. } => { + response.send(None).unwrap(); + break; + } + batcher::Message::Constructed(_) => {} + } + } + advance_to_view( + &mut mailbox, + &mut batcher_receiver, + &schemes, + quorum, + target_view, + ) + .await; + + // Craft a proposal the voter could not have built locally + // (distinct payload) and build its notarization from all validator + // schemes. The notarization is well-formed; quorum-worth of signers + // cover the proposal so it will pass `add_notarization`. + let foreign_payload = Sha256::hash(b"foreign_leader_owned_proposal"); + let foreign_proposal = Proposal::new( + Round::new(target_epoch, target_view), + target_view.previous().unwrap_or(View::zero()), + foreign_payload, + ); + let (_, foreign_notarization) = build_notarization(&schemes, &foreign_proposal, quorum); + + // Deliver the foreign notarization. This seeds the voter's slot + // with a proposal it never built, producing `is_local=false` on + // the certification candidate. + mailbox + .resolved(Certificate::Notarization(foreign_notarization)) + .await; + + // Wait for a `Finalize` on the leader-owned view. Observing + // finalize proves the certify callback both fired and resolved + // successfully. Any `Nullify` here would mean the voter never + // reached the certification branch (for example because + // `is_local=true` incorrectly short-circuited it). + loop { + match batcher_receiver.recv().await.unwrap() { + batcher::Message::Constructed(Vote::Finalize(finalize)) + if finalize.view() == target_view => + { + assert_eq!(finalize.proposal, foreign_proposal); + break; + } + batcher::Message::Constructed(Vote::Nullify(nullify)) + if nullify.view() == target_view => + { + panic!( + "leader-owned view with an externally-recovered proposal \ + must certify via the automaton instead of nullifying \ + view {target_view}" + ); + } + batcher::Message::Update { response, .. } => response.send(None).unwrap(), + batcher::Message::Constructed(_) => {} + } + } + + // Assert the `is_local=false` invariant: the certify callback + // fired for the leader-owned view. Without the fix under test, + // a `leader == me`-only shortcut would skip the call and this + // assertion would fail. + let certified = certify_calls.lock(); + assert!( + certified.contains(&target_view), + "voter must invoke automaton.certify for an externally-recovered \ + leader-owned proposal (observed: {certified:?})" + ); + }); + } + + #[test_traced] + fn test_certify_observer_fires_for_external_leader_proposal() { + certify_observer_fires_for_external_leader_proposal( + bls12381_threshold_vrf::fixture::, + ); + certify_observer_fires_for_external_leader_proposal( + bls12381_threshold_vrf::fixture::, + ); + certify_observer_fires_for_external_leader_proposal(bls12381_multisig::fixture::); + certify_observer_fires_for_external_leader_proposal( + bls12381_multisig::fixture::, + ); + certify_observer_fires_for_external_leader_proposal(ed25519::fixture); + certify_observer_fires_for_external_leader_proposal(secp256r1::fixture); + } + /// Test that in-flight certification requests are cancelled when finalization occurs. /// /// 1. Use a very long certify latency to ensure certification is in-flight. diff --git a/consensus/src/simplex/mocks/application.rs b/consensus/src/simplex/mocks/application.rs index 4648950ca75..76e292d33c8 100644 --- a/consensus/src/simplex/mocks/application.rs +++ b/consensus/src/simplex/mocks/application.rs @@ -186,6 +186,7 @@ pub struct Application { fail_verification: bool, drop_proposals: bool, + stall_proposals: bool, drop_verifications: bool, should_certify: Certifier, @@ -202,6 +203,10 @@ pub struct Application { /// of a leader-owned proposal). verify_observer: Option>, + /// Senders held alive to simulate proposals that hang indefinitely + /// (used when `stall_proposals` is set). + pending_proposes: Vec>, + /// Senders held alive to simulate certifications that hang indefinitely /// (used by [`Certifier::Pending`]). pending_certifications: Vec>, @@ -236,6 +241,7 @@ impl Application fail_verification: false, drop_proposals: false, + stall_proposals: false, drop_verifications: false, should_certify: cfg.should_certify, @@ -243,6 +249,7 @@ impl Application verified: HashSet::new(), propose_observer: None, verify_observer: None, + pending_proposes: Vec::new(), pending_certifications: Vec::new(), }, Mailbox::new(sender), @@ -257,6 +264,14 @@ impl Application self.drop_proposals = drop; } + /// When set, `Message::Propose` requests are held open indefinitely: the + /// response sender is parked in `pending_proposes`, keeping the oneshot + /// alive so the caller's `receiver` never resolves. This simulates a + /// propose that is still in flight at the moment the voter crashes. + pub const fn set_stall_proposals(&mut self, stall: bool) { + self.stall_proposals = stall; + } + pub const fn set_drop_verifications(&mut self, drop: bool) { self.drop_verifications = drop; } @@ -416,6 +431,10 @@ impl Application if let Some(observer) = &self.propose_observer { observer(context.clone()); } + if self.stall_proposals { + self.pending_proposes.push(response); + continue; + } if self.drop_proposals { continue; } From 8ae64d5bef95a5cb29c1123ace8b52b5e51de157 Mon Sep 17 00:00:00 2001 From: Patrick O'Grady Date: Mon, 20 Apr 2026 14:05:53 -0700 Subject: [PATCH 107/107] lint --- consensus/src/simplex/actors/voter/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 325e5e36b39..fa5339b26de 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -4318,13 +4318,13 @@ mod tests { // progress to their next await points without consuming batcher // messages we still need for later assertions. for _ in 0..100 { - if pre_propose_calls.lock().iter().any(|v| *v == target_view) { + if pre_propose_calls.lock().contains(&target_view) { break; } context.sleep(Duration::from_millis(10)).await; } assert!( - pre_propose_calls.lock().iter().any(|v| *v == target_view), + pre_propose_calls.lock().contains(&target_view), "pre-crash voter must reach the propose window for the leader-owned view" );