Skip to content

Commit 5062787

Browse files
authored
Fix FP panics in marshal and storage (#4042)
1 parent 5f4ce4f commit 5062787

3 files changed

Lines changed: 60 additions & 18 deletions

File tree

consensus/fuzz/src/marshal/single_node/invariant.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,23 @@ pub fn check_all<H: TestHarness>(
2424
application_delivered: &[(Height, D)],
2525
canonical: &[H::TestBlock],
2626
) {
27-
check_ready_prefix_delivered(ready_prefix, delivery_log);
27+
check_ready_prefix_delivered(ready_prefix, application_delivered);
2828
check_segment_ordering(segment_bounds, segment_starts, delivery_log);
2929
check_redelivery_after_restart(expected_redeliveries, segment_bounds, delivery_log);
3030
check_digest_fidelity::<H>(application_delivered, canonical);
3131
}
3232

3333
/// Invariant: ready-prefix delivery.
3434
///
35-
/// Every height in `1..=ready_prefix` must appear at least once in
36-
/// `delivery_log`. The driver advances `ready_prefix` only when mirrored
37-
/// repair makes marshal's finalized archive contiguous from height 1, which is
38-
/// precisely when marshal is obliged to deliver the prefix.
39-
pub fn check_ready_prefix_delivered(ready_prefix: u64, delivery_log: &[Height]) {
40-
let delivered_set: BTreeSet<u64> = delivery_log.iter().map(|h| h.get()).collect();
35+
/// Every height in `1..=ready_prefix` must appear at least once in the
36+
/// application's delivery record. The driver advances `ready_prefix` only when
37+
/// mirrored repair makes marshal's finalized archive contiguous from height 1,
38+
/// which is precisely when marshal is obliged to deliver the prefix. This
39+
/// checks `application_delivered` (the ground truth of what reached the
40+
/// application) rather than the ack-derived `delivery_log`, which omits stale
41+
/// redeliveries skipped at restart boundaries.
42+
pub fn check_ready_prefix_delivered(ready_prefix: u64, application_delivered: &[(Height, D)]) {
43+
let delivered_set: BTreeSet<u64> = application_delivered.iter().map(|(h, _)| h.get()).collect();
4144
for h in 1..=ready_prefix {
4245
assert!(
4346
delivered_set.contains(&h),

consensus/fuzz/src/marshal/single_node/runner.rs

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,13 @@ where
347347
let mut durable_available: HashSet<u64> = HashSet::new();
348348
let mut variant_available: HashSet<u64> = HashSet::new();
349349
let mut finalized_available: HashSet<u64> = HashSet::new();
350+
// Heights for which marshal holds a live buffer waiter, created by a
351+
// Subscribe for a block it does not yet have. Only such a waiter makes a
352+
// later PublishViaVariant actor-observable: it fires the subscription,
353+
// marshal ingests the block, and a pending floor anchored on it
354+
// completes. Without a waiter a publish only populates the variant cache
355+
// and marshal never re-checks it. Cleared on Restart with the actor.
356+
let mut subscribed_heights: HashSet<u64> = HashSet::new();
350357
// Publish order backing `variant_available`'s FIFO eviction.
351358
let mut variant_order: std::collections::VecDeque<u64> = std::collections::VecDeque::new();
352359
// ready_prefix is monotone non-decreasing. It advances when the
@@ -662,6 +669,11 @@ where
662669
let block = &canonical[block_index(block_idx)];
663670
let height = H::height(block);
664671
let round = round_for_height(height);
672+
// Marshal registers a buffer waiter only when it lacks the
673+
// block; a hit answers the subscription inline without one.
674+
if !block_available(&durable_available, &variant_available, height.get()) {
675+
subscribed_heights.insert(height.get());
676+
}
665677
if by_commitment {
666678
subscriptions.push(handle.mailbox.subscribe_by_commitment(
667679
H::commitment(block),
@@ -843,6 +855,11 @@ where
843855
height: fresh_height,
844856
},
845857
));
858+
// The live (non-closed) subscriptions above leave marshal a
859+
// waiter for the fresh block when it is missing locally.
860+
if !block_available(&durable_available, &variant_available, fresh_height.get()) {
861+
subscribed_heights.insert(fresh_height.get());
862+
}
846863
handle.mailbox.hint_notarized(fresh_round, fresh_commitment);
847864

848865
// Only report consensus certificates for a locally missing
@@ -971,17 +988,28 @@ where
971988
}
972989
}
973990
}
974-
// Marshal holds a buffer subscription for a pending
975-
// floor anchor; the publish completes it and marshal
976-
// ingests the block as the durable floor anchor.
977-
repair_wake |= apply_pending_floor(
978-
&mut pending_floor,
979-
height,
980-
&mut durable_available,
981-
&mut finalized_available,
982-
&mut processed_height,
983-
&mut segment_starts,
984-
);
991+
// A publish only completes a pending floor anchor when
992+
// marshal holds a live waiter for this block (from a
993+
// prior Subscribe): the publish fires the subscription
994+
// and marshal ingests it. Without a waiter the block
995+
// only lands in the variant cache, which marshal does
996+
// not re-check, so the floor stays parked on the
997+
// resolver. This is independent of the variant.
998+
//
999+
// Marshal subscriptions are one-shot: ingest notifies and
1000+
// removes them. Consume the waiter here unconditionally so
1001+
// a later republish (after the block is FIFO-evicted) does
1002+
// not wake a floor that real marshal can no longer serve.
1003+
if subscribed_heights.remove(&h) {
1004+
repair_wake |= apply_pending_floor(
1005+
&mut pending_floor,
1006+
height,
1007+
&mut durable_available,
1008+
&mut finalized_available,
1009+
&mut processed_height,
1010+
&mut segment_starts,
1011+
);
1012+
}
9851013
}
9861014
}
9871015
MarshalEvent::AckNext => {
@@ -1070,6 +1098,9 @@ where
10701098
// visible to marshal.
10711099
variant_available.clear();
10721100
variant_order.clear();
1101+
// The new actor starts with no buffer waiters; any
1102+
// subscription registered by the prior instance died with it.
1103+
subscribed_heights.clear();
10731104
// Marshal's processed_height for the new instance
10741105
// comes from its persistent metadata, which
10751106
// setup.height reflects. Pending deliveries that

storage/fuzz/fuzz_targets/freezer_operations.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,14 @@ fn fuzz(input: FuzzInput) {
166166
.await
167167
.unwrap();
168168
restarts += 1;
169+
// Refresh the crash-recovery model from the checkpoint we just
170+
// reopened from: it is now the latest persisted baseline, paired
171+
// with the current state, so a later crash can validly restore it.
172+
// Mode 0 reopens from the table (no checkpoint), which cannot serve
173+
// as a restore point, so it leaves no synced checkpoint. A stale
174+
// pre-restart checkpoint would otherwise be recovered past, dropping
175+
// re-put keys the rolled-back model still expects.
176+
synced = checkpoint.map(|cp| (cp, expected_state.clone(), cursors.clone()));
169177
}
170178
Op::Close => {
171179
freezer.close().await.unwrap();

0 commit comments

Comments
 (0)