Skip to content

Commit 5737a4a

Browse files
asaf-swclaude
andcommitted
apollo_l1_events: bound catch-up commit-block backlog with a cap and metric
The catchupper's `commit_block_backlog` was an unbounded `Vec` populated by every commit-block arriving above the provider's height during startup catch-up, and drained only once L2 sync reached the target. A persistently slow or stalled sync while the batcher keeps committing could grow it without limit (security finding L-16). Add a configurable `max_commit_block_backlog_len` (default 1,000,000) and a `l1_message_provider_commit_block_backlog_len` gauge. On overflow, `add_commit_block_to_backlog` returns a new `CatchUpBacklogOverflow` error rather than dropping entries: the backlog must stay a gapless, strictly-sequential run, so drop-oldest would corrupt the drain-time invariant and silently skip an L1-handler commit. The error surfaces to the batcher (which already handles commit_block errors) and to logs/alerts. The gauge is updated on each push and reset to 0 when the backlog drains at catch-up completion. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 570ca41 commit 5737a4a

10 files changed

Lines changed: 194 additions & 6 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/apollo_l1_events/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ alloy.workspace = true
4242
apollo_base_layer_tests.workspace = true
4343
apollo_infra_utils = { workspace = true, features = ["testing"] }
4444
apollo_l1_events_types = { workspace = true, features = ["testing"] }
45+
apollo_metrics = { workspace = true, features = ["testing"] }
4546
apollo_state_sync_types = { workspace = true, features = ["testing"] }
4647
apollo_time = { workspace = true, features = ["testing"] }
4748
assert_matches.workspace = true
4849
itertools.workspace = true
50+
metrics.workspace = true
51+
metrics-exporter-prometheus.workspace = true
4952
mockall.workspace = true
5053
papyrus_base_layer = { workspace = true, features = ["testing"] }
5154
pretty_assertions.workspace = true

crates/apollo_l1_events/src/catchupper.rs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ use std::sync::atomic::{AtomicU8, Ordering};
22
use std::sync::Arc;
33
use std::time::Duration;
44

5-
use apollo_l1_events_types::SharedL1EventsProviderClient;
5+
use apollo_l1_events_types::errors::L1EventsProviderError;
6+
use apollo_l1_events_types::{L1EventsProviderResult, SharedL1EventsProviderClient};
67
use apollo_state_sync_types::communication::SharedStateSyncClient;
78
use indexmap::IndexSet;
89
use starknet_api::block::BlockNumber;
910
use starknet_api::transaction::TransactionHash;
1011
use tracing::{debug, warn};
1112

13+
use crate::metrics::L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN;
14+
1215
// When the Provider gets a commit_block that is too high, it starts catching up.
1316
// The commit is rejected by the provider, so it must use sync to catch up to the height of the
1417
// commit, including that height. The sync task continues until reaching the target height,
@@ -28,6 +31,9 @@ pub struct Catchupper {
2831
// Keep track of sync task for health checks and logging status.
2932
pub sync_task_handle: SyncTaskHandle,
3033
pub n_sync_health_check_failures: Arc<AtomicU8>,
34+
/// Defensive cap on `commit_block_backlog` length; see L-16. Exceeding it is a hard error, not
35+
/// a drop, to preserve the gapless-sequential invariant of the backlog.
36+
pub max_commit_block_backlog_len: usize,
3137
}
3238

3339
impl Catchupper {
@@ -39,6 +45,7 @@ impl Catchupper {
3945
l1_events_provider_client: SharedL1EventsProviderClient,
4046
sync_client: SharedStateSyncClient,
4147
sync_retry_interval: Duration,
48+
max_commit_block_backlog_len: usize,
4249
) -> Self {
4350
Self {
4451
sync_retry_interval,
@@ -47,6 +54,7 @@ impl Catchupper {
4754
sync_client,
4855
sync_task_handle: SyncTaskHandle::NotStartedYet,
4956
n_sync_health_check_failures: Default::default(),
57+
max_commit_block_backlog_len,
5058
// This is overriden when starting the sync task (e.g., when provider starts
5159
// catching up).
5260
target_height: BlockNumber(0),
@@ -66,17 +74,35 @@ impl Catchupper {
6674
&mut self,
6775
committed_txs: IndexSet<TransactionHash>,
6876
height: BlockNumber,
69-
) {
77+
) -> L1EventsProviderResult<()> {
7078
assert!(
7179
self.commit_block_backlog
7280
.last()
7381
.is_none_or(|commit_block| commit_block.height.unchecked_next() == height),
7482
"Heights should be sequential."
7583
);
7684

85+
// Tripwire against unbounded growth on a stalled/lagging L2 sync (L-16). We must NOT drop
86+
// entries: the backlog is a gapless, strictly-sequential run and a hole would corrupt the
87+
// drain-time sequentiality assert and silently skip an L1-handler commit. Fail loudly
88+
// instead, so the caller and alerts learn catch-up is pathologically behind.
89+
if self.commit_block_backlog.len() >= self.max_commit_block_backlog_len {
90+
warn!(
91+
"Catch-up commit-block backlog reached its cap of {} entries at height {height}; \
92+
rejecting commit-block. L2 sync is likely stalled or lagging.",
93+
self.max_commit_block_backlog_len
94+
);
95+
return Err(L1EventsProviderError::CatchUpBacklogOverflow {
96+
height,
97+
max: self.max_commit_block_backlog_len,
98+
});
99+
}
100+
77101
debug!("Adding future commit-block to backlog at height: {height}");
78102
self.commit_block_backlog
79103
.push(CommitBlockBacklog { height, committed_txs: committed_txs.clone() });
104+
L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN.set_lossy(self.commit_block_backlog.len());
105+
Ok(())
80106
}
81107

82108
/// Spawns async task that produces and sends commit block messages to the provider, according

crates/apollo_l1_events/src/l1_events_provider.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use starknet_api::transaction::TransactionHash;
2222
use tracing::{debug, error, info, instrument, trace, warn};
2323

2424
use crate::catchupper::Catchupper;
25-
use crate::metrics::register_provider_metrics;
25+
use crate::metrics::{register_provider_metrics, L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN};
2626
use crate::transaction_manager::TransactionManager;
2727
use crate::L1EventsProviderConfig;
2828

@@ -71,6 +71,7 @@ impl L1EventsProvider {
7171
l1_events_provider_client,
7272
state_sync_client,
7373
config.startup_sync_sleep_retry_interval_seconds,
74+
config.max_commit_block_backlog_len,
7475
);
7576
Self {
7677
config,
@@ -87,6 +88,7 @@ impl L1EventsProvider {
8788
self.catchupper.l1_events_provider_client.clone(),
8889
self.catchupper.sync_client.clone(),
8990
self.config.startup_sync_sleep_retry_interval_seconds,
91+
self.config.max_commit_block_backlog_len,
9092
);
9193
}
9294
// Functions Called by the scraper.
@@ -422,7 +424,9 @@ impl L1EventsProvider {
422424
Equal => self.apply_commit_block(committed_txs, Default::default()),
423425
// We're still syncing, backlog it, it'll get applied later.
424426
Greater => {
425-
self.catchupper.add_commit_block_to_backlog(committed_txs, new_height);
427+
// Propagate a backlog-overflow error to the caller (the batcher) rather than
428+
// growing memory unbounded; see L-16.
429+
self.catchupper.add_commit_block_to_backlog(committed_txs, new_height)?;
426430
// No need to check the backlog or catchup completion, since those are only
427431
// applicable if we just increased the provider's height, like in the `Equal` case.
428432
return Ok(());
@@ -438,6 +442,8 @@ impl L1EventsProvider {
438442
self.current_height
439443
);
440444
let backlog = std::mem::take(&mut self.catchupper.commit_block_backlog);
445+
// The backlog is fully consumed below; reset its observability gauge to 0.
446+
L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN.set_lossy(0_usize);
441447
assert!(
442448
backlog.is_empty()
443449
|| self.current_height == backlog.first().unwrap().height

crates/apollo_l1_events/src/l1_events_provider_tests.rs

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use apollo_time::time::Clock;
2424
use assert_matches::assert_matches;
2525
use indexmap::IndexSet;
2626
use itertools::Itertools;
27+
use metrics_exporter_prometheus::PrometheusBuilder;
2728
use pretty_assertions::assert_eq;
2829
use rstest::rstest;
2930
use starknet_api::block::{BlockNumber, BlockTimestamp};
@@ -33,6 +34,7 @@ use starknet_api::tx_hash;
3334

3435
use crate::catchupper::{Catchupper, CommitBlockBacklog, SyncTaskHandle};
3536
use crate::l1_events_provider::L1EventsProvider;
37+
use crate::metrics::{register_provider_metrics, L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN};
3638
use crate::test_utils::{
3739
l1_handler,
3840
make_catchupper,
@@ -415,6 +417,118 @@ async fn commit_block_backlog() {
415417
expected_l1_events_provider.assert_eq(&l1_events_provider);
416418
}
417419

420+
/// Drives `commit_block` with strictly-increasing (`Greater`) heights while catching up, so each
421+
/// one is appended to the backlog. Returns the provider so callers can inspect/continue.
422+
fn provider_catching_up_with_backlog_cap(max_commit_block_backlog_len: usize) -> L1EventsProvider {
423+
const STARTUP_HEIGHT: BlockNumber = BlockNumber(0);
424+
let catchupper = make_catchupper!(backlog: [], max: max_commit_block_backlog_len);
425+
L1EventsProviderContentBuilder::new()
426+
.with_catchupper(catchupper)
427+
.with_height(STARTUP_HEIGHT)
428+
.with_state(ProviderState::CatchingUp)
429+
.build_into_l1_provider()
430+
}
431+
432+
#[test]
433+
fn backlog_below_cap_accepts_all_commits() {
434+
const CAP: usize = 3;
435+
let mut l1_events_provider = provider_catching_up_with_backlog_cap(CAP);
436+
437+
// Two commits, both above current height (0) -> backlogged, staying below the cap of 3.
438+
for height in [BlockNumber(1), BlockNumber(2)] {
439+
commit_block_no_rejected(&mut l1_events_provider, &[], height);
440+
}
441+
442+
assert_eq!(l1_events_provider.catchupper.commit_block_backlog.len(), 2);
443+
}
444+
445+
#[test]
446+
fn backlog_fills_exactly_to_cap_without_error() {
447+
const CAP: usize = 3;
448+
let mut l1_events_provider = provider_catching_up_with_backlog_cap(CAP);
449+
450+
// Exactly CAP commits fit (heights 1..=3, all above current height 0).
451+
for height in [BlockNumber(1), BlockNumber(2), BlockNumber(3)] {
452+
commit_block_no_rejected(&mut l1_events_provider, &[], height);
453+
}
454+
455+
let backlog = &l1_events_provider.catchupper.commit_block_backlog;
456+
assert_eq!(backlog.len(), CAP);
457+
// The backlog must remain a gapless, strictly-sequential run.
458+
assert_eq!(
459+
backlog.iter().map(|commit_block| commit_block.height).collect::<Vec<_>>(),
460+
vec![BlockNumber(1), BlockNumber(2), BlockNumber(3)]
461+
);
462+
}
463+
464+
#[test]
465+
fn backlog_over_cap_returns_overflow_error_without_dropping() {
466+
const CAP: usize = 3;
467+
let mut l1_events_provider = provider_catching_up_with_backlog_cap(CAP);
468+
469+
// Fill the backlog exactly to the cap.
470+
for height in [BlockNumber(1), BlockNumber(2), BlockNumber(3)] {
471+
commit_block_no_rejected(&mut l1_events_provider, &[], height);
472+
}
473+
474+
// The next commit overflows the cap and must be rejected with a descriptive error.
475+
let overflow_height = BlockNumber(4);
476+
let result = l1_events_provider.commit_block([].into(), [].into(), overflow_height);
477+
assert_matches!(
478+
result,
479+
Err(L1EventsProviderError::CatchUpBacklogOverflow { height, max })
480+
if height == overflow_height && max == CAP
481+
);
482+
483+
// Nothing was silently dropped: the backlog is still exactly CAP entries and gapless.
484+
let backlog = &l1_events_provider.catchupper.commit_block_backlog;
485+
assert_eq!(backlog.len(), CAP);
486+
assert_eq!(
487+
backlog.iter().map(|commit_block| commit_block.height).collect::<Vec<_>>(),
488+
vec![BlockNumber(1), BlockNumber(2), BlockNumber(3)]
489+
);
490+
}
491+
492+
#[tokio::test]
493+
async fn backlog_len_metric_tracks_push_and_drain() {
494+
let recorder = PrometheusBuilder::new().build_recorder();
495+
let _recorder_guard = metrics::set_default_local_recorder(&recorder);
496+
497+
// Start catching up with current height 0 and a target one above it, so a single `Equal`
498+
// commit (height 1) completes catch-up and drains the backlog.
499+
const TARGET_HEIGHT: BlockNumber = BlockNumber(0);
500+
let mut catchupper = make_catchupper!(backlog: []);
501+
catchupper.target_height = TARGET_HEIGHT;
502+
let mut l1_events_provider = L1EventsProviderContentBuilder::new()
503+
.with_catchupper(catchupper)
504+
.with_height(BlockNumber(0))
505+
.with_state(ProviderState::CatchingUp)
506+
.build_into_l1_provider();
507+
// Registers the gauge so it is rendered even before its first `set`.
508+
register_provider_metrics();
509+
510+
// Two `Greater` commits (heights 1, 2) are backlogged; the gauge tracks the backlog length.
511+
// They start sequentially one above the current height (0), as the drain-time invariant
512+
// requires.
513+
commit_block_no_rejected(&mut l1_events_provider, &[], BlockNumber(1));
514+
commit_block_no_rejected(&mut l1_events_provider, &[], BlockNumber(2));
515+
let metrics = recorder.handle().render();
516+
assert_eq!(
517+
L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN.parse_numeric_metric::<usize>(&metrics),
518+
Some(2)
519+
);
520+
521+
// The `Equal` commit at height 0 completes catch-up (current height becomes 1 > target 0),
522+
// drains the backlog [1, 2], and resets the gauge to 0.
523+
commit_block_no_rejected(&mut l1_events_provider, &[], BlockNumber(0));
524+
assert_eq!(l1_events_provider.state, ProviderState::Pending);
525+
let metrics = recorder.handle().render();
526+
assert_eq!(
527+
L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN.parse_numeric_metric::<usize>(&metrics),
528+
Some(0)
529+
);
530+
}
531+
418532
#[test]
419533
fn commit_block_before_add_tx_stores_tx_in_committed() {
420534
// Setup

crates/apollo_l1_events/src/metrics.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ define_metrics!(
1919
MetricCounter { L1_MESSAGE_SCRAPER_REORG_DETECTED, "l1_message_scraper_reorg_detected", "Number of times the L1 message scraper detected a reorganization in the base layer", init=0},
2020
MetricGauge { L1_MESSAGE_SCRAPER_LAST_SUCCESS_TIMESTAMP_SECONDS, "l1_message_scraper_last_success_timestamp_seconds", "Unix timestamp (seconds) of the last successful L1 message scrape" },
2121
MetricGauge { L1_MESSAGE_PROVIDER_NUM_PENDING_TXS, "l1_message_provider_num_pending_txs", "The number of pending L1 handler transactions in the transaction manager" },
22+
MetricGauge { L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN, "l1_message_provider_commit_block_backlog_len", "The number of commit-blocks buffered in the catch-up backlog while the provider syncs to the target height; abnormal sustained growth indicates a stalled or lagging L2 sync" },
2223
},
2324
);
2425

@@ -33,4 +34,5 @@ pub(crate) fn register_scraper_metrics() {
3334

3435
pub(crate) fn register_provider_metrics() {
3536
L1_MESSAGE_PROVIDER_NUM_PENDING_TXS.register();
37+
L1_MESSAGE_PROVIDER_COMMIT_BLOCK_BACKLOG_LEN.register();
3638
}

crates/apollo_l1_events/src/test_utils.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ use crate::transaction_record::{TransactionPayload, TransactionRecord};
3636
use crate::L1EventsProviderConfig;
3737

3838
macro_rules! make_catchupper {
39-
(backlog: [$($height:literal => [$($tx:literal),* $(,)*]),* $(,)*]) => {{
39+
(backlog: [$($height:literal => [$($tx:literal),* $(,)*]),* $(,)*]) => {
40+
make_catchupper!(backlog: [$($height => [$($tx),*]),*], max: usize::MAX)
41+
};
42+
(backlog: [$($height:literal => [$($tx:literal),* $(,)*]),* $(,)*], max: $max:expr) => {{
4043
Catchupper {
4144
commit_block_backlog: vec![
4245
$(CommitBlockBacklog {
@@ -49,7 +52,8 @@ macro_rules! make_catchupper {
4952
sync_client: Arc::new(MockStateSyncClient::default()),
5053
sync_task_handle: SyncTaskHandle::default(),
5154
n_sync_health_check_failures: Default::default(),
52-
sync_retry_interval: Duration::from_millis(10)
55+
sync_retry_interval: Duration::from_millis(10),
56+
max_commit_block_backlog_len: $max
5357
}
5458
}};
5559
}

crates/apollo_l1_events_config/src/config.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ pub struct L1EventsProviderConfig {
2121
pub l1_handler_proposal_cooldown_seconds: Duration,
2222
/// When true, the L1 provider operates in dummy mode.
2323
pub dummy_mode: bool,
24+
/// Maximum number of commit-blocks that may be buffered in the catch-up backlog while the
25+
/// provider syncs to the target height. This is a defensive tripwire against unbounded memory
26+
/// growth when L2 sync stalls or lags persistently during startup catch-up (see security
27+
/// finding L-16). Hitting this bound is treated as a hard error rather than silently dropping
28+
/// entries, because the backlog must remain a gapless, strictly-sequential run of heights.
29+
pub max_commit_block_backlog_len: usize,
2430
}
2531

2632
impl Default for L1EventsProviderConfig {
@@ -31,6 +37,10 @@ impl Default for L1EventsProviderConfig {
3137
l1_handler_consumption_timelock_seconds: Duration::from_secs(5 * 60),
3238
l1_handler_proposal_cooldown_seconds: Duration::from_secs(70),
3339
dummy_mode: false,
40+
// ~1M empty-block entries is only ~tens of MB (per-entry overhead is dominated by the
41+
// few 32-byte L1-handler tx hashes per block), which comfortably covers any legitimate
42+
// startup sync gap while still bounding worst-case memory. See L-16.
43+
max_commit_block_backlog_len: 1_000_000,
3444
}
3545
}
3646
}
@@ -72,6 +82,14 @@ impl SerializeConfig for L1EventsProviderConfig {
7282
trivial truthy responses without connecting to actual L1.",
7383
ParamPrivacyInput::Public,
7484
),
85+
ser_param(
86+
"max_commit_block_backlog_len",
87+
&self.max_commit_block_backlog_len,
88+
"Maximum number of commit-blocks buffered in the catch-up backlog during startup \
89+
sync before commit_block fails; guards against unbounded memory growth on a \
90+
stalled or lagging L2 sync.",
91+
ParamPrivacyInput::Public,
92+
),
7593
])
7694
}
7795
}

crates/apollo_l1_events_types/src/errors.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ pub enum L1EventsProviderError {
2020
UnexpectedProviderState { expected: ProviderState, found: ProviderState },
2121
#[error("Cannot transition from {from} to {to}")]
2222
UnexpectedProviderStateTransition { from: String, to: String },
23+
// The catch-up backlog is a gapless, strictly-sequential run of commit-blocks, so entries can
24+
// never be dropped to make room. Hitting the cap means L2 sync is pathologically stalled or
25+
// lagging; surface this to the caller (and logs/alerts) rather than growing memory unbounded.
26+
#[error(
27+
"Catch-up commit-block backlog overflow at height {height}: reached cap of {max} entries. \
28+
L2 sync is likely stalled or lagging."
29+
)]
30+
CatchUpBacklogOverflow { height: BlockNumber, max: usize },
2331
}
2432

2533
impl L1EventsProviderError {

crates/apollo_node/resources/config_schema.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3269,6 +3269,11 @@
32693269
"privacy": "Public",
32703270
"value": 70
32713271
},
3272+
"l1_events_provider_config.max_commit_block_backlog_len": {
3273+
"description": "Maximum number of commit-blocks buffered in the catch-up backlog during startup sync before commit_block fails; guards against unbounded memory growth on a stalled or lagging L2 sync.",
3274+
"privacy": "Public",
3275+
"value": 1000000
3276+
},
32723277
"l1_events_provider_config.startup_sync_sleep_retry_interval_seconds": {
32733278
"description": "Interval in seconds between each retry of syncing with L2 during startup.",
32743279
"privacy": "Public",

0 commit comments

Comments
 (0)