Skip to content

Commit 5dc7e50

Browse files
asaf-swclaude
andauthored
papyrus_base_layer: add time-based retry of the primary L1 endpoint (#14521)
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 50ef162 commit 5dc7e50

11 files changed

Lines changed: 386 additions & 16 deletions

File tree

crates/apollo_base_layer_tests/src/anvil_base_layer.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,15 @@ impl BaseLayerContract for AnvilBaseLayer {
267267
async fn cycle_provider_url(&mut self) -> Result<(), Self::Error> {
268268
unimplemented!("Anvil base layer is tied to a an Anvil server, url is fixed.")
269269
}
270+
271+
async fn reset_provider_url_to_primary(&mut self) -> Result<(), Self::Error> {
272+
unimplemented!("Anvil base layer is tied to a an Anvil server, url is fixed.")
273+
}
274+
275+
// Anvil is tied to a single fixed URL, so it is always on its only (primary) endpoint.
276+
async fn is_at_primary(&self) -> Result<bool, Self::Error> {
277+
Ok(true)
278+
}
270279
}
271280

272281
/// Converts a given [L1 handler transaction](starknet_api::transaction::L1HandlerTransaction)

crates/apollo_deployments/resources/app_configs/base_layer_config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"base_layer_config.bpo1_start_block_number": 9456501,
33
"base_layer_config.bpo2_start_block_number": 9504747,
44
"base_layer_config.fusaka_no_bpo_start_block_number": 9408577,
5+
"base_layer_config.retry_primary_interval_seconds": 60,
56
"base_layer_config.starknet_contract_address": "",
67
"base_layer_config.timeout_millis": 1000
78
}

crates/apollo_deployments/resources/app_configs/replacer_base_layer_config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"base_layer_config.bpo1_start_block_number": "$$$_BASE_LAYER_CONFIG-BPO1_START_BLOCK_NUMBER_$$$",
33
"base_layer_config.bpo2_start_block_number": "$$$_BASE_LAYER_CONFIG-BPO2_START_BLOCK_NUMBER_$$$",
44
"base_layer_config.fusaka_no_bpo_start_block_number": "$$$_BASE_LAYER_CONFIG-FUSAKA_NO_BPO_START_BLOCK_NUMBER_$$$",
5+
"base_layer_config.retry_primary_interval_seconds": 60,
56
"base_layer_config.starknet_contract_address": "$$$_BASE_LAYER_CONFIG-STARKNET_CONTRACT_ADDRESS_$$$",
67
"base_layer_config.timeout_millis": 1000
78
}

crates/apollo_node/resources/config_schema.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@
2424
"privacy": "Private",
2525
"value": "https://mainnet.infura.io/v3/YOUR_INFURA_API_KEY"
2626
},
27+
"base_layer_config.retry_primary_interval_seconds": {
28+
"description": "The interval (seconds) after which the next base-layer access retries the primary (first) endpoint.",
29+
"privacy": "Public",
30+
"value": 60
31+
},
2732
"base_layer_config.starknet_contract_address": {
2833
"description": "Starknet contract address in ethereum.",
2934
"privacy": "Public",

crates/apollo_node/src/components.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,10 @@ pub async fn create_node_components(
314314
.get_l1_gas_price_shared_client()
315315
.expect("L1 gas price client should be available");
316316
let base_layer = EthereumBaseLayerContract::new(base_layer_config.clone());
317-
let cyclic_base_layer_wrapper = CyclicBaseLayerWrapper::new(base_layer);
317+
let cyclic_base_layer_wrapper = CyclicBaseLayerWrapper::new(
318+
base_layer,
319+
base_layer_config.retry_primary_interval_seconds,
320+
);
318321

319322
Some(L1GasPriceScraper::new(
320323
l1_gas_price_scraper_config.clone(),
@@ -390,7 +393,10 @@ pub async fn create_node_components(
390393
.expect("L1 Events Scraper config should be set");
391394
let l1_events_provider_client = clients.get_l1_events_provider_shared_client().unwrap();
392395
let base_layer = EthereumBaseLayerContract::new(base_layer_config.clone());
393-
let cyclic_base_layer_wrapper = CyclicBaseLayerWrapper::new(base_layer);
396+
let cyclic_base_layer_wrapper = CyclicBaseLayerWrapper::new(
397+
base_layer,
398+
base_layer_config.retry_primary_interval_seconds,
399+
);
394400

395401
Some(
396402
L1EventsScraper::new(

crates/papyrus_base_layer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ pretty_assertions.workspace = true
3636
rstest.workspace = true
3737
starknet-types-core.workspace = true
3838
starknet_api = { workspace = true, features = ["testing"] }
39+
tokio = { workspace = true, features = ["test-util"] }

crates/papyrus_base_layer/src/base_layer_test.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,35 @@ async fn test_cycle_wraps_to_primary_through_full_list() {
134134
assert_eq!(base_layer.get_url().await.unwrap().expose_secret(), primary_url);
135135
}
136136

137+
#[tokio::test]
138+
async fn test_reset_provider_url_to_primary_repoints_live_provider() {
139+
let primary_url = Url::parse("http://primary-endpoint.test/").unwrap();
140+
let secondary_url = Url::parse("http://secondary-endpoint.test/").unwrap();
141+
let tertiary_url = Url::parse("http://tertiary-endpoint.test/").unwrap();
142+
let config = EthereumBaseLayerConfig {
143+
ordered_l1_endpoint_urls: vec![
144+
primary_url.clone().into(),
145+
secondary_url.clone().into(),
146+
tertiary_url.clone().into(),
147+
],
148+
..Default::default()
149+
};
150+
let mut base_layer = EthereumBaseLayerContract::new(config);
151+
152+
// Cycle twice to land on the tertiary endpoint.
153+
base_layer.cycle_provider_url().await.unwrap();
154+
base_layer.cycle_provider_url().await.unwrap();
155+
assert_eq!(base_layer.get_url().await.unwrap().expose_secret(), tertiary_url);
156+
157+
// Reset to primary.
158+
base_layer.reset_provider_url_to_primary().await.unwrap();
159+
assert_eq!(base_layer.get_url().await.unwrap().expose_secret(), primary_url);
160+
161+
// Calling reset again when already on primary is a no-op.
162+
base_layer.reset_provider_url_to_primary().await.unwrap();
163+
assert_eq!(base_layer.get_url().await.unwrap().expose_secret(), primary_url);
164+
}
165+
137166
#[test]
138167
fn create_l1_event_data_rejects_out_of_range_inputs() {
139168
let oversized = felt_max_u256() + U256::from(1_u8);

crates/papyrus_base_layer/src/cyclic_base_layer_wrapper.rs

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::ops::RangeInclusive;
2+
use std::time::Duration;
23

34
use apollo_config::secrets::Sensitive;
45
use async_trait::async_trait;
@@ -15,11 +16,26 @@ pub mod cyclic_base_layer_wrapper_test;
1516
#[derive(Debug)]
1617
pub struct CyclicBaseLayerWrapper<B: BaseLayerContract + Send + Sync> {
1718
base_layer: B,
19+
retry_primary_interval: Duration,
20+
last_primary_retry: tokio::time::Instant,
1821
}
1922

2023
impl<B: BaseLayerContract + Send + Sync> CyclicBaseLayerWrapper<B> {
21-
pub fn new(base_layer: B) -> Self {
22-
Self { base_layer }
24+
pub fn new(base_layer: B, retry_primary_interval: Duration) -> Self {
25+
Self { base_layer, retry_primary_interval, last_primary_retry: tokio::time::Instant::now() }
26+
}
27+
28+
// Retries the primary endpoint once the interval has elapsed since we left it. Does nothing
29+
// while already on the primary, so the timer is untouched until a failover moves us off it.
30+
async fn retry_primary_if_due(&mut self) -> Result<(), B::Error> {
31+
if self.base_layer.is_at_primary().await? {
32+
return Ok(());
33+
}
34+
if self.last_primary_retry.elapsed() >= self.retry_primary_interval {
35+
self.last_primary_retry = tokio::time::Instant::now();
36+
self.base_layer.reset_provider_url_to_primary().await?;
37+
}
38+
Ok(())
2339
}
2440

2541
// Check the result of a function call to the base layer. If it fails, cycle the URL and signal
@@ -38,11 +54,21 @@ impl<B: BaseLayerContract + Send + Sync> CyclicBaseLayerWrapper<B> {
3854
let Ok(current_url) = current_url_result else {
3955
return Some(Err(current_url_result.expect_err("result is checked at let-else")));
4056
};
57+
// Record whether we are about to leave the primary, before cycling away from it.
58+
let is_at_primary_result = self.base_layer.is_at_primary().await;
59+
let Ok(was_at_primary) = is_at_primary_result else {
60+
return Some(Err(is_at_primary_result.expect_err("result is checked at let-else")));
61+
};
4162
// Otherwise, cycle the URL so we can try again. Return error in case it fails to cycle.
4263
let cycle_url_result = self.base_layer.cycle_provider_url().await;
4364
let Ok(()) = cycle_url_result else {
4465
return Some(Err(cycle_url_result.expect_err("result is checked at let-else")));
4566
};
67+
// Restart the retry-primary clock only when this failover leaves the primary, so the wait
68+
// is measured from when we left it; cycling between backups must not push the retry out.
69+
if was_at_primary {
70+
self.last_primary_retry = tokio::time::Instant::now();
71+
}
4672
// Get the new URL (return error in case it fails to get it).
4773
let new_url_result = self.base_layer.get_url().await;
4874
let Ok(new_url) = new_url_result else {
@@ -69,6 +95,7 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
6995
&mut self,
7096
l1_block: L1BlockNumber,
7197
) -> Result<BlockHashAndNumber, Self::Error> {
98+
self.retry_primary_if_due().await?;
7299
let start_url = self.base_layer.get_url().await?;
73100
loop {
74101
let result = self.base_layer.get_proved_block_at(l1_block).await;
@@ -79,6 +106,7 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
79106
}
80107

81108
async fn latest_l1_block_number(&mut self) -> Result<L1BlockNumber, Self::Error> {
109+
self.retry_primary_if_due().await?;
82110
let start_url = self.base_layer.get_url().await?;
83111
loop {
84112
let result = self.base_layer.latest_l1_block_number().await;
@@ -92,6 +120,7 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
92120
&mut self,
93121
block_number: L1BlockNumber,
94122
) -> Result<Option<L1BlockReference>, Self::Error> {
123+
self.retry_primary_if_due().await?;
95124
let start_url = self.base_layer.get_url().await?;
96125
loop {
97126
let result = self.base_layer.l1_block_at(block_number).await;
@@ -106,6 +135,7 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
106135
block_range: RangeInclusive<L1BlockNumber>,
107136
event_identifiers: &'a [&'a str],
108137
) -> Result<Vec<L1Event>, Self::Error> {
138+
self.retry_primary_if_due().await?;
109139
let start_url = self.base_layer.get_url().await?;
110140
loop {
111141
let result = self.base_layer.events(block_range.clone(), event_identifiers).await;
@@ -119,6 +149,7 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
119149
&mut self,
120150
block_number: L1BlockNumber,
121151
) -> Result<Option<L1BlockHeader>, Self::Error> {
152+
self.retry_primary_if_due().await?;
122153
let start_url = self.base_layer.get_url().await?;
123154
loop {
124155
let result = self.base_layer.get_block_header(block_number).await;
@@ -128,6 +159,8 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
128159
}
129160
}
130161

162+
// Takes &self so it cannot cycle or retry endpoints; callers needing resilience use the &mut
163+
// self methods.
131164
async fn get_block_header_immutable(
132165
&self,
133166
block_number: L1BlockNumber,
@@ -146,4 +179,12 @@ impl<B: BaseLayerContract + Send + Sync> BaseLayerContract for CyclicBaseLayerWr
146179
async fn cycle_provider_url(&mut self) -> Result<(), Self::Error> {
147180
self.base_layer.cycle_provider_url().await
148181
}
182+
183+
async fn reset_provider_url_to_primary(&mut self) -> Result<(), Self::Error> {
184+
self.base_layer.reset_provider_url_to_primary().await
185+
}
186+
187+
async fn is_at_primary(&self) -> Result<bool, Self::Error> {
188+
self.base_layer.is_at_primary().await
189+
}
149190
}

0 commit comments

Comments
 (0)