Skip to content

Commit a5c38ec

Browse files
authored
feat: make validator startup more resilient (#7342)
1 parent 69a0106 commit a5c38ec

1 file changed

Lines changed: 56 additions & 18 deletions

File tree

rust/main/agents/validator/src/validator.rs

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ use hyperlane_base::{
2222
SequencedDataContractSync,
2323
};
2424
use hyperlane_core::{
25-
Announcement, ChainResult, HyperlaneChain, HyperlaneContract, HyperlaneDomain, HyperlaneSigner,
26-
HyperlaneSignerExt, Mailbox, MerkleTreeHook, MerkleTreeInsertion, ReorgPeriod, TxOutcome,
27-
ValidatorAnnounce, H256, U256,
25+
rpc_clients::RPC_RETRY_SLEEP_DURATION, Announcement, ChainResult, HyperlaneChain,
26+
HyperlaneContract, HyperlaneDomain, HyperlaneSigner, HyperlaneSignerExt, Mailbox,
27+
MerkleTreeHook, MerkleTreeInsertion, ReorgPeriod, TxOutcome, ValidatorAnnounce, H256, U256,
2828
};
2929
use hyperlane_ethereum::{Signers, SingletonSigner, SingletonSignerHandle};
3030

@@ -37,6 +37,8 @@ use crate::{
3737
submit::{ValidatorSubmitter, ValidatorSubmitterMetrics},
3838
};
3939

40+
const CURSOR_INSTANTIATION_ATTEMPTS: usize = 10;
41+
4042
/// A validator agent
4143
#[derive(Debug, AsRef)]
4244
pub struct Validator {
@@ -291,30 +293,35 @@ impl BaseAgent for Validator {
291293
// messages or submitting checkpoints.
292294
loop {
293295
match self.merkle_tree_hook.count(&self.reorg_period).await {
296+
Err(err) => {
297+
error!(?err, "Error getting merkle tree hook count");
298+
sleep(self.interval).await;
299+
}
294300
Ok(0) => {
295301
info!("Waiting for first message in merkle tree hook");
296302
sleep(self.interval).await;
297303
}
298304
Ok(_) => {
299-
let merkle_tree_hook_sync = match self.run_merkle_tree_hook_sync().await {
300-
Ok(handle) => handle,
301-
Err(err) => {
302-
tracing::error!(?err, "Failed to run merkle tree hook sync");
303-
return;
304-
}
305-
};
306-
tasks.push(merkle_tree_hook_sync);
307-
for checkpoint_sync_task in self.run_checkpoint_submitters().await {
308-
tasks.push(checkpoint_sync_task);
309-
}
310305
break;
311306
}
312-
Err(err) => {
313-
error!(?err, "Error getting merkle tree hook count");
314-
sleep(self.interval).await;
315-
}
316307
}
317308
}
309+
310+
let merkle_tree_hook_sync = match self
311+
.try_n_times_to_run_merkle_tree_hook_sync(CURSOR_INSTANTIATION_ATTEMPTS)
312+
.await
313+
{
314+
Ok(s) => s,
315+
Err(err) => {
316+
error!(?err, "Failed to run merkle tree hook sync");
317+
return;
318+
}
319+
};
320+
tasks.push(merkle_tree_hook_sync);
321+
for checkpoint_sync_task in self.run_checkpoint_submitters().await {
322+
tasks.push(checkpoint_sync_task);
323+
}
324+
318325
tasks.push(self.runtime_metrics.spawn());
319326

320327
// Note that this only returns an error if one of the tasks panics
@@ -325,6 +332,37 @@ impl BaseAgent for Validator {
325332
}
326333

327334
impl Validator {
335+
/// Try to create merkle tree hook contract sync attempts times before giving up.
336+
async fn try_n_times_to_run_merkle_tree_hook_sync(
337+
&self,
338+
attempts: usize,
339+
) -> eyre::Result<JoinHandle<()>> {
340+
for i in 0..attempts {
341+
let task = match self.run_merkle_tree_hook_sync().await {
342+
Ok(s) => s,
343+
Err(err) => {
344+
error!(
345+
?err,
346+
domain = self.origin_chain.name(),
347+
attempt_count = i,
348+
"Failed to run merkle tree hook sync"
349+
);
350+
sleep(RPC_RETRY_SLEEP_DURATION).await;
351+
continue;
352+
}
353+
};
354+
self.chain_metrics
355+
.set_critical_error(self.origin_chain.name(), false);
356+
return Ok(task);
357+
}
358+
self.chain_metrics
359+
.set_critical_error(self.origin_chain.name(), true);
360+
Err(eyre::eyre!(
361+
"Failed to initialize merkle tree hook sync after {} attempts",
362+
attempts
363+
))
364+
}
365+
328366
async fn run_merkle_tree_hook_sync(&self) -> eyre::Result<JoinHandle<()>> {
329367
let index_settings = self
330368
.as_ref()

0 commit comments

Comments
 (0)