From 717e3fefcb5f858e49e5ef2fde6327c083e716f0 Mon Sep 17 00:00:00 2001 From: gzalz Date: Thu, 6 Nov 2025 09:39:23 -0700 Subject: [PATCH 1/5] wait for epoch info rpc to succeed in process loop --- tip-router-operator-cli/src/process_epoch.rs | 26 ++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tip-router-operator-cli/src/process_epoch.rs b/tip-router-operator-cli/src/process_epoch.rs index 0c67503f..dea159b8 100644 --- a/tip-router-operator-cli/src/process_epoch.rs +++ b/tip-router-operator-cli/src/process_epoch.rs @@ -112,8 +112,30 @@ pub async fn loop_stages( save_stages: bool, ) -> Result<()> { let keypair = read_keypair_file(&cli.keypair_path).expect("Failed to read keypair file"); - let mut current_epoch_info = rpc_client.get_epoch_info().await?; - let epoch_schedule = rpc_client.get_epoch_schedule().await?; + // This should attempt until it succeeds + let mut current_epoch_info = { + loop { + match rpc_client.get_epoch_info().await { + Ok(info) => break info, + Err(e) => { + error!("Error getting epoch info. Retrying in 5 seconds..."); + tokio::time::sleep(Duration::from_secs(5)).await; + } + } + } + }; + // This should attempt until it succeeds + let epoch_schedule = { + loop { + match rpc_client.get_epoch_schedule().await { + Ok(schedule) => break schedule, + Err(e) => { + error!("Error getting epoch schedule. Retrying in 5 seconds..."); + tokio::time::sleep(Duration::from_secs(5)).await; + } + } + } + }; // Track runs that are starting right at the beginning of a new epoch let operator_address = cli.operator_address.clone(); From 43763349486a12800b0f4c81d4653dd19c2f3775 Mon Sep 17 00:00:00 2001 From: gzalz Date: Thu, 6 Nov 2025 09:51:35 -0700 Subject: [PATCH 2/5] additional map_err for visibility --- tip-router-operator-cli/src/submit.rs | 36 +++++++++++++++++++++------ 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tip-router-operator-cli/src/submit.rs b/tip-router-operator-cli/src/submit.rs index 857b6255..e77a238e 100644 --- a/tip-router-operator-cli/src/submit.rs +++ b/tip-router-operator-cli/src/submit.rs @@ -91,10 +91,27 @@ pub async fn submit_to_ncn( compute_unit_price: u64, cluster: &str, ) -> Result<(), anyhow::Error> { - let epoch_info = client.get_epoch_info().await?; - let meta_merkle_tree = MetaMerkleTree::new_from_file(meta_merkle_tree_path)?; + let epoch_info = client + .get_epoch_info() + .await + .map_err(|e| anyhow::anyhow!("Failed to fetch epoch info from RPC client: {:?}", e))?; + let meta_merkle_tree = MetaMerkleTree::new_from_file(meta_merkle_tree_path).map_err(|e| { + anyhow::anyhow!( + "Failed to load Meta Merkle Tree from file {:?}: {:?}", + meta_merkle_tree_path, + e + ) + })?; let config_pda = Config::find_program_address(tip_router_program_id, ncn_address).0; - let config = get_ncn_config(client, tip_router_program_id, ncn_address).await?; + let config = get_ncn_config(client, tip_router_program_id, ncn_address) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to fetch Tip Router config for NCN {}: {:?}", + ncn_address, + e + ) + })?; // The meta merkle root files are tagged with the epoch they have created the snapshot for // Tip router accounts for that merkle root are created in the next epoch @@ -119,12 +136,15 @@ pub async fn submit_to_ncn( } }; - let ballot_box = BallotBox::try_from_slice_unchecked(&ballot_box_account.data)?; + let ballot_box = BallotBox::try_from_slice_unchecked(&ballot_box_account.data) + .map_err(|e| anyhow::anyhow!("Failed to deserialize ballot box: {:?}", e))?; - let is_voting_valid = ballot_box.is_voting_valid( - epoch_info.absolute_slot, - config.valid_slots_after_consensus(), - )?; + let is_voting_valid = ballot_box + .is_voting_valid( + epoch_info.absolute_slot, + config.valid_slots_after_consensus(), + ) + .map_err(|e| anyhow::anyhow!("Failed to determine if voting is valid: {:?}", e))?; // If exists, look for vote from current operator let vote = ballot_box From e0b601d35e4293553957aa19bcfd6d8d8a5d0544 Mon Sep 17 00:00:00 2001 From: gzalz Date: Mon, 10 Nov 2025 10:58:40 -0700 Subject: [PATCH 3/5] operator: improved error handling / logging --- tip-router-operator-cli/src/process_epoch.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tip-router-operator-cli/src/process_epoch.rs b/tip-router-operator-cli/src/process_epoch.rs index dea159b8..f6bd9f2a 100644 --- a/tip-router-operator-cli/src/process_epoch.rs +++ b/tip-router-operator-cli/src/process_epoch.rs @@ -112,25 +112,25 @@ pub async fn loop_stages( save_stages: bool, ) -> Result<()> { let keypair = read_keypair_file(&cli.keypair_path).expect("Failed to read keypair file"); - // This should attempt until it succeeds + let mut current_epoch_info = { loop { match rpc_client.get_epoch_info().await { Ok(info) => break info, Err(e) => { - error!("Error getting epoch info. Retrying in 5 seconds..."); + error!("Error getting epoch info from RPC. Retrying..."); tokio::time::sleep(Duration::from_secs(5)).await; } } } }; - // This should attempt until it succeeds + let epoch_schedule = { loop { match rpc_client.get_epoch_schedule().await { Ok(schedule) => break schedule, Err(e) => { - error!("Error getting epoch schedule. Retrying in 5 seconds..."); + error!("Error getting epoch schedule from RPC. Retrying..."); tokio::time::sleep(Duration::from_secs(5)).await; } } @@ -311,7 +311,7 @@ pub async fn loop_stages( meta_merkle_tree_path(epoch_to_process, &cli.get_save_path()); let operator_address = Pubkey::from_str(&cli.operator_address)?; - submit_to_ncn( + let submit_result = submit_to_ncn( &rpc_client, &keypair, &operator_address, @@ -327,7 +327,10 @@ pub async fn loop_stages( cli.vote_microlamports, &cli.cluster, ) - .await?; + .await; + if let Err(e) = submit_result { + error!("Failed to submit epoch {} to NCN: {:?}", epoch_to_process, e); + } stage = OperatorState::WaitForNextEpoch; } OperatorState::WaitForNextEpoch => { From 81454dfd1e9aa45f45f32e22ffdeac02cc394bf2 Mon Sep 17 00:00:00 2001 From: gzalz Date: Mon, 10 Nov 2025 11:07:37 -0700 Subject: [PATCH 4/5] fmt --- tip-router-operator-cli/src/process_epoch.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tip-router-operator-cli/src/process_epoch.rs b/tip-router-operator-cli/src/process_epoch.rs index f6bd9f2a..164f9b20 100644 --- a/tip-router-operator-cli/src/process_epoch.rs +++ b/tip-router-operator-cli/src/process_epoch.rs @@ -112,7 +112,7 @@ pub async fn loop_stages( save_stages: bool, ) -> Result<()> { let keypair = read_keypair_file(&cli.keypair_path).expect("Failed to read keypair file"); - + let mut current_epoch_info = { loop { match rpc_client.get_epoch_info().await { @@ -329,7 +329,10 @@ pub async fn loop_stages( ) .await; if let Err(e) = submit_result { - error!("Failed to submit epoch {} to NCN: {:?}", epoch_to_process, e); + error!( + "Failed to submit epoch {} to NCN: {:?}", + epoch_to_process, e + ); } stage = OperatorState::WaitForNextEpoch; } From 3e3f326d0fc2e52f4ccf29020a1be32b9859e78a Mon Sep 17 00:00:00 2001 From: gzalz Date: Mon, 10 Nov 2025 12:54:31 -0700 Subject: [PATCH 5/5] additional datapoint_error --- tip-router-operator-cli/src/process_epoch.rs | 23 +++++++++++++++++ tip-router-operator-cli/src/submit.rs | 26 +++++++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/tip-router-operator-cli/src/process_epoch.rs b/tip-router-operator-cli/src/process_epoch.rs index 164f9b20..0675814d 100644 --- a/tip-router-operator-cli/src/process_epoch.rs +++ b/tip-router-operator-cli/src/process_epoch.rs @@ -119,6 +119,13 @@ pub async fn loop_stages( Ok(info) => break info, Err(e) => { error!("Error getting epoch info from RPC. Retrying..."); + datapoint_error!( + "tip_router_cli.get_epoch_info", + ("operator_address", cli.operator_address.clone(), String), + ("status", "error", String), + ("error", e.to_string(), String), + "cluster" => &cli.cluster, + ); tokio::time::sleep(Duration::from_secs(5)).await; } } @@ -131,6 +138,13 @@ pub async fn loop_stages( Ok(schedule) => break schedule, Err(e) => { error!("Error getting epoch schedule from RPC. Retrying..."); + datapoint_error!( + "tip_router_cli.get_epoch_schedule", + ("operator_address", cli.operator_address.clone(), String), + ("status", "error", String), + ("error", e.to_string(), String), + "cluster" => &cli.cluster, + ); tokio::time::sleep(Duration::from_secs(5)).await; } } @@ -333,6 +347,15 @@ pub async fn loop_stages( "Failed to submit epoch {} to NCN: {:?}", epoch_to_process, e ); + datapoint_error!( + "tip_router_cli.cast_vote", + ("operator_address", operator_address.to_string(), String), + ("epoch", epoch_to_process, i64), + ("status", "error", String), + ("error", e.to_string(), String), + ("state", "cast_vote", String), + "cluster" => &cli.cluster, + ); } stage = OperatorState::WaitForNextEpoch; } diff --git a/tip-router-operator-cli/src/submit.rs b/tip-router-operator-cli/src/submit.rs index e77a238e..2bfccefe 100644 --- a/tip-router-operator-cli/src/submit.rs +++ b/tip-router-operator-cli/src/submit.rs @@ -136,15 +136,35 @@ pub async fn submit_to_ncn( } }; - let ballot_box = BallotBox::try_from_slice_unchecked(&ballot_box_account.data) - .map_err(|e| anyhow::anyhow!("Failed to deserialize ballot box: {:?}", e))?; + let ballot_box = + BallotBox::try_from_slice_unchecked(&ballot_box_account.data).map_err(|e| { + datapoint_error!( + "tip_router_cli.ballot_box_deserialize_error", + ("operator_address", operator_address.to_string(), String), + ("epoch", tip_router_target_epoch, i64), + ("status", "error", String), + ("error", format!("{:?}", e), String), + "cluster" => cluster, + ); + anyhow::anyhow!("Failed to deserialize ballot box: {:?}", e) + })?; let is_voting_valid = ballot_box .is_voting_valid( epoch_info.absolute_slot, config.valid_slots_after_consensus(), ) - .map_err(|e| anyhow::anyhow!("Failed to determine if voting is valid: {:?}", e))?; + .map_err(|e| { + datapoint_error!( + "tip_router_cli.voting_validity_error", + ("operator_address", operator_address.to_string(), String), + ("epoch", tip_router_target_epoch, i64), + ("status", "error", String), + ("error", format!("{:?}", e), String), + "cluster" => cluster, + ); + anyhow::anyhow!("Failed to determine if voting is valid: {:?}", e) + })?; // If exists, look for vote from current operator let vote = ballot_box