Add Timeout and Retry Logic for Initial Connection (#1206)

sigurpol · web-flow · commit 17101d67bf4d · 2025-12-29T12:00:20.000+01:00
The staking miner can hang indefinitely during the initial connection attempt. When this happens,
only this log appears:

```
attempting to connect to "wss://polkadot-asset-hub-rpc.polkadot.io"
```

And the process never progresses or errors out. This is problematic in production because:
1. The miner appears to be running but is actually stuck
2. No alerts are triggered (prometheus endpoint is not yet started)
3. The only remedy is to manually detect the stall and restart

The `ReconnectingRpcClient` has retry logic (10 attempts with exponential backoff), but if each
individual connection attempt hangs internally, the retry logic never gets a chance to kick in.

Add timeout and retry logic to `Client::new()`, following the existing patterns used in
`monitor.rs` for stall detection.
diff --git a/src/client.rs b/src/client.rs
@@ -1,10 +1,24 @@
-use crate::prelude::{ChainClient, Config, LOG_TARGET};
+use crate::{
+	error::{Error, TimeoutError},
+	prelude::{ChainClient, Config, LOG_TARGET},
+	prometheus,
+};
 use std::{sync::Arc, time::Duration};
 use subxt::backend::{
 	chain_head::{ChainHeadBackend, ChainHeadBackendBuilder},
 	rpc::reconnecting_rpc_client::{ExponentialBackoff, RpcClient as ReconnectingRpcClient},
 };
 
+/// Timeout for each connection attempt in seconds.
+/// If a connection attempt doesn't complete within this time, we retry.
+const CONNECTION_ATTEMPT_TIMEOUT_SECS: u64 = 30;
+
+/// Maximum number of connection attempts before giving up.
+const MAX_CONNECTION_ATTEMPTS: u32 = 3;
+
+/// Delay between connection attempts in seconds.
+const CONNECTION_RETRY_DELAY_SECS: u64 = 5;
+
 /// Wraps the subxt interfaces to make it easy to use for the staking-miner.
 #[derive(Clone, Debug)]
 pub struct Client {
@@ -13,28 +27,84 @@ pub struct Client {
 }
 
 impl Client {
-	pub async fn new(uri: &str) -> Result<Self, subxt::Error> {
-		log::debug!(target: LOG_TARGET, "attempting to connect to {uri:?}");
-
-		// Create a reconnecting RPC client with exponential backoff
-		let reconnecting_rpc =
-			ReconnectingRpcClient::builder()
-				.retry_policy(
-					ExponentialBackoff::from_millis(500)
-						.max_delay(Duration::from_secs(30))
-						.take(10), // Allow up to 10 retry attempts before giving up
-				)
-				.build(uri.to_string())
-				.await
-				.map_err(|e| subxt::Error::Other(format!("Failed to connect: {e:?}")))?;
-
-		let backend: ChainHeadBackend<Config> =
-			ChainHeadBackendBuilder::default().build_with_background_driver(reconnecting_rpc);
-		let chain_api = ChainClient::from_backend(Arc::new(backend)).await?;
-
-		log::info!(target: LOG_TARGET, "Connected to {uri} with ChainHead backend");
-
-		Ok(Self { chain_api })
+	pub async fn new(uri: &str) -> Result<Self, Error> {
+		for attempt in 1..=MAX_CONNECTION_ATTEMPTS {
+			log::debug!(
+				target: LOG_TARGET,
+				"attempting to connect to {uri:?} (attempt {attempt}/{MAX_CONNECTION_ATTEMPTS})"
+			);
+
+			match Self::try_connect(uri).await {
+				Ok(client) => return Ok(client),
+				Err(e) => {
+					if attempt == MAX_CONNECTION_ATTEMPTS {
+						log::error!(
+							target: LOG_TARGET,
+							"Failed to connect after {MAX_CONNECTION_ATTEMPTS} attempts: {e:?}"
+						);
+						return Err(e);
+					}
+					log::warn!(
+						target: LOG_TARGET,
+						"Connection attempt {attempt}/{MAX_CONNECTION_ATTEMPTS} failed: {e:?}, \
+						 retrying in {CONNECTION_RETRY_DELAY_SECS}s..."
+					);
+					tokio::time::sleep(Duration::from_secs(CONNECTION_RETRY_DELAY_SECS)).await;
+				},
+			}
+		}
+
+		unreachable!("Loop should have returned or errored")
+	}
+
+	async fn try_connect(uri: &str) -> Result<Self, Error> {
+		// Wrap the entire connection process with a timeout
+		let connect_future = async {
+			// Create a reconnecting RPC client with exponential backoff
+			let reconnecting_rpc =
+				ReconnectingRpcClient::builder()
+					.retry_policy(
+						ExponentialBackoff::from_millis(500)
+							.max_delay(Duration::from_secs(10))
+							.take(3), // Fewer internal retries since we have outer retry loop
+					)
+					.build(uri.to_string())
+					.await
+					.map_err(|e| Error::Other(format!("Failed to connect: {e:?}")))?;
+
+			let backend: ChainHeadBackend<Config> =
+				ChainHeadBackendBuilder::default().build_with_background_driver(reconnecting_rpc);
+			let chain_api = ChainClient::from_backend(Arc::new(backend)).await?;
+
+			Ok::<Self, Error>(Self { chain_api })
+		};
+
+		match tokio::time::timeout(
+			Duration::from_secs(CONNECTION_ATTEMPT_TIMEOUT_SECS),
+			connect_future,
+		)
+		.await
+		{
+			Ok(result) => {
+				if result.is_ok() {
+					log::info!(target: LOG_TARGET, "Connected to {uri} with ChainHead backend");
+				}
+				result
+			},
+			Err(_) => {
+				prometheus::on_connection_timeout();
+				log::warn!(
+					target: LOG_TARGET,
+					"Connection attempt timed out after {CONNECTION_ATTEMPT_TIMEOUT_SECS}s"
+				);
+				Err(TimeoutError::InitialConnection {
+					timeout_secs: CONNECTION_ATTEMPT_TIMEOUT_SECS,
+					attempt: 0, // Will be filled by caller context
+					max_attempts: MAX_CONNECTION_ATTEMPTS,
+				}
+				.into())
+			},
+		}
 	}
 
 	/// Get a reference to the chain API.
diff --git a/src/error.rs b/src/error.rs
@@ -30,6 +30,10 @@ pub enum TimeoutError {
 	ScoreCheck { timeout_secs: u64 },
 	#[error("Missing pages submission timed out after {timeout_secs} seconds")]
 	MissingPages { timeout_secs: u64 },
+	#[error(
+		"Initial connection timed out after {timeout_secs} seconds (attempt {attempt}/{max_attempts})"
+	)]
+	InitialConnection { timeout_secs: u64, attempt: u32, max_attempts: u32 },
 }
 
 #[derive(thiserror::Error, Debug)]
diff --git a/src/main.rs b/src/main.rs
@@ -93,6 +93,13 @@ async fn main() -> Result<(), Error> {
 	let filter = EnvFilter::from_default_env().add_directive(log.parse()?);
 	tracing_subscriber::fmt().with_env_filter(filter).init();
 
+	// Start prometheus endpoint early so metrics are available during connection attempts.
+	if let Err(e) = prometheus::run(prometheus_port).await {
+		log::warn!("Failed to start prometheus endpoint: {e}");
+	}
+	// Initialize the timestamp so that if connection hangs, the stall detection alert can fire.
+	prometheus::set_last_block_processing_time();
+
 	let client = Client::new(&uri).await?;
 
 	let version_bytes = client
@@ -106,9 +113,6 @@ async fn main() -> Result<(), Error> {
 		Decode::decode(&mut &version_bytes[..])?;
 
 	let chain = opt::Chain::try_from(&runtime_version)?;
-	if let Err(e) = prometheus::run(prometheus_port).await {
-		log::warn!("Failed to start prometheus endpoint: {e}");
-	}
 	log::info!(target: LOG_TARGET, "Connected to chain: {chain}");
 
 	SHARED_CLIENT.set(client.clone()).expect("shared client only set once; qed");
diff --git a/src/prometheus.rs b/src/prometheus.rs
@@ -391,6 +391,14 @@ mod hidden {
 		.unwrap()
 	});
 
+	static CONNECTION_TIMEOUTS: Lazy<Counter> = Lazy::new(|| {
+		register_counter!(opts!(
+			"staking_miner_connection_timeouts_total",
+			"Total number of initial connection attempt timeouts"
+		))
+		.unwrap()
+	});
+
 	pub fn on_runtime_upgrade() {
 		RUNTIME_UPGRADES.inc();
 	}
@@ -526,4 +534,8 @@ mod hidden {
 	pub fn on_era_pruning_timeout() {
 		ERA_PRUNING_TIMEOUTS.inc();
 	}
+
+	pub fn on_connection_timeout() {
+		CONNECTION_TIMEOUTS.inc();
+	}
 }