Skip to content

Commit ad7a55d

Browse files
tomg10claude
andauthored
fix: add explicit timeouts to all reqwest HTTP clients + add lots of prover logs (#4772)
Several HTTP clients across the codebase were created with `reqwest::Client::new()` which has no timeout configured. This can cause indefinite hangs when a remote server is slow or unresponsive, with no error logged — the task silently freezes. This was the root cause of batch 58064 being stuck for 5 hours: the ProofGenDataSubmitter's HTTP POST to the prover gateway hung indefinitely, leaving the batch as `picked_by_prover` until the `proof_generation_timeout` (18000s) reclaim caught it. Affected clients: - proof_data_handler HTTP client (60s) — root cause of the incident - Avail DA bridge API client (60s) - Contract verifier GitHub resolver (300s — downloads large binaries) - Metadata calculator tree API client (60s) - ZK OS tree manager API client (60s) - Prover autoscaler HTTP client (60s) ## What ❔ <!-- What are the changes this PR brings about? --> <!-- Example: This PR adds a PR template to the repo. --> <!-- (For bigger PRs adding more context is appreciated) --> ## Why ❔ <!-- Why are these changes done? What goal do they contribute to? What are the principles behind them? --> <!-- The `Why` has to be clear to non-Matter Labs entities running their own ZK Chain --> <!-- Example: PR templates ensure PR reviewers, observers, and future iterators are in context about the evolution of repos. --> ## Is this a breaking change? - [ ] Yes - [ ] No ## Operational changes <!-- Any config changes? Any new flags? Any changes to any scripts? --> <!-- Please add anything that non-Matter Labs entities running their own ZK Chain may need to know --> ## Checklist <!-- Check your PR fulfills the following items. --> <!-- For draft PRs check the boxes as you complete them. --> - [ ] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [ ] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [ ] Code has been formatted via `zkstack dev fmt` and `zkstack dev lint`. --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 19a6890 commit ad7a55d

File tree

29 files changed

+137
-40
lines changed

29 files changed

+137
-40
lines changed

core/bin/contract-verifier/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ async fn main() -> anyhow::Result<()> {
9494
verifier_config.etherscan_api_url.is_some() && etherscan_api_key.is_some();
9595
let contract_verifier = ContractVerifier::new(
9696
verifier_config.compilation_timeout,
97+
verifier_config.compiler_download_timeout,
9798
pool.clone(),
9899
etherscan_verifier_enabled,
99100
)

core/bin/external_node/src/config/tests/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ api:
7979
latest_values_cache_size_mb: 200
8080
request_timeout_sec: 20
8181
tree_api_url: http://tree/
82+
tree_api_request_timeout_sec: 45
8283
max_batch_request_size: 50
8384
websocket_requests_per_minute_limit: 1000
8485
mempool_cache_size: 1000

core/bin/external_node/src/config/tests/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ fn parsing_from_full_env() {
304304
305305
# API component config
306306
EN_API_TREE_API_REMOTE_URL=http://tree/
307+
EN_API_WEB3_JSON_RPC_TREE_API_REQUEST_TIMEOUT_SEC=45
307308
# Tree component config
308309
EN_TREE_API_PORT=2955
309310
@@ -563,6 +564,7 @@ fn avail_da_client_from_env() {
563564
EN_DA_AVAIL_CLIENT_TYPE="FullClient"
564565
EN_DA_BRIDGE_API_URL="localhost:54321"
565566
EN_DA_TIMEOUT_MS="2000"
567+
EN_DA_API_CLIENT_TIMEOUT_SEC="90"
566568
EN_DA_API_NODE_URL="localhost:12345"
567569
EN_DA_APP_ID="1"
568570
EN_DA_FINALITY_STATE="inBlock"

core/bin/external_node/src/node_builder.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ impl ExternalNodeBuilder {
432432
fn add_tree_api_client_layer(mut self) -> anyhow::Result<Self> {
433433
self.node.add_layer(TreeApiClientLayer::http(
434434
self.config.local.api.web3_json_rpc.tree_api_url.clone(),
435+
self.config.local.api.web3_json_rpc.tree_api_request_timeout,
435436
));
436437
Ok(self)
437438
}

core/bin/zksync_server/src/node_builder.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,10 @@ impl MainNodeBuilder {
460460

461461
fn add_tree_api_client_layer(mut self) -> anyhow::Result<Self> {
462462
let rpc_config = try_load_config!(self.configs.api_config).web3_json_rpc;
463-
self.node
464-
.add_layer(TreeApiClientLayer::http(rpc_config.tree_api_url));
463+
self.node.add_layer(TreeApiClientLayer::http(
464+
rpc_config.tree_api_url,
465+
rpc_config.tree_api_request_timeout,
466+
));
465467
Ok(self)
466468
}
467469

core/deny.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ ignore = [
3131
"RUSTSEC-2026-0047",
3232
"RUSTSEC-2026-0048",
3333
"RUSTSEC-2026-0049", # `rustls-webpki` CRL vulnerability, old 0.101.7/0.102.8 pinned by rustls 0.21/0.22 (transitive deps of aws-smithy, hyper-rustls)
34+
# `rand` 0.8.x unsoundness in `ThreadRng` reseed under a custom logger calling rand from
35+
# log macros. Patch only exists for >=0.9.3 / >=0.10.1; no fix backported to 0.8.x. We
36+
# already updated 0.9.x to 0.9.3; rand 0.8.5 remains via many transitive deps
37+
# (alloy-primitives, sqlx-postgres, jsonrpsee, boojum, secp256k1, foundry-compilers, ...)
38+
# that haven't migrated to rand 0.9. The unsoundness requires a custom logger that calls
39+
# rand from logging callbacks, which we don't do.
40+
"RUSTSEC-2026-0097",
3441
]
3542

3643
[licenses]

core/lib/config/src/configs/api.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ pub struct Web3JsonRpcConfig {
385385
/// since the server can communicate with the tree in-process.
386386
#[config(alias = "tree_api_remote_url")]
387387
pub tree_api_url: Option<String>,
388+
/// Total request timeout for the Tree API HTTP client. Only used when [`Self::tree_api_url`] is set.
389+
#[config(default_t = Duration::from_secs(60))]
390+
pub tree_api_request_timeout: Duration,
388391
/// Polling period for mempool cache update - how often the mempool cache is updated from the database.
389392
#[config(default_t = Duration::from_millis(50), with = Fallback(TimeUnit::Millis))]
390393
pub mempool_cache_update_interval: Duration,
@@ -542,6 +545,7 @@ mod tests {
542545
websocket_requests_per_minute_limit: NonZeroU32::new(10).unwrap(),
543546
request_timeout: Some(Duration::from_secs(20)),
544547
tree_api_url: Some("http://tree/".into()),
548+
tree_api_request_timeout: Duration::from_secs(45),
545549
mempool_cache_update_interval: Duration::from_millis(50),
546550
mempool_cache_size: 10000,
547551
whitelisted_tokens_for_aa: vec![
@@ -605,6 +609,7 @@ mod tests {
605609
API_CONTRACT_VERIFICATION_PORT="3070"
606610
API_CONTRACT_VERIFICATION_URL="http://127.0.0.1:3070"
607611
API_WEB3_JSON_RPC_TREE_API_URL="http://tree/"
612+
API_WEB3_JSON_RPC_TREE_API_REQUEST_TIMEOUT_SEC=45
608613
API_WEB3_JSON_RPC_MAX_RESPONSE_BODY_SIZE_MB=15
609614
API_WEB3_JSON_RPC_MAX_RESPONSE_BODY_SIZE_OVERRIDES_MB="eth_call=1, eth_getTransactionReceipt=None, zks_getProof=32"
610615
API_PROMETHEUS_LISTENER_PORT="3312"
@@ -668,6 +673,7 @@ mod tests {
668673
eth_call_gas_cap: null
669674
request_timeout_sec: 20
670675
tree_api_url: "http://tree/"
676+
tree_api_request_timeout_sec: 45
671677
send_raw_tx_sync_max_timeout_ms: 10000
672678
send_raw_tx_sync_default_timeout_ms: 2000
673679
prometheus:
@@ -733,6 +739,7 @@ mod tests {
733739
eth_call_gas_cap: null
734740
request_timeout: 20s
735741
tree_api_url: "http://tree/"
742+
tree_api_request_timeout: 45s
736743
send_raw_tx_sync_max_timeout_ms: 10000
737744
send_raw_tx_sync_default_timeout_ms: 2000
738745
prometheus:

core/lib/config/src/configs/contract_verifier.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ pub struct ContractVerifierConfig {
1313
/// Max time of a single compilation.
1414
#[config(default_t = 4 * TimeUnit::Minutes, with = Fallback(TimeUnit::Seconds))]
1515
pub compilation_timeout: Duration,
16+
/// Total request timeout for the GitHub compiler resolver HTTP client used to download
17+
/// compiler binaries. Defaults to 5 minutes since binaries can be large.
18+
#[config(default_t = 5 * TimeUnit::Minutes, with = Fallback(TimeUnit::Seconds))]
19+
pub compiler_download_timeout: Duration,
1620
/// Port to which the Prometheus exporter server is listening.
1721
#[config(default_t = 3_318)]
1822
pub prometheus_port: u16,
@@ -39,6 +43,7 @@ mod tests {
3943
fn expected_config() -> ContractVerifierConfig {
4044
ContractVerifierConfig {
4145
compilation_timeout: Duration::from_secs(30),
46+
compiler_download_timeout: Duration::from_secs(600),
4247
prometheus_port: 3314,
4348
port: 3070,
4449
etherscan_api_url: Some("https://api.etherscan.io/".to_owned()),
@@ -49,6 +54,7 @@ mod tests {
4954
fn parsing_from_env() {
5055
let env = r#"
5156
CONTRACT_VERIFIER_COMPILATION_TIMEOUT=30
57+
CONTRACT_VERIFIER_COMPILER_DOWNLOAD_TIMEOUT=600
5258
CONTRACT_VERIFIER_PROMETHEUS_PORT=3314
5359
CONTRACT_VERIFIER_PORT=3070
5460
CONTRACT_VERIFIER_ETHERSCAN_API_URL="https://api.etherscan.io/"
@@ -66,6 +72,7 @@ mod tests {
6672
let yaml = r#"
6773
port: 3070
6874
compilation_timeout: 30
75+
compiler_download_timeout: 600
6976
prometheus_port: 3314
7077
etherscan_api_url: https://api.etherscan.io/
7178
"#;
@@ -79,6 +86,7 @@ mod tests {
7986
let yaml = r#"
8087
port: 3070
8188
compilation_timeout: 30s
89+
compiler_download_timeout: 10 min
8290
prometheus_port: 3314
8391
etherscan_api_url: https://api.etherscan.io/
8492
"#;

core/lib/config/src/configs/da_client/avail.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ pub struct AvailConfig {
1919
pub bridge_api_url: String,
2020
#[config(default_t = Duration::from_secs(30))]
2121
pub timeout: Duration,
22+
/// Total request timeout for the bridge API HTTP client.
23+
/// Applies to every request made by the client; individual requests can still
24+
/// further restrict via [`Self::timeout`].
25+
#[config(default_t = Duration::from_secs(60))]
26+
pub api_client_timeout: Duration,
2227
#[config(flatten)]
2328
pub config: AvailClientConfig,
2429
}

core/lib/config/src/configs/da_client/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ mod tests {
153153
client: Avail
154154
bridge_api_url: https://bridge-api.avail.so
155155
timeout_ms: 20000
156+
api_client_timeout: 90s
156157
avail_client_type: GasRelay
157158
gas_relay_api_url: https://lens-turbo-api.availproject.org
158159
max_retries: 4
@@ -252,6 +253,7 @@ mod tests {
252253
client: Avail
253254
bridge_api_url: https://turing-bridge-api.avail.so
254255
timeout: 20s
256+
api_client_timeout: 90s
255257
dispatch_timeout: 5s
256258
avail_client_type: FullClient
257259
api_node_url: wss://turing-rpc.avail.so/ws

0 commit comments

Comments
 (0)