Skip to content

Commit 8714144

Browse files
authored
feat: Revive continue-on-error mode (#11)
This PR enables correct processing of panics for the continue-on-error mode, and also adds logic to re-instantiate prover in case of panic. Some minor fixes here and there as well.
1 parent 07975e9 commit 8714144

File tree

9 files changed

+145
-100
lines changed

9 files changed

+145
-100
lines changed

ethereum_prover/configs/ethproofs_prod.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Use this config to submit proofs to EthProofs staging
22

33
eth_prover:
4-
app_bin_path: "../../artifacts/app.bin"
4+
app_bin_path: "../artifacts/app.bin"
55
mode: "gpu_prove"
66
cache_policy: "on_failure"
77
ethproofs_submission: "prod"

ethereum_prover/configs/ethproofs_staging.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Use this config to submit proofs to EthProofs staging
22

33
eth_prover:
4-
app_bin_path: "../../artifacts/app.bin"
4+
app_bin_path: "../artifacts/app.bin"
55
mode: "gpu_prove"
66
cache_policy: "on_failure"
77
ethproofs_submission: "staging"

ethereum_prover/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ pub mod metrics;
2020
pub mod prover;
2121
pub(crate) mod tasks;
2222
pub(crate) mod types;
23+
pub(crate) mod utils;
2324

2425
#[derive(Debug, Default)]
2526
pub struct Runner {}

ethereum_prover/src/prover/cpu_witness.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::path::PathBuf;
44
use alloy::providers::DynProvider;
55
use alloy::providers::Provider;
66
use alloy::rpc::types::Transaction;
7+
use anyhow::Context as _;
78
use basic_bootloader::bootloader::BasicBootloader;
89
use basic_bootloader::bootloader::config::BasicBootloaderForwardETHLikeConfig;
910
use forward_system::run::InvalidTransaction;
@@ -68,7 +69,7 @@ impl CpuWitnessGenerator {
6869
oracle: ZkEENonDeterminismSource,
6970
) -> anyhow::Result<Vec<u32>> {
7071
let app_bin_path = self.app_bin_path.clone();
71-
tokio::task::spawn_blocking(move || {
72+
match tokio::task::spawn_blocking(move || {
7273
let copy_source = ReadWitnessSource::new(oracle);
7374
let items = copy_source.get_read_items();
7475

@@ -80,7 +81,18 @@ impl CpuWitnessGenerator {
8081
let witness = items.borrow().clone();
8182
Ok(witness)
8283
})
83-
.await?
84+
.await
85+
{
86+
Ok(Ok(witness)) => Ok(witness),
87+
Ok(Err(err)) => Err(err).context("Failed to generate witness using zksync_os_runner"),
88+
Err(err) => {
89+
let panic_msg = crate::utils::extract_panic_message(err);
90+
Err(anyhow::anyhow!(
91+
"Witness generation task panicked: {}",
92+
panic_msg
93+
))
94+
}
95+
}
8496
}
8597
}
8698

ethereum_prover/src/prover/gpu_prover.rs

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
use anyhow::Context as _;
12
use oracle_provider::ZkEENonDeterminismSource;
2-
use std::path::Path;
3+
use std::path::{Path, PathBuf};
34
use std::sync::{Arc, Mutex};
45
use std::time::Instant;
56

@@ -11,6 +12,8 @@ pub struct ProofResult {
1112
}
1213

1314
pub struct Prover {
15+
app_bin_path: PathBuf,
16+
worker_threads: Option<usize>,
1417
inner: Arc<Mutex<execution_utils::unrolled_gpu::UnrolledProver>>,
1518
}
1619

@@ -22,19 +25,15 @@ impl std::fmt::Debug for Prover {
2225

2326
impl Prover {
2427
pub fn new(app_bin_path: &Path, worker_threads: Option<usize>) -> anyhow::Result<Self> {
25-
let base_path = strip_bin_suffix(app_bin_path)?;
26-
let mut configuration =
27-
execution_utils::gpu_prover::execution::prover::ExecutionProverConfiguration::default();
28-
if let Some(threads) = worker_threads {
29-
configuration.max_thread_pool_threads = Some(threads);
30-
configuration.replay_worker_threads_count = threads;
31-
}
32-
let inner = execution_utils::unrolled_gpu::UnrolledProver::new(
33-
&base_path,
34-
configuration,
35-
execution_utils::unrolled_gpu::UnrolledProverLevel::RecursionUnified,
36-
);
28+
let inner = create_unrolled_prover(app_bin_path, worker_threads).with_context(|| {
29+
format!(
30+
"failed to create unrolled prover with app binary at {:?}",
31+
app_bin_path
32+
)
33+
})?;
3734
Ok(Self {
35+
app_bin_path: app_bin_path.to_path_buf(),
36+
worker_threads,
3837
inner: Arc::new(Mutex::new(inner)),
3938
})
4039
}
@@ -48,11 +47,33 @@ impl Prover {
4847

4948
let inner = self.inner.clone();
5049

51-
let (proof, cycles) = tokio::task::spawn_blocking(move || {
50+
let future_result = tokio::task::spawn_blocking(move || {
5251
let prover = inner.lock().unwrap();
5352
prover.prove(block_number, oracle)
5453
})
55-
.await?;
54+
.await;
55+
let (proof, cycles) = match future_result {
56+
Ok(result) => result,
57+
Err(err) => {
58+
let panic_msg = crate::utils::extract_panic_message(err);
59+
60+
// If prover panics, it is not safe to use it again, since some of threads may be poisoned/dead.
61+
// We need to re-instantiate it.
62+
{
63+
let mut inner = self.inner.lock().unwrap();
64+
// If we cannot reinstantiate the prover for some reason, we cannot do much -- better to panic.
65+
*inner = create_unrolled_prover(self.app_bin_path.as_path(), self.worker_threads).expect(
66+
"failed to re-instantiate prover after panic; prover app binary path is invalid",
67+
);
68+
}
69+
70+
return Err(anyhow::anyhow!(
71+
"Prover task panicked for the block {}: {}",
72+
block_number,
73+
panic_msg
74+
));
75+
}
76+
};
5677

5778
let proving_time_secs = start.elapsed().as_secs_f64();
5879
let proof_bytes = bincode::serde::encode_to_vec(&proof, bincode::config::standard())?;
@@ -74,3 +95,23 @@ fn strip_bin_suffix(path: &Path) -> anyhow::Result<String> {
7495
Ok(path_str.to_string())
7596
}
7697
}
98+
99+
fn create_unrolled_prover(
100+
app_bin_path: &Path,
101+
worker_threads: Option<usize>,
102+
) -> anyhow::Result<execution_utils::unrolled_gpu::UnrolledProver> {
103+
let base_path = strip_bin_suffix(app_bin_path)?;
104+
let mut configuration =
105+
execution_utils::gpu_prover::execution::prover::ExecutionProverConfiguration::default();
106+
if let Some(threads) = worker_threads {
107+
configuration.max_thread_pool_threads = Some(threads);
108+
configuration.replay_worker_threads_count = threads;
109+
}
110+
111+
let unrolled_prover = execution_utils::unrolled_gpu::UnrolledProver::new(
112+
&base_path,
113+
configuration,
114+
execution_utils::unrolled_gpu::UnrolledProverLevel::RecursionUnified,
115+
);
116+
Ok(unrolled_prover)
117+
}

ethereum_prover/src/tasks/block_stream/continuous.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ impl ContinuousBlockStream {
2727
cache: CacheStorage,
2828
cache_policy: CachePolicy,
2929
) -> (Self, Receiver<EthBlockInput>) {
30-
let (sender, receiver) = channel(10);
30+
// Capacity of 1 ensures that we won't be too far behind in case proving takes more time than expected.
31+
let (sender, receiver) = channel(1);
3132

3233
let provider = alloy::providers::ProviderBuilder::new().connect_http(rpc_url);
3334
let provider = DynProvider::new(provider);

ethereum_prover/src/tasks/cpu_witness.rs

Lines changed: 24 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -68,45 +68,31 @@ impl CpuWitnessTask {
6868
})
6969
.await?;
7070
}
71-
Err(err) => match self.on_failure {
72-
OnFailure::Exit => {
73-
METRICS.witness_failure_total.inc();
74-
latency.observe();
75-
sentry::with_scope(
76-
|scope| {
77-
scope.set_level(Some(sentry::Level::Error));
78-
scope.set_tag("mode", "cpu_witness");
79-
scope.set_tag("block_number", block_number.to_string());
80-
},
81-
|| sentry_anyhow::capture_anyhow(&err),
82-
);
83-
return Err(err).with_context(|| {
84-
format!("Failed to generate witness for the block {block_number}")
85-
});
86-
}
87-
OnFailure::Continue => {
88-
METRICS.witness_failure_total.inc();
89-
latency.observe();
90-
sentry::with_scope(
91-
|scope| {
92-
scope.set_level(Some(sentry::Level::Error));
93-
scope.set_tag("mode", "cpu_witness");
94-
scope.set_tag("block_number", block_number.to_string());
95-
},
96-
|| {
97-
sentry_anyhow::capture_anyhow(&err);
98-
},
99-
);
100-
tracing::error!(
101-
"Failed to generate witness for the block {block_number}: {err}"
102-
);
103-
// continue;
104-
// TODO: `continue` mode is not supported yet, as some of the panics are not unwind-safe.
105-
return Err(err).with_context(|| {
106-
format!("Failed to generate witness for the block {block_number}")
107-
});
71+
Err(err) => {
72+
METRICS.witness_failure_total.inc();
73+
latency.observe();
74+
sentry::with_scope(
75+
|scope| {
76+
scope.set_level(Some(sentry::Level::Error));
77+
scope.set_tag("mode", "cpu_witness");
78+
scope.set_tag("block_number", block_number.to_string());
79+
},
80+
|| sentry_anyhow::capture_anyhow(&err),
81+
);
82+
match self.on_failure {
83+
OnFailure::Exit => {
84+
return Err(err).with_context(|| {
85+
format!("Failed to generate witness for the block {block_number}")
86+
});
87+
}
88+
OnFailure::Continue => {
89+
tracing::error!(
90+
"Failed to generate witness for the block {block_number}: {err}"
91+
);
92+
continue;
93+
}
10894
}
109-
},
95+
}
11096
}
11197

11298
// We don't any commands as we're not generating proofs

ethereum_prover/src/tasks/gpu_prove.rs

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,12 @@ impl GpuProveTask {
5555

5656
match self.process_block(witness).await {
5757
Ok(proof_result) => {
58-
tracing::info!("Generated GPU proof for block {}", block_number);
58+
tracing::info!(
59+
"Generated GPU proof for block {}. Number of cycles: {}, proving time: {}s",
60+
block_number,
61+
proof_result.cycles,
62+
proof_result.proving_time_secs
63+
);
5964
METRICS.proof_success_total.inc();
6065
latency.observe();
6166
self.command_sender
@@ -66,48 +71,33 @@ impl GpuProveTask {
6671
.await
6772
.ok();
6873
}
69-
Err(err) => match self.on_failure {
70-
OnFailure::Exit => {
71-
METRICS.proof_failure_total.inc();
72-
latency.observe();
73-
sentry::with_scope(
74-
|scope| {
75-
scope.set_level(Some(sentry::Level::Error));
76-
scope.set_tag("mode", "gpu_prove");
77-
scope.set_tag("block_number", block_number.to_string());
78-
},
79-
|| {
80-
sentry_anyhow::capture_anyhow(&err);
81-
},
82-
);
83-
return Err(err).with_context(|| {
84-
format!("Failed to generate proof for the block {block_number}")
85-
});
86-
}
87-
OnFailure::Continue => {
88-
METRICS.proof_failure_total.inc();
89-
latency.observe();
90-
sentry::with_scope(
91-
|scope| {
92-
scope.set_level(Some(sentry::Level::Error));
93-
scope.set_tag("mode", "gpu_prove");
94-
scope.set_tag("block_number", block_number.to_string());
95-
},
96-
|| {
97-
sentry_anyhow::capture_anyhow(&err);
98-
},
99-
);
100-
tracing::error!(
101-
"Failed to generate proof for the block {block_number}: {err}"
102-
);
103-
// continue;
104-
// TODO: `continue` mode is not supported yet, as some of the panics are not unwind-safe.
105-
// TODO: On failure in the prover itself, we need to re-instantiate the prover.
106-
return Err(err).with_context(|| {
107-
format!("Failed to generate proof for the block {block_number}")
108-
});
74+
Err(err) => {
75+
METRICS.proof_failure_total.inc();
76+
latency.observe();
77+
sentry::with_scope(
78+
|scope| {
79+
scope.set_level(Some(sentry::Level::Error));
80+
scope.set_tag("mode", "gpu_prove");
81+
scope.set_tag("block_number", block_number.to_string());
82+
},
83+
|| {
84+
sentry_anyhow::capture_anyhow(&err);
85+
},
86+
);
87+
match self.on_failure {
88+
OnFailure::Exit => {
89+
return Err(err).with_context(|| {
90+
format!("Failed to generate proof for the block {block_number}")
91+
});
92+
}
93+
OnFailure::Continue => {
94+
tracing::error!(
95+
"Failed to generate proof for the block {block_number}: {err}"
96+
);
97+
continue;
98+
}
10999
}
110-
},
100+
}
111101
}
112102
}
113103

ethereum_prover/src/utils.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
pub(crate) fn extract_panic_message(err: tokio::task::JoinError) -> String {
2+
if err.is_panic() {
3+
let panic = err.into_panic();
4+
if let Some(s) = panic.downcast_ref::<String>() {
5+
s.clone()
6+
} else if let Some(s) = panic.downcast_ref::<&str>() {
7+
s.to_string()
8+
} else {
9+
"Unknown panic".to_string()
10+
}
11+
} else {
12+
"Task was cancelled".to_string()
13+
}
14+
}

0 commit comments

Comments
 (0)