starknet_transaction_prover: origin-level breadcrumbs for validation errors

avi-starkware · claude · avi-starkware · commit 894c08ef43ee · 2026-05-27T17:19:22.000+03:00
Splits validation-error logging so each origin (request body, RPC,
proving) carries an `event` tag and trimmed payload context, making it
straightforward to root-cause production failures from the structured
log stream. Also adds an in-test microbenchmark for HttpMetricsLayer
overhead so any regression in the hot path is visible.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/starknet_transaction_prover/src/proving/virtual_snos_prover.rs b/crates/starknet_transaction_prover/src/proving/virtual_snos_prover.rs
@@ -18,7 +18,7 @@ use starknet_api::execution_resources::GasAmount;
 use starknet_api::rpc_transaction::{RpcInvokeTransaction, RpcInvokeTransactionV3, RpcTransaction};
 use starknet_api::transaction::fields::{Proof, ProofFacts, Tip};
 use starknet_api::transaction::{InvokeTransaction, MessageToL1};
-use tracing::{info, instrument};
+use tracing::{info, instrument, warn};
 use url::Url;
 
 use crate::blocking_check::{BlockingCheckClient, BlockingCheckResult};
@@ -172,16 +172,23 @@ impl<R: VirtualSnosRunner + 'static> VirtualSnosProver<R> {
         block_id: BlockId,
         transaction: RpcTransaction,
     ) -> Result<ProveTransactionResult, VirtualSnosProverError> {
-        // Validate block_id is not pending.
+        // Each validation site emits a structured warn before returning so
+        // the failing field is grep-able directly. Transaction calldata is
+        // private user data — never logged.
         if matches!(block_id, BlockId::Pending) {
+            warn!(event = "validation_error", reason = "pending_block_unsupported");
             return Err(VirtualSnosProverError::ValidationError(
                 "Pending blocks are not supported; only finalized blocks can be proven."
                     .to_string(),
             ));
         }
 
-        let invoke_v3 = extract_rpc_invoke_tx(transaction.clone())?;
-        validate_transaction_input(&invoke_v3, self.validate_zero_fee_fields)?;
+        let invoke_v3 = extract_rpc_invoke_tx(transaction.clone()).inspect_err(|err| {
+            warn!(event = "validation_error", reason = "non_invoke_transaction", error = %err);
+        })?;
+        validate_transaction_input(&invoke_v3, self.validate_zero_fee_fields).inspect_err(|err| {
+            warn!(event = "validation_error", reason = "invalid_transaction_input", error = %err);
+        })?;
         let invoke_tx = InvokeTransaction::V3(invoke_v3.into());
 
         match &self.blocking_check_client {
diff --git a/crates/starknet_transaction_prover/src/server.rs b/crates/starknet_transaction_prover/src/server.rs
@@ -35,6 +35,8 @@ pub mod log_redact;
 pub mod metrics;
 #[cfg(test)]
 pub mod mock_rpc;
+#[cfg(test)]
+mod overhead_bench;
 pub mod panic;
 pub mod request_log;
 pub mod request_span;
diff --git a/crates/starknet_transaction_prover/src/server/overhead_bench.rs b/crates/starknet_transaction_prover/src/server/overhead_bench.rs
@@ -0,0 +1,92 @@
+//! In-test microbenchmarks for the new observability middleware.
+//!
+//! Run with:
+//!     SEED=0 cargo test --release -p starknet_transaction_prover \
+//!         server::overhead_bench -- --ignored --nocapture
+//!
+//! Marked `#[ignore]` so the regular `cargo test` run does not pay the
+//! benchmark cost. The intent is to give an operator a quick way to
+//! reproduce the per-request overhead numbers published in the
+//! observability rollout report (see `observability_report.html`).
+//!
+//! The module is gated with `#[cfg(test)]` in `server.rs`; no inner
+//! `#![cfg(test)]` here (would trip clippy::duplicated-attributes).
+
+use std::time::Instant;
+
+use bytes::Bytes;
+use http::{Method, Request, Response, StatusCode};
+use http_body_util::Full;
+use jsonrpsee::server::HttpBody;
+use tower::{Layer, ServiceExt};
+
+use crate::server::http_metrics::HttpMetricsLayer;
+use crate::server::test_recorder::shared_handle;
+
+const ITERATIONS: usize = 50_000;
+
+fn ok_service() -> impl tower::Service<
+    Request<HttpBody>,
+    Response = Response<HttpBody>,
+    Error = std::convert::Infallible,
+    Future = futures::future::Ready<Result<Response<HttpBody>, std::convert::Infallible>>,
+> + Clone {
+    tower::service_fn(|_req: Request<HttpBody>| {
+        let response = Response::builder()
+            .status(StatusCode::OK)
+            .body(HttpBody::new(Full::new(Bytes::new())))
+            .expect("static body is infallible");
+        futures::future::ready(Ok::<_, std::convert::Infallible>(response))
+    })
+}
+
+fn make_request() -> Request<HttpBody> {
+    Request::builder()
+        .method(Method::POST)
+        .uri("/")
+        .body(HttpBody::new(Full::new(Bytes::new())))
+        .expect("static body is infallible")
+}
+
+async fn time_loop<S>(label: &str, svc: S) -> std::time::Duration
+where
+    S: tower::Service<Request<HttpBody>, Response = Response<HttpBody>> + Clone,
+    S::Future: Send,
+{
+    // Warm-up: prime caches, exporter buffers, etc.
+    for _ in 0..1_000 {
+        let _ = svc.clone().oneshot(make_request()).await;
+    }
+    let start = Instant::now();
+    for _ in 0..ITERATIONS {
+        let _ = svc.clone().oneshot(make_request()).await;
+    }
+    let elapsed = start.elapsed();
+    // `Duration / u32` is the only stable division API; safe to cast
+    // because ITERATIONS is a tiny compile-time constant.
+    let iterations_u32 = u32::try_from(ITERATIONS).expect("ITERATIONS fits in u32");
+    eprintln!(
+        "{label:<30}: {elapsed:?} total, {per_op:?} per request ({ITERATIONS} ops)",
+        per_op = elapsed / iterations_u32,
+    );
+    elapsed
+}
+
+#[tokio::test]
+#[ignore = "microbenchmark; run with --ignored --nocapture"]
+async fn bench_no_layer_baseline() {
+    let elapsed = time_loop("baseline (no layer)", ok_service()).await;
+    // Sanity: 50k no-op tower calls should fit in well under a second on
+    // any reasonable machine. Used as the dividend for overhead %.
+    assert!(elapsed.as_secs() < 5, "baseline regressed unexpectedly: {elapsed:?}");
+}
+
+#[tokio::test]
+#[ignore = "microbenchmark; run with --ignored --nocapture"]
+async fn bench_http_metrics_layer_overhead() {
+    // Force the recorder up first; otherwise the first metric call pays
+    // the install cost which is not representative of steady-state.
+    let _ = shared_handle();
+    let svc = HttpMetricsLayer.layer(ok_service());
+    time_loop("HttpMetricsLayer", svc).await;
+}
diff --git a/crates/starknet_transaction_prover/src/server/rpc_impl.rs b/crates/starknet_transaction_prover/src/server/rpc_impl.rs
@@ -80,7 +80,16 @@ impl ProvingRpcServer for ProvingRpcServerImpl {
         self.saturation_monitor.mark_accepted();
 
         self.prover.prove_transaction(block_id, transaction).await.map_err(|err| {
-            warn!("prove_transaction failed: {:?}", err);
+            // Structured fields match `prover_prove_transaction_outcome_total`
+            // so log queries and metric filters use the same vocabulary. The
+            // origin-level warns inside `virtual_snos_prover` cover the
+            // specific failure mode; this one is the catch-all backstop.
+            warn!(
+                event = "prove_transaction_failed",
+                outcome = err.metric_outcome(),
+                error = %err,
+                "prove_transaction failed",
+            );
             ErrorObjectOwned::from(err)
         })
     }