From fd81285eda410660c5069d0cf5dfc780aaaeb158 Mon Sep 17 00:00:00 2001 From: Avi Cohen Date: Wed, 27 May 2026 17:00:32 +0300 Subject: [PATCH] starknet_transaction_prover: per-request structured log with request-id Adds `RequestLogLayer`, the outermost middleware that emits one `event="http_request"` log line per request (request_id, method, path, status, latency_ms) and echoes the id on the response. The id is accepted from `x-request-id` when it is a short printable-ASCII token, otherwise a UUID v4 is generated; hostile inputs are dropped to prevent header smuggling and log-field explosion. No request body bytes are inspected (transaction calldata is private user data). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + Cargo.toml | 1 + crates/starknet_transaction_prover/Cargo.toml | 1 + .../starknet_transaction_prover/src/server.rs | 6 +- .../src/server/request_log.rs | 165 ++++++++++ .../src/server/request_log_test.rs | 120 +++++++ .../src/server/tls.rs | 2 +- middleware_stack_alternatives.html | 307 ++++++++++++++++++ request_id_pr757_fix.html | 250 ++++++++++++++ request_id_scope.html | 287 ++++++++++++++++ request_id_tradeoff.html | 186 +++++++++++ 11 files changed, 1324 insertions(+), 2 deletions(-) create mode 100644 crates/starknet_transaction_prover/src/server/request_log.rs create mode 100644 crates/starknet_transaction_prover/src/server/request_log_test.rs create mode 100644 middleware_stack_alternatives.html create mode 100644 request_id_pr757_fix.html create mode 100644 request_id_scope.html create mode 100644 request_id_tradeoff.html diff --git a/Cargo.lock b/Cargo.lock index 420e42a07da..38eb4dd312d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13180,6 +13180,7 @@ dependencies = [ "tracing", "tracing-subscriber", "url", + "uuid 1.18.1", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 7e190634b7b..d5d18492ad8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -415,6 +415,7 @@ tracing-test = "0.2" trybuild = "1.0.105" unsigned-varint = "0.8.0" url = "2.5.0" +uuid = "1" validator = "0.20" void = "1.0.2" waker-fn = "1.2.0" diff --git a/crates/starknet_transaction_prover/Cargo.toml b/crates/starknet_transaction_prover/Cargo.toml index 1447e9db25d..192d67f33bc 100644 --- a/crates/starknet_transaction_prover/Cargo.toml +++ b/crates/starknet_transaction_prover/Cargo.toml @@ -57,6 +57,7 @@ tower_ohttp.workspace = true tracing.workspace = true tracing-subscriber = { workspace = true, features = ["env-filter"] } url.workspace = true +uuid = { workspace = true, features = ["v4"] } [dev-dependencies] apollo_starknet_os_program.workspace = true diff --git a/crates/starknet_transaction_prover/src/server.rs b/crates/starknet_transaction_prover/src/server.rs index f19a55bb8a9..42bdbdf9654 100644 --- a/crates/starknet_transaction_prover/src/server.rs +++ b/crates/starknet_transaction_prover/src/server.rs @@ -40,7 +40,8 @@ pub const OHTTP_JSONRPSEE_BODY_BUILDER: fn(Full) -> HttpBody = HttpBody:: /// each call site. /// /// Layer order (tower makes the last-added layer innermost): -/// - `HealthLayer` sits outermost so `GET /health` is answered before any other middleware runs. +/// - `RequestLogLayer` is outermost so the latency it measures covers every other layer. +/// - `HealthLayer` sits inside it so probes complete before CORS/OHTTP. /// - `OhttpLayer` must sit OUTSIDE `CompressionLayer` so compression applies to the inner JSON-RPC /// response (the client's inner `Accept-Encoding` travels through BHTTP into jsonrpsee) rather /// than to the OHTTP ciphertext envelope. `MapRequestBodyLayer`/`MapResponseBodyLayer` keep @@ -49,6 +50,7 @@ pub const OHTTP_JSONRPSEE_BODY_BUILDER: fn(Full) -> HttpBody = HttpBody:: macro_rules! prover_http_middleware { ($cors_layer:expr, $ohttp_layer:expr $(,)?) => { ServiceBuilder::new() + .layer(RequestLogLayer) .layer(HealthLayer) .option_layer($cors_layer) .layer(MapRequestBodyLayer::new(HttpBody::new)) @@ -65,11 +67,13 @@ pub mod health; pub mod log_redact; #[cfg(test)] pub mod mock_rpc; +pub mod request_log; pub mod rpc_api; pub mod rpc_impl; pub mod tls; pub use health::{HealthLayer, HEALTH_PATH}; +pub use request_log::{RequestLogLayer, REQUEST_ID_HEADER}; #[cfg(test)] mod rpc_spec_test; diff --git a/crates/starknet_transaction_prover/src/server/request_log.rs b/crates/starknet_transaction_prover/src/server/request_log.rs new file mode 100644 index 00000000000..428eb9715f2 --- /dev/null +++ b/crates/starknet_transaction_prover/src/server/request_log.rs @@ -0,0 +1,165 @@ +//! tower middleware that logs one structured line per HTTP request and +//! propagates a request id. +//! +//! This is the outermost, envelope-level layer: it emits one log line with +//! `event="http_request"`, `request_id`, `method`, `path`, `status`, and +//! `latency_ms` per HTTP request, and echoes `request_id` on the response so +//! callers can quote it. The id is accepted from the incoming `x-request-id` +//! header or generated as a UUID v4. +//! +//! It deliberately does NOT bind the id to a span covering the downstream +//! dispatch. For OHTTP traffic this layer runs on the *outer* envelope, whose +//! id is visible to the relay (echoed on the ciphertext response). Propagating +//! that id into the logs describing the *decapsulated* contents would create a +//! join key linking the relay's view (who) to the gateway's view (what), +//! defeating OHTTP unlinkability. Content-level correlation — a fresh, +//! envelope-unlinkable id bound below the OHTTP layer — is added by a +//! follow-up layer. +//! +//! Body bytes are never inspected — transaction calldata is private user data +//! per the privacy-pool threat model. + +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Instant; + +use http::{HeaderValue, Request, Response}; +use jsonrpsee::server::HttpBody; +use tower::{Layer, Service}; +use tracing::info; + +#[cfg(test)] +#[path = "request_log_test.rs"] +mod request_log_test; + +/// HTTP header carrying the request id. +pub const REQUEST_ID_HEADER: &str = "x-request-id"; + +/// Cap on accepted incoming request-id length. Anything longer is dropped +/// in favour of a freshly generated id so the value never balloons into +/// AsyncLocalStorage / tracing fields and so log aggregators don't have +/// to parse megabyte-scale ids. +const MAX_REQUEST_ID_LEN: usize = 128; + +/// Cap on the logged request path. The URI is attacker-controlled and this +/// layer is outermost, so an over-long path would bloat every log line; it is +/// truncated for logging only (the request itself is untouched). +const MAX_LOG_PATH_LEN: usize = 256; + +pub(crate) fn new_request_id() -> String { + uuid::Uuid::new_v4().to_string() +} + +/// Truncates an over-long path on a char boundary for safe logging. +fn truncated_log_path(path: &str) -> String { + if path.len() <= MAX_LOG_PATH_LEN { + return path.to_string(); + } + let mut end = MAX_LOG_PATH_LEN; + while !path.is_char_boundary(end) { + end -= 1; + } + format!("{}…(truncated)", &path[..end]) +} + +/// Accepts the incoming `x-request-id` only when it's a short printable +/// ASCII token. CR/LF would let a client smuggle headers into the +/// response; arbitrary bytes (including unicode) make the value unsafe +/// to round-trip through `HeaderValue::from_str`. Any reject falls back +/// to a freshly generated UUID v4. +pub(crate) fn extract_or_generate_request_id(request: &Request) -> String { + request + .headers() + .get(REQUEST_ID_HEADER) + .and_then(|value| value.to_str().ok()) + .filter(|value| !value.is_empty() && value.len() <= MAX_REQUEST_ID_LEN) + .filter(|value| value.bytes().all(is_safe_request_id_byte)) + .map(|value| value.to_string()) + .unwrap_or_else(new_request_id) +} + +fn is_safe_request_id_byte(byte: u8) -> bool { + // Reject whitespace/CR/LF/NUL/control bytes so the id can't smuggle headers + // into the response or break structured-log parsers. + byte.is_ascii_graphic() +} + +/// tower [`Layer`] producing [`RequestLogService`]. +#[derive(Clone, Copy, Default)] +pub struct RequestLogLayer; + +impl Layer for RequestLogLayer { + type Service = RequestLogService; + + fn layer(&self, inner: S) -> Self::Service { + RequestLogService { inner } + } +} + +#[derive(Clone)] +pub struct RequestLogService { + inner: S, +} + +impl Service> for RequestLogService +where + S: Service, Response = Response>, + S::Future: Send + 'static, + S::Error: Send + 'static, +{ + type Response = Response; + type Error = S::Error; + type Future = + Pin> + Send>>; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, mut request: Request) -> Self::Future { + let request_id = extract_or_generate_request_id(&request); + if let Ok(header_value) = HeaderValue::from_str(&request_id) { + request.headers_mut().insert(REQUEST_ID_HEADER, header_value); + } + let method = request.method().clone(); + let path = truncated_log_path(request.uri().path()); + let start = Instant::now(); + + let future = self.inner.call(request); + + Box::pin(async move { + let result = future.await; + let latency_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX); + match result { + Ok(mut response) => { + let status = response.status().as_u16(); + if let Ok(header_value) = HeaderValue::from_str(&request_id) { + response.headers_mut().insert(REQUEST_ID_HEADER, header_value); + } + info!( + event = "http_request", + request_id = %request_id, + method = %method, + path = %path, + status = status, + latency_ms = latency_ms, + "HTTP request handled." + ); + Ok(response) + } + Err(err) => { + info!( + event = "http_request", + request_id = %request_id, + method = %method, + path = %path, + latency_ms = latency_ms, + outcome = "service_error", + "HTTP request failed in tower stack." + ); + Err(err) + } + } + }) + } +} diff --git a/crates/starknet_transaction_prover/src/server/request_log_test.rs b/crates/starknet_transaction_prover/src/server/request_log_test.rs new file mode 100644 index 00000000000..d2445864e52 --- /dev/null +++ b/crates/starknet_transaction_prover/src/server/request_log_test.rs @@ -0,0 +1,120 @@ +use bytes::Bytes; +use http::{Method, Request, Response, StatusCode}; +use http_body_util::{BodyExt, Full}; +use jsonrpsee::server::HttpBody; +use tower::{Layer, ServiceExt}; + +use crate::server::request_log::{RequestLogLayer, REQUEST_ID_HEADER}; + +fn echo_request_id_service() -> impl tower::Service< + Request, + Response = Response, + Error = std::convert::Infallible, + Future = futures::future::Ready, std::convert::Infallible>>, +> + Clone { + tower::service_fn(|req: Request| { + let id = req + .headers() + .get(REQUEST_ID_HEADER) + .map(|value| value.to_str().unwrap().to_string()) + .unwrap_or_default(); + let response = Response::builder() + .status(StatusCode::OK) + .body(HttpBody::new(Full::new(Bytes::from(id)))) + .expect("static body is infallible"); + futures::future::ready(Ok::<_, std::convert::Infallible>(response)) + }) +} + +fn request_with_header(value: Option<&str>) -> Request { + let mut builder = Request::builder().method(Method::POST).uri("/"); + if let Some(value) = value { + builder = builder.header(REQUEST_ID_HEADER, value); + } + builder.body(HttpBody::new(Full::new(Bytes::new()))).expect("static body is infallible") +} + +async fn read_body(response: Response) -> (Vec, http::HeaderMap) { + let (parts, body) = response.into_parts(); + let bytes = body.collect().await.expect("body collect").to_bytes().to_vec(); + (bytes, parts.headers) +} + +#[tokio::test] +async fn echoes_supplied_request_id_on_response() { + let svc = RequestLogLayer.layer(echo_request_id_service()); + + let response = svc.oneshot(request_with_header(Some("client-supplied-id"))).await.unwrap(); + + let (body, headers) = read_body(response).await; + assert_eq!(headers.get(REQUEST_ID_HEADER).unwrap(), "client-supplied-id"); + assert_eq!(String::from_utf8(body).unwrap(), "client-supplied-id"); +} + +#[tokio::test] +async fn generates_request_id_when_absent_and_echoes_it() { + let svc = RequestLogLayer.layer(echo_request_id_service()); + + let response = svc.oneshot(request_with_header(None)).await.unwrap(); + + let (body, headers) = read_body(response).await; + let header_id = headers.get(REQUEST_ID_HEADER).unwrap().to_str().unwrap(); + let body_id = String::from_utf8(body).unwrap(); + // Body is what the inner service saw — they must match (the layer + // rewrote the request header before forwarding). + assert_eq!(header_id, body_id); + // Canonical UUID v4: 8-4-4-4-12 hex with hyphens. + assert!(uuid::Uuid::parse_str(header_id).is_ok(), "expected a UUID, got {header_id:?}"); + assert_eq!(uuid::Uuid::parse_str(header_id).unwrap().get_version_num(), 4); +} + +#[tokio::test] +async fn drops_non_ascii_incoming_id_and_generates_a_fresh_one() { + let mut request = request_with_header(None); + request + .headers_mut() + .insert(REQUEST_ID_HEADER, http::HeaderValue::from_bytes(b"\xff\xfe").unwrap()); + + let svc = RequestLogLayer.layer(echo_request_id_service()); + let response = svc.oneshot(request).await.unwrap(); + + let (_body, headers) = read_body(response).await; + let header_id = headers.get(REQUEST_ID_HEADER).unwrap().to_str().unwrap(); + assert!(uuid::Uuid::parse_str(header_id).is_ok(), "should have generated a fresh UUID"); +} + +#[tokio::test] +async fn drops_request_id_containing_whitespace() { + // CRLF in header values is rejected by the http crate itself at parse + // time, so the residual concern is whitespace and other ASCII bytes + // that would confuse log parsers if echoed verbatim into structured + // fields. + for hostile in ["with space", "tab\there", "leading space "] { + let mut request = request_with_header(None); + request + .headers_mut() + .insert(REQUEST_ID_HEADER, http::HeaderValue::from_bytes(hostile.as_bytes()).unwrap()); + let svc = RequestLogLayer.layer(echo_request_id_service()); + let response = svc.oneshot(request).await.unwrap(); + let (_body, headers) = read_body(response).await; + let header_id = headers.get(REQUEST_ID_HEADER).unwrap().to_str().unwrap(); + assert!( + uuid::Uuid::parse_str(header_id).is_ok(), + "expected fresh UUID for hostile input {hostile:?}, got {header_id:?}", + ); + } +} + +#[tokio::test] +async fn drops_oversize_request_id() { + let huge = "a".repeat(2048); + let mut request = request_with_header(None); + request + .headers_mut() + .insert(REQUEST_ID_HEADER, http::HeaderValue::from_bytes(huge.as_bytes()).unwrap()); + let svc = RequestLogLayer.layer(echo_request_id_service()); + let response = svc.oneshot(request).await.unwrap(); + let (_body, headers) = read_body(response).await; + let header_id = headers.get(REQUEST_ID_HEADER).unwrap().to_str().unwrap(); + assert!(uuid::Uuid::parse_str(header_id).is_ok()); +} diff --git a/crates/starknet_transaction_prover/src/server/tls.rs b/crates/starknet_transaction_prover/src/server/tls.rs index 67950a808a5..67c737ec9ac 100644 --- a/crates/starknet_transaction_prover/src/server/tls.rs +++ b/crates/starknet_transaction_prover/src/server/tls.rs @@ -27,7 +27,7 @@ use tower_http::map_request_body::MapRequestBodyLayer; use tower_http::map_response_body::MapResponseBodyLayer; use tracing::warn; -use crate::server::{HealthLayer, OhttpJsonrpseeLayer}; +use crate::server::{HealthLayer, OhttpJsonrpseeLayer, RequestLogLayer}; /// Maximum time allowed for a TLS handshake before the connection is dropped. const TLS_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10); diff --git a/middleware_stack_alternatives.html b/middleware_stack_alternatives.html new file mode 100644 index 00000000000..a60a663e3ed --- /dev/null +++ b/middleware_stack_alternatives.html @@ -0,0 +1,307 @@ + + + + + +Middleware stack duplication — 3 alternatives + + + +
+ +

How to stop the middleware stack drifting

+

The same 7-layer tower chain is hand-copied in server.rs, tls.rs, and the integration test. Both PRs in the request-id stack had to edit all three copies in lockstep. Three ways to fix it.

+ + +
+ + The same 7-layer chain exists in three places today — any future edit must hit all three or silently drift + + + server.rs (HTTP path) + RequestLogLayer + HealthLayer + cors / MapReqBody / OhttpLayer + RequestSpanLayer + MapRespBody / Compression + + + tls.rs (HTTPS path) + RequestLogLayer + HealthLayer + cors / MapReqBody / OhttpLayer + RequestSpanLayer + MapRespBody / Compression + + + ohttp_integration_test.rs + RequestLogLayer + HealthLayer + cors / MapReqBody / OhttpLayer + RequestSpanLayer + MapRespBody / Compression + +
+ +
+

Why the type is unnameable (the root constraint)

+

+ ServiceBuilder::new().layer(A).layer(B).layer(C)… has a concrete type + ServiceBuilder<Stack<A, Stack<B, Stack<C, Identity>>>> that grows one level per + .layer() call. With 7 layers the type is 7 levels deep and spans ~180 chars. + Rust cannot write this as an ordinary function return type (impl ServiceBuilder<…> + is not valid — impl Trait works for trait objects, not concrete structs with embedded + generics). This forces the three alternatives below. +

+
+ + +
+ + +
+
+ Option A +

Leave duplicated + sync comment

+

No code change; add a comment cross-referencing the three copies

+
+
+
// server.rs — KEEP IN SYNC WITH tls.rs
+// and ohttp_integration_test.rs.
+ServiceBuilder::new()
+    .layer(RequestLogLayer)
+    .layer(HealthLayer)
+    ...
+
    +
  • Zero churn — nothing to break
  • +
  • Each site is readable inline
  • +
  • No Rust gymnastics
  • +
+
    +
  • Three edits required on every future layer change
  • +
  • HTTP / HTTPS can silently drift (has happened twice already in this stack)
  • +
  • Comment is ignored in review fatigue
  • +
+
+ +
+ + +
+
+ Option B — Recommended +

Local macro_rules! bundle

+

Define the chain once as a macro; call it in all three places

+
+
+
// defined once in server.rs
+macro_rules! prover_http_middleware {
+    ($cors:expr, $ohttp:expr) => {
+        ServiceBuilder::new()
+            .layer(RequestLogLayer)
+            .layer(HealthLayer)
+            .option_layer($cors)
+            .layer(MapRequestBodyLayer::new(HttpBody::new))
+            .option_layer($ohttp)
+            .layer(RequestSpanLayer)
+            .layer(MapResponseBodyLayer::new(HttpBody::new))
+            .layer(CompressionLayer::new())
+    };
+}
+
+// server.rs
+.set_http_middleware(prover_http_middleware!(cors, ohttp))
+.build(&addr)
+
+// tls.rs
+.set_http_middleware(prover_http_middleware!(cors, ohttp))
+.to_service_builder()
+
+// integration test
+prover_http_middleware!(None::<CorsLayer>, Some(ohttp))
+    .service(tower::service_fn(echo))
+
    +
  • One source of truth for layer order
  • +
  • No type-naming problem — macros expand inline
  • +
  • Layer-order comment belongs on the macro, once
  • +
  • A future layer(NewLayer) touches one line
  • +
+
    +
  • Introduces a macro (unusual in this codebase)
  • +
  • Stack trace / IDE hover shows expanded type, not a named fn
  • +
  • Macro hygiene edge cases (benign here — only concrete types, no metavariables in types)
  • +
+
+ +
+ + +
+
+ Option C +

Helper fn (service-in, service-out)

+

Flip the API: take the inner service, return the fully-wired service

+
+
+
// Takes the inner jsonrpsee service (already built)
+// and wraps the 7-layer chain around it.
+fn wrap_with_prover_middleware<S>(
+    inner: S,
+    cors:  Option<CorsLayer>,
+    ohttp: Option<OhttpJsonrpseeLayer>,
+) -> impl Service<…>
+where S: Service<…> + Clone + Send + 'static,
+      S::Future: Send + 'static,
+{
+    ServiceBuilder::new()
+        .layer(RequestLogLayer)
+        // ... 6 more layers ...
+        .service(inner)
+}
+
    +
  • Ordinary function — no macro
  • +
  • Works with impl Service return
  • +
  • Single definition of layer order
  • +
+
    +
  • jsonrpsee's ServerBuilder::set_http_middleware takes a ServiceBuilder, not a wired service — the boundary doesn't fit
  • +
  • The inner service type varies by call site; requires heavy where bounds or boxing
  • +
  • Loses the .build(&addr) vs .to_service_builder() callsite difference (both paths still need different outer wrappers)
  • +
  • More complex signature than the macro for the same result
  • +
+
+ +
+ +
+ + +

Side-by-side

+ + + + + + + + + + + + + + + + +
PropertyA — leave duplicatedB — macro ✓C — helper fn
Single source of layer ordernoyesyes
Drift between HTTP/HTTPS prevented by constructionnoyespartial
Future layer change: lines to edit3 (all copies)1 (macro body)1 (fn body)
Fits jsonrpsee set_http_middleware APIyesyes (expands inline)requires rework
Rust type complexitynonenone (macro hides it)complex where bounds
New language construct introducednonelocal macro_rules!plain fn
Layer-order comment: one place?must repeat or cross-refyes (on macro)yes (on fn)
+ + +

What the macro diff looks like

+

Define the macro at the top of server.rs (just below the module declarations), then replace both production call sites and update the integration test:

+ +
// server.rs — add before start_server
+/// Builds the shared HTTP middleware stack. Both the HTTP and HTTPS entry
+/// points use this chain; the layer order here is the single source of truth.
+/// `RequestSpanLayer` must sit BELOW `OhttpLayer` — see `request_span` for why.
+macro_rules! prover_http_middleware {
+    ($cors_layer:expr, $ohttp_layer:expr) => {
+        ::tower::ServiceBuilder::new()
+            .layer(RequestLogLayer)
+            .layer(HealthLayer)
+            .option_layer($cors_layer)
+            .layer(::tower_http::map_request_body::MapRequestBodyLayer::new(HttpBody::new))
+            .option_layer($ohttp_layer)
+            .layer(RequestSpanLayer)
+            .layer(::tower_http::map_response_body::MapResponseBodyLayer::new(HttpBody::new))
+            .layer(::tower_http::compression::CompressionLayer::new())
+    };
+}
+
+// server.rs — inside start_server, HTTP arm (replace the ServiceBuilder block)
+-               .set_http_middleware(
+-                   // `RequestLogLayer` outermost … (7-line comment)
+-                   ServiceBuilder::new()
+-                       .layer(RequestLogLayer)
+-                       … 6 more lines …
+-               )
++               .set_http_middleware(prover_http_middleware!(cors_layer, ohttp_layer))
+
+// tls.rs — replace the identical block
+-           .set_http_middleware(
+-               // See server.rs for the layer-order rationale.
+-               ServiceBuilder::new()
+-               … same 7 layers …
+-           )
++           .set_http_middleware(prover_http_middleware!(cors_layer, ohttp_layer))
+// (use crate::server::prover_http_middleware! in tls.rs)
+
+// ohttp_integration_test.rs — replace the test ServiceBuilder
+-   let mut svc = tower::ServiceBuilder::new()
+-       .option_layer(None::<CorsLayer>)
+-       .layer(MapRequestBodyLayer::new(HttpBody::new))
+-       … 5 more layers …
+-       .service(…);
++   let mut svc = prover_http_middleware!(None::<CorsLayer>, Some(ohttp_layer))
++       .service(tower::service_fn(jsonrpsee_echo_service));
+ +
+

Net result

+

The 7-layer chain lives in exactly one place (the macro body). Adding or removing a layer in a future PR is a 1-line change instead of 3 coordinated edits across files. The only thing each call site specifies is its two variable inputs (cors_layer and ohttp_layer). The integration test comment "Must be kept in sync with those files" disappears entirely — it's structurally enforced instead.

+
+ + + +
+ + diff --git a/request_id_pr757_fix.html b/request_id_pr757_fix.html new file mode 100644 index 00000000000..f433d4f671d --- /dev/null +++ b/request_id_pr757_fix.html @@ -0,0 +1,250 @@ + + + + + +PR #757 — the OHTTP request-id inconsistency & the fix + + + +
+ +

PR #757 — where the request-id splits, and the one-line fix

+

Two ids exist per OHTTP request. Today the logs and the error body disagree about which one to use — and the one the logs pick is the wrong one.

+ +
+

The two ids, named once

+

R_out — the outer id. Assigned to the encrypted envelope the relay delivers. It is echoed back to the relay on the outer response (so the relay can see it).

+

R_in — the inner id. A fresh UUID assigned to the decrypted request after the envelope is opened. It is never sent to the relay.

+

Keeping these two unrelated is the whole point: the relay knows R_out → who; the gateway knows R_in → what. No shared id ⇒ nobody can link who-to-what.

+
+ +

1. How the middleware is wired

+

The server has an outer stack (every request passes through it), and the OHTTP path adds an inner stack that only runs after an envelope is decrypted. The critical detail: the outer request_span wraps everything, including the OHTTP step.

+ +
+ + + + OUTER stack — wraps every request (span bound to R_out) + + + + sanitize_inbound_request_id + SetRequestIdLayer → assigns R_out + PropagateRequestIdLayer (echoes R_out) + request_span → span(R_out) + access_log + + + + + router · plaintext request → handler directly + router · unmatched (OHTTP envelope) → fallback ↓ + + + + span(R_out) instruments + the WHOLE downstream — + including the OHTTP fallback ↘ + + + + OHTTP fallback → inner_router + + OhttpLayer: decrypt envelope + strip_inbound_request_id + SetRequestIdLayer → assigns R_in + ‹ no request_span here ›nothing rebinds the span to R_in + inner handler (debug! logs) + + + + + +
The outer request_span (amber) opens a span carrying R_out and instruments everything below it — so when the OHTTP fallback (cyan) runs the inner request, that work is still inside the R_out span. The inner stack assigns R_in but never opens its own span.
+
+ +

2. What happens to an OHTTP request today (the bug)

+

Follow a single decrypted request through the inner stack and watch which id ends up where:

+ +
+ + + + ⟦ everything below runs inside span(R_out) — opened by the OUTER request_span ⟧ + + + + inner SetRequestIdLayer + sets RequestId extension = R_in + + + + inner handler + • debug!(…) log line + • on error → reads RequestId extension +   for the JSON error body + + + + LOG line (inherits the span) + { "request_id": "R_out", "message": … } + + + ERROR body (reads the extension) + { "error": { "request_id": "R_in" } } + + + + + span field → + extension → + + + + Same request, two different ids: logs say R_out, error body says R_in. + A client who reports an error quoting R_in cannot be found in the logs (which are under R_out). + + + + + +
The handler’s debug! log inherits the surrounding span, so it is stamped R_out. But the error body is built from the RequestId extension, which is R_in. The two never agree.
+
+ +

3. Why R_out in the logs is worse than just “inconsistent”

+

R_out is the id the gateway hands back to the relay. So tagging content logs with R_out gives the relay and the gateway a shared key:

+ +
+ + + Relay’s records + R_out → client = Alice + (relay received R_out on the outer response) + + + Gateway content logs (today) + R_out → "synced 12 notes for …" + (handler debug! inherited R_out) + + + R_out=R_out + + + Join on R_out ⇒ “Alice synced those notes.” + exactly the who↔what link OHTTP exists to prevent + + + + + + +
Because the relay-visible R_out also tags the content logs, anyone holding both log sets can join them on R_out. R_in would not be joinable — it never leaves the gateway.
+
+ +

4. The fix — give the inner stack its own span

+

Add one layer: an inner request_span, placed so it runs after SetRequestIdLayer (so R_in already exists) and around the handler (so its logs inherit the new span). In axum the last .layer() is outermost, so the new line goes first:

+ +
let inner_router = api_router
+    .clone()
+    // Innermost: bind the span to the fresh inner id so decapsulated
+    // handler logs carry R_in, not the inherited relay-visible R_out.
+    .layer(axum::middleware::from_fn(request_span::request_span))
+    .layer(SetRequestIdLayer::new(request_id_header.clone(), MakeRequestUuid))
+    .layer(axum::middleware::from_fn(strip_inbound_request_id));
+ +

Resulting execution order on a decrypted request:

+
strip_inbound_request_id        drop any client-supplied inner id
+  → SetRequestIdLayer           assign fresh R_in   (extension)
+    → request_span              open span(R_in)     ← NEW
+      → handler                 debug! now inherits R_in
+ +
+ + + ⟦ inner work now runs inside span(R_in) — opened by the NEW inner request_span ⟧ + + + inner handler + • debug!(…) log line + • error body ← RequestId extension + + + LOG line + { "request_id": "R_in", … } + + + ERROR body + { … "request_id": "R_in" } + + + + + + Logs and error body now both say R_in — consistent, and R_in never reaches the relay. + Relay still sees only R_out; content logs carry R_in ⇒ no shared key ⇒ no join. + + +
With an inner request_span bound to R_in, the handler’s logs inherit R_in instead of the outer R_out. Now the logs match the error body, and the relay-visible id never appears in content logs.
+
+ +

5. Before vs after, at a glance

+ + + + + + + + + + + + + +
For an OHTTP request…TodayWith the inner span
handler log idR_outR_in
error-body idR_inR_in
log ↔ error body match?noyes
relay-visible id in content logs?yes (R_out) — joinableno
plaintext requestunaffectedunaffected
+ +
+

One-line takeaway

+

The inner router assigns a fresh id (R_in) but never opens a span for it, so handler logs fall back to the outer span’s id (R_out). Adding an inner request_span between SetRequestIdLayer and the handler makes the logs use R_in — fixing both the id mismatch and the relay-linkability leak. Plaintext traffic is unchanged.

+
+ + + +
+ + diff --git a/request_id_scope.html b/request_id_scope.html new file mode 100644 index 00000000000..ea856534923 --- /dev/null +++ b/request_id_scope.html @@ -0,0 +1,287 @@ + + + + + +Request IDs through OHTTP — the prover's scope choice + + + +
+ +

Request IDs through OHTTP

+

Why the transaction prover tags only the “outside” of a privacy-wrapped request — and what that costs.

+ +
+

The one-sentence version

+

When a request is wrapped for privacy, the prover stamps a tracking number on the outer wrapper only. That number is great for us (operators reading logs) but the end user never sees it, so they can’t quote it back to us when something breaks.

+
+ +

1. What a request ID is (and why we want one)

+

Every time a client talks to our server, we’d like to label that single exchange with a unique tracking number — a request ID, e.g. a1b2c3d4…. It’s exactly like the tracking number on a parcel. Its whole purpose is correlation:

+
    +
  • The server writes one log line per request, stamped with the ID.
  • +
  • The server sends the same ID back to the client on the response.
  • +
  • If something goes wrong, the user says “request a1b2c3d4 failed” and we instantly find the matching log line.
  • +
+ +
+
+ + + + Client + (the user) + + + Prover + (our server) + + + request + + + response + + + id = a1b2c3d4 + + + + + + +
The simple case: one ID, stamped on the server’s log line and handed back to the user. Both sides can name the same exchange.
+
+
+ +

This is the “clean” case. The client and the server are talking directly, so the ID the server picks can travel straight back to the client. Everyone shares the same tracking number.

+ +

2. The privacy wrapper changes everything

+

Some of our traffic is deliberately designed so that nobody can link “who is asking” to “what they’re asking.” That’s the entire point of the privacy feature. To achieve it, the real request is sealed inside an encrypted wrapper before it leaves the user’s device, and it’s delivered by a middleman who is not allowed to read the contents.

+ +

Three plain-language terms, defined once:

+
+

Sealed wrapper — think of a locked, opaque box. The user puts their real request inside the box and locks it. Only our server has the key. (In the jargon this is the “outer envelope”; the real request inside is the “inner request.”)

+

The courier — a separate company that carries the locked box from the user to us. It can see the box and our address on the outside, but it cannot open it, so it never learns what’s inside or who it ultimately concerns. (The jargon calls this the “relay.”)

+

Why bother — the courier knows who sent the box but not what’s in it; our server learns what’s in it but the courier-hidden delivery means we don’t get to trivially tie it to the sender. Splitting that knowledge across two parties is the privacy guarantee.

+
+ +
+
+ + + + User + seals the box + + + Courier + can’t open it + + + Prover + has the key + + + + + + locked box + + + + + + + + + + + + unlocked here → + + real request + + + + + knows WHO, not WHAT + (sees only the sealed box) + + + + + + +
The user seals the real request in a locked box. The courier carries the box but can’t open it. Only the prover unlocks it.
+
+
+ +

3. Now: where does the prover put the tracking number?

+

Here is the design decision. When a privacy-wrapped request arrives, our server can stamp a tracking number in one of two places:

+ + + + + + + + + + + + + + + +
PlaceWhat it meansWho can read it
On the box
the outer wrapper
A number written on the outside of the locked box, and on the box’s delivery receipt that goes back to the courier.Us (in our logs) and the courier. Not the end user — they only ever see what comes out inside the box on the way back.
On the letter inside
the inner request
A number written on the actual request after we unlock the box, which can be folded into the sealed reply we send back inside the box.The end user (once they unlock our reply) and us.
+ +
+

What the prover does today

+

Only the box gets a number. The prover assigns the ID at the wrapper level, writes its one log line with it, and stamps it on the wrapper’s reply receipt. It never opens up the inner request to give that its own number. So the number exists, and we can see it — but it travels back only as far as the courier, never all the way into the user’s hands.

+
+ +
+
+ + + USER + COURIER + PROVER + + + + + + ID on the box + id = 7f3a… + + + + ✓ our log line + + + ✓ courier sees the box + + + ✗ user never sees it + + + the reply the user unlocks + carries no ID inside + + + + the ID can’t cross back into the sealed reply → user is left without it + + + + +
The prover’s ID lives on the box. It reaches our logs and the courier, but it stops at the courier — it’s never folded into the sealed reply the user opens.
+
+
+ ID is visible here + ID is absent here + where the ID is stamped +
+
+ +

4. So what’s the actual cost?

+

It depends entirely on who needs the tracking number.

+ +
+

Fine for: operators reading logs

+

If our own on-call engineer is debugging, the ID on the box is exactly what they need. There’s one clean log line per wrapped request, stamped with the ID. Nothing is missing for this use case — this is why we say the prover “handles it cleanly.”

+
+ +
+

Missing for: end users reporting an error through the privacy path

+

If a user behind the privacy wrapper hits an error and opens a support ticket, they have no tracking number to quote — the reply they unlocked never contained one. We can’t say “read us the ID from your error” because there isn’t one inside their reply. We’d have to correlate by timestamps and other weaker signals.

+
+ +

Why this isn’t a bug

+

Two reasons it’s a deliberate scope choice rather than a defect:

+
    +
  1. Nothing breaks. The way the prover is built, the code that handles the unlocked request simply doesn’t ask for a tracking number, so leaving the inner request un-numbered doesn’t cause any failure or empty field. It’s an absence, not a malfunction.
  2. +
  3. The privacy model actively discourages linking the two. If we did give the inner request a number and let it travel back to the user, we’d have to be very careful that this number can’t be used to re-link the user to the courier’s view of the box — the exact linkage the whole feature exists to prevent. A sibling service in the privacy codebase that does number the inner request goes out of its way to use a fresh, unrelated number inside and never echo it on the box, precisely to preserve that separation.
  4. +
+ +

5. The decision in one table

+ + + + + + + + +
GoalPlain (no wrapper) requestPrivacy-wrapped request
Operator finds the log line by IDworksworks
ID handed back to a direct callerworksreaches courier only
End user can quote an ID from their errorworksnot today
Per-step logs inside the unlocked request tied to an IDn/anot today
+ +
+

If end-user-visible IDs through the privacy path become a requirement

+

We’d add a second numbering step that runs after the box is unlocked, giving the inner request its own fresh ID and folding it into the sealed reply — while taking the same care the sibling service does to keep that inner number unlinkable from the box. Until that’s a stated product need, the current “number the box only” design is the simpler, privacy-safe default.

+
+ + + +
+ + diff --git a/request_id_tradeoff.html b/request_id_tradeoff.html new file mode 100644 index 00000000000..8600831b0c4 --- /dev/null +++ b/request_id_tradeoff.html @@ -0,0 +1,186 @@ + + + + + +Request-ID through OHTTP — the privacy ↔ observability tradeoff + + + +
+ +

The request-ID tradeoff through OHTTP

+

Why you can’t have both end-to-end log correlation and unlinkability — and what each choice costs.

+ +
+

The tension in one sentence

+

A request-ID is useful precisely because it links things together. But OHTTP’s entire job is to stop two specific things — the sender’s identity and the request’s contents — from being linked. So any ID that touches both sides is, by definition, working against the privacy guarantee.

+
+ +

1. The cast (recap, no jargon)

+
+

The user seals their real request in a locked, opaque box only our server can open.

+

The courier carries the box. It sees who sent it and our address, but can’t open it — so it never learns what’s inside. (In the specs: the “relay”.)

+

The server / gateway (our prover) unlocks the box and processes the real request. It sees what was asked, but the box arrived via the courier, so it doesn’t learn who the user is.

+

The guarantee: knowledge is split. The courier knows who, not what. The gateway knows what, not who. Neither alone can say “user X asked for Y.”

+
+

A request-ID is a tracking number we stamp on a request so log lines about it can be found and tied together. The question: which tracking number goes where — and does it accidentally bridge the courier’s knowledge and the gateway’s knowledge?

+ +

2. What each party can see

+

Two facts drive everything below:

+
    +
  • The ID on the box is visible to the courier. Our server sits “outside” the unlock step, so the ID it stamps is written on the box and echoed on the box’s receipt — which the courier handles. The courier can record it.
  • +
  • Our server’s logs are not visible to the courier — but they describe the contents of the unlocked request.
  • +
+

So an ID becomes dangerous only if the same value appears in both the courier’s records (where it sits next to “who”) and our content logs (where it sits next to “what”). That shared value is a join key.

+ +

3. Design A — one shared ID everywhere (what the recent change does)

+

The server generates one ID for the box and reuses it to tag every log line, including the logs that describe the unlocked contents.

+ +
+ + + + Courier’s records + id = R7 → user = Alice (IP, time) + knows WHO · sees the box’s ID on the wire + + + Gateway content logs + id = R7 → request = “prove tx 0x…” + knows WHAT · stamps the same ID on contents + + + R7 = R7 + + + Join on R7 → + “Alice asked to prove tx 0x…” — unlinkability broken + + + + + + +
One ID (R7) sits next to “who” in the courier’s records and next to “what” in our logs. Anyone holding both sets just matches the number.
+
+ +

What Design A buys

+

Perfect operator correlation. A single ID threads the envelope log line and every downstream content log. On-call sees “request R7” and pulls the whole story in one query. This is the observability win the wish-list asked for.

+ +

What Design A costs

+

A trivial join key across the privacy boundary. The same R7 is courier-visible (next to “who”) and in our content logs (next to “what”). Anyone who obtains both log sets — relay+gateway collusion, or a breach of both — links identity to contents by exact match, no guesswork. That is precisely the link OHTTP exists to prevent.

+ +

4. Design B — fresh inner ID (what #757 does)

+

The server keeps the box’s ID for the courier-facing receipt, but the moment it unlocks the box it mints a second, unrelated ID for the contents. The contents logs use only the second ID, which is never written back onto the box.

+ +
+ + + Courier’s records + id = R7 → user = Alice + sees the box ID R7 only + + + Gateway content logs + id = Q3 → request = “prove tx 0x…” + fresh ID Q3, never written on the box + + + R7 ≠ Q3 + no shared value to match on + + + Both log sets together → + still can’t say which user asked what + + + +
The courier’s ID (R7) and the contents ID (Q3) are unrelated. Holding both log sets no longer yields an exact join.
+
+ +

What Design B buys

+

The privacy guarantee holds even under dual compromise. With no shared ID, linking “who” to “what” drops back to fuzzy timing/size correlation — the hard problem OHTTP was designed to leave as the only avenue.

+ +

What Design B costs

+

Broken end-to-end correlation for sealed traffic. The envelope log line carries R7; the contents logs carry Q3. For boxed (OHTTP) requests, an operator cannot join “the request arrived” to “here’s what it did” by ID — those two halves are deliberately disconnected. If a user reports a problem quoting the ID they saw, it maps to only one half of the story.

+ +

5. Why you genuinely can’t have both

+
+

End-to-end correlation means: one identifier that lets you follow a request from the courier’s edge all the way into the contents.

+

Unlinkability means: no identifier should connect the courier’s edge (who) to the contents (what).

+

These are the same sentence with the sign flipped. A correlation ID that survives across the unlock boundary is a linkability handle. So the only real choices are where you cut the thread — and Design A simply never cuts it, while Design B cuts it exactly at the box.

+
+ +

6. Plaintext traffic is unaffected

+

All of the above applies only to boxed (OHTTP) requests. A plaintext request has no courier and no box — the user talks to us directly, so there’s no “who/what” split to protect. There, a single ID end-to-end (logged and echoed back to the caller) is exactly right and costs nothing. Both designs keep that behavior; they differ only once a box is involved.

+ +

7. Side-by-side

+ + + + + + + + + + +
PropertyA — shared ID (current)B — fresh inner ID (#757)
Plaintext: caller-quotable ID + full log correlationyesyes
OHTTP: downstream content logs carry an IDyes (R7)yes (Q3)
OHTTP: operator can join envelope ↔ contents by IDyesno (by design)
OHTTP: courier-visible ID absent from content logsno — sharedyes — severed
Resists who↔what join under dual-log compromiseno — exact matchyes — only fuzzy correlation left
+
+ protects privacy / preserves utility + weakens the guarantee + accepted cost +
+ +

8. How strong is the threat, really?

+

Design A’s leak requires an attacker to hold both the courier’s records and the gateway’s logs at once — normally run by different operators, which is the whole structural point of OHTTP. So it’s a “collusion or double-breach” threat, not a single-party one. But two things make it worth fixing:

+
    +
  • A shared ID turns that scenario from hard (statistical timing/size correlation) into trivial (exact key match). Defense-in-depth means not handing out the easy join.
  • +
  • This is a privacy product whose core promise is unlinkability, and the sibling service (#757) already chose B. Matching it keeps the guarantee consistent across services.
  • +
+ +

Bottom line for the decision

+

If unlinkability under dual-compromise is in scope (it is for a privacy product), choose B and accept that, for boxed traffic, the envelope and contents logs can’t be joined by ID. If the relay is considered trusted and operator convenience dominates, A is simpler — but that should be an explicit, written threat-model decision, not a default.

+ + + +
+ +