Skip to content

Commit 186e4cf

Browse files
avi-starkwareclaude
andcommitted
starknet_transaction_prover: Prometheus /metrics endpoint with build_info
Adds `MetricsLayer` that short-circuits `GET /metrics` with a Prometheus text-format scrape, and an `install_exporter` helper that registers the global recorder and emits `prover_build_info` with version + git_sha labels. Wired alongside `HealthLayer` so the scrape path bypasses the JSON-RPC parser. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c31ce65 commit 186e4cf

8 files changed

Lines changed: 192 additions & 6 deletions

File tree

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/starknet_transaction_prover/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ http.workspace = true
2828
http-body-util.workspace = true
2929
indexmap.workspace = true
3030
jsonrpsee = { workspace = true, features = ["macros", "server"] }
31+
metrics.workspace = true
32+
metrics-exporter-prometheus.workspace = true
3133
privacy-prove = { workspace = true, optional = true }
3234
rand.workspace = true
3335
reqwest = { workspace = true, features = ["json"] }

crates/starknet_transaction_prover/src/main.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ async fn main() -> anyhow::Result<()> {
2222
};
2323
use starknet_transaction_prover::server::cors::{build_cors_layer, cors_mode};
2424
use starknet_transaction_prover::server::log_redact::redact_url_host;
25+
use starknet_transaction_prover::server::metrics::install_exporter;
2526
use starknet_transaction_prover::server::panic::install_panic_hook;
2627
use starknet_transaction_prover::server::rpc_api::ProvingRpcServer;
2728
use starknet_transaction_prover::server::rpc_impl::ProvingRpcServerImpl;
2829
use starknet_transaction_prover::server::{
2930
start_server,
31+
MetricsLayer,
3032
OhttpJsonrpseeLayer,
3133
OHTTP_JSONRPSEE_BODY_BUILDER,
3234
};
@@ -55,6 +57,13 @@ async fn main() -> anyhow::Result<()> {
5557

5658
let config = ServiceConfig::from_args(args)?;
5759

60+
// Install Prometheus exporter and emit `prover_build_info` before binding
61+
// so a scrape during slow startup still returns the build identity.
62+
let prometheus_handle =
63+
install_exporter(env!("CARGO_PKG_VERSION"), option_env!("GIT_SHA").unwrap_or("unknown"))
64+
.context("Failed to install Prometheus exporter")?;
65+
let metrics_layer = Some(MetricsLayer::new(prometheus_handle));
66+
5867
// Startup banner — version + chain id + redacted RPC host only. No URLs
5968
// with userinfo, no fee token address, no TLS paths, no tx data.
6069
info!(
@@ -101,6 +110,7 @@ async fn main() -> anyhow::Result<()> {
101110
config.max_request_body_size,
102111
cors_layer,
103112
ohttp_layer,
113+
metrics_layer,
104114
)
105115
.await?;
106116

crates/starknet_transaction_prover/src/server.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pub mod cors;
3131
pub mod errors;
3232
pub mod health;
3333
pub mod log_redact;
34+
pub mod metrics;
3435
#[cfg(test)]
3536
pub mod mock_rpc;
3637
pub mod panic;
@@ -40,6 +41,7 @@ pub mod rpc_impl;
4041
pub mod tls;
4142

4243
pub use health::{HealthLayer, HEALTH_PATH};
44+
pub use metrics::{MetricsLayer, METRICS_PATH};
4345
pub use request_log::{RequestLogLayer, REQUEST_ID_HEADER};
4446

4547
#[cfg(test)]
@@ -50,6 +52,7 @@ mod rpc_spec_test;
5052
mod ohttp_integration_test;
5153

5254
/// Starts the JSON-RPC server in either HTTP or HTTPS mode depending on the transport.
55+
#[allow(clippy::too_many_arguments)]
5356
pub async fn start_server(
5457
addr: SocketAddr,
5558
transport: &TransportMode,
@@ -58,6 +61,7 @@ pub async fn start_server(
5861
max_request_body_size: u32,
5962
cors_layer: Option<CorsLayer>,
6063
ohttp_layer: Option<OhttpJsonrpseeLayer>,
64+
metrics_layer: Option<MetricsLayer>,
6165
) -> anyhow::Result<(SocketAddr, ServerHandle)> {
6266
match transport {
6367
TransportMode::Http => {
@@ -83,11 +87,12 @@ pub async fn start_server(
8387
// non-OHTTP requests still stream through unbuffered.
8488
.set_http_middleware(
8589
// `RequestLogLayer` is outermost so the latency it measures
86-
// covers every other layer. `HealthLayer` sits inside it so
87-
// probes complete before CORS/OHTTP.
90+
// covers every other layer. `HealthLayer` and `MetricsLayer`
91+
// sit inside it so probes/scrapes complete before CORS/OHTTP.
8892
ServiceBuilder::new()
8993
.layer(RequestLogLayer)
9094
.layer(HealthLayer)
95+
.option_layer(metrics_layer)
9196
.option_layer(cors_layer)
9297
.layer(MapRequestBodyLayer::new(HttpBody::new))
9398
.option_layer(ohttp_layer)
@@ -111,6 +116,7 @@ pub async fn start_server(
111116
max_request_body_size,
112117
cors_layer,
113118
ohttp_layer,
119+
metrics_layer,
114120
)
115121
.await
116122
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
//! Prometheus `/metrics` endpoint as a tower middleware layer.
2+
//!
3+
//! Short-circuits `GET /metrics` ahead of jsonrpsee so scrapes never run
4+
//! through the JSON-RPC parser. Label cardinality is bounded by the
5+
//! enumerations in [`names`] — no user-controlled values become labels.
6+
7+
use std::task::{Context, Poll};
8+
9+
use bytes::Bytes;
10+
use futures::future::{ready, Either, Ready};
11+
use http::{header, Method, Request, Response, StatusCode};
12+
use http_body_util::Full;
13+
use jsonrpsee::server::HttpBody;
14+
use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
15+
use tower::{Layer, Service};
16+
17+
#[cfg(test)]
18+
#[path = "metrics_test.rs"]
19+
mod metrics_test;
20+
21+
/// Path served by [`MetricsLayer`].
22+
pub const METRICS_PATH: &str = "/metrics";
23+
24+
/// Metric name constants. Kept here so `metrics!` invocations elsewhere link
25+
/// to a single definition instead of bare string literals.
26+
pub mod names {
27+
/// Build identity. Value is always 1; labels carry version + git_sha.
28+
pub const BUILD_INFO: &str = "prover_build_info";
29+
/// Requests rejected because the concurrency semaphore was full.
30+
pub const CONCURRENCY_REJECTED_TOTAL: &str = "prover_concurrency_rejected_total";
31+
}
32+
33+
/// Initializes the global Prometheus exporter and emits the `build_info`
34+
/// gauge. Returns the handle used by [`MetricsLayer`] to render the scrape
35+
/// response.
36+
///
37+
/// Should be called exactly once at startup. The handle is cheap to clone
38+
/// (it wraps an `Arc`).
39+
pub fn install_exporter(version: &str, git_sha: &str) -> anyhow::Result<PrometheusHandle> {
40+
let handle = PrometheusBuilder::new()
41+
.install_recorder()
42+
.map_err(|err| anyhow::anyhow!("failed to install prometheus recorder: {err}"))?;
43+
metrics::gauge!(
44+
names::BUILD_INFO,
45+
"version" => version.to_string(),
46+
"git_sha" => git_sha.to_string(),
47+
)
48+
.set(1.0);
49+
// Pre-register the counter at 0 so it shows up in scrapes before the
50+
// first rejection — dashboards relying on `rate(...) > 0` need the
51+
// series to exist.
52+
metrics::counter!(names::CONCURRENCY_REJECTED_TOTAL).increment(0);
53+
Ok(handle)
54+
}
55+
56+
/// tower [`Layer`] that intercepts `GET /metrics`.
57+
#[derive(Clone)]
58+
pub struct MetricsLayer {
59+
handle: PrometheusHandle,
60+
}
61+
62+
impl MetricsLayer {
63+
pub fn new(handle: PrometheusHandle) -> Self {
64+
Self { handle }
65+
}
66+
}
67+
68+
impl<S> Layer<S> for MetricsLayer {
69+
type Service = MetricsService<S>;
70+
71+
fn layer(&self, inner: S) -> Self::Service {
72+
MetricsService { inner, handle: self.handle.clone() }
73+
}
74+
}
75+
76+
#[derive(Clone)]
77+
pub struct MetricsService<S> {
78+
inner: S,
79+
handle: PrometheusHandle,
80+
}
81+
82+
impl<S, ReqB> Service<Request<ReqB>> for MetricsService<S>
83+
where
84+
S: Service<Request<ReqB>, Response = Response<HttpBody>>,
85+
{
86+
type Response = Response<HttpBody>;
87+
type Error = S::Error;
88+
type Future = Either<Ready<Result<Self::Response, Self::Error>>, S::Future>;
89+
90+
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
91+
self.inner.poll_ready(cx)
92+
}
93+
94+
fn call(&mut self, request: Request<ReqB>) -> Self::Future {
95+
if request.method() == Method::GET && request.uri().path() == METRICS_PATH {
96+
let body = Bytes::from(self.handle.render());
97+
let response = Response::builder()
98+
.status(StatusCode::OK)
99+
.header(header::CONTENT_TYPE, "text/plain; version=0.0.4")
100+
.body(HttpBody::new(Full::new(body)))
101+
.expect("response build with a string body is infallible");
102+
return Either::Left(ready(Ok(response)));
103+
}
104+
Either::Right(self.inner.call(request))
105+
}
106+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
use bytes::Bytes;
2+
use http::{Method, Request, Response, StatusCode};
3+
use http_body_util::{BodyExt, Full};
4+
use jsonrpsee::server::HttpBody;
5+
use tower::{Layer, ServiceExt};
6+
7+
use crate::server::metrics::{install_exporter, MetricsLayer, METRICS_PATH};
8+
9+
fn fallthrough_service() -> impl tower::Service<
10+
Request<HttpBody>,
11+
Response = Response<HttpBody>,
12+
Error = std::convert::Infallible,
13+
Future = futures::future::Ready<Result<Response<HttpBody>, std::convert::Infallible>>,
14+
> + Clone {
15+
tower::service_fn(|_req: Request<HttpBody>| {
16+
let response = Response::builder()
17+
.status(StatusCode::IM_A_TEAPOT)
18+
.body(HttpBody::new(Full::new(Bytes::from_static(b"fallthrough"))))
19+
.expect("static body is infallible");
20+
futures::future::ready(Ok::<_, std::convert::Infallible>(response))
21+
})
22+
}
23+
24+
fn empty_request(method: Method, path: &str) -> Request<HttpBody> {
25+
Request::builder()
26+
.method(method)
27+
.uri(path)
28+
.body(HttpBody::new(Full::new(Bytes::new())))
29+
.expect("static body is infallible")
30+
}
31+
32+
async fn read_body(response: Response<HttpBody>) -> (StatusCode, Vec<u8>) {
33+
let (parts, body) = response.into_parts();
34+
let bytes = body.collect().await.expect("body collect").to_bytes().to_vec();
35+
(parts.status, bytes)
36+
}
37+
38+
#[tokio::test]
39+
async fn get_metrics_renders_prometheus_text() {
40+
// Note: install_exporter installs the global recorder, so this test must
41+
// be the only one in the crate that calls it. Other metric tests should
42+
// share this fixture or call install_exporter via `try_install`.
43+
let handle = install_exporter("0.0.1-test", "deadbeef").expect("install");
44+
let svc = MetricsLayer::new(handle).layer(fallthrough_service());
45+
46+
let response = svc.oneshot(empty_request(Method::GET, METRICS_PATH)).await.unwrap();
47+
48+
let (status, body) = read_body(response).await;
49+
assert_eq!(status, StatusCode::OK);
50+
let body_text = String::from_utf8(body).unwrap();
51+
assert!(
52+
body_text.contains("prover_build_info"),
53+
"scrape should include build_info, got:\n{body_text}"
54+
);
55+
assert!(body_text.contains("version=\"0.0.1-test\""));
56+
assert!(body_text.contains("git_sha=\"deadbeef\""));
57+
}

crates/starknet_transaction_prover/src/server/rpc_impl.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use tracing::warn;
1313
use crate::proving::virtual_snos_prover::{ProveTransactionResult, RpcVirtualSnosProver};
1414
use crate::server::config::ServiceConfig;
1515
use crate::server::errors::service_busy;
16+
use crate::server::metrics::names::CONCURRENCY_REJECTED_TOTAL;
1617
use crate::server::rpc_api::ProvingRpcServer;
1718

1819
/// Starknet RPC specification version.
@@ -57,6 +58,7 @@ impl ProvingRpcServer for ProvingRpcServerImpl {
5758
transaction: RpcTransaction,
5859
) -> RpcResult<ProveTransactionResult> {
5960
let _permit = self.concurrency_semaphore.try_acquire().map_err(|_| {
61+
metrics::counter!(CONCURRENCY_REJECTED_TOTAL).increment(1);
6062
warn!(
6163
max_concurrent_requests = self.max_concurrent_requests,
6264
"Rejected proving request: service is at capacity"

crates/starknet_transaction_prover/src/server/tls.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use tower_http::map_request_body::MapRequestBodyLayer;
2727
use tower_http::map_response_body::MapResponseBodyLayer;
2828
use tracing::warn;
2929

30-
use crate::server::{HealthLayer, OhttpJsonrpseeLayer, RequestLogLayer};
30+
use crate::server::{HealthLayer, MetricsLayer, OhttpJsonrpseeLayer, RequestLogLayer};
3131

3232
/// Maximum time allowed for a TLS handshake before the connection is dropped.
3333
const TLS_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
@@ -45,6 +45,7 @@ pub async fn start_tls_server(
4545
max_request_body_size: u32,
4646
cors_layer: Option<CorsLayer>,
4747
ohttp_layer: Option<OhttpJsonrpseeLayer>,
48+
metrics_layer: Option<MetricsLayer>,
4849
) -> anyhow::Result<(SocketAddr, ServerHandle)> {
4950
let tls_acceptor = load_tls_acceptor(cert_path, key_path)?;
5051

@@ -59,12 +60,11 @@ pub async fn start_tls_server(
5960
let svc_builder = ServerBuilder::default()
6061
.set_config(server_config)
6162
.set_http_middleware(
62-
// `RequestLogLayer` is outermost so it sees status + latency for
63-
// every request (including health probes). See `server.rs` for the
64-
// layer-order rationale.
63+
// See `server.rs` for the layer-order rationale.
6564
ServiceBuilder::new()
6665
.layer(RequestLogLayer)
6766
.layer(HealthLayer)
67+
.option_layer(metrics_layer)
6868
.option_layer(cors_layer)
6969
.layer(MapRequestBodyLayer::new(HttpBody::new))
7070
.option_layer(ohttp_layer)

0 commit comments

Comments
 (0)