More precise metrics for measuring auction overhead (#3754)

MartinquaXD · web-flow · commit e7877c4e7091 · 2025-10-09T17:58:41.000+02:00
# Description Plotting the entire (or at least vast majority) of time lost just running the auction (i.e. everything besides actually computing solutions) is extremely important for guiding our optimization efforts. We already have some metrics for that but since those are histograms we have a few issues: 1. the granularity of histograms depends on the buckets we define. The necessary granularity can vary a lot depending on the task so reusing the same metric for multiple sources of overhead either means we have to introduce a TON of buckets or multiple histograms (one for each source of overhead). 2. AFAIK histograms can't be merged into 1 nice plot that visualizes all the overhead at once. Instead you basically have to look at each histogram individually and mentally piece everything together. # Changes This PR addresses both issues by measuring the overhead using 2 counters. One for measuring the total time spent in each phase and one for counting how many measurements we did. Using gauges for this would have been a bit easier but gauges have the issue that they only plot the exact value stored at the time when prometheus scrapes the metrics. Since the runtime of the individual sources of overhead can vary quite a bit from run to run there is a chance that gauges misrepresent the metrics. With the 2 counter approach we can at least always compute averages for all sources of overhead which should hopefully give us better data. As we continue to reduce this overhead it might make sense to break down some of these phases a bit more but I think this is a good starting point. Note that a lot of plotted phases look insignificant in my screenshot but only because the data comes from the playground which basically does nothing. From my previous efforts to optimize performance I know that many of these phases take a surprising amount of time. ## How to test I used #3752 to build the new dashboard I want to build in the playground to verify that things work as I intend. As you can see that dashboard makes it a lot easier to get a sense of ALL the auction overhead at once and how much each phase contributes to the total overhead. <img width="1247" height="639" alt="Screenshot 2025-10-09 at 06 32 57" src="https://github.com/user-attachments/assets/74196838-74fc-4188-a5b9-fd8775eb5d1d" />
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/autopilot/src/infra/persistence/mod.rs b/crates/autopilot/src/infra/persistence/mod.rs
@@ -72,6 +72,8 @@ impl Persistence {
         &self,
         auction: &domain::RawAuctionData,
     ) -> Result<domain::auction::Id, DatabaseError> {
+        let _timer = observe::metrics::metrics()
+            .on_auction_overhead_start("autopilot", "replace_auction_in_db");
         let auction = dto::auction::from_domain(auction.clone());
         self.postgres
             .replace_current_auction(&auction)
diff --git a/crates/autopilot/src/infra/solvers/dto/solve.rs b/crates/autopilot/src/infra/solvers/dto/solve.rs
@@ -36,6 +36,8 @@ impl Request {
         trusted_tokens: &HashSet<H160>,
         time_limit: Duration,
     ) -> Self {
+        let _timer =
+            observe::metrics::metrics().on_auction_overhead_start("autopilot", "serialize_request");
         let helper = RequestHelper {
             id: auction.id,
             orders: auction
diff --git a/crates/autopilot/src/maintenance.rs b/crates/autopilot/src/maintenance.rs
@@ -20,7 +20,7 @@ use {
         core::{AtomicU64, GenericGauge},
     },
     shared::{event_handling::AlloyEventRetriever, maintenance::Maintaining},
-    std::{future::Future, sync::Arc},
+    std::{future::Future, sync::Arc, time::Instant},
     tokio::sync::Mutex,
 };
 
@@ -64,7 +64,7 @@ impl Maintenance {
             return;
         }
 
-        let start = std::time::Instant::now();
+        let start = Instant::now();
         if let Err(err) = self.update_inner().await {
             tracing::warn!(?err, block = new_block.number, "failed to run maintenance");
             metrics().updates.with_label_values(&["error"]).inc();
@@ -82,7 +82,8 @@ impl Maintenance {
     }
 
     async fn update_inner(&self) -> Result<()> {
-        // All these can run independently of each other.
+        let _timer =
+            observe::metrics::metrics().on_auction_overhead_start("autopilot", "maintenance_total");
         tokio::try_join!(
             Self::timed_future(
                 "settlement_indexer",
@@ -118,6 +119,7 @@ impl Maintenance {
             .maintenance_stage_time
             .with_label_values(&[label])
             .start_timer();
+        let _timer2 = observe::metrics::metrics().on_auction_overhead_start("autopilot", label);
         fut.await
     }
 
diff --git a/crates/autopilot/src/solvable_orders.rs b/crates/autopilot/src/solvable_orders.rs
@@ -33,10 +33,10 @@ use {
         collections::{BTreeMap, HashMap, HashSet, btree_map::Entry},
         future::Future,
         sync::Arc,
-        time::Duration,
+        time::{Duration, Instant},
     },
     strum::VariantNames,
-    tokio::{sync::Mutex, time::Instant},
+    tokio::sync::Mutex,
 };
 
 #[derive(prometheus_metric_storage::MetricStorage)]
@@ -166,6 +166,9 @@ impl SolvableOrdersCache {
     pub async fn update(&self, block: u64, store_events: bool) -> Result<()> {
         let start = Instant::now();
 
+        let _timer = observe::metrics::metrics()
+            .on_auction_overhead_start("autopilot", "update_solvabe_orders");
+
         let db_solvable_orders = self.get_solvable_orders().await?;
         tracing::trace!("fetched solvable orders from db");
 
diff --git a/crates/driver/src/domain/competition/mod.rs b/crates/driver/src/domain/competition/mod.rs
@@ -115,6 +115,8 @@ impl Competition {
     /// Solve an auction as part of this competition.
     pub async fn solve(&self, auction: Arc<String>) -> Result<Option<Solved>, Error> {
         let start = Instant::now();
+        let timer = ::observe::metrics::metrics()
+            .on_auction_overhead_start("driver", "pre_processing_total");
 
         let tasks = self
             .fetcher
@@ -179,6 +181,7 @@ impl Competition {
             .auction_preprocessing
             .with_label_values(&["total"])
             .observe(elapsed.as_secs_f64());
+        drop(timer);
         tracing::debug!(?elapsed, "auction task execution time");
 
         let auction = &auction;
@@ -520,6 +523,7 @@ impl Competition {
     {
         task::spawn_blocking(move || {
             let _timer = metrics::get().processing_stage_timer(stage);
+            let _timer2 = ::observe::metrics::metrics().on_auction_overhead_start("driver", stage);
             f()
         })
         .await
diff --git a/crates/driver/src/domain/competition/pre_processing.rs b/crates/driver/src/domain/competition/pre_processing.rs
@@ -206,6 +206,8 @@ impl Utilities {
     async fn parse_request(&self, solve_request: Arc<String>) -> Result<Arc<Auction>> {
         let auction_dto: SolveRequest = {
             let _timer = metrics::get().processing_stage_timer("parse_dto");
+            let _timer2 =
+                observe::metrics::metrics().on_auction_overhead_start("driver", "parse_dto");
             // deserialization takes tens of milliseconds so run it on a blocking task
             tokio::task::spawn_blocking(move || {
                 serde_json::from_str(&solve_request).context("could not parse solve request")
@@ -219,6 +221,8 @@ impl Utilities {
 
         let auction_domain = {
             let _timer = metrics::get().processing_stage_timer("convert_to_domain");
+            let _timer2 = observe::metrics::metrics()
+                .on_auction_overhead_start("driver", "convert_to_domain");
             let app_data = self
                 .app_data_retriever
                 .as_ref()
@@ -237,6 +241,8 @@ impl Utilities {
     /// Fetches the tradable balance for every order owner.
     async fn fetch_balances(self: Arc<Self>, auction: Arc<Auction>) -> Arc<Balances> {
         let _timer = metrics::get().processing_stage_timer("fetch_balances");
+        let _timer2 =
+            observe::metrics::metrics().on_auction_overhead_start("driver", "fetch_balances");
 
         // Collect trader/token/source/interaction tuples for fetching available
         // balances. Note that we are pessimistic here, if a trader is selling
@@ -330,6 +336,8 @@ impl Utilities {
         };
 
         let _timer = metrics::get().processing_stage_timer("fetch_app_data");
+        let _timer2 =
+            observe::metrics::metrics().on_auction_overhead_start("driver", "fetch_app_data");
 
         let app_data = join_all(
             auction
@@ -367,6 +375,8 @@ impl Utilities {
 
     async fn cow_amm_orders(self: Arc<Self>, auction: Arc<Auction>) -> Arc<Vec<Order>> {
         let _timer = metrics::get().processing_stage_timer("cow_amm_orders");
+        let _timer2 =
+            observe::metrics::metrics().on_auction_overhead_start("driver", "cow_amm_orders");
         let cow_amms = self.eth.contracts().cow_amm_registry().amms().await;
         let domain_separator = self.eth.contracts().settlement_domain_separator();
         let domain_separator = model::DomainSeparator(domain_separator.0);
@@ -487,6 +497,8 @@ impl Utilities {
         auction: Arc<Auction>,
     ) -> Arc<Vec<liquidity::Liquidity>> {
         let _timer = metrics::get().processing_stage_timer("fetch_liquidity");
+        let _timer2 =
+            observe::metrics::metrics().on_auction_overhead_start("driver", "fetch_liquidity");
         let pairs = auction.liquidity_pairs();
         Arc::new(
             self.liquidity_fetcher
diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs
@@ -25,7 +25,10 @@ use {
     num::BigRational,
     observe::tracing::tracing_headers,
     reqwest::header::HeaderName,
-    std::{collections::HashMap, time::Duration},
+    std::{
+        collections::HashMap,
+        time::{Duration, Instant},
+    },
     tap::TapFallible,
     thiserror::Error,
     tracing::{Instrument, instrument},
@@ -233,8 +236,9 @@ impl Solver {
         auction: &Auction,
         liquidity: &[liquidity::Liquidity],
     ) -> Result<Vec<Solution>, Error> {
+        let start = Instant::now();
+
         let flashloan_hints = self.assemble_flashloan_hints(auction);
-        // Fetch the solutions from the solver.
         let weth = self.eth.contracts().weth_address();
         let auction_dto = dto::auction::new(
             auction,
@@ -246,12 +250,6 @@ impl Solver {
             auction.deadline(self.timeouts()).solvers(),
         );
 
-        if let Some(id) = auction.id() {
-            // Only auctions with IDs are real auctions (/quote requests don't have an ID,
-            // and it makes no sense to store them)
-            self.persistence.archive_auction(id, &auction_dto);
-        }
-
         let body = {
             // pre-allocate a big enough buffer to avoid re-allocating memory
             // as the request gets serialized
@@ -261,6 +259,17 @@ impl Solver {
             String::from_utf8(buffer).expect("serde_json only writes valid utf8")
         };
 
+        if let Some(id) = auction.id() {
+            // Only auctions with IDs are real auctions (/quote requests don't have an ID).
+            // Only for those it makes sense to archive them and measure the execution time.
+            self.persistence.archive_auction(id, &auction_dto);
+            ::observe::metrics::metrics().measure_auction_overhead(
+                start,
+                "driver",
+                "serialize_request",
+            );
+        }
+
         let url = shared::url::join(&self.config.endpoint, "solve");
         super::observe::solver_request(&url, &body);
         let timeout = match auction.deadline(self.timeouts()).solvers().remaining() {
diff --git a/crates/observe/Cargo.toml b/crates/observe/Cargo.toml
@@ -18,6 +18,7 @@ opentelemetry_sdk = { workspace = true }
 pin-project-lite = { workspace = true }
 prometheus = { workspace = true }
 prometheus-metric-storage = { workspace = true }
+scopeguard = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 time = { workspace = true, features = ["macros"] }
diff --git a/crates/observe/src/metrics.rs b/crates/observe/src/metrics.rs
@@ -1,5 +1,8 @@
 use {
-    prometheus::Encoder,
+    prometheus::{
+        Encoder,
+        core::{AtomicF64, AtomicU64, GenericCounterVec},
+    },
     std::{
         collections::HashMap,
         convert::Infallible,
@@ -9,6 +12,7 @@ use {
             OnceLock,
             atomic::{AtomicBool, Ordering},
         },
+        time::Instant,
     },
     tokio::task::{self, JoinHandle},
     warp::{Filter, Rejection, Reply},
@@ -136,3 +140,44 @@ fn handle_readiness(
         }
     })
 }
+
+/// Metrics shared by potentially all processes.
+#[derive(prometheus_metric_storage::MetricStorage)]
+pub struct Metrics {
+    /// All the time losses we incur while arbitrating the auctions
+    #[metric(labels("component", "phase"))]
+    pub auction_overhead_time: GenericCounterVec<AtomicF64>,
+
+    /// How many measurements we did for each source of overhead.
+    #[metric(labels("component", "phase"))]
+    pub auction_overhead_count: GenericCounterVec<AtomicU64>,
+}
+
+impl Metrics {
+    /// Returns a struct that measures the overhead when it gets dropped.
+    #[must_use]
+    pub fn on_auction_overhead_start<'a, 'b, 'c>(
+        &'a self,
+        component: &'b str,
+        phase: &'c str,
+    ) -> impl Drop + use<'a, 'b, 'c> {
+        let start = std::time::Instant::now();
+        scopeguard::guard(start, move |start| {
+            self.measure_auction_overhead(start, component, phase);
+        })
+    }
+
+    pub fn measure_auction_overhead(&self, start: Instant, component: &str, phase: &str) {
+        self.auction_overhead_time
+            .with_label_values(&[component, phase])
+            .inc_by(start.elapsed().as_secs_f64());
+
+        self.auction_overhead_count
+            .with_label_values(&[component, phase])
+            .inc()
+    }
+}
+
+pub fn metrics() -> &'static Metrics {
+    Metrics::instance(get_storage_registry()).unwrap()
+}