chore(iroh-bench): Allow configuring the number of worker threads for each endpoint in iroh-bench (#4063)

matheus23 · web-flow · commit 065b44870026 · 2026-03-30T22:41:17.000+02:00
(this is #3780 reopened, because that had the wrong target branch) ## Description When I look at qlog files of iroh-bench runs, I can see that PATH_ACK processing is severely delayed when using a single-threaded tokio runtime for each endpoint. This disappears when I give iroh-bench two worker threads. This allows the EndpointDriver and the ConnectionDriver to each run independently of each other, thus the endpoint driver won't be delayed when processing PATH_ACKs. When the processing was delayed, this had an effect on the apparent RTT and in turn on congestion control. ## Screenshots Before: Note all the blue lines going in from waaaaay back in history all into one place on the left (those are the PATH_ACKs that get handled way too late): <img width="1720" height="772" alt="image" src="https://github.com/user-attachments/assets/688e19e5-63bb-4d1c-9256-43ce0911a4c0" /> After: The PATH_ACKs get handled in time, they're way more interspersed with stream frames: <img width="1720" height="772" alt="image" src="https://github.com/user-attachments/assets/a33db88c-dde0-458d-8007-12c7f119165d" /> ## Change checklist  - [x] Self-review.
diff --git a/iroh/bench/src/bin/bulk.rs b/iroh/bench/src/bin/bulk.rs
@@ -35,7 +35,7 @@ fn main() {
 
 pub fn run_iroh(opt: Opt) -> Result<()> {
     let server_span = tracing::error_span!("server");
-    let runtime = rt();
+    let runtime = rt(opt.workers_per_ep);
 
     #[cfg(feature = "local-relay")]
     let (relay_url, relay_server) = if opt.only_relay {
@@ -70,7 +70,7 @@ pub fn run_iroh(opt: Opt) -> Result<()> {
         let relay_url = relay_url.clone();
         handles.push(std::thread::spawn(move || {
             let _guard = tracing::error_span!("client", id).entered();
-            let runtime = rt();
+            let runtime = rt(opt.workers_per_ep);
             match runtime.block_on(iroh::client(server_addr, relay_url.clone(), opt)) {
                 Ok(stats) => Ok(stats),
                 Err(e) => {
@@ -112,7 +112,7 @@ pub fn run_noq(opt: Opt) -> Result<()> {
     use rustls::pki_types::{CertificateDer, PrivatePkcs8KeyDer};
 
     let server_span = tracing::error_span!("server");
-    let runtime = rt();
+    let runtime = rt(opt.workers_per_ep);
     let cert = rcgen::generate_simple_self_signed(vec!["localhost".into()]).unwrap();
     let key = PrivatePkcs8KeyDer::from(cert.signing_key.serialize_der());
     let cert = CertificateDer::from(cert.cert);
@@ -134,7 +134,7 @@ pub fn run_noq(opt: Opt) -> Result<()> {
         let cert = cert.clone();
         handles.push(std::thread::spawn(move || {
             let _guard = tracing::error_span!("client", id).entered();
-            let runtime = rt();
+            let runtime = rt(opt.workers_per_ep);
             match runtime.block_on(noq::client(server_addr, cert, opt)) {
                 Ok(stats) => Ok(stats),
                 Err(e) => {
diff --git a/iroh/bench/src/lib.rs b/iroh/bench/src/lib.rs
@@ -76,6 +76,21 @@ pub struct Opt {
     pub only_relay: bool,
     #[clap(long, default_value_t = false)]
     pub use_ipv6: bool,
+
+    /// How many tokio worker threads to use for each endpoint in the benchmark.
+    ///
+    /// Defaults to 1, a single-threaded runtime.
+    /// Set this to 0 to have each endpoint use a full multi-threaded tokio runtime with as many
+    /// workers as CPU parallelism detected.
+    ///
+    /// Setting this to 2 is a very reasonable value.
+    /// When quinn runs a single connection, it can run one task (the EndpointDriver) for all
+    /// receive-based work, and one task (the ConnectionDriver) for all send-based tasks.
+    /// If quinn is only given a single worker, then work distribution may be unfair, resulting
+    /// in acknowledgements not being processed timely enough, causing congestion control issues,
+    /// although ideally that problem is fixed in some other way in quinn.
+    #[clap(long, default_value_t = 1)]
+    pub workers_per_ep: usize,
 }
 
 pub enum EndpointSelector {
@@ -140,8 +155,18 @@ pub fn configure_tracing_subscriber() {
     .unwrap();
 }
 
-pub fn rt() -> Runtime {
-    Builder::new_current_thread().enable_all().build().unwrap()
+pub fn rt(workers: usize) -> Runtime {
+    let mut builder = match workers {
+        // 0 means "use as many threads as detected CPU parallelism" implicitly.
+        0 => Builder::new_multi_thread(),
+        1 => Builder::new_current_thread(),
+        workers => {
+            let mut b = Builder::new_multi_thread();
+            b.worker_threads(workers);
+            b
+        }
+    };
+    builder.enable_all().build().unwrap()
 }
 
 fn parse_byte_size(s: &str) -> Result<u64, ParseIntError> {