feat: use u64 arithmetic in Pacer instead of u128 (#3594)

larseggert · web-flow · commit f0469ce808c7 · 2026-05-11T06:18:28.000Z
* feat: use u64 arithmetic in `Pacer` instead of u128 `Pacer::next` and `bytes_for` used u128 for pacing rate calculations, causing software-emulated 128-bit division (`compiler_builtins::u128_by_u64_div_rem`) on every packet send. The key product in `next` is `rtt_ns * deficit`, where `deficit` is at most 2 × MTU (~3000 bytes) and a 10-second RTT gives 10^10 ns: product ~3*10^13, far below u64::MAX (1.8*10^19). The key product in `bytes_for` is `elapsed_ns * cwnd * SPEEDUP`: at 400 Gbps with a 100 ms RTT the inter-packet interval is ~24 ns and the BDP ~5 GB, giving ~2.4*10^11. Even a full-RTT elapsed (10^8 ns) gives 10^18 < u64::MAX. Beyond that, `saturating_mul` caps the value and callers clamp to `self.m`. * Fixes * Suggestions from @martinthomson
diff --git a/neqo-transport/src/pace.rs b/neqo-transport/src/pace.rs
@@ -40,7 +40,7 @@ impl Pacer {
     /// the case the congestion controller increases the congestion window.
     /// This value spaces packets over half the congestion window, which matches
     /// our current congestion controller, which double the window every RTT.
-    const SPEEDUP: usize = 2;
+    const SPEEDUP: u64 = 2;
 
     /// Create a new `Pacer`.  This takes the current time, the maximum burst size,
     /// and the packet size.
@@ -86,49 +86,54 @@ impl Pacer {
 
         // This is the inverse of the function in `spend`:
         // self.t + rtt * (self.p - self.c) / (Self::SPEEDUP * cwnd)
-        let r = rtt.as_nanos();
-        let deficit =
-            u128::try_from(packet - self.c).expect("packet is larger than current credit");
-        let d = r.saturating_mul(deficit);
-        let divisor = u128::try_from(cwnd)
-            .expect("usize fits into u128")
-            .saturating_mul(u128::try_from(Self::SPEEDUP).expect("usize fits into u128"));
-        let add = d / divisor;
-        let w = u64::try_from(add).map_or(rtt, Duration::from_nanos);
+        //
+        // `deficit` can exceed 2 × MTU when `self.c` carries accumulated debt
+        // from consecutive sub-granularity sends.  `saturating_mul` caps the
+        // product safely regardless of the actual value.
+        let Ok(deficit) = u64::try_from(packet - self.c) else {
+            qtrace!("[{self}] next {cwnd}/{rtt:?} deficit overflow");
+            return self.t;
+        };
+        let rtt_ns = u64::try_from(rtt.as_nanos()).unwrap_or(u64::MAX);
+        let divisor = (cwnd as u64).saturating_mul(Self::SPEEDUP);
+        let w_ns = rtt_ns.saturating_mul(deficit) / divisor;
 
         // If the increment is below the timer granularity, send immediately.
-        if w < GRANULARITY {
-            qtrace!("[{self}] next {cwnd}/{rtt:?} below granularity ({w:?})");
+        #[expect(
+            clippy::cast_possible_truncation,
+            reason = "GRANULARITY is 1ms, fits in u64"
+        )]
+        if w_ns < GRANULARITY.as_nanos() as u64 {
+            qtrace!("[{self}] next {cwnd}/{rtt:?} below granularity ({w_ns}ns)");
             return self.t;
         }
 
-        let nxt = self.t + w;
-        qtrace!("[{self}] next {cwnd}/{rtt:?} wait {w:?} = {nxt:?}");
+        let nxt = self.t + Duration::from_nanos(w_ns);
+        qtrace!("[{self}] next {cwnd}/{rtt:?} wait {w_ns}ns = {nxt:?}");
         nxt
     }
 
     /// Bytes sendable at `SPEEDUP * cwnd / rtt` pace over `elapsed`.
     /// Returns `None` if `rtt` is zero.
-    #[allow(
-        clippy::allow_attributes,
-        clippy::unwrap_in_result,
-        reason = "Check if this can be removed with MSRV > 1.90"
-    )]
-    fn bytes_for(cwnd: usize, rtt: Duration, elapsed: Duration) -> Option<u128> {
-        let factor = u128::try_from(cwnd)
-            .expect("usize fits into u128")
-            .saturating_mul(u128::try_from(Self::SPEEDUP).expect("usize fits into u128"));
-        elapsed
-            .as_nanos()
-            .saturating_mul(factor)
-            .checked_div(rtt.as_nanos())
+    ///
+    /// The key product is `elapsed_ns * cwnd * SPEEDUP`.  At 400 Gbps with a
+    /// 100 ms RTT the BDP is ~5 GB, so `factor` = cwnd * 2 ≈ 10^10.  The
+    /// inter-packet interval at that rate is ~24 ns, giving a product of
+    /// ~2.4*10^11, well within u64.  Even a full-RTT elapsed (10^8 ns) gives
+    /// 10^8 * 10^10 = 10^18 < `u64::MAX` (1.8*10^19).  Beyond that the
+    /// `saturating_mul` caps the value and callers clamp to `self.m`.
+    fn bytes_for(cwnd: usize, rtt: Duration, elapsed: Duration) -> Option<u64> {
+        let rtt_ns = u64::try_from(rtt.as_nanos()).unwrap_or(u64::MAX);
+        let elapsed_ns = u64::try_from(elapsed.as_nanos()).unwrap_or(u64::MAX);
+        let factor = (cwnd as u64).saturating_mul(Self::SPEEDUP);
+        elapsed_ns.saturating_mul(factor).checked_div(rtt_ns)
     }
 
     /// Compute the effective pacing rate in bytes per second.
     ///
-    /// Returns `None` if `rtt` is zero or the rate exceeds `u64::MAX`.
+    /// Returns `None` if `rtt` is zero.
     pub(crate) fn rate(cwnd: usize, rtt: Duration) -> Option<u64> {
-        u64::try_from(Self::bytes_for(cwnd, rtt, Duration::from_secs(1))?).ok()
+        Self::bytes_for(cwnd, rtt, Duration::from_secs(1))
     }
 
     /// Spend credit. This cannot fail, but instead may carry debt into the