Skip to content

Commit f0469ce

Browse files
authored
feat: use u64 arithmetic in Pacer instead of u128 (#3594)
* feat: use u64 arithmetic in `Pacer` instead of u128 `Pacer::next` and `bytes_for` used u128 for pacing rate calculations, causing software-emulated 128-bit division (`compiler_builtins::u128_by_u64_div_rem`) on every packet send. The key product in `next` is `rtt_ns * deficit`, where `deficit` is at most 2 × MTU (~3000 bytes) and a 10-second RTT gives 10^10 ns: product ~3*10^13, far below u64::MAX (1.8*10^19). The key product in `bytes_for` is `elapsed_ns * cwnd * SPEEDUP`: at 400 Gbps with a 100 ms RTT the inter-packet interval is ~24 ns and the BDP ~5 GB, giving ~2.4*10^11. Even a full-RTT elapsed (10^8 ns) gives 10^18 < u64::MAX. Beyond that, `saturating_mul` caps the value and callers clamp to `self.m`. * Fixes * Suggestions from @martinthomson
1 parent fa9fc98 commit f0469ce

1 file changed

Lines changed: 34 additions & 29 deletions

File tree

neqo-transport/src/pace.rs

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ impl Pacer {
4040
/// the case the congestion controller increases the congestion window.
4141
/// This value spaces packets over half the congestion window, which matches
4242
/// our current congestion controller, which double the window every RTT.
43-
const SPEEDUP: usize = 2;
43+
const SPEEDUP: u64 = 2;
4444

4545
/// Create a new `Pacer`. This takes the current time, the maximum burst size,
4646
/// and the packet size.
@@ -86,49 +86,54 @@ impl Pacer {
8686

8787
// This is the inverse of the function in `spend`:
8888
// self.t + rtt * (self.p - self.c) / (Self::SPEEDUP * cwnd)
89-
let r = rtt.as_nanos();
90-
let deficit =
91-
u128::try_from(packet - self.c).expect("packet is larger than current credit");
92-
let d = r.saturating_mul(deficit);
93-
let divisor = u128::try_from(cwnd)
94-
.expect("usize fits into u128")
95-
.saturating_mul(u128::try_from(Self::SPEEDUP).expect("usize fits into u128"));
96-
let add = d / divisor;
97-
let w = u64::try_from(add).map_or(rtt, Duration::from_nanos);
89+
//
90+
// `deficit` can exceed 2 × MTU when `self.c` carries accumulated debt
91+
// from consecutive sub-granularity sends. `saturating_mul` caps the
92+
// product safely regardless of the actual value.
93+
let Ok(deficit) = u64::try_from(packet - self.c) else {
94+
qtrace!("[{self}] next {cwnd}/{rtt:?} deficit overflow");
95+
return self.t;
96+
};
97+
let rtt_ns = u64::try_from(rtt.as_nanos()).unwrap_or(u64::MAX);
98+
let divisor = (cwnd as u64).saturating_mul(Self::SPEEDUP);
99+
let w_ns = rtt_ns.saturating_mul(deficit) / divisor;
98100

99101
// If the increment is below the timer granularity, send immediately.
100-
if w < GRANULARITY {
101-
qtrace!("[{self}] next {cwnd}/{rtt:?} below granularity ({w:?})");
102+
#[expect(
103+
clippy::cast_possible_truncation,
104+
reason = "GRANULARITY is 1ms, fits in u64"
105+
)]
106+
if w_ns < GRANULARITY.as_nanos() as u64 {
107+
qtrace!("[{self}] next {cwnd}/{rtt:?} below granularity ({w_ns}ns)");
102108
return self.t;
103109
}
104110

105-
let nxt = self.t + w;
106-
qtrace!("[{self}] next {cwnd}/{rtt:?} wait {w:?} = {nxt:?}");
111+
let nxt = self.t + Duration::from_nanos(w_ns);
112+
qtrace!("[{self}] next {cwnd}/{rtt:?} wait {w_ns}ns = {nxt:?}");
107113
nxt
108114
}
109115

110116
/// Bytes sendable at `SPEEDUP * cwnd / rtt` pace over `elapsed`.
111117
/// Returns `None` if `rtt` is zero.
112-
#[allow(
113-
clippy::allow_attributes,
114-
clippy::unwrap_in_result,
115-
reason = "Check if this can be removed with MSRV > 1.90"
116-
)]
117-
fn bytes_for(cwnd: usize, rtt: Duration, elapsed: Duration) -> Option<u128> {
118-
let factor = u128::try_from(cwnd)
119-
.expect("usize fits into u128")
120-
.saturating_mul(u128::try_from(Self::SPEEDUP).expect("usize fits into u128"));
121-
elapsed
122-
.as_nanos()
123-
.saturating_mul(factor)
124-
.checked_div(rtt.as_nanos())
118+
///
119+
/// The key product is `elapsed_ns * cwnd * SPEEDUP`. At 400 Gbps with a
120+
/// 100 ms RTT the BDP is ~5 GB, so `factor` = cwnd * 2 ≈ 10^10. The
121+
/// inter-packet interval at that rate is ~24 ns, giving a product of
122+
/// ~2.4*10^11, well within u64. Even a full-RTT elapsed (10^8 ns) gives
123+
/// 10^8 * 10^10 = 10^18 < `u64::MAX` (1.8*10^19). Beyond that the
124+
/// `saturating_mul` caps the value and callers clamp to `self.m`.
125+
fn bytes_for(cwnd: usize, rtt: Duration, elapsed: Duration) -> Option<u64> {
126+
let rtt_ns = u64::try_from(rtt.as_nanos()).unwrap_or(u64::MAX);
127+
let elapsed_ns = u64::try_from(elapsed.as_nanos()).unwrap_or(u64::MAX);
128+
let factor = (cwnd as u64).saturating_mul(Self::SPEEDUP);
129+
elapsed_ns.saturating_mul(factor).checked_div(rtt_ns)
125130
}
126131

127132
/// Compute the effective pacing rate in bytes per second.
128133
///
129-
/// Returns `None` if `rtt` is zero or the rate exceeds `u64::MAX`.
134+
/// Returns `None` if `rtt` is zero.
130135
pub(crate) fn rate(cwnd: usize, rtt: Duration) -> Option<u64> {
131-
u64::try_from(Self::bytes_for(cwnd, rtt, Duration::from_secs(1))?).ok()
136+
Self::bytes_for(cwnd, rtt, Duration::from_secs(1))
132137
}
133138

134139
/// Spend credit. This cannot fail, but instead may carry debt into the

0 commit comments

Comments
 (0)