Optimize TSO waker scheduling and add Criterion benchmark

Michael Ingley · Michael Ingley · commit a66b5e655ea4 · 2026-02-05T23:47:21.000-06:00
Signed-off-by: Michael Ingley &lt;mingley@linkedin.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -49,6 +49,7 @@ tonic = { version = "0.10", features = ["tls", "gzip"] }
 
 [dev-dependencies]
 clap = "2"
+criterion = "0.5"
 env_logger = "0.10"
 fail = { version = "0.4", features = ["failpoints"] }
 proptest = "1"
@@ -64,3 +65,7 @@ tokio = { version = "1", features = ["sync", "rt-multi-thread", "macros"] }
 name = "failpoint_tests"
 path = "tests/failpoint_tests.rs"
 required-features = ["fail/failpoints"]
+
+[[bench]]
+name = "tso_waker_policy"
+harness = false
diff --git a/benches/tso_waker_policy.rs b/benches/tso_waker_policy.rs
@@ -0,0 +1,116 @@
+use std::hint::black_box;
+use std::sync::Arc;
+use std::task::{Wake, Waker};
+use std::time::{Duration, Instant};
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use futures::task::AtomicWaker;
+
+const MAX_PENDING_COUNT: usize = 1 << 16;
+const FULL_EVERY: u64 = 1024;
+const FULL_WINDOW: u64 = 16;
+
+struct NoopWake;
+
+impl Wake for NoopWake {
+    fn wake(self: Arc<Self>) {}
+    fn wake_by_ref(self: &Arc<Self>) {}
+}
+
+fn response_policy_old(iterations: u64) -> Duration {
+    let atomic_waker = AtomicWaker::new();
+    let waker = Waker::from(Arc::new(NoopWake));
+    atomic_waker.register(&waker);
+
+    let mut pending_len = 0usize;
+    let start = Instant::now();
+    for i in 0..iterations {
+        if i % FULL_EVERY == 0 {
+            pending_len = MAX_PENDING_COUNT;
+        }
+        black_box(pending_len >= MAX_PENDING_COUNT);
+        pending_len = pending_len.saturating_sub(1);
+        atomic_waker.wake();
+    }
+    start.elapsed()
+}
+
+fn response_policy_new(iterations: u64) -> Duration {
+    let atomic_waker = AtomicWaker::new();
+    let waker = Waker::from(Arc::new(NoopWake));
+    atomic_waker.register(&waker);
+
+    let mut pending_len = 0usize;
+    let start = Instant::now();
+    for i in 0..iterations {
+        if i % FULL_EVERY == 0 {
+            pending_len = MAX_PENDING_COUNT;
+        }
+        let was_full = pending_len >= MAX_PENDING_COUNT;
+        pending_len = pending_len.saturating_sub(1);
+        let should_wake = was_full && pending_len < MAX_PENDING_COUNT;
+        if black_box(should_wake) {
+            atomic_waker.wake();
+        }
+    }
+    start.elapsed()
+}
+
+fn register_policy_old(iterations: u64) -> Duration {
+    let atomic_waker = AtomicWaker::new();
+    let waker = Waker::from(Arc::new(NoopWake));
+
+    let start = Instant::now();
+    for i in 0..iterations {
+        let pending_len = if i % FULL_EVERY < FULL_WINDOW {
+            MAX_PENDING_COUNT
+        } else {
+            MAX_PENDING_COUNT - 1
+        };
+        black_box(pending_len);
+        atomic_waker.register(&waker);
+    }
+    start.elapsed()
+}
+
+fn register_policy_new(iterations: u64) -> Duration {
+    let atomic_waker = AtomicWaker::new();
+    let waker = Waker::from(Arc::new(NoopWake));
+
+    let start = Instant::now();
+    for i in 0..iterations {
+        let pending_len = if i % FULL_EVERY < FULL_WINDOW {
+            MAX_PENDING_COUNT
+        } else {
+            MAX_PENDING_COUNT - 1
+        };
+        if black_box(pending_len >= MAX_PENDING_COUNT) {
+            atomic_waker.register(&waker);
+        }
+    }
+    start.elapsed()
+}
+
+fn bench_tso_waker_policy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tso_waker_policy");
+    group.warm_up_time(Duration::from_secs(2));
+    group.measurement_time(Duration::from_secs(6));
+
+    group.bench_function(BenchmarkId::new("response", "old"), |b| {
+        b.iter_custom(response_policy_old);
+    });
+    group.bench_function(BenchmarkId::new("response", "new"), |b| {
+        b.iter_custom(response_policy_new);
+    });
+    group.bench_function(BenchmarkId::new("register", "old"), |b| {
+        b.iter_custom(register_policy_old);
+    });
+    group.bench_function(BenchmarkId::new("register", "new"), |b| {
+        b.iter_custom(register_policy_new);
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_tso_waker_policy);
+criterion_main!(benches);
diff --git a/doc/tso_waker_criterion.md b/doc/tso_waker_criterion.md
@@ -0,0 +1,58 @@
+# TSO Waker Criterion Benchmark
+
+Date: 2026-02-06
+Repo: `tikv/client-rust`
+Branch: `mingley/tso-waker-criterion`
+Host: macOS 26.2 (Darwin 25.2.0), Apple M4 Pro, arm64
+Rust toolchain: 1.84.1
+
+## Goal
+
+Quantify the latency impact of reducing TSO stream wake/registration churn in
+`src/pd/timestamp.rs`.
+
+## Method
+
+Benchmark framework:
+- Criterion (`cargo bench`)
+
+Bench target:
+- `benches/tso_waker_policy.rs`
+
+Command used:
+
+```bash
+cargo bench --bench tso_waker_policy -- --noplot
+```
+
+Criterion configuration in benchmark:
+- warmup: 2 seconds
+- measurement: 6 seconds
+- samples: 100
+
+The benchmark compares old vs new policies in two isolated hot paths:
+- `response/*`: wake policy when processing responses
+- `register/*`: self-waker registration policy in no-request branch
+
+## Results (Absolute Latency)
+
+From Criterion output (`time` line):
+
+- `tso_waker_policy/response/old`: `[3.2519 ns 3.2712 ns 3.2926 ns]`
+- `tso_waker_policy/response/new`: `[763.41 ps 766.39 ps 769.43 ps]`
+
+- `tso_waker_policy/register/old`: `[2.3768 ns 2.3819 ns 2.3874 ns]`
+- `tso_waker_policy/register/new`: `[286.76 ps 287.51 ps 288.27 ps]`
+
+Median-based speedups:
+- response path: `3.2712 ns / 0.76639 ns = 4.27x`
+- registration path: `2.3819 ns / 0.28751 ns = 8.28x`
+
+## Interpretation
+
+The new policy materially reduces per-operation latency in both isolated paths,
+with sub-nanosecond median latency for the optimized variants in this synthetic
+microbenchmark.
+
+This benchmark is intentionally focused on internal policy overhead. It does not
+by itself measure end-to-end PD/TSO RPC latency in a real TiKV deployment.
diff --git a/src/pd/timestamp.rs b/src/pd/timestamp.rs
@@ -31,6 +31,7 @@ use tonic::transport::Channel;
 use crate::internal_err;
 use crate::proto::pdpb::pd_client::PdClient;
 use crate::proto::pdpb::*;
+use crate::stats::observe_tso_batch;
 use crate::Result;
 
 /// It is an empirical value.
@@ -98,13 +99,17 @@ async fn run_tso(
     let mut responses = pd_client.tso(request_stream).await?.into_inner();
 
     while let Some(Ok(resp)) = responses.next().await {
-        {
+        let should_wake_sender = {
             let mut pending_requests = pending_requests.lock().await;
+            let was_full = pending_requests.len() >= MAX_PENDING_COUNT;
             allocate_timestamps(&resp, &mut pending_requests)?;
-        }
+            was_full && pending_requests.len() < MAX_PENDING_COUNT
+        };
 
-        // Wake up the sending future blocked by too many pending requests or locked.
-        sending_future_waker.wake();
+        // Only wake sender when a previously full queue gains capacity.
+        if should_wake_sender {
+            sending_future_waker.wake();
+        }
     }
     // TODO: distinguish between unexpected stream termination and expected end of test
     info!("TSO stream terminated");
@@ -137,7 +142,6 @@ impl Stream for TsoRequestStream {
         {
             pending_requests
         } else {
-            this.self_waker.register(cx.waker());
             return Poll::Pending;
         };
         let mut requests = Vec::new();
@@ -153,6 +157,7 @@ impl Stream for TsoRequestStream {
         }
 
         if !requests.is_empty() {
+            observe_tso_batch(requests.len());
             let req = TsoRequest {
                 header: Some(RequestHeader {
                     cluster_id: *this.cluster_id,
@@ -170,9 +175,11 @@ impl Stream for TsoRequestStream {
 
             Poll::Ready(Some(req))
         } else {
-            // Set the waker to the context, then the stream can be waked up after the pending queue
-            // is no longer full.
-            this.self_waker.register(cx.waker());
+            // Register self waker only when blocked by a full pending queue.
+            // When queue is not full, poll_recv above has already registered the receiver waker.
+            if pending_requests.len() >= MAX_PENDING_COUNT {
+                this.self_waker.register(cx.waker());
+            }
             Poll::Pending
         }
     }