perf(queue): add push_shared/pop_shared &self API, bench all Rust variants

Milerius · claude · Milerius · commit f4603abbe4e1 · 2026-04-07T15:06:52.000+02:00
- Add push_shared/pop_shared on RawRing taking &amp;self instead of &amp;mut self
  to avoid LLVM noalias interference in two-thread benchmarks
- Raw bench now tests all 3 Rust queues: mantis-inline (push_shared),
  mantis-copy (push/pop &amp;self), rtrb (push/pop Result)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/rust/src/raw_bench.rs b/benchmarks/rust/src/raw_bench.rs
@@ -86,10 +86,10 @@ fn run_raw(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
     // SpscRing uses CacheLine-colocated fields internally.
     let mut ring = SpscRing::<Msg, 1024>::new();
 
-    // Cast to usize for Send across threads.
-    // SAFETY: ring lives on this stack frame and we join both threads before returning.
-    // SPSC protocol guarantees disjoint access (producer: push, consumer: pop_into).
-    let ring_addr = &mut ring as *mut SpscRing<Msg, 1024> as usize;
+    // Use &self shared references — no &mut aliasing UB.
+    // SAFETY: SPSC protocol guarantees disjoint access. Ring lives on stack
+    // and we join both threads before returning.
+    let ring_addr = &ring as *const SpscRing<Msg, 1024> as usize;
 
     let consumer_ready = AtomicBool::new(false);
     let ready_addr = &consumer_ready as *const AtomicBool as usize;
@@ -104,12 +104,13 @@ fn run_raw(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
         let latency = unsafe { &*(latency_addr as *const AtomicU64) };
         ready.store(true, Ordering::Release);
 
-        let rb = unsafe { &mut *(ring_addr as *mut SpscRing<Msg, 1024>) };
+        // SAFETY: SPSC consumer — only pops. &self avoids noalias interference.
+        let rb = unsafe { &*(ring_addr as *const SpscRing<Msg, 1024>) };
         let mut msg = Msg::default();
         let mut sum: u64 = 0;
         let mut count: u64 = 0;
         while count < total_ops {
-            if unsafe { rb.pop_into(&mut msg as *mut Msg) } {
+            if unsafe { rb.pop_shared(&mut msg as *mut Msg) } {
                 let now = rdtsc();
                 sum += now - msg.timestamp;
                 count += 1;
@@ -124,7 +125,8 @@ fn run_raw(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
         let ready = unsafe { &*(ready_addr as *const AtomicBool) };
         while !ready.load(Ordering::Acquire) {}
 
-        let rb = unsafe { &mut *(ring_addr as *mut SpscRing<Msg, 1024>) };
+        // SAFETY: SPSC producer — only pushes. &self avoids noalias interference.
+        let rb = unsafe { &*(ring_addr as *const SpscRing<Msg, 1024>) };
         for i in 0..total_ops {
             let msg = Msg {
                 timestamp: rdtsc(),
@@ -136,7 +138,7 @@ fn run_raw(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
                 quantity: ((i & 0xFF) + 1) as i64,
                 order_id: i as i64,
             };
-            while !rb.push(msg) {}
+            while !unsafe { rb.push_shared(msg) } {}
         }
     });
 
@@ -146,14 +148,135 @@ fn run_raw(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
     total_latency.load(Ordering::Acquire)
 }
 
-/// Run multiple iterations and print cycles/op.
-pub fn run_raw_bench(producer_core: usize, consumer_core: usize, ops: u64, iterations: usize) {
+// ─── SpscRingCopy variant ────────────────────────────────────────────────────
+
+fn run_raw_copy(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
+    use mantis_queue::SpscRingCopy;
+
+    let ring = SpscRingCopy::<Msg, 1024>::new();
+    let ring_addr = &ring as *const SpscRingCopy<Msg, 1024> as usize;
+
+    let consumer_ready = AtomicBool::new(false);
+    let ready_addr = &consumer_ready as *const AtomicBool as usize;
+    let total_latency = AtomicU64::new(0);
+    let latency_addr = &total_latency as *const AtomicU64 as usize;
+
+    let consumer = thread::spawn(move || {
+        pin(consumer_core);
+        let ready = unsafe { &*(ready_addr as *const AtomicBool) };
+        let latency = unsafe { &*(latency_addr as *const AtomicU64) };
+        ready.store(true, Ordering::Release);
+
+        let rb = unsafe { &*(ring_addr as *const SpscRingCopy<Msg, 1024>) };
+        let mut msg = Msg::default();
+        let mut sum: u64 = 0;
+        let mut count: u64 = 0;
+        while count < total_ops {
+            if rb.pop(&mut msg) {
+                let now = rdtsc();
+                sum += now - msg.timestamp;
+                count += 1;
+            }
+        }
+        latency.store(sum, Ordering::Release);
+    });
+
+    let producer = thread::spawn(move || {
+        pin(producer_core);
+        let ready = unsafe { &*(ready_addr as *const AtomicBool) };
+        while !ready.load(Ordering::Acquire) {}
+
+        let rb = unsafe { &*(ring_addr as *const SpscRingCopy<Msg, 1024>) };
+        for i in 0..total_ops {
+            let mut msg = Msg {
+                timestamp: rdtsc(),
+                sequence: i,
+                symbol_id: (i & 0xFFF) as u32,
+                side: (i & 1) as u16,
+                _pad: 0,
+                price: (i * 100 + 1) as i64,
+                quantity: ((i & 0xFF) + 1) as i64,
+                order_id: i as i64,
+            };
+            while !rb.push(&msg) {}
+        }
+    });
+
+    producer.join().unwrap();
+    consumer.join().unwrap();
+    total_latency.load(Ordering::Acquire)
+}
+
+// ─── rtrb variant ────────────────────────────────────────────────────────────
+
+fn run_raw_rtrb(producer_core: usize, consumer_core: usize, total_ops: u64) -> u64 {
+    let (mut tx, mut rx) = rtrb::RingBuffer::<Msg>::new(1024);
+
+    let consumer_ready = AtomicBool::new(false);
+    let ready_addr = &consumer_ready as *const AtomicBool as usize;
+    let total_latency = AtomicU64::new(0);
+    let latency_addr = &total_latency as *const AtomicU64 as usize;
+
+    let consumer = thread::spawn(move || {
+        pin(consumer_core);
+        let ready = unsafe { &*(ready_addr as *const AtomicBool) };
+        let latency = unsafe { &*(latency_addr as *const AtomicU64) };
+        ready.store(true, Ordering::Release);
+
+        let mut sum: u64 = 0;
+        let mut count: u64 = 0;
+        while count < total_ops {
+            if let Ok(msg) = rx.pop() {
+                let now = rdtsc();
+                sum += now - msg.timestamp;
+                count += 1;
+            }
+        }
+        latency.store(sum, Ordering::Release);
+    });
+
+    let producer = thread::spawn(move || {
+        pin(producer_core);
+        let ready = unsafe { &*(ready_addr as *const AtomicBool) };
+        while !ready.load(Ordering::Acquire) {}
+
+        for i in 0..total_ops {
+            let msg = Msg {
+                timestamp: rdtsc(),
+                sequence: i,
+                symbol_id: (i & 0xFFF) as u32,
+                side: (i & 1) as u16,
+                _pad: 0,
+                price: (i * 100 + 1) as i64,
+                quantity: ((i & 0xFF) + 1) as i64,
+                order_id: i as i64,
+            };
+            while tx.push(msg).is_err() {}
+        }
+    });
+
+    producer.join().unwrap();
+    consumer.join().unwrap();
+    total_latency.load(Ordering::Acquire)
+}
+
+// ─── Runner ──────────────────────────────────────────────────────────────────
+
+fn run_variant(
+    name: &str,
+    run_fn: fn(usize, usize, u64) -> u64,
+    producer_core: usize,
+    consumer_core: usize,
+    ops: u64,
+    iterations: usize,
+) {
+    eprintln!("[{name}]");
     // Warmup
-    let _ = run_raw(producer_core, consumer_core, ops);
+    let _ = run_fn(producer_core, consumer_core, ops);
 
     let mut best = u64::MAX;
     for i in 1..=iterations {
-        let total_cycles = run_raw(producer_core, consumer_core, ops);
+        let total_cycles = run_fn(producer_core, consumer_core, ops);
         let cycles_per_op = total_cycles as f64 / ops as f64;
         if total_cycles < best {
             best = total_cycles;
@@ -163,3 +286,12 @@ pub fn run_raw_bench(producer_core: usize, consumer_core: usize, ops: u64, itera
     let best_per_op = best as f64 / ops as f64;
     eprintln!("  BEST: {best_per_op:.1} cycles/op");
 }
+
+/// Run all variants or a specific one.
+pub fn run_raw_bench(producer_core: usize, consumer_core: usize, ops: u64, iterations: usize) {
+    run_variant("mantis-inline (push_shared/pop_shared)", run_raw, producer_core, consumer_core, ops, iterations);
+    eprintln!();
+    run_variant("mantis-copy (push/pop &self)", run_raw_copy, producer_core, consumer_core, ops, iterations);
+    eprintln!();
+    run_variant("rtrb (push/pop Result)", run_raw_rtrb, producer_core, consumer_core, ops, iterations);
+}
diff --git a/crates/queue/src/handle.rs b/crates/queue/src/handle.rs
@@ -280,6 +280,37 @@ where
         unsafe { self.engine.pop(out) }
     }
 
+    /// Push via shared reference. Returns `true` on success, `false` if full.
+    ///
+    /// # Safety
+    ///
+    /// The caller must uphold the SPSC protocol: exactly one thread calls
+    /// `push_shared` (the producer), and a different thread calls `pop_shared`
+    /// (the consumer). No two threads may call the same method concurrently.
+    #[expect(
+        unsafe_code,
+        reason = "SPSC shared-reference push for zero-overhead two-thread use"
+    )]
+    #[inline(always)]
+    pub unsafe fn push_shared(&self, value: T) -> bool {
+        self.engine.push(value)
+    }
+
+    /// Pop via shared reference into `out`. Returns `true` on success, `false` if empty.
+    ///
+    /// # Safety
+    ///
+    /// Same SPSC contract as `push_shared`. Additionally, `out` must be
+    /// a valid, writeable, properly aligned pointer to `T`.
+    #[expect(
+        unsafe_code,
+        reason = "SPSC shared-reference pop for zero-overhead two-thread use"
+    )]
+    #[inline(always)]
+    pub unsafe fn pop_shared(&self, out: *mut T) -> bool {
+        unsafe { self.engine.pop(out) }
+    }
+
     /// Number of elements currently in the ring.
     #[must_use]
     pub fn len(&self) -> usize {