bench: microsecond-precision timing and 10x N for stable CI signal

elijahr · elijahr · commit 439ee2c55c35 · 2026-04-29T18:34:16.000-05:00
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -79,29 +79,28 @@ jobs:
       - name: Compile bench_throughput (CI-tuned run shape)
         # Cloud runs use a tighter wall-clock budget than the local default
         # (1M messages × 33 runs). Bounded variants (sipsic, channels) use
-        # 100k × 5 runs and finish in single-digit seconds. unbounded_mupsic
+        # 1M × 5 runs and finish in single-digit seconds. unbounded_mupsic
         # is super-linear in message count and is gated separately via
-        # UnboundedMupsicMessageCount=50000 + UnboundedMupsicRuns=3 to keep
+        # UnboundedMupsicMessageCount=500000 + UnboundedMupsicRuns=3 to keep
         # the variant tractable inside the 30-min job budget. Warmup stays
         # at 2 to absorb JIT/cache effects.
         #
-        # Why 50000 (not 5000): bench_throughput.nim uses a millisecond-
-        # precision timer. At N=5000 the unbounded_mupsic variant finishes
-        # in well under 1 ms per run, so ops/ms is reported as `inf`, the
-        # BMF adapter discards the data points, and Bencher receives nothing
-        # for the variant we actually want to gate on. Local smoke at N=50000
-        # finishes in ~0.26s with stable per-topology numbers (1P/1C ~9.4k
-        # ops/ms, 2P/1C ~10.8k, 4P/1C ~8.9k). 50K × 3 runs × 3 topologies
-        # plus warmup costs roughly 4-10s on CI: well inside the 30-min job
-        # budget while giving the timer enough wall-clock to produce a
-        # stable signal.
+        # Counts are 10x the previous values (100k / 50k → 1M / 500k) and
+        # the bench timer is microsecond/nanosecond-precision (computed in
+        # ns, printed as `mean: <N>.1f ops/ms`). Together this gives non-zero
+        # stddev and >1-decimal resolution between runs on the fast CI
+        # runner; previously a 50k unbounded_mupsic run completed in ~3 ms,
+        # so multiple samples bucketed into the same integer ms and stddev
+        # was reported as 0. The new shape costs roughly 30-60s on CI: well
+        # inside the 30-min job budget while giving the timer enough
+        # wall-clock spread to produce a stable, comparable signal.
         run: |
           nim c -d:release -d:danger --threads:on \
-            -d:MessageCount=100000 \
+            -d:MessageCount=1000000 \
             -d:DefaultRuns=5 \
             -d:WarmupRuns=2 \
             -d:UnboundedMupsicRuns=3 \
-            -d:UnboundedMupsicMessageCount=50000 \
+            -d:UnboundedMupsicMessageCount=500000 \
             benchmarks/nim/bench_throughput.nim
 
       - name: Run bench_throughput
diff --git a/benchmarks/nim/bench_throughput.nim b/benchmarks/nim/bench_throughput.nim
@@ -80,6 +80,9 @@ proc runThroughputBenchmark*[Q](
 
   # Start timing. Monotonic clock — `epochTime` (wall clock) can step
   # backward across NTP adjustments and skew throughput numbers.
+  # Nanosecond precision: ms-precision buckets multiple short runs into
+  # the same integer ms, producing identical samples and stddev=0 on a
+  # fast CI runner. ops/ms is reconstructed as a float at print time.
   let startTime = getMonoTime()
 
   # Launch threads
@@ -94,8 +97,8 @@ proc runThroughputBenchmark*[Q](
   for i in 0 ..< numConsumers:
     joinThread(consumerThreads[i])
 
-  let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
-  result = float(messageCount) / elapsedMs
+  let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
+  result = float(messageCount) * 1_000_000.0 / elapsedNs
 
 proc benchmarkThroughput*[Q](
     initQueue: proc(): Q,
@@ -185,6 +188,7 @@ proc runMupmucBenchmark[N, P, C: static int, T](
 
   # Start timing. Monotonic clock — `epochTime` (wall clock) can step
   # backward across NTP adjustments and skew throughput numbers.
+  # See runThroughputBenchmark for the nanosecond-precision rationale.
   let startTime = getMonoTime()
 
   # Launch threads
@@ -199,8 +203,8 @@ proc runMupmucBenchmark[N, P, C: static int, T](
   for i in 0 ..< C:
     joinThread(consumerThreads[i])
 
-  let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
-  result = float(messageCount) / elapsedMs
+  let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
+  result = float(messageCount) * 1_000_000.0 / elapsedNs
 
 # Fixed-size Mupmuc benchmark functions for common thread counts
 proc benchmarkMupmuc1P1C*(
@@ -355,8 +359,8 @@ proc runUnboundedMupsicBenchmark[S: static int, T; MaxThreads: static int](
     joinThread(producerThreads[i])
   joinThread(consumerThread)
 
-  let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
-  result = float(totalMessages) / elapsedMs
+  let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
+  result = float(totalMessages) * 1_000_000.0 / elapsedNs
 
 proc benchmarkUnboundedMupsicNP1C(
     numProducers: int,
@@ -446,8 +450,8 @@ when isMainModule:
       numConsumers = 1,
       runs = 10,
     )
-    echo fmt"  mean: {sipsicMetrics.mean:.0f} ops/ms"
-    echo fmt"  stddev: {sipsicMetrics.stddev:.0f}"
+    echo fmt"  mean: {sipsicMetrics.mean:.1f} ops/ms"
+    echo fmt"  stddev: {sipsicMetrics.stddev:.1f}"
     echo ""
 
   # Mupmuc (bounded MPMC)
@@ -464,8 +468,8 @@ when isMainModule:
           benchmarkMupmuc4P4C(runs = 10)
         else:
           ThroughputMetrics()
-      echo fmt"  mean: {metrics.mean:.0f} ops/ms"
-      echo fmt"  stddev: {metrics.stddev:.0f}"
+      echo fmt"  mean: {metrics.mean:.1f} ops/ms"
+      echo fmt"  stddev: {metrics.stddev:.1f}"
       echo ""
 
   # UnboundedMupsic (unbounded MPSC) — new harness, 33 runs.
@@ -485,9 +489,9 @@ when isMainModule:
           benchmarkUnboundedMupsic4P1C()
         else:
           ThroughputMetrics()
-      echo fmt"  mean: {metrics.mean:.0f} ops/ms"
-      echo fmt"  min: {metrics.min:.0f}  max: {metrics.max:.0f}"
-      echo fmt"  stddev: {metrics.stddev:.0f}"
+      echo fmt"  mean: {metrics.mean:.1f} ops/ms"
+      echo fmt"  min: {metrics.min:.1f}  max: {metrics.max:.1f}"
+      echo fmt"  stddev: {metrics.stddev:.1f}"
       echo ""
     echo "==================================================="
     echo ""
@@ -503,6 +507,6 @@ when isMainModule:
         numConsumers = threads,
         runs = 10,
       )
-      echo fmt"  mean: {metrics.mean:.0f} ops/ms"
-      echo fmt"  stddev: {metrics.stddev:.0f}"
+      echo fmt"  mean: {metrics.mean:.1f} ops/ms"
+      echo fmt"  stddev: {metrics.stddev:.1f}"
       echo ""
diff --git a/benchmarks/test_bmf_adapter.py b/benchmarks/test_bmf_adapter.py
@@ -193,6 +193,39 @@ def test_unknown_variant_is_skipped(self) -> None:
         actual = bmf_adapter.parse_bench_output(fixture)
         self.assertEqual(actual, expected)
 
+    def test_decimal_means_and_minmax(self) -> None:
+        """Microsecond/nanosecond-precision timer emits `:.1f` decimal values.
+
+        After the bench switched from `inMilliseconds` to `inNanoseconds`,
+        means and min/max gain a fractional component (e.g. 16666.7 instead
+        of 16667.). The adapter must capture decimal floats verbatim.
+        """
+        fixture = textwrap.dedent(
+            """\
+            Sipsic (bounded SPSC) 1P/1C:
+              mean: 12345.6 ops/ms
+              stddev: 12.3
+
+            UnboundedMupsic (unbounded MPSC) 2P/1C:
+              mean: 16666.7 ops/ms
+              min: 15555.4  max: 17777.8
+              stddev: 678.9
+            """
+        )
+
+        expected = {
+            "sipsic/1p1c": {"throughput": {"value": 12345.6}},
+            "unbounded_mupsic/2p1c": {
+                "throughput": {
+                    "value": 16666.7,
+                    "lower_value": 15555.4,
+                    "upper_value": 17777.8,
+                }
+            },
+        }
+        actual = bmf_adapter.parse_bench_output(fixture)
+        self.assertEqual(actual, expected)
+
     def test_partial_min_max_finite(self) -> None:
         """If only min is finite (max is inf), only lower_value is recorded."""
         fixture = textwrap.dedent(