Skip to content

Commit 439ee2c

Browse files
committed
bench: microsecond-precision timing and 10x N for stable CI signal
1 parent 1a5b6e1 commit 439ee2c

3 files changed

Lines changed: 65 additions & 29 deletions

File tree

.github/workflows/bench.yml

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,29 +79,28 @@ jobs:
7979
- name: Compile bench_throughput (CI-tuned run shape)
8080
# Cloud runs use a tighter wall-clock budget than the local default
8181
# (1M messages × 33 runs). Bounded variants (sipsic, channels) use
82-
# 100k × 5 runs and finish in single-digit seconds. unbounded_mupsic
82+
# 1M × 5 runs and finish in single-digit seconds. unbounded_mupsic
8383
# is super-linear in message count and is gated separately via
84-
# UnboundedMupsicMessageCount=50000 + UnboundedMupsicRuns=3 to keep
84+
# UnboundedMupsicMessageCount=500000 + UnboundedMupsicRuns=3 to keep
8585
# the variant tractable inside the 30-min job budget. Warmup stays
8686
# at 2 to absorb JIT/cache effects.
8787
#
88-
# Why 50000 (not 5000): bench_throughput.nim uses a millisecond-
89-
# precision timer. At N=5000 the unbounded_mupsic variant finishes
90-
# in well under 1 ms per run, so ops/ms is reported as `inf`, the
91-
# BMF adapter discards the data points, and Bencher receives nothing
92-
# for the variant we actually want to gate on. Local smoke at N=50000
93-
# finishes in ~0.26s with stable per-topology numbers (1P/1C ~9.4k
94-
# ops/ms, 2P/1C ~10.8k, 4P/1C ~8.9k). 50K × 3 runs × 3 topologies
95-
# plus warmup costs roughly 4-10s on CI: well inside the 30-min job
96-
# budget while giving the timer enough wall-clock to produce a
97-
# stable signal.
88+
# Counts are 10x the previous values (100k / 50k → 1M / 500k) and
89+
# the bench timer is microsecond/nanosecond-precision (computed in
90+
# ns, printed as `mean: <N>.1f ops/ms`). Together this gives non-zero
91+
# stddev and >1-decimal resolution between runs on the fast CI
92+
# runner; previously a 50k unbounded_mupsic run completed in ~3 ms,
93+
# so multiple samples bucketed into the same integer ms and stddev
94+
# was reported as 0. The new shape costs roughly 30-60s on CI: well
95+
# inside the 30-min job budget while giving the timer enough
96+
# wall-clock spread to produce a stable, comparable signal.
9897
run: |
9998
nim c -d:release -d:danger --threads:on \
100-
-d:MessageCount=100000 \
99+
-d:MessageCount=1000000 \
101100
-d:DefaultRuns=5 \
102101
-d:WarmupRuns=2 \
103102
-d:UnboundedMupsicRuns=3 \
104-
-d:UnboundedMupsicMessageCount=50000 \
103+
-d:UnboundedMupsicMessageCount=500000 \
105104
benchmarks/nim/bench_throughput.nim
106105
107106
- name: Run bench_throughput

benchmarks/nim/bench_throughput.nim

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ proc runThroughputBenchmark*[Q](
8080

8181
# Start timing. Monotonic clock — `epochTime` (wall clock) can step
8282
# backward across NTP adjustments and skew throughput numbers.
83+
# Nanosecond precision: ms-precision buckets multiple short runs into
84+
# the same integer ms, producing identical samples and stddev=0 on a
85+
# fast CI runner. ops/ms is reconstructed as a float at print time.
8386
let startTime = getMonoTime()
8487

8588
# Launch threads
@@ -94,8 +97,8 @@ proc runThroughputBenchmark*[Q](
9497
for i in 0 ..< numConsumers:
9598
joinThread(consumerThreads[i])
9699

97-
let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
98-
result = float(messageCount) / elapsedMs
100+
let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
101+
result = float(messageCount) * 1_000_000.0 / elapsedNs
99102

100103
proc benchmarkThroughput*[Q](
101104
initQueue: proc(): Q,
@@ -185,6 +188,7 @@ proc runMupmucBenchmark[N, P, C: static int, T](
185188

186189
# Start timing. Monotonic clock — `epochTime` (wall clock) can step
187190
# backward across NTP adjustments and skew throughput numbers.
191+
# See runThroughputBenchmark for the nanosecond-precision rationale.
188192
let startTime = getMonoTime()
189193

190194
# Launch threads
@@ -199,8 +203,8 @@ proc runMupmucBenchmark[N, P, C: static int, T](
199203
for i in 0 ..< C:
200204
joinThread(consumerThreads[i])
201205

202-
let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
203-
result = float(messageCount) / elapsedMs
206+
let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
207+
result = float(messageCount) * 1_000_000.0 / elapsedNs
204208

205209
# Fixed-size Mupmuc benchmark functions for common thread counts
206210
proc benchmarkMupmuc1P1C*(
@@ -355,8 +359,8 @@ proc runUnboundedMupsicBenchmark[S: static int, T; MaxThreads: static int](
355359
joinThread(producerThreads[i])
356360
joinThread(consumerThread)
357361

358-
let elapsedMs = float(inMilliseconds(getMonoTime() - startTime))
359-
result = float(totalMessages) / elapsedMs
362+
let elapsedNs = float(inNanoseconds(getMonoTime() - startTime))
363+
result = float(totalMessages) * 1_000_000.0 / elapsedNs
360364

361365
proc benchmarkUnboundedMupsicNP1C(
362366
numProducers: int,
@@ -446,8 +450,8 @@ when isMainModule:
446450
numConsumers = 1,
447451
runs = 10,
448452
)
449-
echo fmt" mean: {sipsicMetrics.mean:.0f} ops/ms"
450-
echo fmt" stddev: {sipsicMetrics.stddev:.0f}"
453+
echo fmt" mean: {sipsicMetrics.mean:.1f} ops/ms"
454+
echo fmt" stddev: {sipsicMetrics.stddev:.1f}"
451455
echo ""
452456

453457
# Mupmuc (bounded MPMC)
@@ -464,8 +468,8 @@ when isMainModule:
464468
benchmarkMupmuc4P4C(runs = 10)
465469
else:
466470
ThroughputMetrics()
467-
echo fmt" mean: {metrics.mean:.0f} ops/ms"
468-
echo fmt" stddev: {metrics.stddev:.0f}"
471+
echo fmt" mean: {metrics.mean:.1f} ops/ms"
472+
echo fmt" stddev: {metrics.stddev:.1f}"
469473
echo ""
470474

471475
# UnboundedMupsic (unbounded MPSC) — new harness, 33 runs.
@@ -485,9 +489,9 @@ when isMainModule:
485489
benchmarkUnboundedMupsic4P1C()
486490
else:
487491
ThroughputMetrics()
488-
echo fmt" mean: {metrics.mean:.0f} ops/ms"
489-
echo fmt" min: {metrics.min:.0f} max: {metrics.max:.0f}"
490-
echo fmt" stddev: {metrics.stddev:.0f}"
492+
echo fmt" mean: {metrics.mean:.1f} ops/ms"
493+
echo fmt" min: {metrics.min:.1f} max: {metrics.max:.1f}"
494+
echo fmt" stddev: {metrics.stddev:.1f}"
491495
echo ""
492496
echo "==================================================="
493497
echo ""
@@ -503,6 +507,6 @@ when isMainModule:
503507
numConsumers = threads,
504508
runs = 10,
505509
)
506-
echo fmt" mean: {metrics.mean:.0f} ops/ms"
507-
echo fmt" stddev: {metrics.stddev:.0f}"
510+
echo fmt" mean: {metrics.mean:.1f} ops/ms"
511+
echo fmt" stddev: {metrics.stddev:.1f}"
508512
echo ""

benchmarks/test_bmf_adapter.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,39 @@ def test_unknown_variant_is_skipped(self) -> None:
193193
actual = bmf_adapter.parse_bench_output(fixture)
194194
self.assertEqual(actual, expected)
195195

196+
def test_decimal_means_and_minmax(self) -> None:
197+
"""Microsecond/nanosecond-precision timer emits `:.1f` decimal values.
198+
199+
After the bench switched from `inMilliseconds` to `inNanoseconds`,
200+
means and min/max gain a fractional component (e.g. 16666.7 instead
201+
of 16667.). The adapter must capture decimal floats verbatim.
202+
"""
203+
fixture = textwrap.dedent(
204+
"""\
205+
Sipsic (bounded SPSC) 1P/1C:
206+
mean: 12345.6 ops/ms
207+
stddev: 12.3
208+
209+
UnboundedMupsic (unbounded MPSC) 2P/1C:
210+
mean: 16666.7 ops/ms
211+
min: 15555.4 max: 17777.8
212+
stddev: 678.9
213+
"""
214+
)
215+
216+
expected = {
217+
"sipsic/1p1c": {"throughput": {"value": 12345.6}},
218+
"unbounded_mupsic/2p1c": {
219+
"throughput": {
220+
"value": 16666.7,
221+
"lower_value": 15555.4,
222+
"upper_value": 17777.8,
223+
}
224+
},
225+
}
226+
actual = bmf_adapter.parse_bench_output(fixture)
227+
self.assertEqual(actual, expected)
228+
196229
def test_partial_min_max_finite(self) -> None:
197230
"""If only min is finite (max is inf), only lower_value is recorded."""
198231
fixture = textwrap.dedent(

0 commit comments

Comments
 (0)