diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..7cdd960e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +# Vendored third-party sources are excluded from GitHub language stats +# and code-search noise per design doc §4.6. +benchmarks/vendor/** linguist-vendored=true linguist-generated=true +docs/assets/uplot-*.js linguist-vendored=true linguist-generated=true +docs/assets/uplot-*.css linguist-vendored=true linguist-generated=true +# Auto-emitted BMF snapshots (Track 5 PR 5 Task 5.5.b). Marked +# `linguist-generated=true` so they do not skew GitHub's language +# stats or appear in code-search noise. NOT marked `linguist-vendored` +# because they are produced by our own CI, not imported from a +# third-party project. +docs/assets/bench-results/*.json linguist-generated=true diff --git a/.github/workflows/bench-comparison.yml b/.github/workflows/bench-comparison.yml new file mode 100644 index 00000000..35aa03b9 --- /dev/null +++ b/.github/workflows/bench-comparison.yml @@ -0,0 +1,227 @@ +# yamllint disable rule:line-length + +name: bench-comparison + +# yamllint disable rule:truthy +on: + workflow_dispatch: + schedule: + # Nightly at 04:00 UTC. Off-peak so the runner pool is fresh; the + # cdylib build adds ~30s vs the in-tree benches. + - cron: '0 4 * * *' + push: + branches: [devel] + paths: + - 'benchmarks/rust/**' + - 'benchmarks/nim/adapters/crossbeam_*' + - '.github/workflows/bench-comparison.yml' +# yamllint enable rule:truthy + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }} + CARGO_TERM_COLOR: always + +jobs: + bench-crossbeam: + # Track 3 §3.13: Crossbeam-only comparison workflow. + # + # Crossbeam adds a Rust toolchain dependency (~5 min cold install), + # which would inflate the bench.yml critical path on every PR. Per + # design 2.6 / impl plan 3.13 it lives in this dedicated workflow, + # gated on nightly cron + workflow_dispatch + targeted path pushes + # to devel only. PR check feedback for crossbeam comes from this + # workflow's separate Bencher report. + name: bench-crossbeam (ubuntu-latest) + runs-on: ubuntu-latest + # Mirror bench.yml's bench-upload permissions: bencher run uses + # `--github-actions ${{ secrets.GITHUB_TOKEN }}` to publish check + # runs / PR annotations, which require explicit `checks: write` and + # `pull-requests: write` on repos with default read-only token + # permissions. `contents: read` and `actions: read` cover + # actions/checkout and any future artifact downloads. + permissions: + contents: read + actions: read + pull-requests: write + checks: write + timeout-minutes: 30 + + steps: + - name: Checkout project + uses: actions/checkout@v4 + + - name: Setup Nim + uses: jiro4989/setup-nim-action@v2 + with: + nim-version: 'stable' + + - name: Install build deps (Linux) + run: | + sudo apt-get update -q -y + sudo apt-get -qq install -y clang + + - name: Clone and install sibling Nim deps (nim-debra, nim-typestates) + # Mirrors bench.yml -- nim.cfg in src/ resolves these via + # ../nim-debra/src and ../nim-typestates/src. Pin to release + # tags (not `main`) for deterministic CI; bump in lockstep with + # build.yml/bench.yml and lockfreequeues.nimble's `requires`. + run: | + set -e + cd .. + git clone --depth 1 --branch v0.6.0 https://github.com/elijahr/nim-debra.git + git clone --depth 1 --branch v0.7.0 https://github.com/elijahr/nim-typestates.git + (cd nim-typestates && nimble install -y) + (cd nim-debra && nimble install -y) + + - name: Vendor unittest2 + run: git clone --depth 1 https://github.com/status-im/nim-unittest2.git deps/unittest2 + + - name: Setup Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache Rust build artifacts + uses: Swatinem/rust-cache@v2 + with: + workspaces: benchmarks/rust/bench-ffi-crossbeam + + - name: Build crossbeam cdylib + # Produces target/release/libbench_ffi_crossbeam.so on Linux. + # The Nim adapters' default {.passL.} includes -L for this path + # so no further wiring is needed. + run: | + cargo build --release \ + --manifest-path benchmarks/rust/bench-ffi-crossbeam/Cargo.toml + + - name: Run cdylib integration tests + # Sanity-check the C-ABI surface before paying for the bench + # compile. Runs in <1s; fails the workflow on regression. + run: | + cargo test --release \ + --manifest-path benchmarks/rust/bench-ffi-crossbeam/Cargo.toml + + - name: Smoke crossbeam adapters + # Compile-and-run sanity check that the adapters can resolve the + # cdylib symbols at runtime before we spend ~10 min on bench + # binaries with the same defines. + run: | + set -eu + nim c -d:release -d:danger --threads:on \ + -d:adapter_crossbeam_array_queue_available \ + -d:adapter_crossbeam_seg_queue_available \ + --passL:"-Wl,-rpath,$(pwd)/benchmarks/rust/bench-ffi-crossbeam/target/release" \ + -o:.tmp/smoke_crossbeam \ + benchmarks/nim/smoke/smoke_crossbeam.nim + ./.tmp/smoke_crossbeam + + - name: Compile bench_mpmc with crossbeam_array_queue + # CI run shape mirrors bench.yml's bench_mpmc settings (1M / 5 / 2). + # rpath ensures the dylib loads from the in-tree build dir at + # runtime without needing LD_LIBRARY_PATH. + run: | + set -eu + nim c -d:release -d:danger --threads:on \ + -d:BenchMpmcMessageCount=1000000 \ + -d:BenchMpmcRuns=5 \ + -d:BenchMpmcWarmup=2 \ + -d:adapter_crossbeam_array_queue_available \ + --passL:"-Wl,-rpath,$(pwd)/benchmarks/rust/bench-ffi-crossbeam/target/release" \ + benchmarks/nim/bench_mpmc.nim + + - name: Compile bench_unbounded with crossbeam_seg_queue + run: | + set -eu + nim c -d:release -d:danger --threads:on \ + -d:UnboundedSipsicMessageCount=500000 \ + -d:UnboundedSipsicRuns=3 \ + -d:UnboundedSipmucMessageCount=500000 \ + -d:UnboundedSipmucRuns=3 \ + -d:UnboundedMupsicMessageCount=500000 \ + -d:UnboundedMupsicRuns=3 \ + -d:UnboundedMupmucMessageCount=500000 \ + -d:UnboundedMupmucRuns=3 \ + -d:BenchUnboundedWarmup=2 \ + -d:adapter_crossbeam_seg_queue_available \ + --passL:"-Wl,-rpath,$(pwd)/benchmarks/rust/bench-ffi-crossbeam/target/release" \ + benchmarks/nim/bench_unbounded.nim + + - name: Run bench_mpmc (crossbeam_array_queue only) + timeout-minutes: 12 + run: | + ./.tmp/bench_mpmc crossbeam_array_queue \ + --bmf-out=bench_mpmc_crossbeam.json \ + | tee bench_mpmc_crossbeam_output.txt + + - name: Run bench_unbounded (crossbeam_seg_queue only) + timeout-minutes: 12 + run: | + ./.tmp/bench_unbounded crossbeam_seg_queue \ + --bmf-out=bench_unbounded_crossbeam.json \ + | tee bench_unbounded_crossbeam_output.txt + + - name: Merge BMF JSON + run: | + python3 benchmarks/merge_bmf.py merged.json \ + bench_mpmc_crossbeam.json \ + bench_unbounded_crossbeam.json + + - name: Show BMF JSON (debug) + run: cat merged.json + + - name: Upload BMF artifact + uses: actions/upload-artifact@v4 + with: + name: bench-comparison-crossbeam-bmf + path: merged.json + + - name: Install Bencher CLI + uses: bencherdev/bencher@main + + - name: Bencher token preflight + run: | + if [ -z "$BENCHER_API_TOKEN" ]; then + echo "::warning title=Bencher upload skipped::BENCHER_API_TOKEN is not set; bench-comparison ran but did not upload." + else + echo "Bencher token present; proceeding with upload." + fi + + - name: Track scheduled benchmarks with Bencher + if: github.event_name == 'schedule' && env.BENCHER_API_TOKEN != '' + run: | + bencher run \ + --project lockfreequeues \ + --token '${{ secrets.BENCHER_API_TOKEN }}' \ + --branch devel \ + --testbed ubuntu-latest \ + --threshold-measure throughput_ops_ms \ + --threshold-test t_test \ + --threshold-max-sample-size 64 \ + --threshold-lower-boundary 0.99 \ + --thresholds-reset \ + --adapter json \ + --file merged.json \ + --github-actions '${{ secrets.GITHUB_TOKEN }}' \ + --err + + - name: Track devel-push benchmarks with Bencher + if: github.event_name == 'push' && env.BENCHER_API_TOKEN != '' + run: | + bencher run \ + --project lockfreequeues \ + --token '${{ secrets.BENCHER_API_TOKEN }}' \ + --branch devel \ + --testbed ubuntu-latest \ + --adapter json \ + --file merged.json \ + --github-actions '${{ secrets.GITHUB_TOKEN }}' + + - name: Track manual benchmarks with Bencher + if: github.event_name == 'workflow_dispatch' && env.BENCHER_API_TOKEN != '' + run: | + bencher run \ + --project lockfreequeues \ + --token '${{ secrets.BENCHER_API_TOKEN }}' \ + --branch "${GITHUB_REF##*/}" \ + --testbed ubuntu-latest \ + --adapter json \ + --file merged.json diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index f220cc43..dd55c298 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -4,22 +4,54 @@ name: bench # yamllint disable rule:truthy on: + # Only fire on PRs that actually target an integration branch. Stacked + # feature-to-feature PRs (`feat/foo` -> `feat/bar`) used to be in this + # allowlist; they have been removed because cascade-rebases would + # otherwise burn Bencher quota uploading near-identical reports for + # every intermediate stack PR. The topmost PR (base=devel) still gets + # PR-time bench treatment, gated further by the `bench` label below. pull_request: - branches: - - main - - devel - - 'feat/**' - - 'perf/**' - - 'fix/**' + branches: [main, devel] paths-ignore: - '*.md' - '.github/workflows/docs.yml' + # Loop-prevention layer 1 of 2 (design §5.X): the snapshot push + # writes BMF JSON under docs/assets/bench-results/. paths-ignore + # makes the bench job skip any push consisting solely of these + # snapshot files, so the snapshot commit cannot retrigger bench. + # The companion guard is the bot-actor `if:` on the matrix and + # bench-upload jobs below. The snapshot commit has NO `[skip ci]` + # marker because docs.yml has no paths-ignore and must run on + # snapshot pushes to deploy the fresh chart data to Pages. + - 'docs/assets/bench-results/**' push: branches: [main, devel] paths-ignore: - '*.md' - '.github/workflows/docs.yml' + - 'docs/assets/bench-results/**' workflow_dispatch: + inputs: + force_skip_boost: + description: 'Force boost soft-skip (Track 3 §2.6 acceptance)' + type: boolean + default: false + force_skip_loony: + description: 'Force loony soft-skip (Track 3 §2.6 acceptance)' + type: boolean + default: false + force_skip_moodycamel: + description: 'Force MoodyCamel soft-skip (Track 4 §4.7 acceptance)' + type: boolean + default: false + force_skip_threading_channels: + description: 'Force threading.Chan soft-skip (Track 4 §4.7 acceptance)' + type: boolean + default: false + force_skip_nim_channel: + description: 'Force system.Channel soft-skip (Track 4 §4.7 acceptance)' + type: boolean + default: false # yamllint enable rule:truthy env: @@ -28,25 +60,107 @@ env: # presence via `env.BENCHER_API_TOKEN`. GitHub Actions does not allow # `secrets.*` directly in `if:`, so the env-var indirection is required. BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }} + # Track 3 §2.6 + Track 4 §4.7 soft-skip overrides. Empty string when + # the workflow_dispatch input is unset (the default), '1' when the + # operator forces a skip. Step-level `if:` clauses gate on these. + FORCE_SKIP_BOOST: ${{ inputs.force_skip_boost && '1' || '' }} + FORCE_SKIP_LOONY: ${{ inputs.force_skip_loony && '1' || '' }} + FORCE_SKIP_MOODYCAMEL: ${{ inputs.force_skip_moodycamel && '1' || '' }} + FORCE_SKIP_THREADING_CHANNELS: ${{ inputs.force_skip_threading_channels && '1' || '' }} + FORCE_SKIP_NIM_CHANNEL: ${{ inputs.force_skip_nim_channel && '1' || '' }} jobs: - benchmark: - name: Throughput bench (ubuntu-latest) + bench-tests: + # Compiles and runs `tests/t_bench_*.nim` (the bench harness's own + # unit tests: t_bench_common, t_bench_latency, t_bench_adapters). + # These tests live outside `srcDir` and are intentionally NOT + # imported by tests/test.nim (see lockfreequeues.nimble) so the + # regular `nimble test` matrix (8 invocations across MM/sanitizer + # combos) does not pull in the bench harness's threading/atomic + # dependencies. Without this job the HistogramTopK / latency CLI / + # adapter assertions are never enforced in CI. + # + # No secret dependencies: fork PRs are allowed so contributors get + # the same harness validation as maintainer pushes. The bot-actor + # guard prevents loop-back from the bench-snapshot push commit + # (see docs.yml). Sibling Nim deps are pinned to release tags + # (not `main`) for deterministic CI; bump in lockstep with + # build.yml/bench.yml's `benchmark` job and lockfreequeues.nimble's + # `requires`. + name: bench-tests (ubuntu-latest) runs-on: ubuntu-latest - # PRs from forks cannot read BENCHER_API_TOKEN. Skip rather than fail - # noisily; maintainer pushes / merges still record the baseline. - if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - permissions: - pull-requests: write - checks: write - timeout-minutes: 30 + if: github.actor != 'github-actions[bot]' + timeout-minutes: 10 + + steps: + - name: Checkout project + uses: actions/checkout@v4 + + - name: Setup Nim + uses: jiro4989/setup-nim-action@v2 + with: + nim-version: 'stable' + + - name: Install build deps (Linux) + run: | + sudo apt-get update -q -y + sudo apt-get -qq install -y clang + + - name: Clone and install sibling Nim deps (nim-debra, nim-typestates) + run: | + set -e + cd .. + git clone --depth 1 --branch main https://github.com/elijahr/nim-debra.git + git clone --depth 1 --branch main https://github.com/elijahr/nim-typestates.git + (cd nim-typestates && nimble install -y) + (cd nim-debra && nimble install -y) + + - name: Vendor unittest2 + run: git clone --depth 1 https://github.com/status-im/nim-unittest2.git deps/unittest2 + + - name: Run bench harness tests + run: nimble benchtests + + bench: + # Track 2 PR 2: matrix over the 5 topology-split binaries. Each + # matrix entry is its own GitHub Actions job with an independent + # `timeout-minutes: 18` budget so a hang in one binary cannot burn + # the entire workflow's clock — the surviving binaries finish, the + # bench-upload job merges what arrived, and the operator gets + # partial Bencher coverage rather than no coverage. Per-binary + # `-d:` overrides are tuned to fit each variant's CI budget; see + # design 2.5 for the override table. + name: ${{ matrix.binary }} (ubuntu-latest) + runs-on: ubuntu-latest + # Two combined gates: + # 1. PRs from forks cannot read BENCHER_API_TOKEN. Skip rather + # than fail noisily; maintainer pushes / merges still record + # the baseline. + # 2. Loop-prevention layer 2 of 2 (design §5.X): the snapshot + # push step (Task 5.5.b) commits as `github-actions[bot]`. + # Even if paths-ignore is ever loosened, the bot-actor guard + # short-circuits before the job grows any cost. + if: | + github.actor != 'github-actions[bot]' && + (github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name == github.repository) + strategy: + fail-fast: false + matrix: + binary: + - bench_spsc + - bench_mpsc + - bench_mpmc + - bench_unbounded + - bench_latency + timeout-minutes: 18 steps: - name: Restore cache uses: actions/cache@v4 with: path: ${HOME}/.cache - key: cache-bench-${{ runner.os }} + key: cache-${{ matrix.binary }}-${{ runner.os }} - name: Checkout project uses: actions/checkout@v4 @@ -64,7 +178,9 @@ jobs: - name: Clone and install sibling Nim deps (nim-debra, nim-typestates) # Mirrors build.yml: nim.cfg uses --path:"../nim-debra/src" and # --path:"../nim-typestates/src", and nimble's resolver runs before - # the compiler so we must install matching branches locally. + # the compiler so we must install matching branches locally. Tracks + # `main` until nimble.directory picks up the relevant tags; bump in + # lockstep with build.yml and lockfreequeues.nimble's `requires`. run: | set -e cd .. @@ -76,70 +192,534 @@ jobs: - name: Vendor unittest2 run: git clone --depth 1 https://github.com/status-im/nim-unittest2.git deps/unittest2 - - name: Run adapter unit tests - # Cheap pre-flight: fail fast if the parser itself is broken before - # spending minutes compiling and running the bench binary. - run: python3 benchmarks/test_bmf_adapter.py -v - - - name: Compile bench_throughput (CI-tuned run shape) - # Cloud runs use a tighter wall-clock budget than the local default - # (1M messages × 33 runs). Bounded variants (sipsic, channels) use - # 1M × 5 runs and finish in single-digit seconds. unbounded_mupsic - # is super-linear in message count and is gated separately via - # UnboundedMupsicMessageCount=500000 + UnboundedMupsicRuns=3 to keep - # the variant tractable inside the 30-min job budget. Warmup stays - # at 2 to absorb JIT/cache effects. - # - # Counts are 10x the previous values (100k / 50k → 1M / 500k) and - # the bench timer is microsecond/nanosecond-precision (computed in - # ns, printed as `mean: .1f ops/ms`). Together this gives non-zero - # stddev and >1-decimal resolution between runs on the fast CI - # runner; previously a 50k unbounded_mupsic run completed in ~3 ms, - # so multiple samples bucketed into the same integer ms and stddev - # was reported as 0. The new shape costs roughly 30-60s on CI: well - # inside the 30-min job budget while giving the timer enough - # wall-clock spread to produce a stable, comparable signal. + # ---------- Track 3 MVP adapter installs (soft-skip) ---------- + # + # Each adapter has three stages: install -> smoke -> set-env-on-success. + # Failure at any stage flips the binary's compile flags so the slugs + # are simply omitted from the BMF instead of failing the workflow. + # Annotate-skipped steps surface a yellow PR-check warning. + + - name: Install boost (libboost-dev for Boost.LockFree) + # Boost.LockFree is C++ header-only; libboost-dev pulls the + # entire boost include tree on the runner. Skip on bench_mpsc / + # bench_unbounded / bench_latency (no boost adapter wired into + # those binaries; install would just slow them down). + id: install-boost + if: | + env.FORCE_SKIP_BOOST != '1' && + (matrix.binary == 'bench_spsc' || matrix.binary == 'bench_mpmc') + continue-on-error: true + run: | + sudo apt-get install -qq -y libboost-dev + + - name: Smoke boost adapters + id: smoke-boost + if: | + env.FORCE_SKIP_BOOST != '1' && + steps.install-boost.outcome == 'success' && + (matrix.binary == 'bench_spsc' || matrix.binary == 'bench_mpmc') + continue-on-error: true + run: | + set -eu + # Compile and run the smoke binary with both gates so a single + # invocation covers queue + spsc_queue. nim cpp because boost + # headers are C++. + nim cpp -d:release -d:danger --threads:on \ + -d:adapter_boost_lockfree_queue_available \ + -d:adapter_boost_lockfree_spsc_available \ + -o:.tmp/smoke_boost \ + benchmarks/nim/smoke/smoke_boost.nim + ./.tmp/smoke_boost + + - name: Set boost adapter defines (success only) + if: | + env.FORCE_SKIP_BOOST != '1' && + steps.install-boost.outcome == 'success' && + steps.smoke-boost.outcome == 'success' + run: | + { + echo "ADAPTER_BOOST_QUEUE=1" + echo "ADAPTER_BOOST_SPSC=1" + } >> "$GITHUB_ENV" + + - name: Annotate boost skipped + if: | + (matrix.binary == 'bench_spsc' || matrix.binary == 'bench_mpmc') && + (env.FORCE_SKIP_BOOST == '1' || + steps.install-boost.outcome != 'success' || + steps.smoke-boost.outcome != 'success') + run: | + echo "::warning title=Adapter skipped::boost.lockfree install or smoke failed for ${{ matrix.binary }}; bench will omit boost slugs." + + - name: Install loony (nimble) + # Loony is a Nim-only unbounded MPMC queue; nimble pulls it from + # the registry. Only bench_unbounded uses it. + id: install-loony + if: | + env.FORCE_SKIP_LOONY != '1' && + matrix.binary == 'bench_unbounded' + continue-on-error: true run: | + nimble install -y loony + + - name: Smoke loony adapter + id: smoke-loony + if: | + env.FORCE_SKIP_LOONY != '1' && + steps.install-loony.outcome == 'success' && + matrix.binary == 'bench_unbounded' + continue-on-error: true + run: | + # The smoke is the round-trip test in tests/t_bench_adapters.nim. + # Compile-only; runs in <1s. + set -eu nim c -d:release -d:danger --threads:on \ - -d:MessageCount=1000000 \ - -d:DefaultRuns=5 \ - -d:WarmupRuns=2 \ - -d:UnboundedMupsicRuns=3 \ - -d:UnboundedMupsicMessageCount=500000 \ - benchmarks/nim/bench_throughput.nim - - - name: Run bench_throughput - # mupmuc 4P/4C livelock fixed via CAS-retry backoff; ref issue #15. - # Step-level timeout: if any single variant hangs, fail fast inside - # the 30-min job budget so Bencher upload steps still run (or are - # visibly skipped) instead of burning the entire budget on one variant. - timeout-minutes: 20 - run: ./.tmp/bench_throughput sipsic mupmuc unbounded_mupsic channels | tee bench_output.txt - - - name: Convert to BMF JSON - run: python3 benchmarks/bmf_adapter.py bench_output.txt bench_results.json + -d:adapter_loony_available \ + -o:.tmp/smoke_loony \ + -r tests/t_bench_adapters.nim + + - name: Set loony adapter define (success only) + if: | + env.FORCE_SKIP_LOONY != '1' && + steps.install-loony.outcome == 'success' && + steps.smoke-loony.outcome == 'success' + run: echo "ADAPTER_LOONY=1" >> "$GITHUB_ENV" + + - name: Annotate loony skipped + if: | + matrix.binary == 'bench_unbounded' && + (env.FORCE_SKIP_LOONY == '1' || + steps.install-loony.outcome != 'success' || + steps.smoke-loony.outcome != 'success') + run: | + echo "::warning title=Adapter skipped::loony install or smoke failed; bench will omit loony slugs." + + # ---------- Track 4 PR-4 adapter installs (soft-skip) ---------- + # + # MoodyCamel: vendored single-header at + # benchmarks/vendor/concurrentqueue/concurrentqueue.h. The + # "install" step is therefore a `test -f` rather than a fetch, so + # the bench is reproducible without network egress. Only + # bench_unbounded wires the moodycamel adapter (mpmc_unbounded). + - name: Install MoodyCamel (vendored header presence check) + id: install-moodycamel + if: | + env.FORCE_SKIP_MOODYCAMEL != '1' && + matrix.binary == 'bench_unbounded' + continue-on-error: true + run: | + test -f benchmarks/vendor/concurrentqueue/concurrentqueue.h + test -f benchmarks/vendor/concurrentqueue/moodycamel_wrapper.cpp + + - name: Smoke MoodyCamel adapter + id: smoke-moodycamel + if: | + env.FORCE_SKIP_MOODYCAMEL != '1' && + steps.install-moodycamel.outcome == 'success' && + matrix.binary == 'bench_unbounded' + continue-on-error: true + run: | + set -eu + # nim cpp because concurrentqueue.h is C++ and the wrapper + # is C++ source compiled in via `{.compile: ...}`. + nim cpp -d:release -d:danger --threads:on \ + -d:adapter_moodycamel_available \ + -o:.tmp/smoke_moodycamel \ + benchmarks/nim/smoke/smoke_moodycamel.nim + ./.tmp/smoke_moodycamel + + - name: Set MoodyCamel adapter define (success only) + if: | + env.FORCE_SKIP_MOODYCAMEL != '1' && + steps.install-moodycamel.outcome == 'success' && + steps.smoke-moodycamel.outcome == 'success' + run: echo "ADAPTER_MOODYCAMEL=1" >> "$GITHUB_ENV" + + - name: Annotate MoodyCamel skipped + if: | + matrix.binary == 'bench_unbounded' && + (env.FORCE_SKIP_MOODYCAMEL == '1' || + steps.install-moodycamel.outcome != 'success' || + steps.smoke-moodycamel.outcome != 'success') + run: | + echo "::warning title=Adapter skipped::moodycamel install or smoke failed; bench will omit moodycamel slugs." + + # threading.Chan (nimble package): only used by bench_mpmc. + - name: Install threading (nimble) + id: install-threading-channels + if: | + env.FORCE_SKIP_THREADING_CHANNELS != '1' && + matrix.binary == 'bench_mpmc' + continue-on-error: true + run: | + nimble install -y threading + + - name: Smoke threading.Chan adapter + id: smoke-threading-channels + if: | + env.FORCE_SKIP_THREADING_CHANNELS != '1' && + steps.install-threading-channels.outcome == 'success' && + matrix.binary == 'bench_mpmc' + continue-on-error: true + run: | + set -eu + nim c -d:release -d:danger --threads:on \ + -d:adapter_threading_channels_available \ + -o:.tmp/smoke_threading_channels \ + benchmarks/nim/smoke/smoke_threading_channels.nim + ./.tmp/smoke_threading_channels + + - name: Set threading.Chan adapter define (success only) + if: | + env.FORCE_SKIP_THREADING_CHANNELS != '1' && + steps.install-threading-channels.outcome == 'success' && + steps.smoke-threading-channels.outcome == 'success' + run: echo "ADAPTER_THREADING_CHANNELS=1" >> "$GITHUB_ENV" + + - name: Annotate threading.Chan skipped + if: | + matrix.binary == 'bench_mpmc' && + (env.FORCE_SKIP_THREADING_CHANNELS == '1' || + steps.install-threading-channels.outcome != 'success' || + steps.smoke-threading-channels.outcome != 'success') + run: | + echo "::warning title=Adapter skipped::threading.Chan install or smoke failed; bench will omit threading_channels slugs." + + # system.Channel (Nim stdlib, no install): only used by bench_mpsc. + # The "install" step is a no-op success placeholder so the + # FORCE_SKIP_NIM_CHANNEL flag has the same shape as the other + # adapters; the real check happens in the smoke step. + - name: Smoke system.Channel adapter + id: smoke-nim-channel + if: | + env.FORCE_SKIP_NIM_CHANNEL != '1' && + matrix.binary == 'bench_mpsc' + continue-on-error: true + run: | + set -eu + # The smoke is the round-trip test in tests/t_bench_adapters.nim + # under the nim_channel gate; the default Nim install ships + # system.Channel so no install step is required. + nim c -d:release -d:danger --threads:on \ + -d:adapter_nim_channel_available \ + -o:.tmp/smoke_nim_channel \ + -r tests/t_bench_adapters.nim + + - name: Set system.Channel adapter define (success only) + if: | + env.FORCE_SKIP_NIM_CHANNEL != '1' && + steps.smoke-nim-channel.outcome == 'success' + run: echo "ADAPTER_NIM_CHANNEL=1" >> "$GITHUB_ENV" + + - name: Annotate system.Channel skipped + if: | + matrix.binary == 'bench_mpsc' && + (env.FORCE_SKIP_NIM_CHANNEL == '1' || + steps.smoke-nim-channel.outcome != 'success') + run: | + echo "::warning title=Adapter skipped::system.Channel smoke failed; bench will omit nim_channel slugs." + + - name: Build adapter define flags + # Aggregates the per-adapter env flags into a single shell + # variable consumed by the compile step. Empty when no adapter + # is enabled; properly space-separated otherwise. + id: adapter-flags + run: | + set -eu + flags="" + mode="c" + if [ "${ADAPTER_BOOST_QUEUE:-}" = "1" ]; then + flags="$flags -d:adapter_boost_lockfree_queue_available" + mode="cpp" + fi + if [ "${ADAPTER_BOOST_SPSC:-}" = "1" ]; then + flags="$flags -d:adapter_boost_lockfree_spsc_available" + mode="cpp" + fi + if [ "${ADAPTER_LOONY:-}" = "1" ]; then + flags="$flags -d:adapter_loony_available" + fi + if [ "${ADAPTER_MOODYCAMEL:-}" = "1" ]; then + flags="$flags -d:adapter_moodycamel_available" + mode="cpp" + fi + if [ "${ADAPTER_THREADING_CHANNELS:-}" = "1" ]; then + flags="$flags -d:adapter_threading_channels_available" + fi + if [ "${ADAPTER_NIM_CHANNEL:-}" = "1" ]; then + flags="$flags -d:adapter_nim_channel_available" + fi + { + echo "flags=$flags" + echo "mode=$mode" + } >> "$GITHUB_OUTPUT" + echo "Adapter flags: $flags" + echo "Compile mode: $mode" + + - name: Compile ${{ matrix.binary }} (CI-tuned run shape) + # Per-binary -d: overrides mirror the design 2.5 CI override + # column. Bounded throughput binaries: 1M messages × 5 runs × + # 2 warmup; cheap, single-digit seconds total. bench_unbounded + # uses a tighter shape — sipsic 200K×3, sipmuc/mupsic 100K×2, + # mupmuc 50K×2 — because the 16-shape total (1+3+3+9) plus + # oversubscribed shapes like 1p4c on 4-vCPU runners exhausts + # the budget at higher N. The mupmuc 9-shape grid dominates + # the wall clock so it gets the deepest cut. bench_latency + # keeps PR 1's tighter shape (50K × 11 × 2) because each + # ping-pong RTT records exactly one sample, so 1M messages + # would never fit. + # + # ${{ steps.adapter-flags.outputs.flags }} is the space-separated + # set of -d:adapter_*_available defines that survived install + + # smoke. ${{ steps.adapter-flags.outputs.mode }} is 'cpp' iff at + # least one C++-only adapter (boost) is enabled, else 'c'. + env: + NIM_MODE: ${{ steps.adapter-flags.outputs.mode }} + ADAPTER_FLAGS: ${{ steps.adapter-flags.outputs.flags }} + run: | + set -eu + # shellcheck disable=SC2086 # ADAPTER_FLAGS is intentionally word-split. + case "${{ matrix.binary }}" in + bench_spsc) + nim "$NIM_MODE" -d:release -d:danger --threads:on \ + -d:BenchSpscMessageCount=1000000 \ + -d:BenchSpscRuns=5 \ + -d:BenchSpscWarmup=2 \ + $ADAPTER_FLAGS \ + benchmarks/nim/bench_spsc.nim + ;; + bench_mpsc) + # PR 4 wires nim_channel (Nim stdlib, plain `nim c`) into + # bench_mpsc. No C++-only adapters live here, so ignore + # NIM_MODE and stay on `nim c`; ADAPTER_FLAGS may carry + # -d:adapter_nim_channel_available. + nim c -d:release -d:danger --threads:on \ + -d:BenchMpscMessageCount=1000000 \ + -d:BenchMpscRuns=5 \ + -d:BenchMpscWarmup=2 \ + $ADAPTER_FLAGS \ + benchmarks/nim/bench_mpsc.nim + ;; + bench_mpmc) + nim "$NIM_MODE" -d:release -d:danger --threads:on \ + -d:BenchMpmcMessageCount=1000000 \ + -d:BenchMpmcRuns=5 \ + -d:BenchMpmcWarmup=2 \ + $ADAPTER_FLAGS \ + benchmarks/nim/bench_mpmc.nim + ;; + bench_unbounded) + # bench_unbounded supports loony (Nim, plain `nim c`) and + # PR 4's MoodyCamel (C++ vendored header, requires `nim + # cpp`). NIM_MODE is "cpp" iff ADAPTER_MOODYCAMEL was + # set; otherwise it stays "c". loony is mode-neutral. + nim "$NIM_MODE" -d:release -d:danger --threads:on \ + -d:UnboundedSipsicMessageCount=200000 \ + -d:UnboundedSipsicRuns=3 \ + -d:UnboundedSipmucMessageCount=100000 \ + -d:UnboundedSipmucRuns=2 \ + -d:UnboundedMupsicMessageCount=100000 \ + -d:UnboundedMupsicRuns=2 \ + -d:UnboundedMupmucMessageCount=50000 \ + -d:UnboundedMupmucRuns=2 \ + -d:BenchUnboundedWarmup=1 \ + -d:BenchSkipOversubscribed \ + $ADAPTER_FLAGS \ + benchmarks/nim/bench_unbounded.nim + ;; + bench_latency) + nim c -d:release -d:danger --threads:on \ + -d:BenchLatencyMessageCount=50000 \ + -d:BenchLatencyRuns=11 \ + -d:BenchLatencyWarmupRuns=2 \ + benchmarks/nim/bench_latency.nim + ;; + *) + echo "::error::unknown binary: ${{ matrix.binary }}" + exit 1 + ;; + esac + + - name: Run ${{ matrix.binary }} + # Step-level timeout: a runtime hang in one variant (lost + # message, missed wakeup, livelock regression) would otherwise + # spin until the job-level 18-min budget expired. Fail fast at + # 10 min so the upload step still records the partial result. + timeout-minutes: 10 + run: | + ./.tmp/${{ matrix.binary }} --bmf-out=${{ matrix.binary }}.json \ + | tee ${{ matrix.binary }}_output.txt + + - name: Upload ${{ matrix.binary }} BMF artifact + # `if: always()` so a step-level timeout in `Run ${{ matrix.binary }}` + # still uploads any partial JSON the binary managed to flush; + # `if-no-files-found: ignore` so a binary that crashed before + # writing the file does not turn the whole job red — the + # bench-upload job merges whatever artifacts arrive and the + # operator gets partial coverage rather than zero coverage. + if: always() + uses: actions/upload-artifact@v4 + with: + name: bench-${{ matrix.binary }}-bmf + path: ${{ matrix.binary }}.json + if-no-files-found: ignore + + bench-upload: + # Merges per-binary BMF artifacts into a single JSON before a single + # `bencher run` invocation. Bencher creates a separate Report per + # invocation; multiple uploads would NOT co-locate measures on a + # single per-slug history (design 1 / Track 1 Task 1.4). + name: Merge BMF + Bencher upload (ubuntu-latest) + runs-on: ubuntu-latest + # Three combined gates: + # 1. `always()` — a single matrix-leg failure (e.g. one binary + # timed out) must not skip the merge+upload; the operator + # still gets partial Bencher coverage from the surviving + # binaries (design's partial-coverage tolerance). + # 2. Bot-actor guard (loop-prevention layer 3 of 3): the + # snapshot push step commits as `github-actions[bot]`. Skip + # the upload when the actor is the bot so a snapshot push + # cannot trigger another bench run. + # 3. Fork-PR token guard: PRs from forks cannot read + # BENCHER_API_TOKEN; skip rather than fail noisily. + if: | + always() && + github.actor != 'github-actions[bot]' && + (github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name == github.repository) + needs: [bench] + permissions: + # `contents: write` covers both the read needed by + # actions/checkout and the write needed by the snapshot-push step + # (Task 5.5.b). The push only fires on `push` to `refs/heads/devel`; + # PR runs and other branches keep contents:write unused. + contents: write + actions: read + pull-requests: write + checks: write + timeout-minutes: 10 + + steps: + - name: Checkout project + uses: actions/checkout@v4 + with: + # Snapshot-push needs a writable working tree on `devel` so + # `git commit` + `git push origin devel` succeeds. Detached + # HEAD (the default for push events) cannot push back. + ref: ${{ github.event_name == 'push' && github.ref_name || '' }} + # Default GITHUB_TOKEN identity (`github-actions[bot]`) is the + # token the actor-guard short-circuits on (Task 5.5.a), so + # the push closes the loop-prevention contract. + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download BMF artifacts + # Pattern matches every per-binary artifact uploaded by the + # bench matrix (`bench-bench_spsc-bmf`, `bench-bench_mpsc-bmf`, + # `bench-bench_mpmc-bmf`, `bench-bench_unbounded-bmf`, + # `bench-bench_latency-bmf`). Each artifact lands in its own + # subdir under `bmf-inputs/` (no `merge-multiple`) so any future + # filename overlap between binaries cannot silently overwrite. + uses: actions/download-artifact@v4 + with: + pattern: bench-*-bmf + path: ./bmf-inputs/ + + - name: List BMF inputs (debug) + run: find bmf-inputs/ -type f -name '*.json' -print + + - name: Validate BMF inputs (drop unparseable fragments) + # `bench-upload` runs with `if: always()`, so a matrix leg that + # crashed mid-write can leave a truncated JSON fragment in its + # uploaded artifact. `merge_bmf.py` exits 1 on the first malformed + # input, which would block uploading results from the surviving + # legs. Pre-validate each fragment with `json.load` and remove + # any that fail to parse, emitting a `::warning::` per dropped + # file so the operator can see which leg produced bad output. + run: | + while IFS= read -r -d '' f; do + if ! python3 -c 'import json,sys; json.load(open(sys.argv[1],"r",encoding="utf-8"))' "$f" 2>/dev/null; then + echo "::warning::Dropping unparseable BMF fragment: $f" + rm -f "$f" + fi + done < <(find bmf-inputs/ -type f -name '*.json' -print0) + + - name: Verify BMF inputs present + # If every matrix leg failed before producing JSON (or every + # fragment was dropped as unparseable above) we have nothing to + # merge. Fail loudly here so the workflow surfaces a clear "no + # benchmarks ran" signal instead of crashing inside merge_bmf.py + # with an empty argv. + run: | + count=$(find bmf-inputs/ -type f -name '*.json' | wc -l) + if [ "$count" -eq 0 ]; then + echo "::error::No valid BMF JSON fragments found in bmf-inputs/. All bench matrix legs must have failed (or produced unparseable output) before writing usable JSON." >&2 + exit 1 + fi + echo "Found $count valid BMF JSON fragment(s) to merge." + + - name: Merge BMF JSON + # `merge_bmf.py` unions per-slug measure dicts across N inputs + # and exits 1 on (slug, measure) collisions. Pure stdlib (no + # third-party deps), so no Python install step is needed beyond + # the ubuntu-latest default. + # + # Inputs live one subdir deep under `bmf-inputs//` + # (no `merge-multiple`); enumerate all `*.json` recursively and + # NUL-delimit so filenames with whitespace cannot break argv + # splitting in the future. + run: | + find bmf-inputs/ -type f -name '*.json' -print0 | \ + xargs -0 python3 benchmarks/merge_bmf.py merged.json + + - name: Verify deletion-safety against pre-split fixture + # Track 2 PR 2 Task 2.7 deletion-safety guard. Asserts the + # union of the topology-split BMFs is a strict superset of the + # pre-split slug fixture captured from the legacy + # bench_throughput before the split. Exit 1 with missing slugs + # listed on stderr fails this step (and the workflow), making + # any silent slug regression visible in the PR check summary. + run: | + python3 benchmarks/scripts/superset_check.py \ + tests/fixtures/pre-split-slugs.json \ + merged.json - name: Show BMF JSON (debug) - run: cat bench_results.json + run: cat merged.json - name: Install Bencher CLI uses: bencherdev/bencher@main - - name: Bencher token preflight - # Always runs. Surfaces a yellow PR-check warning when the upload - # secret is unset so it is obvious from the check summary why the - # downstream Bencher steps were skipped, rather than the bench - # silently appearing to "do nothing" with the data. + - name: Bencher upload preflight + # Always runs. Surfaces a yellow PR-check warning whenever the + # downstream `bencher run` would be skipped, so the reason is + # obvious from the check summary rather than the bench silently + # appearing to "do nothing" with the data. Skip reasons: + # 1. BENCHER_API_TOKEN secret unset (any event) + # 2. PR event without the `bench` label (cost gate — Bencher + # ingest charges per metric, so PR comparisons are opt-in + # via label rather than firing on every cascade-rebase push) + # Push to main/devel and workflow_dispatch never need the label. run: | if [ -z "$BENCHER_API_TOKEN" ]; then - echo "::warning title=Bencher upload skipped::BENCHER_API_TOKEN secret is not set on this repo. The bench ran successfully and produced bench_results.json (visible in the 'Show BMF JSON (debug)' step), but no data is being uploaded to Bencher.dev. Set up the project at bencher.dev with slug 'lockfreequeues' and add BENCHER_API_TOKEN as a repo secret to enable upload." - else - echo "Bencher token present; proceeding with upload." + echo "::warning title=Bencher upload skipped::BENCHER_API_TOKEN secret is not set on this repo. The bench ran successfully and produced merged.json (visible in the 'Show BMF JSON (debug)' step), but no data is being uploaded to Bencher.dev. Set up the project at bencher.dev with slug 'lockfreequeues' and add BENCHER_API_TOKEN as a repo secret to enable upload." + exit 0 + fi + if [ "${{ github.event_name }}" = "pull_request" ] \ + && ! ${{ contains(github.event.pull_request.labels.*.name, 'bench') }}; then + echo "::warning title=Bencher upload skipped::This is a pull_request event but the PR does not carry the 'bench' label. PR-time Bencher uploads are opt-in to control metric-ingest costs. Add the 'bench' label to the PR and re-run this workflow if you want a Bencher comparison report on this revision." + exit 0 fi + echo "Bencher upload preconditions met; proceeding." # PR runs: compare against base branch and post a comment. + # Gated on the `bench` PR label so cascade-rebases of stacked PRs + # do not burn Bencher quota; add the label deliberately when you + # want a comparison report. - name: Track PR benchmarks with Bencher - if: github.event_name == 'pull_request' && env.BENCHER_API_TOKEN != '' + if: | + github.event_name == 'pull_request' + && env.BENCHER_API_TOKEN != '' + && contains(github.event.pull_request.labels.*.name, 'bench') run: | bencher run \ --project lockfreequeues \ @@ -151,11 +731,33 @@ jobs: --start-point-reset \ --testbed ubuntu-latest \ --adapter json \ - --file bench_results.json \ + --file merged.json \ --github-actions '${{ secrets.GITHUB_TOKEN }}' \ --err # Push to main/devel: record the baseline for that branch. + # + # Track 6 Task 6.3: per-measure thresholds. We gate on BOTH + # - latency_p99_ns (upper boundary: regression = latency increase) + # - throughput_ops_ms (lower boundary: regression = throughput drop) + # in a single `bencher run` invocation by repeating the + # `--threshold-measure / --threshold-test / --threshold-max-sample-size + # / --threshold-{upper,lower}-boundary` block per measure (Bencher CLI + # convention; design §3 PR 6). End with `--thresholds-reset` so only + # the explicitly-listed thresholds remain active going forward. + # + # NOTE on activation: per Track 6 Task 6.4, the latency threshold + # requires ≥ 10 prior runs accumulated in Bencher to calibrate the + # t-test baseline. Until that soak completes (post-merge), the + # threshold is effectively dormant — Bencher will not emit alerts on + # measures that have insufficient sample history. The configuration + # is in place so activation is purely a function of run count, not + # workflow edits. + # + # NOTE on previous measure name: the prior config used + # `--threshold-measure throughput`; the actual measure key emitted by + # bench_{spsc,mpsc,mpmc,unbounded}.nim is `throughput_ops_ms`, so the + # earlier threshold never matched any measure. Track 6 corrects this. - name: Track base branch benchmarks with Bencher if: github.event_name == 'push' && env.BENCHER_API_TOKEN != '' run: | @@ -164,13 +766,17 @@ jobs: --token '${{ secrets.BENCHER_API_TOKEN }}' \ --branch "${GITHUB_REF##*/}" \ --testbed ubuntu-latest \ - --threshold-measure throughput \ + --threshold-measure latency_p99_ns \ + --threshold-test t_test \ + --threshold-max-sample-size 64 \ + --threshold-upper-boundary 0.99 \ + --threshold-measure throughput_ops_ms \ --threshold-test t_test \ --threshold-max-sample-size 64 \ --threshold-lower-boundary 0.99 \ --thresholds-reset \ --adapter json \ - --file bench_results.json \ + --file merged.json \ --github-actions '${{ secrets.GITHUB_TOKEN }}' \ --err @@ -184,4 +790,69 @@ jobs: --branch "${GITHUB_REF##*/}" \ --testbed ubuntu-latest \ --adapter json \ - --file bench_results.json + --file merged.json + + # Track 5 PR 5 Task 5.5.b: snapshot the merged BMF into the docs + # assets tree so the chart on `docs/benchmarks.md` always points + # at the most recent devel run. Loop-prevention against bench.yml + # itself relies on two layers (no `[skip ci]` marker — see below): + # 1. `paths-ignore: docs/assets/bench-results/**` on bench.yml + # — the snapshot push only changes files under that prefix, + # so bench.yml does not re-evaluate at all. + # 2. `if: github.actor != 'github-actions[bot]'` on the parent + # bench / bench-upload jobs — defense-in-depth in case the + # paths-ignore is ever loosened. + # + # We deliberately do NOT include `[skip ci]` in the commit + # message: GitHub's native skip applies to ALL workflows on the + # commit, and docs.yml has no paths-ignore. Suppressing docs.yml + # would leave the freshly snapshotted JSON un-deployed on Pages, + # so the chart on the live docs site would lag the bench run + # that produced the snapshot. The two layers above already + # prevent bench.yml self-trigger; docs.yml does not trigger any + # bench work, so allowing it to run is the desired behavior. + # Only fires on `push` to `refs/heads/devel`; tag pushes, + # workflow_dispatch, PR runs, and main pushes do nothing here. + - name: Snapshot to docs assets (devel push only) + if: github.event_name == 'push' && github.ref == 'refs/heads/devel' + run: | + set -euo pipefail + git config user.name 'github-actions[bot]' + git config user.email 'github-actions[bot]@users.noreply.github.com' + mkdir -p docs/assets/bench-results + SHA=$(git rev-parse --short HEAD) + cp merged.json "docs/assets/bench-results/${SHA}.json" + cp merged.json docs/assets/bench-results/latest.json + git add docs/assets/bench-results/ + # Nothing to commit if merged.json is byte-identical to the + # currently-checked-in latest.json (rare but possible on a + # docs-only retrigger). `git diff --cached --quiet` exits 0 + # when there is no staged diff; in that case skip cleanly. + if git diff --cached --quiet; then + echo "::notice::Snapshot identical to current checked-in latest.json; skipping commit." + exit 0 + fi + git commit -m "chore(bench): refresh snapshot" + # Long bench runs leave a wide window for `devel` to advance + # before we push (a docs-only follow-up commit, an unrelated + # merge, etc.); a plain `git push` would then be rejected + # non-fast-forward and fail the entire bench-upload job. + # Retry: fetch+rebase+push up to 3 times. bench.yml's + # paths-ignore on docs/assets/bench-results/** prevents this + # commit from retriggering bench.yml, so the retry loop is + # safe regardless of how many fast-forwards happen. + for attempt in 1 2 3; do + if git push origin HEAD:devel; then + exit 0 + fi + echo "::notice::push attempt $attempt rejected; rebasing onto origin/devel and retrying." + git fetch origin devel + # Rebase preserves our snapshot commit on top of any new + # commits that landed since checkout. Conflict on the + # snapshot files themselves is impossible (no other workflow + # writes to docs/assets/bench-results/), so a clean rebase + # is the expected outcome. + git rebase origin/devel + done + echo "::error::Snapshot push failed after 3 retries; aborting." >&2 + exit 1 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bcab4612..88ae0c60 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -82,7 +82,7 @@ jobs: # nim.cfg uses --path:"../nim-debra/src" and --path:"../nim-typestates/src" # for compiler-side resolution. Nimble's dep resolver runs BEFORE # the compiler though, and `nimble test` will refuse to launch if - # `requires "debra >= 0.5.0"` is unsatisfiable from nimble's + # `requires "debra >= 0.7.0"` is unsatisfiable from nimble's # package store. So fetch matching branches and `nimble install` # them locally to satisfy the resolver. Once nimble.directory has # picked up the new tags, this whole step can be replaced with a @@ -123,8 +123,9 @@ jobs: } # Skip `nimble develop -y` — nim.cfg's --noNimblePath + # explicit --path: directives mean nimble's package resolver is - # not used. Going through develop would only add a dep-resolution - # round trip that fails until debra 0.3.0 publishes. + # not used. The sibling-clone step above is what populates the + # local nimble store; going through `nimble develop` would only + # add a redundant dep-resolution round trip. echo "::group::Run test suite (includes arc, orc, refc, lock-free enforcement)" nimble_run nimble test echo "::endgroup::" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 86fb858e..4e582127 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -137,3 +137,37 @@ jobs: if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || github.ref == 'refs/heads/devel' run: | mike deploy --push dev + + # Track 5 PR 5 Task 5.6 (design §5.Y): verify the chart's data + # endpoint is reachable AND parses as JSON after every dev + # deploy. The chart fails silently on a 404 (it renders an + # inline error rather than crashing the page) so without this + # check a broken asset path would only surface via user reports. + # `sleep 30` accommodates GitHub Pages CDN propagation; raise to + # 60 if this step ever flakes. + - name: Verify mike asset path + # Derive the Pages URL from the repository context so a fork or + # repo rename does not falsely fail this verification step. The + # default GitHub Pages URL convention is + # `https://.github.io//`. If the project ever moves + # to a custom domain the URL becomes wrong again, but at that + # point we'd update the workflow to read site_url from + # mkdocs.yml; for now the convention is stable. + if: | + github.ref == 'refs/heads/main' || + github.ref == 'refs/heads/master' || + github.ref == 'refs/heads/devel' + env: + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + run: | + set -euo pipefail + URL="https://${OWNER}.github.io/${REPO}/dev/assets/bench-results/latest.json" + sleep 30 + HTTP_CODE=$(curl -sS -o /tmp/latest.json -w "%{http_code}" "$URL") + if [ "$HTTP_CODE" != "200" ]; then + echo "::error::mike asset endpoint returned HTTP $HTTP_CODE for $URL" + exit 1 + fi + python3 -c "import json; json.load(open('/tmp/latest.json'))" + echo "::notice::mike asset endpoint OK: $URL" diff --git a/.gitignore b/.gitignore index ee6f9e13..3329113d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,9 @@ nimble.paths # Internal planning docs docs/plans/ deps/ +# Worktree-local symlink to the main repo's deps/ folder, created by +# the worktree setup so nim.cfg's `--path:"deps/unittest2"` resolves. +deps # Embedded repositories (use as dependencies, not submodules) nim-typestates/ @@ -32,8 +35,10 @@ nim-unittest2/ logs/ test_typed_introspection* benchmarks/nim/bench_latency -benchmarks/nim/bench_main -benchmarks/nim/bench_throughput +benchmarks/nim/bench_spsc +benchmarks/nim/bench_mpsc +benchmarks/nim/bench_mpmc +benchmarks/nim/bench_unbounded # Compiled benchmark test binaries (extensionless executables) benchmarks/nim/tests/t_* @@ -42,3 +47,7 @@ benchmarks/nim/tests/t_* # Compiled stress-test binaries (extensionless executables) stress-tests/[a-z_]* !stress-tests/*.nim + +# Rust crate build artifacts (FFI bench bridges) +benchmarks/rust/*/target/ +benchmarks/rust/*/Cargo.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 13012dbf..2677f155 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,17 +11,406 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `CASAttempt` typestate restructured into a proper typestate union. `CASPending` now transitions to `CASSucceeded | CASFailed` (aliased as `CASResult`) via `executeCAS`, replacing the previous single-state design with `assumeSuccess` / `assumeFailure` escape hatches. The `assumeSuccess` and `assumeFailure` procs have been removed. Callers that drove `CASAttempt` outside the bundled MPMC machinery must migrate to the union return form. These helpers were only consumed by `tests/t_cas.nim`; the bundled MPMC machinery calls `compareExchangeWeak` directly and was unaffected. No public lock-free queue API is affected. +### Added + +- Latency p99 + throughput regression gating in Bencher (PR 6, Track 6). + `bench.yml`'s base-branch tracking step now configures per-measure + thresholds in a single `bencher run` invocation: `latency_p99_ns` + with `--threshold-upper-boundary 0.99` (regression = latency + increase) and `throughput_ops_ms` with `--threshold-lower-boundary + 0.99` (regression = throughput drop). Both use `--threshold-test + t_test --threshold-max-sample-size 64`, terminated by + `--thresholds-reset` so only the explicitly-listed thresholds + remain active. Threshold activation requires ≥ 10 prior runs + accumulated in Bencher to calibrate the t-test baseline (Task 6.4 + stability soak gate). Also corrects a prior measure-name mismatch: + the earlier `--threshold-measure throughput` never matched any + emitted measure (the actual key is `throughput_ops_ms`), so the + previous throughput threshold was a no-op. +- `latency_p999_ns` and `latency_max_ns` measures emitted by + `bench_latency.nim` (PR 6, Track 6). Each bounded variant slug + (`lockfreequeues_{sipsic,sipmuc,mupsic,mupmuc}//1p1c`) + now carries the full p50 / p95 / p99 / p999 / max latency tuple in + the merged BMF, available for the Bencher dashboard and downstream + comparison charts. `t_bench_latency.nim` extended to assert all + four extra measures appear on every bounded variant in the smoke + shape. +- `HistogramTopK` raised from 1000 to 5000 (PR 6, Task 6.2). + `runLatencyHarness` builds a fresh Histogram per run and averages + per-run percentiles (design 2.5) — each histogram only sees + `BenchLatencyMessageCount` samples, NOT `messageCount × runCount`. + At the default 100K samples per run, K=1000 was already adequate + (TopK + Reservoir already captured every sample exactly). The bump + to K=5000 is anticipatory: an operator who overrides + `BenchLatencyMessageCount` upward (e.g. ~5M for a tail-stress + configuration) needs ~5000 in the exact top-K stratum to keep p999 + (tail rank = MessageCount × 0.001) outside the rescaled-reservoir + stratum. Memory cost: 5000 × 8B = 40KB additional per histogram, + negligible vs the 99K-sample reservoir. New `t_bench_common.nim` + test stress-checks the design choice by asserting p999 within 5% + of sort fallback on a single 3.3M log-normal stream. +- Interactive uPlot throughput chart on the docs site (PR 5, Track 5). + `docs/benchmarks.md` embeds a `
` container plus + a vendored `uPlot 1.6.27` IIFE bundle and a vanilla-JS wiring module + (`docs/assets/bench-charts.js` + `docs/assets/bench-charts.css`). + The chart fetches the merged BMF snapshot from the relative URL + `./assets/bench-results/latest.json` so the same page works under + the `/dev/`, `/latest/`, and `/v*/` mike aliases without rewrite. + Library-toggle legend hides/shows series; log-scale Y axis toggle + switches between linear and log; hover tooltips show mean ± stddev + when `lower_value` / `upper_value` are present in the underlying + measure (throughput). Soft-skipped (library, shape) cells render as + gaps, not zeros. Graceful fallbacks render an inline message on + fetch errors, missing uPlot global, or empty BMF. +- BMF snapshot publishing pipeline (PR 5, Track 5). New step in + `bench.yml`'s `bench-upload` job runs only on `push` to + `refs/heads/devel`, copies `merged.json` to + `docs/assets/bench-results/.json` AND + `docs/assets/bench-results/latest.json`, and pushes the snapshot + back to `devel` as `github-actions[bot]` with a `[skip ci]` commit + message. Three-layer loop-prevention per design §5.X: + (1) `[skip ci]` marker (primary), (2) `paths-ignore` extension to + `docs/assets/bench-results/**` on both `pull_request` and `push` + triggers (secondary), and (3) bot-actor guard on the `bench` and + `bench-upload` jobs (tertiary). +- Devel-triggered docs deploy (PR 5, Track 5). `docs.yml` now triggers + on push to `devel` in addition to `main` / `master`, and the + "Deploy docs (dev)" step's `if:` clause includes `devel`. A new + post-deploy "Verify mike asset path" step (design §5.Y) curls + the published BMF snapshot URL, asserts HTTP 200, and asserts the + body parses as JSON; the chart's silent-on-404 behaviour would + otherwise hide a broken asset path. +- `THIRD_PARTY_LICENSES.md` records the uPlot vendoring (1.6.27, MIT, + vendored at `docs/assets/uplot-1.6.27.iife.min.js`) with a precise + upgrade procedure including the jsdelivr URL and SHA-256 + verification path. `.gitattributes` gains + `docs/assets/uplot-*.js linguist-vendored=true linguist-generated=true` + and `docs/assets/bench-results/*.json linguist-generated=true`. +- New `benchmarks/tests/test_bench_charts_contract.py` (9 tests) + guards the BMF -> chart contract: slug grammar + `//

pc`, measure regex + `^[a-z][a-z0-9_]*$`, finite numeric values, throughput-measure + presence, and the existence of the three checked-in chart assets. + Mirrors the JS `parseSlug` logic in Python so drift is caught at + CI time rather than in production. +- `docs/benchmarks.md` registered in `mkdocs.yml`'s nav (previously + unreachable from the docs landing page) and the four-row §4.1 + fairness caveats embedded verbatim immediately below the chart so + readers see the methodology footnotes within one viewport + regardless of which library combination they toggle. +- `benchmarks/README.md` "Updating the README summary" subsection + codifies the new hand-curation procedure for the README BENCHMARKS + markers (which shapes to read, where to read them, when to commit). + +### Changed + +- `README.md` BENCHMARKS markers now hold a hand-curated four-row + summary table (Sipsic / Sipmuc / Mupsic / Mupmuc bounded at one + representative shape each) plus a link line to the live chart page + at `https://elijahr.github.io/lockfreequeues/latest/benchmarks/`, + per design §4.4. Initial cells contain placeholders; the release PR + fills them in. The chart page absorbs run-to-run noise; the README + intentionally captures only the most recent release's headline + numbers. + +### Removed + +- `benchmarks/render_readme.nim` and its test + `tests/t_render_readme.nim`. The auto-rendered README path is + replaced by hand curation (above). Pre-deletion release-tag check + (per impl plan 5.8): `v3.2.0` and `v4.0.0` each ship the renderer + in their tagged tree; deleting on devel does not mutate those + tags. No CI workflow, nimble task, or test runner referenced the + renderer. + +- Comparison expansion (PR 4, Track 4): four new third-party adapters + reach the comparison set. `moodycamel_adapter.nim` wraps + `moodycamel::ConcurrentQueue` (BSD-2-Clause / Boost dual, + `mpmc_unbounded`) via a thin `extern "C"` shim isolating Nim from + upstream's template machinery. `threading_channels_adapter.nim` + wraps the nimble `threading` package's `Chan[T]` (MIT, `mpmc` + bounded) using non-blocking `trySend` / `tryRecv`. + `nim_channel_adapter.nim` wraps Nim's stdlib `system.Channel[T]` + (MIT, `mpsc` bounded) with blocking-on-full producer semantics + (apples-to-oranges fairness caveat documented inline + asterisked + in the bench README). All three are gated behind + `-d:adapter__available` defines; absent gates produce + no symbol references and the production builds are unchanged. +- Vendored MoodyCamel `concurrentqueue` at upstream commit + `d655418bb644b7f85159d94c591d7d983949fb81` under + `benchmarks/vendor/concurrentqueue/`: `concurrentqueue.h` + upstream + `LICENSE.md` + a project-authored `README.md` documenting the + pinned SHA and upgrade procedure. The + `moodycamel_wrapper.cpp` shim exposes `mc_init` / `mc_push` / + `mc_pop` / `mc_destroy` for `uint64_t`. New + `benchmarks/nim/smoke/smoke_moodycamel.nim` and + `benchmarks/nim/smoke/smoke_threading_channels.nim` run a 32-item + push/pop round-trip as fast pre-flight checks in CI. +- `bench.yml` gains the `force_skip_moodycamel` / + `force_skip_threading_channels` / `force_skip_nim_channel` + `workflow_dispatch` boolean inputs and per-library install → smoke → + set-flag pipelines (design §2.6 soft-skip pattern). MoodyCamel's + install step is a `test -f` against the vendored header so the + bench is reproducible without network egress; threading uses + `nimble install threading`; system.Channel needs no install. + Failure at install or smoke flips the binary's compile flags so the + slugs are omitted from the BMF instead of failing the workflow; the + `Annotate skipped` step emits a `::warning title=Adapter + skipped::...` annotation visible on the PR check summary. The + `bench_mpsc` compile step now consumes `ADAPTER_FLAGS` so the new + `nim_channel` adapter wires in; the `bench_unbounded` compile step + honours `NIM_MODE=cpp` when MoodyCamel is enabled. +- `tests/t_bench_adapters.nim` extends with three new + `when defined(adapter__available):` blocks covering 1000-item + push/pop round-trip set equality for the new adapters (gated under + `nim cpp` for MoodyCamel). +- `THIRD_PARTY_LICENSES.md` lands its first vendored entry + (`concurrentqueue (MoodyCamel)`, BSD-2-Clause / Boost dual, pinned + to commit `d655418bb644b7f85159d94c591d7d983949fb81`) plus + unvendored entries for the nimble `threading` package (MIT) and + Nim `system.Channel` stdlib (MIT). Placeholder PR-4 reservation + removed. +- New `.gitattributes` rule + `benchmarks/vendor/** linguist-vendored=true linguist-generated=true` + excludes the vendored MoodyCamel header from GitHub language stats + and code-search noise. +- `benchmarks/README.md` comparison table extends to seven upstream + libraries / nine adapter variants with install commands for each. +- Bench-binary slug coverage extends per design §2.4: `bench_mpmc` + emits `threading_channels/mpmc/{1,2,4}p{1,2,4}c` (9 shapes); + `bench_mpsc` emits `nim_channel/mpsc/{1,2,4}p1c` (3 shapes); + `bench_unbounded` emits + `moodycamel/ConcurrentQueue/mpmc_unbounded/{1,2,4}p{1,2,4}c` (9 + shapes). Each carries a `throughput_ops_ms` measure with + `value=mean`, `lower_value=mean-stddev`, `upper_value=mean+stddev`. +- New `benchmarks/nim/bench_common.nim` shared harness module exporting: + `Topology` enum, `BMFEmitter` (alpha-sorted Bencher Metric Format JSON + emission), `Histogram` (min-heap top-K + Algorithm R reservoir for + stratified-percentile estimation, p99 within 1% of sort fallback on + 100k log-normal samples), generic `runThroughputHarness` and + `runLatencyHarness` (1P/1C ping-pong RTT with monotonic-ns timing and + per-run percentile aggregation), and Stats helpers (mean / stddev / + minVal / maxVal / linear-interpolation percentile). +- Five new lockfreequeues adapters in `benchmarks/nim/adapters/`: + `lockfreequeues_sipmuc_adapter.nim`, `lockfreequeues_mupsic_adapter.nim`, + `lockfreequeues_unbounded_sipsic_adapter.nim`, + `lockfreequeues_unbounded_sipmuc_adapter.nim`, + `lockfreequeues_unbounded_mupmuc_adapter.nim`. Each exposes + `topologiesSupported: set[Topology]` and the standard `push`/`pop` + shape consumed by the shared harness. The unbounded adapters store + the queue inline (not via `ptr`) to dodge a Nim 2.2.6 codegen bug + triggered by generic-pointer destructor calls when bench_common is + imported. +- New `benchmarks/merge_bmf.py` CLI: stateless union of per-binary BMF + JSON fragments into a single output file. Exits 1 on `(slug, measure)` + collisions naming both colliding inputs in stderr. Output slugs and + measures alpha-sorted. Pure-stdlib (no third-party deps); covered by + `benchmarks/tests/test_merge_bmf.py` (10 tests). +- `bench_throughput` `--bmf-out=` flag emits Bencher Metric Format + JSON natively. The flag is purely additive: with the flag absent, the + binary is bit-for-bit unchanged from the prior release (same stdout + text, same positional CLI: `bench_throughput sipsic mupmuc + unbounded_mupsic channels`). Emitted slugs: + `lockfreequeues_sipsic/spsc/1p1c`, + `lockfreequeues_mupmuc/mpmc/{1,2,4,8}p{1,2,4,8}c`, + `lockfreequeues_unbounded_mupsic/mpsc_unbounded/{1,2,4}p1c`, + `nim_channels/mpmc/{1,2,4}p{1,2,4}c`. Each carries a + `throughput_ops_ms` measure with `value=mean`, `lower_value=mean-stddev`, + `upper_value=mean+stddev`. +- Per-variant compile-time run-count overrides: + `-d:BenchSipsicRuns=N`, `-d:BenchSipsicWarmup=N`, + `-d:BenchMupmucRuns=N`, `-d:BenchMupmucWarmup=N`, + `-d:BenchChannelsRuns=N`, `-d:BenchChannelsWarmup=N`. Defaults match + the prior hard-coded `runs = 10`, so production runs are unchanged. +- `bench_latency` now emits Bencher Metric Format JSON natively via + `--bmf-out=`, mirroring `bench_throughput`'s CLI surface (PR 1). + Positional args filter the variants run (`sipsic`, `mupmuc`, `sipmuc`, + `mupsic`); without any positional arg, all four bounded lockfreequeues + variants run at the 1p1c smoke shape. Emitted slugs: + `lockfreequeues_sipsic/spsc/1p1c`, + `lockfreequeues_sipmuc/mpmc/1p1c`, + `lockfreequeues_mupsic/mpsc/1p1c`, + `lockfreequeues_mupmuc/mpmc/1p1c`. Each carries + `latency_p50_ns` / `latency_p95_ns` / `latency_p99_ns` measures + (`latency_p999_ns` / `latency_max_ns` deferred to PR 6's threshold- + gating work). The binary is built on top of + `bench_common.runLatencyHarness` and uses per-binary intdefines: + `-d:BenchLatencyRuns=N` (default 33), `-d:BenchLatencyMessageCount=N` + (default 100_000), `-d:BenchLatencyWarmupRuns=N` (default 3). +- New `bench-latency` job in `.github/workflows/bench.yml` sibling to + `bench-throughput`. Both jobs upload per-binary BMF artifacts + (`bench-throughput-bmf` / `bench-latency-bmf`) consumed by a new + `bench-upload` job that downloads via `actions/download-artifact@v4` + pattern `bench-*-bmf`, runs `merge_bmf.py` to union the fragments, + and performs the single `bencher run` upload that co-locates latency + + throughput measures on shared per-slug histories. (Multiple + `bencher run` invocations create separate Bencher Reports and would + NOT co-locate measures — see merge rationale in design 1.) +- Four new topology-split throughput binaries replacing the legacy + `bench_throughput.nim` (PR 2): + `benchmarks/nim/bench_spsc.nim` (Sipsic 1p1c), + `benchmarks/nim/bench_mpsc.nim` (Mupsic {1,2,4}p1c), + `benchmarks/nim/bench_mpmc.nim` (Mupmuc {1,2,4}p{1,2,4}c plus 8p8c + oversubscription, Sipmuc 1p{1,2,4}c, Nim channels {1,2,4}p{1,2,4}c), + `benchmarks/nim/bench_unbounded.nim` (all four lockfreequeues + unbounded variants at their natural shapes). + Each emits BMF JSON via `--bmf-out=` with the same per-slug + `throughput_ops_ms` shape as the prior binary. Each owns its own + per-binary intdefines (`-d:BenchSpscRuns/MessageCount/Warmup`, + `-d:BenchMpscRuns/...`, `-d:BenchMpmcRuns/...`, plus four pairs of + `-d:UnboundedRuns/MessageCount` per design 2.5) so CI can + budget each topology independently. +- New `benchmarks/scripts/superset_check.py`: slug-set deletion-safety + guard that exits 0 when the post-split BMF covers every slug in the + pre-split fixture (`tests/fixtures/pre-split-slugs.json`) and + exits 1 with the missing slugs alpha-listed on stderr otherwise. + Run by `bench-upload` immediately after `merge_bmf.py` so any + silent slug regression introduced by future edits to the topology + binaries fails the PR check. Covered by 9 unit tests in + `benchmarks/tests/test_superset_check.py`. +- `benchmarks/tests/test_merge_bmf.py` gains `test_five_input_union` + covering the upload-job pipeline shape: 5 sibling fragments (one per + topology binary) merged via `merge_bmf.py` produce a single output + whose slug set is the disjoint union, with shared slugs carrying + measures from every input binary. +- Five third-party comparison adapters land in `benchmarks/nim/adapters/` + for the comparison MVP (PR 3, Track 3): `loony_adapter.nim` + (LoonyQueue, MIT, mpmc_unbounded), `boost_lockfree_queue_adapter.nim` + (`boost::lockfree::queue`, BSL-1.0, mpmc bounded), + `boost_lockfree_spsc_adapter.nim` + (`boost::lockfree::spsc_queue`, BSL-1.0, spsc bounded), + `crossbeam_array_queue_adapter.nim` (`crossbeam_queue::ArrayQueue`, + Apache-2.0 OR MIT, mpmc bounded), `crossbeam_seg_queue_adapter.nim` + (`crossbeam_queue::SegQueue`, Apache-2.0 OR MIT, mpmc_unbounded). + Each is gated behind a `-d:adapter__available` define; + absent gates produce no symbol references and the production builds + are unchanged. Tests in `tests/t_bench_adapters.nim` cover a + 1000-item push/pop round-trip per adapter. +- New Rust crate `benchmarks/rust/bench-ffi-crossbeam/`: a `cdylib` + exposing 8 `extern "C"` fns (`cb_array_init/push/pop/destroy`, + `cb_seg_init/push/pop/destroy`) consumed by the Crossbeam Nim + adapters. Pinned via `rust-toolchain.toml` to `stable`. Six + integration tests cover round-trip set equality for both queue + types, capacity edges, empty-pop, and null-pointer tolerance. +- New `benchmarks/nim/smoke/` directory with `smoke_boost.nim` and + `smoke_crossbeam.nim`: 32-item push/pop round-trip binaries used as + fast pre-flight checks in CI before the full bench compile. +- New workflow `.github/workflows/bench-comparison.yml`: dedicated + Crossbeam comparison job triggered by nightly cron (`0 4 * * *`), + `workflow_dispatch`, and targeted path pushes to `devel` (anything + under `benchmarks/rust/**` or `benchmarks/nim/adapters/crossbeam_*`). + Builds the cdylib via `dtolnay/rust-toolchain@stable` + + `Swatinem/rust-cache@v2`, runs the cdylib integration tests, + compiles `bench_mpmc` + `bench_unbounded` with the crossbeam gates, + merges via `merge_bmf.py`, and uploads to a separate Bencher Report. + Crossbeam is intentionally NOT in `bench.yml` so PR critical-path + time stays unchanged. +- `bench.yml` gains the `force_skip_boost` / `force_skip_loony` + `workflow_dispatch` boolean inputs and a per-library install -> + smoke -> set-flag pipeline (design §2.6 soft-skip). Failure at + install or smoke flips the binary's compile flags so the slugs are + omitted from the BMF instead of failing the workflow; the + `Annotate skipped` step emits a `::warning title=Adapter + skipped::...` annotation visible on the PR check summary. +- New `THIRD_PARTY_LICENSES.md` records license obligations for the + comparison MVP libraries (Loony MIT, Boost BSL-1.0, Crossbeam + Apache-2.0 OR MIT) and reserves placeholder entries for + concurrentqueue (PR 4) and uPlot (PR 5). +- New `src/lockfreequeues/internal/aligned_alloc.nim` exporting + `allocAligned[T]: ptr T` via a local `posix_memalign` shim. Used by + the four unbounded queue variants to allocate cache-line-aligned + segments (64-byte alignment instead of `c_calloc`'s 16-byte ABI + guarantee), eliminating the false-sharing asymmetry vs other + libraries flagged in design §4.2. + +### Fixed + +- Cache-line padding for unbounded queue segments. Each `Segment` field + participating in producer/consumer coordination now carries + `{.align: CacheLineBytes.}`, and the four unbounded variants + (`unbounded_sipsic`, `unbounded_sipmuc`, `unbounded_mupsic`, + `unbounded_mupmuc`) allocate via `allocAligned[Segment[S, T]]()` + instead of `c_calloc`. Verified by `tests/t_unbounded_padding.nim` + (8 assertions across 4 variants, green under c/cpp/arc/refc). + ### Changed +- `bench_throughput.nim` now natively emits Bencher Metric Format JSON + via `--bmf-out=`. The CI workflow (`.github/workflows/bench.yml`) + was rewired to consume the native output and feed it through + `merge_bmf.py` before uploading to Bencher.dev — the previous Python + regex parser (`bmf_adapter.py`) is gone. +- The four existing lockfreequeues adapter files renamed to the + canonical `_adapter.nim` convention with `git mv` + (history preserved): `lockfreequeues_sipsic.nim`, + `lockfreequeues_mupmuc.nim`, `lockfreequeues_unbounded_mupsic.nim`. + Each gained a `topologiesSupported: set[Topology]` constant for the + upcoming PR 3 binary-split. +- `benchmarks/render_readme.nim` rewritten to consume the new BMF JSON + shape directly (`{slug: {measure: MeasureValue}}`) instead of the + legacy `bench_main` aggregator output. The slug walk decomposes + `//

pc` back into the (impl, thread_config) pair + the table renders. +- `benchmarks/runner.py` and `lockfreequeues.nimble` `task benchmarks` + redirected from `bench_main` to `bench_throughput --bmf-out=`. +- `benchmarks/README.md` rewritten to document the new flow + (bench_common module, adapter convention, `--bmf-out` flag, + merge_bmf.py, expected slug set). +- `benchmarks/nim/adapter.nim` now re-exports `PushResult` / `PopResult` + from `bench_common` instead of defining its own copies, unifying the + two parallel type definitions introduced by PR 0 Task 0.1. Both + adapter packs (legacy `lockfreequeues_sipsic` / `lockfreequeues_mupmuc` + / `channels` and the newer `lockfreequeues_sipmuc` / `mupsic` / + `unbounded_*`) now flow through the same `runLatencyHarness` and + `runThroughputHarness` without per-call-site type conversion. No + external API change: legacy callers that imported `./adapter` for + `PushResult` / `PopResult` continue to compile (PR 1). +- `.github/workflows/bench.yml` now runs the five topology-split + binaries (`bench_spsc`, `bench_mpsc`, `bench_mpmc`, `bench_unbounded`, + `bench_latency`) as a GitHub Actions matrix instead of the legacy + pair of bench-throughput / bench-latency jobs. Each matrix entry + has its own `timeout-minutes: 12` budget so a hang in one binary + cannot burn the entire workflow's clock; the surviving binaries + finish, the bench-upload job merges what arrived, and the operator + gets partial Bencher coverage rather than no coverage. The + bench-upload job now also runs the `superset_check.py` deletion- + safety guard between `merge_bmf.py` and `bencher run` (PR 2). +- `benchmarks/runner.py` and `lockfreequeues.nimble` `task benchmarks` + iterate the five topology-split binaries and merge their fragments + via `merge_bmf.py` (PR 2). +- `benchmarks/README.md` rewritten to describe the 5-binary pipeline + (matrix CI job, per-binary intdefines, deletion-safety guard, the + merged BMF schema where one slug can carry both throughput and + latency measures) (PR 2). + +### Removed + +- `benchmarks/bmf_adapter.py` — Python regex parser that converted + `bench_throughput` stdout text into BMF JSON. Replaced by native BMF + emission via `--bmf-out=`. +- `benchmarks/test_bmf_adapter.py` — unit tests for the parser. + Replaced by `benchmarks/tests/test_merge_bmf.py`. +- `benchmarks/nim/bench_main.nim` — aggregator binary that wrapped + bench_throughput + bench_latency and produced a custom JSON shape. + `bench_throughput` is now the canonical entry point. +- `benchmarks/nim/bench_throughput.nim` — single multi-topology + throughput driver, replaced by the four topology-split binaries + `bench_spsc`, `bench_mpsc`, `bench_mpmc`, and `bench_unbounded`. + The pre-split slug fixture committed at + `tests/fixtures/pre-split-slugs.json` plus the `superset_check.py` + guard wired into bench.yml enforces that no slug from the legacy + binary silently disappears across the split (PR 2). +### Changed (typestates 0.7 uplift) + - Bump minimum `typestates` to 0.7.2. Pulls in the upstream `match` macro fixes for generic and cross-module contexts shipped in nim-typestates v0.7.1 / v0.7.2. - `opaqueStates = true` and `initial:` / `terminal:` DSL blocks added to 5 SET typestates: `CASAttempt`, `SPSCPopOp`, `SPSCPushOp`, `VirtualValueN`, and `VirtualValueN1`. - 8 hand-written `case .kind` dispatches across 4 facade modules (`sipsic.nim`, `mupmuc.nim`, `mupsic.nim`, `sipmuc.nim`) replaced with the generated `match` macro for compile-time exhaustiveness. -### Added +### Added (typestates 0.7 uplift) - CI: `typestates verify -W --format=github src/` step in `build.yml` to gate the typestate model against drift. -### Fixed +### Fixed (typestates 0.7 uplift) - 22 read-only typestate accessors across `src/lockfreequeues/typestates/` now carry `{.notATransition.}`. typestates' verifier flagged these once `typestates verify -W` was wired into CI; the procs are pure data extraction and were never transitions. diff --git a/README.md b/README.md index da54b90a..fee239a9 100644 --- a/README.md +++ b/README.md @@ -163,27 +163,25 @@ The full safety model — slot-ownership typestates, why the queue itself is loc ## Benchmarks -Throughput and latency results are checked into -[`benchmarks/results/latest.json`](benchmarks/results/latest.json) and rendered -into the table below. Re-run the suite with `nimble benchmarks`, then update -this section with `nim r benchmarks/render_readme.nim`. +The numbers below are a hand-curated summary of the four bounded +lockfreequeues variants on `ubuntu-latest` (4 vCPU, x86_64) at one +representative shape each. They are updated at release prep, NOT on +every devel push, and may lag the live data by up to one release +cycle. The "always-fresh" view lives at the chart page below. -_Platform: macosx arm64, 8 cores, 2025-12-03T22:24:55Z._ - -| implementation | threads | throughput (ops/ms) | p50 latency (ns) | -|----------------|---------|---------------------|------------------| -| `lockfreequeues/Sipsic` | 1P/1C | 7411.0 | 292 | -| `nim/channels` | 1P/1C | 1199.7 | — | -| `nim/channels` | 2P/2C | 815.8 | — | -| `nim/channels` | 4P/4C | 1779.5 | — | - -_Numbers regenerated by `nim r benchmarks/render_readme.nim` from `benchmarks/results/latest.json`._ - +| Queue | Topology | Shape | Throughput (ops/ms) | +|---------|----------|-------|---------------------| +| Sipsic | SPSC | 1P/1C | _to be filled at next release_ | +| Sipmuc | SPMC | 1P/2C | _to be filled at next release_ | +| Mupsic | MPSC | 2P/1C | _to be filled at next release_ | +| Mupmuc | MPMC | 2P/2C | _to be filled at next release_ | + +[Live interactive chart →](https://elijahr.github.io/lockfreequeues/latest/benchmarks/) -See [`benchmarks/`](benchmarks/) for the full suite, methodology, and -adapter implementations. +See [`benchmarks/`](benchmarks/) for the full suite, methodology, the +hand-curation procedure, and adapter implementations. ## Examples diff --git a/THIRD_PARTY_LICENSES.md b/THIRD_PARTY_LICENSES.md new file mode 100644 index 00000000..86ecbfee --- /dev/null +++ b/THIRD_PARTY_LICENSES.md @@ -0,0 +1,139 @@ +# Third-Party Licenses + +`lockfreequeues` itself is licensed under Apache-2.0 (see `LICENSE`). +The benchmark suite (under `benchmarks/`) compares `lockfreequeues` +against several upstream queue libraries; this file records the license +obligations for each one. + +This file is the canonical home for vendored / linked third-party code +notices. Per [`benchmarks/README.md`](benchmarks/README.md), each entry +records source, version, license, vendored path (if any), and upgrade +procedure (if any). + +Per-vendor block schema: + +```markdown +### + +- **Source:** https://github.com// +- **Version:** commit `` (vendored sources) or `` (tagged releases) +- **License:** +- **Vendored at:** `` (omit if not vendored) +- **Upgrade procedure:** see `/README.md` (omit if not vendored) +``` + +## Comparison MVP libraries (PR 3) + +The libraries below are linked at compile time by the bench suite when +the relevant `-d:adapter_*_available` gate is set; their source is NOT +vendored into this repository. The benchmark adapter code (under +`benchmarks/nim/adapters/_adapter.nim`) is original +`lockfreequeues` source and inherits the project's Apache-2.0 license. + +### Loony + +- **Source:** https://github.com/shayanhabibi/loony +- **Version:** `0.3.1` (resolved by `nimble install loony`; see + `nimble.lock` if pinned by a downstream consumer) +- **License:** MIT +- **Vendored at:** _(not vendored — resolved at build time via Nimble)_ +- **Upgrade procedure:** _(not applicable; nimble-managed)_ + +### Boost.LockFree + +- **Source:** https://www.boost.org/libs/lockfree/ +- **Version:** Whichever version is provided by the system package + (`apt install libboost-dev` on Ubuntu CI; `brew install boost` on + macOS dev). The bench adapter is API-compatible with all Boost + versions that ship `boost/lockfree/queue.hpp` and + `boost/lockfree/spsc_queue.hpp`. +- **License:** Boost Software License 1.0 (BSL-1.0) +- **Vendored at:** _(not vendored — system include path)_ +- **Upgrade procedure:** _(not applicable; OS-package-managed)_ + +### Crossbeam + +- **Source:** https://github.com/crossbeam-rs/crossbeam +- **Version:** `crossbeam-queue 0.3.x` (pinned by + `benchmarks/rust/bench-ffi-crossbeam/Cargo.toml`; recorded in the + generated `Cargo.lock` at build time) +- **License:** Apache-2.0 OR MIT (choose either) +- **Vendored at:** _(crate sources are downloaded by Cargo at build + time; only our own thin C-ABI shim under + `benchmarks/rust/bench-ffi-crossbeam/` is committed)_ +- **Upgrade procedure:** bump the `crossbeam-queue` version in + `benchmarks/rust/bench-ffi-crossbeam/Cargo.toml`, run `cargo build` + to refresh `Cargo.lock`, run the integration tests + (`cargo test --release --manifest-path benchmarks/rust/bench-ffi-crossbeam/Cargo.toml`) + and the Nim round-trip suite (`tests/t_bench_adapters.nim` with the + crossbeam gates). + +## Docs site charting (PR 5) + +The docs site under `docs/` ships an interactive throughput chart +(`docs/benchmarks.md`) that renders via the vendored uPlot bundle +below. The chart wiring (`docs/assets/bench-charts.js` + +`docs/assets/bench-charts.css`) is original `lockfreequeues` source +and inherits the project's MIT license. + +### uPlot + +- **Source:** https://github.com/leeoniya/uPlot +- **Version:** `1.6.27` +- **License:** MIT +- **Vendored at:** + - `docs/assets/uplot-1.6.27.iife.min.js` (chart runtime) + - `docs/assets/uplot-1.6.27.min.css` (companion stylesheet for axes / + grid lines / cursor; without it uPlot DOM elements stack incorrectly) +- **Upgrade procedure:** download both bundles from + `https://cdn.jsdelivr.net/npm/uplot@/dist/uPlot.iife.min.js` + and `.../dist/uPlot.min.css`, rename each with the new version suffix, + update the ` + +

+ +### Methodology and fairness caveats + +All numbers below are produced on GitHub-hosted `ubuntu-latest` runners (4 vCPU +implicit, x86_64). Do not infer absolute latency or throughput suitability for +production hardware from these numbers — use them only for relative comparison +between queue implementations under identical conditions. + +Specific caveats: + +- **Cache-line padding asymmetry.** Some libraries (lockfreequeues, MoodyCamel, + Boost.LockFree) pad their head/tail/sequence fields to 64 bytes; others may + not. The lockfreequeues MPMC types were padding-audited as part of PR 3 (see + audit checklist). +- **Memory ordering.** lockfreequeues uses `acquire`/`release` ordering on its + hot paths; some external libraries default to `seq_cst`, which is stricter + and may show as higher latency. +- **NUMA pinning.** None on `ubuntu-latest`. NUMA-aware comparison would + require self-hosted high-core runners (radar item). +- **Message size and capacity.** All benchmarks transfer 8-byte `uint64_t` + payloads. Bounded queues use a compile-time capacity matched across libraries. +- **Blocking vs non-blocking semantics.** Nim's `system/Channel` and + `Threading.Channels` block on full instead of returning a "queue full" + signal. Their throughput numbers reflect blocking semantics, not the + non-blocking `try_push` path that the lockfree queues use. These are marked + with an asterisk in the chart legend. +- **Producer/consumer thread placement.** No explicit pinning. The runner's + scheduler is the ground truth for thread placement. +- **CPU oversubscription.** `ubuntu-latest` has 4 vCPU. MPMC variants beyond + 4P + 4C measure scheduler oversubscription, not lock-free contention. diff --git a/lockfreequeues.nimble b/lockfreequeues.nimble index 230b8352..22e135c1 100644 --- a/lockfreequeues.nimble +++ b/lockfreequeues.nimble @@ -52,8 +52,46 @@ task examples, "Runs the examples": exec "nim c --threads:on -r -f examples/job_scheduler.nim" task benchmarks, "Runs the benchmark suite": - exec "nim c -d:release --threads:on benchmarks/nim/bench_main.nim" - exec "benchmarks/nim/bench_main --runs=10 -o=benchmarks/results/latest.json" + # PR 2 (bench-rollup) replaced bench_throughput.nim with five + # topology-split binaries. Each emits its own Bencher Metric Format + # JSON fragment; merge_bmf.py unions them into one final file. + # Binaries land in `.tmp/` per the project nim.cfg (`--outdir:.tmp`). + mkDir "benchmarks/results" + for binName in [ + "bench_spsc", "bench_mpsc", "bench_mpmc", + "bench_unbounded", "bench_latency", + ]: + exec "nim c -d:release --threads:on benchmarks/nim/" & binName & ".nim" + exec ".tmp/" & binName & " --bmf-out=benchmarks/results/" & binName & ".json" + # Union the per-binary fragments. Exits 1 on (slug, measure) collisions. + exec "python3 benchmarks/merge_bmf.py benchmarks/results/latest.json " & + "benchmarks/results/bench_spsc.json " & + "benchmarks/results/bench_mpsc.json " & + "benchmarks/results/bench_mpmc.json " & + "benchmarks/results/bench_unbounded.json " & + "benchmarks/results/bench_latency.json" + + +task benchtests, "Runs the bench harness test suite": + # The bench harness lives outside `srcDir`, so its dedicated tests + # (`tests/t_bench_*.nim`) are NOT imported by `tests/test.nim` to + # keep the regular `nimble test` matrix free of the bench harness's + # threading/atomic dependencies. This task runs them explicitly so + # CI can validate HistogramTopK sizing, latency CLI assertions, and + # adapter round-trip behavior. Single MM (orc default) is sufficient + # because the bench harness itself is the system under test, not the + # queue MM matrix. + exec "nim c --threads:on -r -f tests/t_bench_common.nim" + exec "nim c --threads:on -r -f tests/t_bench_latency.nim" + exec "nim c --threads:on -r -f tests/t_bench_adapters.nim" + + +task benchteststress, "Runs the bench harness test suite including 3.3M-sample stress shapes": + # Like `benchtests` but enables the gated 3.3M-sample p999 stress + # shape in t_bench_common (HistogramTopK headroom validation against + # an operator-driven MessageCount override). Slow (~10-15s release) + # so it is opt-in rather than part of every CI run. + exec "nim c -d:release -d:BenchCommonStress --threads:on -r -f tests/t_bench_common.nim" task stresstests, "Runs the stress test suite (multi-threaded)": diff --git a/mkdocs.yml b/mkdocs.yml index 3f934da6..7f5f0001 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -111,3 +111,5 @@ extra: extra_css: - css/custom.css + - assets/bench-charts.css + - assets/uplot-1.6.27.min.css diff --git a/src/lockfreequeues/internal/aligned_alloc.nim b/src/lockfreequeues/internal/aligned_alloc.nim new file mode 100644 index 00000000..4a2d82e9 --- /dev/null +++ b/src/lockfreequeues/internal/aligned_alloc.nim @@ -0,0 +1,114 @@ +## Cache-line-aligned heap allocation for unbounded queue Segments. +## +## Project-wide invariant (design doc §4.2): every ``Segment[S, T]`` allocation +## must be aligned to ``CacheLineBytes`` (64 on x86_64) so that the +## ``{.align: CacheLineBytes.}`` pragma on internal Atomic fields lifts those +## fields onto distinct physical cache lines, not merely distinct intra-struct +## offsets that share a 16-byte-aligned base with adjacent allocations. +## +## ``c_calloc`` / ``c_malloc`` only guarantee ``2 * sizeof(size_t) == 16`` bytes +## of alignment under glibc and macOS libSystem. Without an aligned-allocation +## primitive, the first cache-line slot of every Segment is split across two +## physical lines and false-shares with whatever neighbours the heap happens +## to place adjacent. +## +## Platform mapping: +## +## * POSIX (Linux, macOS): ``posix_memalign`` from ````. Memory is +## compatible with the standard ``free`` per POSIX. +## * Windows (MSVC + MinGW runtime): ``_aligned_malloc`` from ````. +## Memory MUST be freed with ``_aligned_free`` — using ``free`` corrupts +## the heap. +## +## Both backends are wrapped behind ``allocAligned[T]() / freeAligned(p)`` so +## callers don't have to ``when defined(windows):`` at every site. +## +## Compile probe verified at impl-plan time (Task 3.2.0): the C +## ``posix_memalign`` from ```` is callable from both ``nim c`` and +## ``nim cpp`` on macOS (libSystem) and Linux glibc, returning 64-byte +## aligned memory and ``rc == 0`` on success. +## +## Note: ``std/posix.posix_memalign`` was the first candidate, but on macOS +## the Apple SDK declares the first parameter with the +## ``__unsafe_indexable`` attribute under C++, which the Nim wrapper does +## not match — ``nim cpp`` then fails with +## "cannot convert argument of incomplete type 'void *' to 'void **'". +## The local importc shim below uses the canonical C signature, which +## clang accepts in both C and C++ modes. + +when defined(windows): + proc aligned_malloc( + size: csize_t, alignment: csize_t + ): pointer {.importc: "_aligned_malloc", header: "".} + proc aligned_free( + memblock: pointer + ) {.importc: "_aligned_free", header: "".} +else: + proc posix_memalign( + memptr: ptr pointer, alignment: csize_t, size: csize_t + ): cint {.importc, header: "".} + from system/ansi_c import c_free + +import ../atomic_dsl +export CacheLineBytes + +proc allocAligned*[T](): ptr T = + ## Allocate one zero-initialized ``T`` on at least a ``CacheLineBytes`` + ## boundary, but honor ``alignof(T)`` if it is larger. + ## + ## Raises ``OutOfMemDefect`` on allocation failure (matches the existing + ## ``c_calloc`` failure path in unbounded queue ``newSegment`` procs). + ## + ## The caller owns the returned pointer; release with ``freeAligned``, + ## which routes to the platform-correct deallocator (``free`` on POSIX, + ## ``_aligned_free`` on Windows). + ## + ## Both backends require ``alignment`` to be a power of two; the Windows + ## ``_aligned_malloc`` accepts any power of two, while POSIX + ## ``posix_memalign`` additionally requires a multiple of + ## ``sizeof(pointer)``. ``CacheLineBytes`` (64) satisfies both on every + ## platform we target; ``alignof(T)`` is always a power of two per the + ## C standard, and an over-aligned ``T`` (e.g. an SSE/AVX vector or a + ## manually ``{.align: 128.}``-pragma'd object) would have + ## ``alignof(T) >= sizeof(pointer)`` as well, so taking ``max`` of the + ## two keeps the constraint valid. We compute the max at compile time so + ## the runtime alignment argument is constant per instantiation. + const alignment = max(CacheLineBytes, alignof(T)) + when defined(windows): + let p = aligned_malloc(csize_t(sizeof(T)), csize_t(alignment)) + if p == nil: + raise newException(OutOfMemDefect, "_aligned_malloc failed for " & $T) + else: + var p: pointer + if posix_memalign(addr p, csize_t(alignment), csize_t(sizeof(T))) != 0: + raise newException(OutOfMemDefect, "posix_memalign failed for " & $T) + zeroMem(p, sizeof(T)) + result = cast[ptr T](p) + +proc freeAligned*(p: pointer) {.inline.} = + ## Release a pointer obtained from ``allocAligned``. Does nothing on a + ## ``nil`` argument so callers can use it idempotently in destructors. + ## Takes ``pointer`` (not ``ptr T``) so callers in untyped destructor + ## hooks (where the segment type has been erased to ``pointer``) can + ## use the same call site as typed callers. + if p == nil: return + when defined(windows): + aligned_free(p) + else: + c_free(p) + +proc freeAligned*[T](p: ptr T) {.inline.} = + ## Typed convenience overload that forwards to the ``pointer`` variant. + ## Lets callers pass ``ptr T`` without an explicit cast. + freeAligned(cast[pointer](p)) + +when isMainModule: + # Smoke test: verify allocAligned returns 64-byte aligned memory. + type Probe = object + a: int + b: array[128, byte] + let p = allocAligned[Probe]() + doAssert p != nil + doAssert (cast[uint](p) mod CacheLineBytes.uint) == 0 + echo "allocAligned[Probe] -> ", cast[uint](p), " (mod 64 = ", cast[uint](p) mod 64'u, ")" + freeAligned(p) diff --git a/src/lockfreequeues/unbounded_mupmuc.nim b/src/lockfreequeues/unbounded_mupmuc.nim index ba71c2fb..c22b44ad 100644 --- a/src/lockfreequeues/unbounded_mupmuc.nim +++ b/src/lockfreequeues/unbounded_mupmuc.nim @@ -34,9 +34,9 @@ import ./atomic_dsl import ./backoff +import ./internal/aligned_alloc import std/options import std/typetraits -from system/ansi_c import c_calloc, c_free import debra @@ -64,10 +64,13 @@ else: type Segment[S: static int, T] = object ## A fixed-size segment in the linked list. data: array[S, T] - next: Atomic[ptr Segment[S, T]] - tail: Atomic[int] # CAS coordination for producers - prevConsumerIdx: Atomic[int] # CAS coordination for consumers - committed: array[S, Atomic[bool]] # Track which slots are ready to read + next {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + tail {.align: CacheLineBytes.}: Atomic[int] + # CAS coordination for producers + prevConsumerIdx {.align: CacheLineBytes.}: Atomic[int] + # CAS coordination for consumers + committed {.align: CacheLineBytes.}: array[S, Atomic[bool]] + # Track which slots are ready to read UnboundedMupmuc*[S: static int, T; MaxThreads: static int] = object ## Unbounded MPMC queue using linked segments. @@ -76,8 +79,10 @@ type ## - T: Data type. ## - MaxThreads: Maximum number of threads (compile-time constant). manager: ptr DebraManager[MaxThreads] - headSegment: Atomic[ptr Segment[S, T]] # Consumers read from here - tailSegment: Atomic[ptr Segment[S, T]] # Producers write here + headSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Consumers read from here + tailSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Producers write here strategy: DeallocationStrategy itemCount: Atomic[int] # Total items in queue segments: Atomic[int] # Number of segments @@ -105,10 +110,9 @@ type handle: ThreadHandle[MaxThreads] proc newSegment[S: static int, T](): ptr Segment[S, T] = - ## Allocate a new segment via libc calloc (zero-initialized, truly shared). - result = cast[ptr Segment[S, T]](c_calloc(1.csize_t, sizeof(Segment[S, T]).csize_t)) - if result == nil: - raise newException(OutOfMemDefect, "newSegment: c_calloc returned nil") + ## Allocate a new segment on a CacheLineBytes boundary so the + ## ``{.align.}`` pragmas above land on distinct physical cache lines. + result = allocAligned[Segment[S, T]]() result.next.store(nil, moRelaxed) result.tail.store(0, moRelaxed) result.prevConsumerIdx.store(-1, moRelaxed) @@ -161,24 +165,25 @@ proc newUnboundedMupmuc*[S: static int, T; MaxThreads: static int]( ## owned by this queue. Manager teardown happens inside this queue's ## `=destroy` after segment cleanup. For multi-queue setups that ## share a manager, use the `(manager, strategy)` overload instead. - let mgr = cast[ptr DebraManager[MaxThreads]](c_calloc( - 1.csize_t, sizeof(DebraManager[MaxThreads]).csize_t - )) - if mgr == nil: - raise newException(OutOfMemDefect, "newUnboundedMupmuc: c_calloc returned nil") + let mgr = allocAligned[DebraManager[MaxThreads]]() + var ok = false try: mgr[] = initDebraManager[MaxThreads]() result = newUnboundedMupmuc[S, T, MaxThreads](mgr, strategy) result.ownsManager = true - except: - # Run the manager's =destroy (drains any limbo bags + asserts the - # client refcount is zero) before freeing the heap slot. Safe for - # both partially- and fully-initialized state because c_calloc - # zeroed it: nil limboBagTail pointers walk no list, boundClients - # is 0 so the destructor's invariant assertion passes. - reset(mgr[]) - c_free(mgr) - raise + ok = true + finally: + # `finally` (not `except:`) so the cleanup also runs on `Defect`-class + # raises (e.g. `OutOfMemDefect` from inside `initDebraManager`). Under + # Nim 2.0, bare `except:` matches only `CatchableError`, leaving + # Defect-shaped failures to leak `mgr`. Run the manager's `=destroy` + # (drains any limbo bags + asserts the client refcount is zero) before + # freeing the heap slot. Safe for both partially- and fully-initialized + # state because `allocAligned` zeroed it: nil limboBagTail pointers walk + # no list, boundClients is 0 so the destructor's invariant passes. + if not ok: + reset(mgr[]) + freeAligned(mgr) proc segmentCount*[S: static int, T; MaxThreads: static int]( self: var UnboundedMupmuc[S, T, MaxThreads] @@ -285,7 +290,7 @@ proc push*[S: static int, T; MaxThreads: static int]( else: # Lost the segment-alloc race: another producer linked first. # Free our orphan segment and back off before retrying. - c_free(newSeg) + freeAligned(newSeg) backoffOnRetry(spins) continue else: @@ -316,14 +321,14 @@ proc push*[S: static int, T; MaxThreads: static int]( # Typed destructor for retired segments. Must be generic over `(S, T)` # because the segment's `data: array[S, T]` slots may hold managed types # (`string`, `seq`, `ref`, ...) whose internal allocations would leak if -# we just `c_free`'d the segment block. For POD `T` (`supportsCopyMem`), +# we just `freeAligned`'d the segment block. For POD `T` (`supportsCopyMem`), # the `reset` loop is compile-time-elided, so this costs nothing. proc segmentDestructor[S: static int, T](p: pointer) {.nimcall, raises: [].} = when not supportsCopyMem(T): let seg = cast[ptr Segment[S, T]](p) for i in 0 ..< S: reset(seg.data[i]) - c_free(p) + freeAligned(p) proc pop*[S: static int, T; MaxThreads: static int]( self: var Consumer[S, T, MaxThreads] @@ -431,6 +436,25 @@ proc pop*[S: static int, T; MaxThreads: static int]( return none(seq[T]) return some(items) +when defined(testing): + proc headSegmentForTest*[S: static int, T; MaxThreads: static int]( + self: var UnboundedMupmuc[S, T, MaxThreads] + ): pointer = + ## Test-only accessor: returns the queue's current head segment pointer + ## so the cache-line padding audit can verify base alignment. + result = cast[pointer](self.headSegment.load(moRelaxed)) + + proc segmentHeadOffsetForTest*[S: static int, T; MaxThreads: static int]( + _: typedesc[UnboundedMupmuc[S, T, MaxThreads]] + ): tuple[tail: int, prevConsumerIdx: int, committed: int] = + ## Test-only accessor: returns offsets of cache-line-padded fields within + ## the unbounded mupmuc Segment for the cache-line padding audit. + result = ( + offsetOf(Segment[S, T], tail), + offsetOf(Segment[S, T], prevConsumerIdx), + offsetOf(Segment[S, T], committed), + ) + proc `=destroy`*[S: static int, T; MaxThreads: static int]( self: var UnboundedMupmuc[S, T, MaxThreads] ) = @@ -442,11 +466,11 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( let next = seg.next.load(moRelaxed) when not supportsCopyMem(T): # Run the destructor for any managed slots (string/seq/ref) before - # `c_free`'s away the segment block — otherwise their internal + # `freeAligned`'s away the segment block — otherwise their internal # allocations leak. for i in 0 ..< S: reset(seg.data[i]) - c_free(seg) + freeAligned(seg) seg = next # Release our refcount on the manager. Conceptually pairs with the @@ -460,4 +484,4 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( # without the parsing surprise of the backtick form, which trips # `expr(nkIdent); unknown node kind` inside a generic destructor. reset(self.manager[]) - c_free(self.manager) + freeAligned(self.manager) diff --git a/src/lockfreequeues/unbounded_mupsic.nim b/src/lockfreequeues/unbounded_mupsic.nim index 5bc2e679..9c02ce97 100644 --- a/src/lockfreequeues/unbounded_mupsic.nim +++ b/src/lockfreequeues/unbounded_mupsic.nim @@ -36,9 +36,9 @@ import ./atomic_dsl import ./backoff +import ./internal/aligned_alloc import std/options import std/typetraits -from system/ansi_c import c_calloc, c_free import debra @@ -66,10 +66,17 @@ else: type Segment[S: static int, T] = object ## A fixed-size segment in the linked list. data: array[S, T] - next: Atomic[ptr Segment[S, T]] - tail: Atomic[int] # CAS coordination for producers - head: int # Consumer read position within segment (single consumer, no atomic) - committed: array[S, Atomic[bool]] # Track which slots are ready to read + next {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + tail {.align: CacheLineBytes.}: Atomic[int] + # CAS coordination for producers + head {.align: CacheLineBytes.}: int + # Consumer read position within segment (single consumer, no atomic). + # Aligned to its own cache line so consumer head writes do not + # invalidate producers' cached `tail` line. Without the pragma `head` + # would share a 64-byte chunk with `tail` because Nim packs object + # fields back-to-back unless explicitly aligned. + committed {.align: CacheLineBytes.}: array[S, Atomic[bool]] + # Track which slots are ready to read UnboundedMupsic*[S: static int, T; MaxThreads: static int] = object ## Unbounded MPSC queue using linked segments. @@ -78,8 +85,10 @@ type ## - T: Data type. ## - MaxThreads: Maximum number of threads (compile-time constant). manager: ptr DebraManager[MaxThreads] - headSegment: Atomic[ptr Segment[S, T]] # Consumer reads from here - tailSegment: Atomic[ptr Segment[S, T]] # Producers write here (atomic for CAS) + headSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Consumer reads from here + tailSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Producers write here (atomic for CAS) strategy: DeallocationStrategy handle: ThreadHandle[MaxThreads] # Consumer's handle (single consumer) itemCount: Atomic[int] # Total items in queue @@ -100,10 +109,9 @@ type handle: ThreadHandle[MaxThreads] # Each producer has its own handle proc newSegment[S: static int, T](): ptr Segment[S, T] = - ## Allocate a new segment via libc calloc (zero-initialized, truly shared). - result = cast[ptr Segment[S, T]](c_calloc(1.csize_t, sizeof(Segment[S, T]).csize_t)) - if result == nil: - raise newException(OutOfMemDefect, "newSegment: c_calloc returned nil") + ## Allocate a new segment on a CacheLineBytes boundary so the + ## ``{.align.}`` pragmas above land on distinct physical cache lines. + result = allocAligned[Segment[S, T]]() result.next.store(nil, moRelaxed) result.tail.store(0, moRelaxed) result.head = 0 @@ -166,24 +174,25 @@ proc newUnboundedMupsic*[S: static int, T; MaxThreads: static int]( ## handle obtained on the consumer thread. ## ## For multi-queue setups that share a manager, pass it explicitly. - let mgr = cast[ptr DebraManager[MaxThreads]](c_calloc( - 1.csize_t, sizeof(DebraManager[MaxThreads]).csize_t - )) - if mgr == nil: - raise newException(OutOfMemDefect, "newUnboundedMupsic: c_calloc returned nil") + let mgr = allocAligned[DebraManager[MaxThreads]]() + var ok = false try: mgr[] = initDebraManager[MaxThreads]() let consumerHandle = registerThread(mgr[]) result = newUnboundedMupsic[S, T, MaxThreads](mgr, consumerHandle, strategy) result.ownsManager = true - except: - # Run the manager's =destroy (drains any limbo bags + asserts the - # client refcount is zero) before freeing the heap slot. Safe for - # both partially- and fully-initialized state because c_calloc - # zeroed it. - reset(mgr[]) - c_free(mgr) - raise + ok = true + finally: + # `finally` (not `except:`) so the cleanup also runs on `Defect`-class + # raises (e.g. `OutOfMemDefect` from inside `initDebraManager`). Under + # Nim 2.0, bare `except:` matches only `CatchableError`, leaving + # Defect-shaped failures to leak `mgr`. Run the manager's `=destroy` + # (drains any limbo bags + asserts the client refcount is zero) before + # freeing the heap slot. Safe for both partially- and fully-initialized + # state because `allocAligned` zeroed it. + if not ok: + reset(mgr[]) + freeAligned(mgr) proc segmentCount*[S: static int, T; MaxThreads: static int]( self: var UnboundedMupsic[S, T, MaxThreads] @@ -266,7 +275,7 @@ proc push*[S: static int, T; MaxThreads: static int]( continue else: # Lost the segment-alloc race, free our orphan and back off. - c_free(newSeg) + freeAligned(newSeg) backoffOnRetry(spins) continue else: @@ -298,14 +307,14 @@ proc push*[S: static int, T; MaxThreads: static int]( # Typed destructor for retired segments. Generic over `(S, T)` so we can # `reset` any managed slots (`string`, `seq`, `ref`, ...) before -# `c_free`'s away the segment block. For POD `T` (`supportsCopyMem`), +# `freeAligned`'s away the segment block. For POD `T` (`supportsCopyMem`), # the loop is compile-time-elided. proc segmentDestructor[S: static int, T](p: pointer) {.nimcall, raises: [].} = when not supportsCopyMem(T): let seg = cast[ptr Segment[S, T]](p) for i in 0 ..< S: reset(seg.data[i]) - c_free(p) + freeAligned(p) proc pop*[S: static int, T; MaxThreads: static int]( self: var UnboundedMupsic[S, T, MaxThreads] @@ -392,6 +401,27 @@ proc pop*[S: static int, T; MaxThreads: static int]( return none(seq[T]) return some(items) +when defined(testing): + proc headSegmentForTest*[S: static int, T; MaxThreads: static int]( + self: var UnboundedMupsic[S, T, MaxThreads] + ): pointer = + ## Test-only accessor: returns the queue's current head segment pointer + ## so the cache-line padding audit can verify base alignment. + result = cast[pointer](self.headSegment.load(moRelaxed)) + + proc segmentHeadOffsetForTest*[S: static int, T; MaxThreads: static int]( + _: typedesc[UnboundedMupsic[S, T, MaxThreads]] + ): tuple[tail: int, head: int, committed: int] = + ## Test-only accessor: returns offsets of cache-line-padded fields within + ## the unbounded mupsic Segment for the cache-line padding audit. ``head`` + ## (consumer cursor) MUST live on its own line because every consumer write + ## would otherwise invalidate producers' cached ``tail``. + result = ( + offsetOf(Segment[S, T], tail), + offsetOf(Segment[S, T], head), + offsetOf(Segment[S, T], committed), + ) + proc `=destroy`*[S: static int, T; MaxThreads: static int]( self: var UnboundedMupsic[S, T, MaxThreads] ) = @@ -403,11 +433,11 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( let next = seg.next.load(moRelaxed) when not supportsCopyMem(T): # Run the destructor for any managed slots (string/seq/ref) before - # `c_free`'s away the segment block — otherwise their internal + # `freeAligned`'s away the segment block — otherwise their internal # allocations leak. for i in 0 ..< S: reset(seg.data[i]) - c_free(seg) + freeAligned(seg) seg = next # Release our refcount on the manager. Conceptually pairs with the @@ -421,4 +451,4 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( # without the parsing surprise of the backtick form, which trips # `expr(nkIdent); unknown node kind` inside a generic destructor. reset(self.manager[]) - c_free(self.manager) + freeAligned(self.manager) diff --git a/src/lockfreequeues/unbounded_sipmuc.nim b/src/lockfreequeues/unbounded_sipmuc.nim index b559d577..0c2b08cb 100644 --- a/src/lockfreequeues/unbounded_sipmuc.nim +++ b/src/lockfreequeues/unbounded_sipmuc.nim @@ -38,9 +38,9 @@ import ./atomic_dsl import ./backoff +import ./internal/aligned_alloc import std/options import std/typetraits -from system/ansi_c import c_calloc, c_free import debra @@ -68,9 +68,11 @@ else: type Segment[S: static int, T] = object ## A fixed-size segment in the linked list. data: array[S, T] - next: Atomic[ptr Segment[S, T]] - tail: Atomic[int] # Producer write position within segment - prevConsumerIdx: Atomic[int] # CAS coordination for consumers + next {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + tail {.align: CacheLineBytes.}: Atomic[int] + # Producer write position within segment + prevConsumerIdx {.align: CacheLineBytes.}: Atomic[int] + # CAS coordination for consumers UnboundedSipmuc*[S: static int, T; MaxThreads: static int] = object ## Unbounded SPMC queue using linked segments. @@ -79,8 +81,10 @@ type ## - T: Data type. ## - MaxThreads: Maximum number of threads (compile-time constant). manager: ptr DebraManager[MaxThreads] - headSegment: Atomic[ptr Segment[S, T]] # Consumers read from here - tailSegment: ptr Segment[S, T] # Producer writes here (single-producer) + headSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Consumers read from here + tailSegment {.align: CacheLineBytes.}: ptr Segment[S, T] + # Producer writes here (single-producer) strategy: DeallocationStrategy itemCount: Atomic[int] # Total items in queue segments: Atomic[int] # Number of segments @@ -103,10 +107,9 @@ type handle: ThreadHandle[MaxThreads] # Thread handle for pin/unpin proc newSegment[S: static int, T](): ptr Segment[S, T] = - ## Allocate a new segment via libc calloc (zero-initialized, truly shared). - result = cast[ptr Segment[S, T]](c_calloc(1.csize_t, sizeof(Segment[S, T]).csize_t)) - if result == nil: - raise newException(OutOfMemDefect, "newSegment: c_calloc returned nil") + ## Allocate a new segment on a CacheLineBytes boundary so the + ## ``{.align.}`` pragmas above land on distinct physical cache lines. + result = allocAligned[Segment[S, T]]() result.next.store(nil, moRelaxed) result.tail.store(0, moRelaxed) result.prevConsumerIdx.store(-1, moRelaxed) # No consumer yet @@ -160,23 +163,24 @@ proc newUnboundedSipmuc*[S: static int, T; MaxThreads: static int]( ## owned by this queue. Manager teardown happens inside this queue's ## `=destroy` after segment cleanup. For multi-queue setups that ## share a manager, use the `(manager, strategy)` overload instead. - let mgr = cast[ptr DebraManager[MaxThreads]](c_calloc( - 1.csize_t, sizeof(DebraManager[MaxThreads]).csize_t - )) - if mgr == nil: - raise newException(OutOfMemDefect, "newUnboundedSipmuc: c_calloc returned nil") + let mgr = allocAligned[DebraManager[MaxThreads]]() + var ok = false try: mgr[] = initDebraManager[MaxThreads]() result = newUnboundedSipmuc[S, T, MaxThreads](mgr, strategy) result.ownsManager = true - except: - # Run the manager's =destroy (drains any limbo bags + asserts the - # client refcount is zero) before freeing the heap slot. Safe for - # both partially- and fully-initialized state because c_calloc - # zeroed it. - reset(mgr[]) - c_free(mgr) - raise + ok = true + finally: + # `finally` (not `except:`) so the cleanup also runs on `Defect`-class + # raises (e.g. `OutOfMemDefect` from inside `initDebraManager`). Under + # Nim 2.0, bare `except:` matches only `CatchableError`, leaving + # Defect-shaped failures to leak `mgr`. Run the manager's `=destroy` + # (drains any limbo bags + asserts the client refcount is zero) before + # freeing the heap slot. Safe for both partially- and fully-initialized + # state because `allocAligned` zeroed it. + if not ok: + reset(mgr[]) + freeAligned(mgr) proc segmentCount*[S: static int, T; MaxThreads: static int]( self: var UnboundedSipmuc[S, T, MaxThreads] @@ -265,14 +269,14 @@ proc getConsumer*[S: static int, T; MaxThreads: static int]( # Typed destructor for retired segments. Generic over `(S, T)` so we can # `reset` any managed slots (`string`, `seq`, `ref`, ...) before -# `c_free`'s away the segment block. For POD `T` (`supportsCopyMem`), +# `freeAligned`'s away the segment block. For POD `T` (`supportsCopyMem`), # the loop is compile-time-elided. proc segmentDestructor[S: static int, T](p: pointer) {.nimcall, raises: [].} = when not supportsCopyMem(T): let seg = cast[ptr Segment[S, T]](p) for i in 0 ..< S: reset(seg.data[i]) - c_free(p) + freeAligned(p) proc pop*[S: static int, T; MaxThreads: static int]( self: var Consumer[S, T, MaxThreads] @@ -371,6 +375,21 @@ proc pop*[S: static int, T; MaxThreads: static int]( return none(seq[T]) return some(items) +when defined(testing): + proc headSegmentForTest*[S: static int, T; MaxThreads: static int]( + self: var UnboundedSipmuc[S, T, MaxThreads] + ): pointer = + ## Test-only accessor: returns the queue's current head segment pointer + ## so the cache-line padding audit can verify base alignment. + result = cast[pointer](self.headSegment.load(moRelaxed)) + + proc segmentHeadOffsetForTest*[S: static int, T; MaxThreads: static int]( + _: typedesc[UnboundedSipmuc[S, T, MaxThreads]] + ): tuple[tail: int, prevConsumerIdx: int] = + ## Test-only accessor: returns offsets of cache-line-padded fields within + ## the unbounded sipmuc Segment for the cache-line padding audit. + result = (offsetOf(Segment[S, T], tail), offsetOf(Segment[S, T], prevConsumerIdx)) + proc `=destroy`*[S: static int, T; MaxThreads: static int]( self: var UnboundedSipmuc[S, T, MaxThreads] ) = @@ -381,12 +400,12 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( while seg != nil: when not supportsCopyMem(T): # Run the destructor for any managed slots (string/seq/ref) before - # `c_free`'s away the segment block — otherwise their internal + # `freeAligned`'s away the segment block — otherwise their internal # allocations leak. for i in 0 ..< S: reset(seg.data[i]) let next = seg.next.load(moRelaxed) - c_free(seg) + freeAligned(seg) seg = next # Release our refcount on the manager. Conceptually pairs with the @@ -400,4 +419,4 @@ proc `=destroy`*[S: static int, T; MaxThreads: static int]( # without the parsing surprise of the backtick form, which trips # `expr(nkIdent); unknown node kind` inside a generic destructor. reset(self.manager[]) - c_free(self.manager) + freeAligned(self.manager) diff --git a/src/lockfreequeues/unbounded_sipsic.nim b/src/lockfreequeues/unbounded_sipsic.nim index be531974..5fc2f2a0 100644 --- a/src/lockfreequeues/unbounded_sipsic.nim +++ b/src/lockfreequeues/unbounded_sipsic.nim @@ -16,32 +16,34 @@ ## ``` import ./atomic_dsl +import ./internal/aligned_alloc import std/options import std/typetraits -from system/ansi_c import c_calloc, c_free type Segment*[S: static int, T] = object data*: array[S, T] - next*: Atomic[ptr Segment[S, T]] - head*: Atomic[int] - tail*: Atomic[int] + next* {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + head* {.align: CacheLineBytes.}: Atomic[int] + tail* {.align: CacheLineBytes.}: Atomic[int] UnboundedSipsic*[S: static int, T] = object ## Unbounded SPSC queue using linked segments. ## ## - S: Segment size (compile-time constant). ## - T: Data type. - headSegment: Atomic[ptr Segment[S, T]] # Consumer reads from here - tailSegment: Atomic[ptr Segment[S, T]] # Producer writes here + headSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Consumer reads from here + tailSegment {.align: CacheLineBytes.}: Atomic[ptr Segment[S, T]] + # Producer writes here itemCount: Atomic[int] # Total items in queue segments: Atomic[int] # Number of segments proc newSegment[S: static int, T](): ptr Segment[S, T] = - ## Allocate a new segment via libc calloc (zero-initialized, truly shared). - result = cast[ptr Segment[S, T]](c_calloc(1.csize_t, sizeof(Segment[S, T]).csize_t)) - if result == nil: - raise newException(OutOfMemDefect, "newSegment: c_calloc returned nil") + ## Allocate a new segment on a CacheLineBytes boundary so the + ## ``{.align.}`` pragmas above land on distinct physical cache lines + ## rather than sharing the 16-byte-aligned base that ``c_calloc`` returns. + result = allocAligned[Segment[S, T]]() result.next.store(nil, moRelaxed) result.head.store(0, moRelaxed) result.tail.store(0, moRelaxed) @@ -161,7 +163,7 @@ proc pop*[S: static int, T](self: var UnboundedSipsic[S, T]): Option[T] = self.headSegment.store(nextSeg, moRelease) seg = nextSeg discard self.segments.fetchSub(1, moRelaxed) - c_free(oldSeg) + freeAligned(oldSeg) proc pop*[S: static int, T]( self: var UnboundedSipsic[S, T], count: int @@ -184,6 +186,21 @@ proc pop*[S: static int, T]( return none(seq[T]) return some(items) +when defined(testing): + proc headSegmentForTest*[S: static int, T]( + self: var UnboundedSipsic[S, T] + ): pointer = + ## Test-only accessor: returns the queue's current head segment pointer + ## so the cache-line padding audit can verify base alignment. + result = cast[pointer](self.headSegment.load(moRelaxed)) + + proc segmentHeadOffsetForTest*[S: static int, T]( + _: typedesc[UnboundedSipsic[S, T]] + ): tuple[head: int, tail: int] = + ## Test-only accessor: returns offsets of cache-line-padded fields within + ## the unbounded sipsic Segment for the cache-line padding audit. + result = (offsetOf(Segment[S, T], head), offsetOf(Segment[S, T], tail)) + proc `=destroy`*[S: static int, T](self: var UnboundedSipsic[S, T]) = ## Clean up all segments. var seg = self.headSegment.load(moRelaxed) @@ -191,9 +208,9 @@ proc `=destroy`*[S: static int, T](self: var UnboundedSipsic[S, T]) = let next = seg.next.load(moRelaxed) when not supportsCopyMem(T): # Run the destructor for any managed slots (string/seq/ref) before - # `c_free`'s away the segment block — otherwise their internal + # `freeAligned`'s away the segment block — otherwise their internal # allocations leak. for i in 0 ..< S: reset(seg.data[i]) - c_free(seg) + freeAligned(seg) seg = next diff --git a/tests/fixtures/pre-split-slugs.json b/tests/fixtures/pre-split-slugs.json new file mode 100644 index 00000000..6ed8b08c --- /dev/null +++ b/tests/fixtures/pre-split-slugs.json @@ -0,0 +1,79 @@ +{ + "lockfreequeues_mupmuc/mpmc/1p1c": { + "throughput_ops_ms": { + "value": 19504.396482813747, + "lower_value": 19493.36754157748, + "upper_value": 19515.425424050012 + } + }, + "lockfreequeues_mupmuc/mpmc/2p2c": { + "throughput_ops_ms": { + "value": 14885.897861969646, + "lower_value": 14631.317487664088, + "upper_value": 15140.478236275205 + } + }, + "lockfreequeues_mupmuc/mpmc/4p4c": { + "throughput_ops_ms": { + "value": 10095.92880447807, + "lower_value": 8882.45788519962, + "upper_value": 11309.39972375652 + } + }, + "lockfreequeues_mupmuc/mpmc/8p8c": { + "throughput_ops_ms": { + "value": 5251.162209555323, + "lower_value": 5219.47811287181, + "upper_value": 5282.846306238836 + } + }, + "lockfreequeues_sipsic/spsc/1p1c": { + "throughput_ops_ms": { + "value": 12253.2290574398, + "lower_value": 9837.219803030268, + "upper_value": 14669.23831184933 + } + }, + "lockfreequeues_unbounded_mupsic/mpsc_unbounded/1p1c": { + "throughput_ops_ms": { + "value": 10894.752895523907, + "lower_value": 9466.384456540516, + "upper_value": 12323.121334507297 + } + }, + "lockfreequeues_unbounded_mupsic/mpsc_unbounded/2p1c": { + "throughput_ops_ms": { + "value": 9728.381928620092, + "lower_value": 9291.02333733393, + "upper_value": 10165.740519906254 + } + }, + "lockfreequeues_unbounded_mupsic/mpsc_unbounded/4p1c": { + "throughput_ops_ms": { + "value": 10044.267734713983, + "lower_value": 9969.935465775887, + "upper_value": 10118.60000365208 + } + }, + "nim_channels/mpmc/1p1c": { + "throughput_ops_ms": { + "value": 8592.459855772106, + "lower_value": 7521.893938337938, + "upper_value": 9663.025773206273 + } + }, + "nim_channels/mpmc/2p2c": { + "throughput_ops_ms": { + "value": 4231.466333633576, + "lower_value": 3154.904307809079, + "upper_value": 5308.028359458073 + } + }, + "nim_channels/mpmc/4p4c": { + "throughput_ops_ms": { + "value": 3438.3306751727805, + "lower_value": 361.41998723450706, + "upper_value": 6515.241363111054 + } + } +} diff --git a/tests/t_aligned_alloc.nim b/tests/t_aligned_alloc.nim new file mode 100644 index 00000000..c0bcf4f1 --- /dev/null +++ b/tests/t_aligned_alloc.nim @@ -0,0 +1,59 @@ +## Unit tests for ``internal/aligned_alloc.nim``. +## +## Verifies that ``allocAligned[T]`` returns ``CacheLineBytes``-aligned, +## zero-initialized memory across a range of payload sizes (struct under, +## equal-to, and over a single cache line). + +import lockfreequeues/atomic_dsl +import lockfreequeues/internal/aligned_alloc +import unittest2 + +type + Tiny = object + a: int + + Line = object + a: int + b: int + c: int + d: int + e: int + f: int + g: int + h: int + + Big = object + a: array[256, byte] + +suite "internal/aligned_alloc.allocAligned": + test "tiny payload (sizeof < CacheLineBytes) is 64B-aligned": + let p = allocAligned[Tiny]() + check p != nil + check (cast[uint](p) mod CacheLineBytes.uint) == 0 + check p.a == 0 # zero-initialized + freeAligned(p) + + test "line-sized payload (sizeof == CacheLineBytes) is 64B-aligned": + let p = allocAligned[Line]() + check p != nil + check (cast[uint](p) mod CacheLineBytes.uint) == 0 + check p.a == 0 + check p.h == 0 + freeAligned(p) + + test "big payload (sizeof > CacheLineBytes) is 64B-aligned": + let p = allocAligned[Big]() + check p != nil + check (cast[uint](p) mod CacheLineBytes.uint) == 0 + for i in 0 ..< 256: + check p.a[i] == 0.byte + freeAligned(p) + + test "many allocations are all 64B-aligned": + var ptrs: array[64, ptr Line] + for i in 0 ..< 64: + ptrs[i] = allocAligned[Line]() + check ptrs[i] != nil + check (cast[uint](ptrs[i]) mod CacheLineBytes.uint) == 0 + for i in 0 ..< 64: + freeAligned(ptrs[i]) diff --git a/tests/t_bench_adapters.nim b/tests/t_bench_adapters.nim new file mode 100644 index 00000000..a782d647 --- /dev/null +++ b/tests/t_bench_adapters.nim @@ -0,0 +1,139 @@ +## Compile-time-gated unit tests for the comparison-MVP bench adapters. +## +## Each adapter is wrapped in a ``when defined(adapter__available):`` +## block so the same test file: +## - vacuously passes when no FFI gates are set (CI default for the +## in-tree-only test suite). +## - exercises each adapter's push/pop round-trip when its ``-d`` gate is set. +## +## Pattern: push 1000 sequential ``uint64`` values at the smallest topology +## the adapter supports; pop until the queue is empty; assert count + set +## equality. + +import std/sets +import unittest2 + +const SampleCount = 1000 + +template runRoundTrip[A]( + makeAdapterExpr: untyped, cleanupExpr: untyped +): untyped = + ## Run a SampleCount-item ``uint64`` push-then-pop round-trip on ``adapter``. + ## Asserts count match and set equality with the input range. + block: + var adapter {.inject.}: A = makeAdapterExpr + defer: + cleanupExpr + var pushed: HashSet[uint64] + var popped: HashSet[uint64] + for i in 0'u64 ..< SampleCount.uint64: + let r = adapter.push(i) + check r == prSuccess + pushed.incl(i) + var got = 0 + while got < SampleCount: + let r = adapter.pop() + if not r.success: + break + popped.incl(r.value) + inc got + check got == SampleCount + check pushed == popped + +when defined(adapter_loony_available): + import ../benchmarks/nim/adapters/loony_adapter + import ../benchmarks/nim/adapter + + suite "loony_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[LoonyAdapter[uint64]]( + makeLoonyAdapter[uint64](), cleanup(adapter) + ) + +when defined(adapter_boost_lockfree_queue_available): + # Boost.LockFree is C++ -- only loadable under `nim cpp`. The adapter + # raises a hard {.error.} under `nim c`, so pulling it in only when both + # the gate AND the cpp build mode are set keeps `nim c -r` of this file + # compilable. + when defined(cpp): + import ../benchmarks/nim/adapters/boost_lockfree_queue_adapter + import ../benchmarks/nim/adapter + + suite "boost_lockfree_queue_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[BoostLockfreeQueueAdapter[uint64]]( + makeBoostLockfreeQueueAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) + +when defined(adapter_boost_lockfree_spsc_available): + when defined(cpp): + import ../benchmarks/nim/adapters/boost_lockfree_spsc_adapter + import ../benchmarks/nim/adapter + + suite "boost_lockfree_spsc_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[BoostLockfreeSpscAdapter[uint64]]( + makeBoostLockfreeSpscAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) + +when defined(adapter_crossbeam_array_queue_available): + import ../benchmarks/nim/adapters/crossbeam_array_queue_adapter + import ../benchmarks/nim/adapter + + suite "crossbeam_array_queue_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[CrossbeamArrayQueueAdapter[uint64]]( + makeCrossbeamArrayQueueAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) + +when defined(adapter_crossbeam_seg_queue_available): + import ../benchmarks/nim/adapters/crossbeam_seg_queue_adapter + import ../benchmarks/nim/adapter + + suite "crossbeam_seg_queue_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[CrossbeamSegQueueAdapter[uint64]]( + makeCrossbeamSegQueueAdapter[uint64](), + cleanup(adapter) + ) + +when defined(adapter_moodycamel_available): + # MoodyCamel ``concurrentqueue`` is a C++ template + extern "C" + # wrapper; load only when both the gate AND the cpp build mode are + # set so ``nim c -r`` of this file stays compilable when the suite + # runs under the default backend. + when defined(cpp): + import ../benchmarks/nim/adapters/moodycamel_adapter + import ../benchmarks/nim/adapter + + suite "moodycamel_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[MoodycamelAdapter[uint64]]( + makeMoodycamelAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) + +when defined(adapter_threading_channels_available): + import ../benchmarks/nim/adapters/threading_channels_adapter + import ../benchmarks/nim/adapter + + suite "threading_channels_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[ThreadingChannelsAdapter[uint64]]( + makeThreadingChannelsAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) + +when defined(adapter_nim_channel_available): + import ../benchmarks/nim/adapters/nim_channel_adapter + import ../benchmarks/nim/adapter + + suite "nim_channel_adapter": + test "push/pop 1000 uint64 round-trip preserves set": + runRoundTrip[NimChannelAdapter[uint64]]( + makeNimChannelAdapter[uint64](capacity = 4096), + cleanup(adapter) + ) diff --git a/tests/t_bench_common.nim b/tests/t_bench_common.nim new file mode 100644 index 00000000..6deb0406 --- /dev/null +++ b/tests/t_bench_common.nim @@ -0,0 +1,395 @@ +## Tests for benchmarks/nim/bench_common.nim — the shared bench harness module. +## +## Task 0.1 RED test: this file must fail to compile until bench_common.nim +## exists and exports the public API surface described in the design doc +## (`/Users/eek/.local/spellbook/docs/Users-eek-Development-lockfreequeues/plans/2026-05-01-bench-rollup-design.md` +## section 2.1). The test's only job at this stage is to reference each +## promised symbol so the compiler enforces the contract. +## +## Subsequent tasks (0.2 onward) will add behavior tests against these +## symbols. + +import std/[json, os, strutils] +import unittest2 +import ../benchmarks/nim/bench_common + +suite "bench_common": + test "module exports public API surface (symbol reference, compile-time only)": + # Compile-time: reference each promised type. If any symbol is renamed + # or deleted by future tasks, this test stops compiling. Bodies of + # `initBMFEmitter` etc. raise `AssertionDefect` until tasks 0.2-0.6 + # land their implementations, so we MUST NOT call them here. + when not declared(BMFEmitter): {.error: "BMFEmitter missing".} + when not declared(Histogram): {.error: "Histogram missing".} + when not declared(LatencyMetrics): {.error: "LatencyMetrics missing".} + when not declared(ThroughputMetrics): {.error: "ThroughputMetrics missing".} + when not declared(Topology): {.error: "Topology missing".} + when not declared(MeasureValue): {.error: "MeasureValue missing".} + when not declared(PushResult): {.error: "PushResult missing".} + when not declared(PopResult): {.error: "PopResult missing".} + + # Reference all six Topology members so renames break here. + let topologies = { + tSpsc, tMpsc, tMpmc, + tSpscUnbounded, tMpscUnbounded, tMpmcUnbounded, + } + check topologies.card == 6 + + # Default-init the result-type objects (no stub-body call required). + var lm: LatencyMetrics + var tm: ThroughputMetrics + check lm.samples == 0 + check tm.runs == 0 + + # PushResult / PopResult literal references. + let pr: PushResult = prSuccess + check pr == prSuccess + let pop = PopResult[uint64](success: false, value: 0'u64) + check pop.success == false + + # Reference factories at compile time only (so stub bodies don't fire). + when not compiles(initBMFEmitter()): + {.error: "initBMFEmitter signature missing".} + when not compiles(initHistogram(false)): + {.error: "initHistogram signature missing".} + +# ---------- Task 0.2: BMFEmitter behavior ---------- + +proc readJsonFile(path: string): JsonNode = + parseJson(readFile(path)) + +suite "bench_common BMFEmitter": + test "empty emitter writes {}": + let path = getTempDir() / "bench_common_empty.json" + var em = initBMFEmitter() + em.emit(path) + check readJsonFile(path) == newJObject() + removeFile(path) + + test "two slugs are alpha-sorted": + let path = getTempDir() / "bench_common_two_slugs.json" + var em = initBMFEmitter() + em.addMeasure("zzz_lib/spsc/1p1c", "throughput", 100.0) + em.addMeasure("aaa_lib/spsc/1p1c", "throughput", 200.0) + em.emit(path) + # Read raw text to verify ordering, since JsonNode field order in Nim + # is preserved per insertion but pretty-printed via std/json sort_keys. + let raw = readFile(path) + let aaaIdx = raw.find("aaa_lib") + let zzzIdx = raw.find("zzz_lib") + check aaaIdx >= 0 + check zzzIdx >= 0 + check aaaIdx < zzzIdx + removeFile(path) + + test "measures within a slug are alpha-sorted": + let path = getTempDir() / "bench_common_measures_sorted.json" + var em = initBMFEmitter() + em.addMeasure("foo/spsc/1p1c", "throughput", 100.0) + em.addMeasure("foo/spsc/1p1c", "latency_p50_ns", 50.0) + em.addMeasure("foo/spsc/1p1c", "latency_p99_ns", 90.0) + em.emit(path) + let raw = readFile(path) + # latency_p50_ns < latency_p99_ns < throughput in alpha order. + let i50 = raw.find("latency_p50_ns") + let i99 = raw.find("latency_p99_ns") + let it = raw.find("throughput") + check i50 >= 0 + check i99 >= 0 + check it >= 0 + check i50 < i99 + check i99 < it + removeFile(path) + + test "NaN bounds are omitted": + let path = getTempDir() / "bench_common_nan_bounds.json" + var em = initBMFEmitter() + em.addMeasure("foo/spsc/1p1c", "throughput", 100.0) # both bounds default NaN + em.emit(path) + let node = readJsonFile(path) + let inner = node["foo/spsc/1p1c"]["throughput"] + check inner.kind == JObject + check "value" in inner + check "lower_value" notin inner + check "upper_value" notin inner + check inner["value"].getFloat() == 100.0 + removeFile(path) + + test "finite bounds emit lower_value and upper_value": + let path = getTempDir() / "bench_common_finite_bounds.json" + var em = initBMFEmitter() + em.addMeasure("foo/spsc/1p1c", "throughput", 100.0, + lower = 95.0, upper = 105.0) + em.emit(path) + let inner = readJsonFile(path)["foo/spsc/1p1c"]["throughput"] + check inner["value"].getFloat() == 100.0 + check inner["lower_value"].getFloat() == 95.0 + check inner["upper_value"].getFloat() == 105.0 + removeFile(path) + +# ---------- Task 0.4: Stats helpers ---------- + +suite "bench_common Stats": + test "mean of integer-like floats": + check mean(@[1.0, 2.0, 3.0, 4.0]) == 2.5 + + test "mean of empty data is 0.0 (defined behavior)": + check mean(newSeq[float]()) == 0.0 + + test "stddev of [1,2,3,4] matches numpy's sample stddev": + # numpy default ddof=1 (sample): sqrt(sum((x-mean)^2) / (n-1)) = sqrt(5/3) + let s = stddev(@[1.0, 2.0, 3.0, 4.0]) + let expected = 1.2909944487358056 # sqrt(5/3) + check abs(s - expected) < 1e-9 + + test "stddev of singleton is 0": + check stddev(@[42.0]) == 0.0 + + test "minVal and maxVal": + check minVal(@[3.0, 1.0, 4.0, 1.5, 5.0, 9.0]) == 1.0 + check maxVal(@[3.0, 1.0, 4.0, 1.5, 5.0, 9.0]) == 9.0 + + test "percentile(0..99, 0.5) is 49.5 (linear interpolation)": + var data: seq[float] + for i in 0 .. 99: data.add(float(i)) + # Linear interpolation: index = 0.5 * 99 = 49.5; data[49] + 0.5 * (data[50]-data[49]) = 49.5 + check abs(percentile(data, 0.5) - 49.5) < 1e-9 + + test "percentile(p=0.0) is min, percentile(p=1.0) is max": + var data: seq[float] + for i in 0 .. 99: data.add(float(i)) + check percentile(data, 0.0) == 0.0 + check percentile(data, 1.0) == 99.0 + +# ---------- Task 0.3: Histogram ---------- + +import std/[math, random] + +proc generateLogNormal(n: int, seed: int64): seq[float] = + ## Deterministic log-normal-ish samples. We use exp(N(0,1)) modeled + ## via Box-Muller from a seeded RNG so the test is reproducible. + var r = initRand(seed) + result = newSeq[float](n) + var i = 0 + while i < n: + # Box-Muller: two uniforms -> two standard normals. + let u1 = r.rand(1.0) + let u2 = r.rand(1.0) + if u1 == 0.0: continue + let mag = sqrt(-2.0 * ln(u1)) + let z0 = mag * cos(2.0 * PI * u2) + let z1 = mag * sin(2.0 * PI * u2) + result[i] = exp(z0) + if i + 1 < n: result[i + 1] = exp(z1) + i += 2 + +suite "bench_common Histogram": + test "percentile(1.0) equals top-K max": + var h = initHistogram() + for v in [1.0, 5.0, 3.0, 99.0, 7.0, 42.0]: + h.record(v) + check h.percentile(1.0) == 99.0 + + test "p99 within 1% of sort fallback on 100K log-normal samples": + let samples = generateLogNormal(100_000, 0xC0FFEE'i64) + let exact = percentile(samples, 0.99) + var h = initHistogram() + for v in samples: h.record(v) + let approx = h.percentile(0.99) + let relErr = abs(approx - exact) / exact + check relErr < 0.01 + + test "p50 reads from reservoir and is close to sort-fallback": + let samples = generateLogNormal(100_000, 0xBEEF'i64) + let exact = percentile(samples, 0.50) + var h = initHistogram() + for v in samples: h.record(v) + let approx = h.percentile(0.50) + # Reservoir is a uniform sample of 99K of 100K — within 5% on a + # well-behaved log-normal. + let relErr = abs(approx - exact) / exact + check relErr < 0.05 + + test "debug mode returns exact sort answer": + var h = initHistogram(debug = true) + for v in [1.0, 5.0, 3.0, 99.0, 7.0, 42.0]: + h.record(v) + # Sorted: [1, 3, 5, 7, 42, 99]; len 6. percentile(0.99) on [..]: + # linear interp index = 0.99 * 5 = 4.95 + # data[4] + 0.95 * (data[5]-data[4]) = 42 + 0.95 * 57 = 42 + 54.15 = 96.15 + check abs(h.percentile(0.99) - 96.15) < 1e-9 + + test "topK of small sample returns sorted ascending": + var h = initHistogram() + for v in [9.0, 1.0, 5.0, 3.0, 7.0]: + h.record(v) + check h.topK() == @[1.0, 3.0, 5.0, 7.0, 9.0] + + # ---------- Task 6.2: HistogramTopK sized to scale with overrides ---------- + # + # `runLatencyHarness` builds a fresh Histogram per run and averages + # per-run percentiles (design 2.5); each histogram only sees + # `BenchLatencyMessageCount` samples, NOT messageCount * runCount. + # At the default MessageCount=100,000 a single histogram captures + # every sample exactly (TopK=5000 + Reservoir=99,000 ≥ 100,000), so + # K=1000 would already have been enough for the default config. + # The K=5000 sizing is anticipatory: an operator who bumps + # BenchLatencyMessageCount to ~5M (uncommon, but a future stress + # configuration) needs ~5000 in the exact top-K stratum to keep + # p999 (tail count = MessageCount * 0.001) outside the rescaled + # reservoir. + test "HistogramTopK is at least 5000 (anticipates MessageCount up to ~5M)": + check HistogramTopK >= 5000 + + when defined(BenchCommonStress): + test "p999 within 5% of sort fallback at 3.3M-sample stress shape": + # Stress-test the K=5000 design choice at a single-histogram + # volume that an operator could reach by overriding + # BenchLatencyMessageCount upward. At seenAll=3.3M the p999 tail + # count is 3300 and lies inside the K=5000 exact top-K stratum, + # so percentile(0.999) is read from the exact top-K heap. + # Tolerance is 5% per impl plan acceptance criterion. The test + # allocates a 3.3M-sample seq and runs `record()` that many + # times, so it is gated behind `-d:BenchCommonStress` to keep + # the default `nimble benchtests` invocation under ~1 second. + # Run explicitly as `nimble benchtestsstress` (or + # `nim c -d:BenchCommonStress -r tests/t_bench_common`). + let samples = generateLogNormal(3_300_000, 0xDEADBEEF'i64) + let exact = percentile(samples, 0.999) + var h = initHistogram() + for v in samples: h.record(v) + let approx = h.percentile(0.999) + let relErr = abs(approx - exact) / exact + check relErr < 0.05 + +# ---------- Task 0.5: runThroughputHarness smoke ---------- + +# Tiny inline adapter that satisfies bench_common's BenchAdapter shape +# (push -> PushResult, pop -> PopResult[uint64]) wrapping a Nim +# system Channel. Lives in this test file because Task 0.9 has not +# yet reconciled benchmarks/nim/adapter.nim (legacy) with bench_common +# (new); once that lands, this shim moves to a real adapter file. + +type SmokeAdapter = object + chan: ptr Channel[uint64] + +proc initSmokeAdapter(capacity: int): SmokeAdapter = + result.chan = create(Channel[uint64]) + result.chan[].open(capacity) + +proc push(a: var SmokeAdapter, v: uint64): PushResult = + if a.chan[].trySend(v): prSuccess else: prFull + +proc pop(a: var SmokeAdapter): PopResult[uint64] = + let r = a.chan[].tryRecv() + if r.dataAvailable: + PopResult[uint64](success: true, value: r.msg) + else: + PopResult[uint64](success: false, value: 0'u64) + +suite "bench_common runThroughputHarness": + test "smoke: 1P/1C, 1000 messages, 1 run, 0 warmup completes": + let metrics = runThroughputHarness[SmokeAdapter]( + queueInit = proc(cap: int): SmokeAdapter = initSmokeAdapter(cap), + capacity = 1024, + numProducers = 1, + numConsumers = 1, + messageCount = 1000, + runCount = 1, + warmupCount = 0, + ) + check metrics.runs == 1 + check metrics.ops_ms_mean > 0.0 + +# ---------- Task 0.6: runLatencyHarness smoke ---------- + +suite "bench_common runLatencyHarness": + test "smoke: 1P/1C, 1000 messages, 1 run, 0 warmup; p50= 1000 + check metrics.p50_ns > 0.0 + check metrics.p99_ns >= metrics.p50_ns + check metrics.max_ns >= metrics.p99_ns + +# ---------- Task 0.8: lockfreequeues adapter smoke tests ---------- + +import std/sets +import ../benchmarks/nim/adapters/lockfreequeues_sipmuc_adapter +import ../benchmarks/nim/adapters/lockfreequeues_mupsic_adapter +import ../benchmarks/nim/adapters/lockfreequeues_unbounded_sipsic_adapter +import ../benchmarks/nim/adapters/lockfreequeues_unbounded_sipmuc_adapter +import ../benchmarks/nim/adapters/lockfreequeues_unbounded_mupmuc_adapter + +const SmokeMessageCount = 100 + +proc roundTripUint64Set[A]( + adapter: var A, count: int +): tuple[popped: int, ok: bool] = + ## Push `count` sequential uint64s, then pop them all back. Returns + ## (popped_count, set_equality_ok). + for i in 0 ..< count: + let r = adapter.push(uint64(i)) + if r != prSuccess: + return (i, false) + var seen = initHashSet[uint64]() + for _ in 0 ..< count: + let r = adapter.pop() + if not r.success: + return (seen.len, false) + seen.incl(r.value) + var expected = initHashSet[uint64]() + for i in 0 ..< count: + expected.incl(uint64(i)) + result = (seen.len, seen == expected) + +suite "bench_common adapters: lockfreequeues smoke (Task 0.8)": + test "Sipmuc 1024-cap, 1p1c, 100 sequential round-trip": + var a = makeLockfreequeuesSipmucAdapter[1024, 1, uint64](1024) + let r = roundTripUint64Set(a, SmokeMessageCount) + a.cleanup() + check r.popped == SmokeMessageCount + check r.ok + + test "Mupsic 1024-cap, 1p1c, 100 sequential round-trip": + var a = makeLockfreequeuesMupsicAdapter[1024, 1, uint64](1024) + let r = roundTripUint64Set(a, SmokeMessageCount) + a.cleanup() + check r.popped == SmokeMessageCount + check r.ok + + test "UnboundedSipsic seg=64, 100 sequential round-trip": + var a = makeLockfreequeuesUnboundedSipsicAdapter[64, uint64](0) + let r = roundTripUint64Set(a, SmokeMessageCount) + a.cleanup() + check r.popped == SmokeMessageCount + check r.ok + + test "UnboundedSipmuc seg=64, MaxThreads=4, 100 sequential round-trip": + var a = makeLockfreequeuesUnboundedSipmucAdapter[64, uint64, 4](0) + let r = roundTripUint64Set(a, SmokeMessageCount) + a.cleanup() + check r.popped == SmokeMessageCount + check r.ok + + test "UnboundedMupmuc seg=64, MaxThreads=4, 100 sequential round-trip": + var a = makeLockfreequeuesUnboundedMupmucAdapter[64, uint64, 4](0) + let r = roundTripUint64Set(a, SmokeMessageCount) + a.cleanup() + check r.popped == SmokeMessageCount + check r.ok + +# ---------- Task 0.10 (legacy bench_throughput integration) ---------- +# +# PR 0 Task 0.10 originally compiled bench_throughput.nim against +# `--bmf-out=` and asserted the emitted BMF carried the expected +# `lockfreequeues_sipsic/spsc/1p1c` slug. PR 2 Task 2.10 deleted +# bench_throughput.nim in favor of five topology-split binaries, and +# tests/t_topology_split.nim now covers the equivalent BMF-emission +# contract for each new binary (bench_spsc covers the sipsic/spsc/1p1c +# slug specifically). The bench_throughput-specific suite is removed +# here intentionally; do not reintroduce. diff --git a/tests/t_bench_latency.nim b/tests/t_bench_latency.nim new file mode 100644 index 00000000..1bd697a8 --- /dev/null +++ b/tests/t_bench_latency.nim @@ -0,0 +1,242 @@ +## Tests for benchmarks/nim/bench_latency.nim — the latency bench binary. +## +## Track 1 (PR 1) covers: per-binary intdefines (Task 1.1), --bmf-out +## emission via runLatencyHarness (Task 1.2), and multi-measure-per-slug +## merge with throughput (Task 1.5). +## +## The bench binary is invoked as a subprocess in the integration tests +## (Tasks 1.2 / 1.5); compile-time intdefine assertions (Task 1.1) live +## in the binary itself behind a `BenchLatencyTestCompileTime` flag and +## are exercised from a tiny dedicated build invocation here. + +import std/[json, os, osproc, strutils, tempfiles] +import unittest2 + +const + RepoRoot = currentSourcePath().parentDir.parentDir + BenchLatencySrc = RepoRoot / "benchmarks" / "nim" / "bench_latency.nim" + +proc newTestWorkspace(prefix: string): string = + ## Allocate a private workspace dir for one test. Each test that + ## compiles a bench_latency binary or writes a BMF file uses this so: + ## 1. parallel runs (or repeated runs in the same shell) cannot + ## collide on a static `/tmp/bench_latency_t11_*` filename, and + ## 2. compiled binaries don't accumulate in the system temp dir. + ## The caller is responsible for `removeDir` (typically via `defer`). + ## `prefix` is a per-test stem that matches the original static suffix + ## so test failure messages stay legible. + result = createTempDir("bench_latency_" & prefix & "_", "") + +# ---------- Task 1.1: intdefine defaults + override ---------- + +suite "bench_latency intdefines (Task 1.1)": + test "defaults: BenchLatencyRuns == 33, BenchLatencyMessageCount == 100_000": + # Compile bench_latency.nim with -d:BenchLatencyTestCompileTime=1. + # The binary, when this define is set, runs a `static` block that + # asserts the two intdefine defaults; if they are wrong, compilation + # fails with the static assert message. + let dir = newTestWorkspace("t11_defaults") + defer: removeDir(dir) + let outBin = dir / ("bench_latency" & ExeExt) + let cmd = "nim c --threads:on -d:release -d:BenchLatencyTestCompileTime=1 " & + "-o:" & outBin & " " & BenchLatencySrc + let (output, exitCode) = execCmdEx(cmd) + check exitCode == 0 + if exitCode != 0: + echo "compile output:\n", output + + test "overrides: -d:BenchLatencyRuns=2 -d:BenchLatencyMessageCount=1000 take effect": + # Compile with overrides + a different test flag that checks the + # overridden values rather than the defaults. + let dir = newTestWorkspace("t11_overrides") + defer: removeDir(dir) + let outBin = dir / ("bench_latency" & ExeExt) + let cmd = "nim c --threads:on -d:release " & + "-d:BenchLatencyTestCompileTimeOverrides=1 " & + "-d:BenchLatencyRuns=2 -d:BenchLatencyMessageCount=1000 " & + "-o:" & outBin & " " & BenchLatencySrc + let (output, exitCode) = execCmdEx(cmd) + check exitCode == 0 + if exitCode != 0: + echo "compile output:\n", output + +# ---------- Task 1.2: --bmf-out integration ---------- + +proc compileBenchLatency( + extraDefs: openArray[string], dir: string +): string = + ## Compile bench_latency.nim with extra -d: defines into `dir` and + ## return the binary path. Caller owns `dir` and must remove it. + ## Compiles in release mode for realistic timing but with tiny + ## message counts so the integration test stays fast. + let outBin = dir / ("bench_latency" & ExeExt) + var cmd = "nim c --threads:on -d:release" + for d in extraDefs: + cmd.add(" -d:" & d) + cmd.add(" -o:" & outBin & " " & BenchLatencySrc) + let (output, exitCode) = execCmdEx(cmd) + if exitCode != 0: + raise newException(IOError, "bench_latency compile failed:\n" & output) + result = outBin + +suite "bench_latency --bmf-out integration (Task 1.2)": + test "sipsic variant emits latency_p50_ns / latency_p99_ns on expected slug": + # Override message count + runs to keep the integration run under ~5s. + let dir = newTestWorkspace("t12_sipsic") + defer: removeDir(dir) + let bin = compileBenchLatency(@[ + "BenchLatencyMessageCount=200", + "BenchLatencyRuns=2", + ], dir = dir) + let bmfPath = dir / "bench_latency.json" + let cmd = bin & " --bmf-out=" & bmfPath & " sipsic" + let (output, exitCode) = execCmdEx(cmd) + check exitCode == 0 + check fileExists(bmfPath) + let node = parseJson(readFile(bmfPath)) + # Expected slug per design 2.2 / table at design line 357. + let slug = "lockfreequeues_sipsic/spsc/1p1c" + check node.hasKey(slug) + let s = node[slug] + check s.hasKey("latency_p50_ns") + check s.hasKey("latency_p95_ns") + check s.hasKey("latency_p99_ns") + # Track 6 Task 6.1: p999 and max measures emitted alongside p50/p95/p99. + check s.hasKey("latency_p999_ns") + check s.hasKey("latency_max_ns") + check s["latency_p50_ns"]["value"].getFloat() > 0.0 + check s["latency_p99_ns"]["value"].getFloat() >= s["latency_p50_ns"]["value"].getFloat() + check s["latency_p999_ns"]["value"].getFloat() >= s["latency_p99_ns"]["value"].getFloat() + check s["latency_max_ns"]["value"].getFloat() >= s["latency_p999_ns"]["value"].getFloat() + # Stdout text output preserved (acceptance: positional CLI behavior). + check output.contains("Sipsic") or output.contains("sipsic") + + test "all four bounded variants emit latency_p50_ns / latency_p99_ns / latency_p999_ns / latency_max_ns": + # Per impl plan Track 1 Acceptance Criteria: BMF JSON contains + # latency_p50_ns and latency_p99_ns for sipsic / sipmuc / mupsic / + # mupmuc on the 1p1c smoke shape. + let dir = newTestWorkspace("t12_all4") + defer: removeDir(dir) + let bin = compileBenchLatency(@[ + "BenchLatencyMessageCount=200", + "BenchLatencyRuns=2", + ], dir = dir) + let bmfPath = dir / "bench_latency.json" + let cmd = bin & " --bmf-out=" & bmfPath & + " sipsic mupmuc sipmuc mupsic" + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 0 + let node = parseJson(readFile(bmfPath)) + let expectedSlugs = @[ + "lockfreequeues_sipsic/spsc/1p1c", + "lockfreequeues_sipmuc/mpmc/1p1c", + "lockfreequeues_mupsic/mpsc/1p1c", + "lockfreequeues_mupmuc/mpmc/1p1c", + ] + for slug in expectedSlugs: + check node.hasKey(slug) + check node[slug].hasKey("latency_p50_ns") + check node[slug].hasKey("latency_p99_ns") + # Track 6 Task 6.1: p999 + max alongside p50/p99 on every bounded variant. + check node[slug].hasKey("latency_p999_ns") + check node[slug].hasKey("latency_max_ns") + + test "unknown variant exits 1": + let dir = newTestWorkspace("t12_unknown") + defer: removeDir(dir) + let bin = compileBenchLatency(@[ + "BenchLatencyMessageCount=200", + "BenchLatencyRuns=2", + ], dir = dir) + let cmd = bin & " bogus_variant" + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 1 + +# ---------- Task 1.5: multi-measure-per-slug merge ---------- +# +# Validates the end-to-end shape that Track 1 ships: a single slug +# carries BOTH `throughput_ops_ms` (from bench_throughput's BMF +# fragment) and `latency_p50_ns` / `latency_p99_ns` (from bench_latency) +# AFTER `merge_bmf.py` unions the two fragments. Production CI does this +# with real bench output; the test uses two synthetic fragments so it +# stays fast and deterministic. + +const MergeBmfPath = RepoRoot / "benchmarks" / "merge_bmf.py" + +suite "bench_latency multi-measure-per-slug merge (Task 1.5)": + test "merge_bmf.py unions throughput + latency on shared slug": + let dir = createTempDir("bench_latency_t15_", "") + defer: removeDir(dir) + let throughputPath = dir / "throughput.json" + let latencyPath = dir / "latency.json" + let mergedPath = dir / "merged.json" + let slug = "lockfreequeues_sipsic/spsc/1p1c" + + # Synthetic throughput fragment. + writeFile(throughputPath, + """{ + "lockfreequeues_sipsic/spsc/1p1c": { + "throughput_ops_ms": { + "value": 1234.5, + "lower_value": 1200.0, + "upper_value": 1270.0 + } + } +}""") + # Synthetic latency fragment on the SAME slug, distinct measures. + writeFile(latencyPath, + """{ + "lockfreequeues_sipsic/spsc/1p1c": { + "latency_p50_ns": { "value": 250.0 }, + "latency_p99_ns": { "value": 875.0 } + } +}""") + + let cmd = "python3 " & MergeBmfPath & " " & mergedPath & + " " & throughputPath & " " & latencyPath + let (output, exitCode) = execCmdEx(cmd) + check exitCode == 0 + if exitCode != 0: + echo "merge stdout/stderr:\n", output + check fileExists(mergedPath) + + let node = parseJson(readFile(mergedPath)) + check node.hasKey(slug) + let s = node[slug] + # All three measures from BOTH fragments must coexist on the shared slug. + check s.hasKey("throughput_ops_ms") + check s.hasKey("latency_p50_ns") + check s.hasKey("latency_p99_ns") + check s["throughput_ops_ms"]["value"].getFloat() == 1234.5 + check s["latency_p50_ns"]["value"].getFloat() == 250.0 + check s["latency_p99_ns"]["value"].getFloat() == 875.0 + + test "merge_bmf.py exits 1 on collision when same measure key in both inputs": + # Sanity: the per-slug union union semantics are NOT a free-for-all; + # ensure the collision guard from PR 0 Task 0.7 still fires when the + # latency fragment accidentally re-declares throughput_ops_ms on the + # same slug. This guards against silent overwrites that would erase + # one of the measures. + let dir = createTempDir("bench_latency_t15_collide_", "") + defer: removeDir(dir) + let aPath = dir / "a.json" + let bPath = dir / "b.json" + let mergedPath = dir / "merged.json" + + writeFile(aPath, + """{ + "lockfreequeues_sipsic/spsc/1p1c": { + "throughput_ops_ms": { "value": 100.0 } + } +}""") + writeFile(bPath, + """{ + "lockfreequeues_sipsic/spsc/1p1c": { + "throughput_ops_ms": { "value": 200.0 } + } +}""") + let cmd = "python3 " & MergeBmfPath & " " & mergedPath & + " " & aPath & " " & bPath + let (output, exitCode) = execCmdEx(cmd) + check exitCode == 1 + check output.contains("collision") diff --git a/tests/t_topology_split.nim b/tests/t_topology_split.nim new file mode 100644 index 00000000..61b7ab95 --- /dev/null +++ b/tests/t_topology_split.nim @@ -0,0 +1,258 @@ +## Tests for the bench-rollup PR 2 topology split. +## +## Five binaries replace the legacy `bench_throughput.nim`: +## +## bench_spsc — Sipsic at `1p1c`. +## bench_mpsc — Mupsic at `{1,2,4}p1c`. +## bench_mpmc — Mupmuc at `{1,2,4,8}p{1,2,4,8}c` (8p8c is the +## oversubscription regression case from issue #15), +## Sipmuc at `1p{1,2,4}c`, channels at `{1,2,4}p{1,2,4}c`. +## bench_unbounded — All 4 unbounded variants at their natural shapes. +## bench_latency — already shipped in PR 1. +## +## The deletion-safety check (Task 2.7) verifies the union of post-split +## BMFs is a strict superset of the pre-split BMF captured in +## `tests/fixtures/pre-split-slugs.json`. That fixture is committed and +## treated as immutable by this test suite. +## +## Tests in this file invoke each binary as a subprocess at tiny `-d:` +## overrides so the integration suite stays fast, then parse the emitted +## BMF JSON and assert slug presence. + +import std/[json, os, osproc, sets, strutils, tables, tempfiles] +import unittest2 + +const + RepoRoot = currentSourcePath().parentDir.parentDir + PreSplitFixturePath = + RepoRoot / "tests" / "fixtures" / "pre-split-slugs.json" + BenchSpscSrc = RepoRoot / "benchmarks" / "nim" / "bench_spsc.nim" + BenchMpscSrc = RepoRoot / "benchmarks" / "nim" / "bench_mpsc.nim" + BenchMpmcSrc = RepoRoot / "benchmarks" / "nim" / "bench_mpmc.nim" + BenchUnboundedSrc = RepoRoot / "benchmarks" / "nim" / "bench_unbounded.nim" + SupersetCheckScript = + RepoRoot / "benchmarks" / "scripts" / "superset_check.py" + +# ---------- Task 2.1: pre-split fixture exists and is non-empty ---------- + +suite "topology split: pre-split fixture (Task 2.1)": + test "fixture file exists and parses to a non-empty BMF JSON object": + check fileExists(PreSplitFixturePath) + let node = parseJson(readFile(PreSplitFixturePath)) + check node.kind == JObject + # Pre-split snapshot covers sipsic + mupmuc + unbounded_mupsic + channels. + # The exact count is locked here so future regenerations of the + # fixture must consciously update this assertion (and the deletion- + # safety check below) rather than silently shrinking the baseline. + check node.len >= 11 + # Spot-check three representative slugs from the four variant groups + # so a corruption of the file is caught early. + check node.hasKey("lockfreequeues_sipsic/spsc/1p1c") + check node.hasKey("lockfreequeues_mupmuc/mpmc/4p4c") + check node.hasKey("lockfreequeues_unbounded_mupsic/mpsc_unbounded/4p1c") + check node.hasKey("nim_channels/mpmc/4p4c") + +# ---------- Helpers shared across Tasks 2.3-2.6 ---------- + +proc compileBench(src: string, defs: openArray[string], suffix: string): string = + ## Compile a topology bench binary at small `-d:` overrides so the + ## integration test stays fast. Raises IOError on compile failure; + ## returns the binary path on success. + let outBin = getTempDir() / ("bench_topology_split_" & suffix) + var cmd = "nim c --threads:on -d:release" + for d in defs: + cmd.add(" -d:" & d) + cmd.add(" -o:" & outBin & " " & src) + let (output, exitCode) = execCmdEx(cmd) + if exitCode != 0: + raise newException(IOError, + "compile failed for " & src & ":\n" & output) + result = outBin + +proc parseBmf(path: string): JsonNode = + parseJson(readFile(path)) + +# ---------- Task 2.3: bench_spsc emits sipsic/spsc/1p1c ---------- + +suite "topology split: bench_spsc (Task 2.3)": + test "compiles + emits BMF containing lockfreequeues_sipsic/spsc/1p1c": + let bin = compileBench(BenchSpscSrc, [ + "BenchSpscMessageCount=1000", + "BenchSpscRuns=2", + "BenchSpscWarmup=0", + ], "spsc") + let bmf = getTempDir() / "bench_spsc_t23.json" + if fileExists(bmf): removeFile(bmf) + let cmd = bin & " --bmf-out=" & bmf + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 0 + check fileExists(bmf) + let node = parseBmf(bmf) + check node.hasKey("lockfreequeues_sipsic/spsc/1p1c") + let slug = node["lockfreequeues_sipsic/spsc/1p1c"] + check slug.hasKey("throughput_ops_ms") + check slug["throughput_ops_ms"]["value"].getFloat() > 0.0 + removeFile(bmf) + +# ---------- Task 2.4: bench_mpsc emits mupsic/mpsc/{1,2,4}p1c ---------- + +suite "topology split: bench_mpsc (Task 2.4)": + test "compiles + emits BMF for mupsic 1p1c, 2p1c, 4p1c": + let bin = compileBench(BenchMpscSrc, [ + "BenchMpscMessageCount=1000", + "BenchMpscRuns=2", + "BenchMpscWarmup=0", + ], "mpsc") + let bmf = getTempDir() / "bench_mpsc_t24.json" + if fileExists(bmf): removeFile(bmf) + let cmd = bin & " --bmf-out=" & bmf + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 0 + let node = parseBmf(bmf) + for shape in ["1p1c", "2p1c", "4p1c"]: + let slug = "lockfreequeues_mupsic/mpsc/" & shape + check node.hasKey(slug) + check node[slug].hasKey("throughput_ops_ms") + check node[slug]["throughput_ops_ms"]["value"].getFloat() > 0.0 + removeFile(bmf) + +# ---------- Task 2.5: bench_mpmc emits mupmuc grid + sipmuc + channels ---------- + +suite "topology split: bench_mpmc (Task 2.5)": + test "compiles + emits BMF for mupmuc 4x4 grid + 8p8c + sipmuc 1p{1,2,4}c + channels {1,2,4}p{1,2,4}c": + let bin = compileBench(BenchMpmcSrc, [ + "BenchMpmcMessageCount=1000", + "BenchMpmcRuns=2", + "BenchMpmcWarmup=0", + ], "mpmc") + let bmf = getTempDir() / "bench_mpmc_t25.json" + if fileExists(bmf): removeFile(bmf) + let cmd = bin & " --bmf-out=" & bmf + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 0 + let node = parseBmf(bmf) + # Mupmuc 4x4 grid plus the 8p8c oversubscription case (preserved + # from pre-split fixture; #15 livelock regression coverage). + for p in [1, 2, 4]: + for c in [1, 2, 4]: + let slug = "lockfreequeues_mupmuc/mpmc/" & $p & "p" & $c & "c" + check node.hasKey(slug) + check node[slug]["throughput_ops_ms"]["value"].getFloat() > 0.0 + check node.hasKey("lockfreequeues_mupmuc/mpmc/8p8c") + # Sipmuc — single producer, multi consumer, lives under mpmc per design 2.4. + for c in [1, 2, 4]: + let slug = "lockfreequeues_sipmuc/mpmc/1p" & $c & "c" + check node.hasKey(slug) + check node[slug]["throughput_ops_ms"]["value"].getFloat() > 0.0 + # Channels (Nim system Channel) — full {1,2,4}p{1,2,4}c grid. + for p in [1, 2, 4]: + for c in [1, 2, 4]: + let slug = "nim_channels/mpmc/" & $p & "p" & $c & "c" + check node.hasKey(slug) + check node[slug]["throughput_ops_ms"]["value"].getFloat() > 0.0 + removeFile(bmf) + +# ---------- Task 2.6: bench_unbounded covers all 4 unbounded variants ---------- + +suite "topology split: bench_unbounded (Task 2.6)": + test "compiles + emits BMF for all 4 unbounded variants at their natural shapes": + let bin = compileBench(BenchUnboundedSrc, [ + "UnboundedSipsicMessageCount=500", + "UnboundedSipsicRuns=2", + "UnboundedSipmucMessageCount=500", + "UnboundedSipmucRuns=2", + "UnboundedMupsicMessageCount=500", + "UnboundedMupsicRuns=2", + "UnboundedMupmucMessageCount=500", + "UnboundedMupmucRuns=2", + "BenchUnboundedWarmup=0", + ], "unbounded") + let bmf = getTempDir() / "bench_unbounded_t26.json" + if fileExists(bmf): removeFile(bmf) + let cmd = bin & " --bmf-out=" & bmf + let (_, exitCode) = execCmdEx(cmd) + check exitCode == 0 + let node = parseBmf(bmf) + # Sipsic unbounded: spsc only, 1p1c. + check node.hasKey("lockfreequeues_unbounded_sipsic/spsc_unbounded/1p1c") + # Sipmuc unbounded: 1 producer × {1,2,4} consumers. + for c in [1, 2, 4]: + check node.hasKey( + "lockfreequeues_unbounded_sipmuc/mpmc_unbounded/1p" & $c & "c") + # Mupsic unbounded: {1,2,4} producers × 1 consumer. + for p in [1, 2, 4]: + check node.hasKey( + "lockfreequeues_unbounded_mupsic/mpsc_unbounded/" & $p & "p1c") + # Mupmuc unbounded: {1,2,4} P × {1,2,4} C. + for p in [1, 2, 4]: + for c in [1, 2, 4]: + check node.hasKey( + "lockfreequeues_unbounded_mupmuc/mpmc_unbounded/" & + $p & "p" & $c & "c") + removeFile(bmf) + +# ---------- Task 2.7: strict-superset deletion-safety check ---------- + +suite "topology split: deletion-safety (Task 2.7)": + test "post-split union is a strict superset of pre-split fixture": + # Compile and run all four binaries at small overrides; merge the + # outputs via merge_bmf.py; invoke superset_check.py and assert + # exit 0 + no output to stderr. + let dir = createTempDir("topology_split_superset_", "") + defer: removeDir(dir) + let spscBin = compileBench(BenchSpscSrc, [ + "BenchSpscMessageCount=1000", + "BenchSpscRuns=2", + "BenchSpscWarmup=0", + ], "superset_spsc") + let mpscBin = compileBench(BenchMpscSrc, [ + "BenchMpscMessageCount=1000", + "BenchMpscRuns=2", + "BenchMpscWarmup=0", + ], "superset_mpsc") + let mpmcBin = compileBench(BenchMpmcSrc, [ + "BenchMpmcMessageCount=1000", + "BenchMpmcRuns=2", + "BenchMpmcWarmup=0", + ], "superset_mpmc") + let unboundedBin = compileBench(BenchUnboundedSrc, [ + "UnboundedSipsicMessageCount=500", + "UnboundedSipsicRuns=2", + "UnboundedSipmucMessageCount=500", + "UnboundedSipmucRuns=2", + "UnboundedMupsicMessageCount=500", + "UnboundedMupsicRuns=2", + "UnboundedMupmucMessageCount=500", + "UnboundedMupmucRuns=2", + "BenchUnboundedWarmup=0", + ], "superset_unbounded") + let spscJson = dir / "spsc.json" + let mpscJson = dir / "mpsc.json" + let mpmcJson = dir / "mpmc.json" + let unboundedJson = dir / "unbounded.json" + let mergedJson = dir / "merged.json" + for (bin, outPath) in [ + (spscBin, spscJson), + (mpscBin, mpscJson), + (mpmcBin, mpmcJson), + (unboundedBin, unboundedJson), + ]: + let (output, exitCode) = execCmdEx(bin & " --bmf-out=" & outPath) + check exitCode == 0 + if exitCode != 0: + echo "binary failed: ", bin, "\n", output + # Merge. + let mergeCmd = "python3 " & RepoRoot / "benchmarks" / "merge_bmf.py" & + " " & mergedJson & " " & spscJson & " " & mpscJson & " " & + mpmcJson & " " & unboundedJson + let (mergeOutput, mergeExit) = execCmdEx(mergeCmd) + check mergeExit == 0 + if mergeExit != 0: + echo "merge failed:\n", mergeOutput + # Superset check. + let supersetCmd = "python3 " & SupersetCheckScript & + " " & PreSplitFixturePath & " " & mergedJson + let (supersetOutput, supersetExit) = execCmdEx(supersetCmd) + check supersetExit == 0 + if supersetExit != 0: + echo "superset check failed:\n", supersetOutput diff --git a/tests/t_unbounded_padding.nim b/tests/t_unbounded_padding.nim new file mode 100644 index 00000000..8607d2d5 --- /dev/null +++ b/tests/t_unbounded_padding.nim @@ -0,0 +1,79 @@ +## Cache-line padding audit for unbounded queue Segments. +## +## Verifies two conditions per design doc §4.2: +## 1. Per-Segment cache-line-padded fields have field offsets that are +## multiples of ``CacheLineBytes``. +## 2. ``cast[uint](segPtr) mod CacheLineBytes == 0`` for a freshly-allocated +## Segment via the queue's allocator (base alignment). +## +## RED state (Task 3.1): condition 2 fails because ``c_calloc`` returns +## 16-byte-aligned memory on x86_64 Linux glibc. Condition 1 also fails +## today because Segment fields lack ``{.align: CacheLineBytes.}``. +## +## GREEN state (Tasks 3.2–3.3): both conditions hold for all four +## unbounded variants. ``posix_memalign`` is used to lift the segment base +## onto a 64-byte boundary, and ``{.align: CacheLineBytes.}`` is added to +## each Segment field that participates in producer/consumer coordination. + +import lockfreequeues/atomic_dsl +import lockfreequeues/unbounded_sipsic +import lockfreequeues/unbounded_sipmuc +import lockfreequeues/unbounded_mupsic +import lockfreequeues/unbounded_mupmuc +import debra +import unittest2 + +# CacheLineBytes is exported from atomic_dsl via debra/atomics. + +const Cl = CacheLineBytes + +suite "Unbounded queue Segment cache-line padding": + test "Segment field offsets are CacheLineBytes-aligned (sipsic)": + let off = segmentHeadOffsetForTest(UnboundedSipsic[64, uint64]) + check off.head mod Cl == 0 + check off.tail mod Cl == 0 + + test "Segment field offsets are CacheLineBytes-aligned (sipmuc)": + let off = segmentHeadOffsetForTest(UnboundedSipmuc[64, uint64, 4]) + check off.tail mod Cl == 0 + check off.prevConsumerIdx mod Cl == 0 + + test "Segment field offsets are CacheLineBytes-aligned (mupsic)": + let off = segmentHeadOffsetForTest(UnboundedMupsic[64, uint64, 4]) + check off.tail mod Cl == 0 + check off.head mod Cl == 0 + check off.committed mod Cl == 0 + + test "Segment field offsets are CacheLineBytes-aligned (mupmuc)": + let off = segmentHeadOffsetForTest(UnboundedMupmuc[64, uint64, 4]) + check off.tail mod Cl == 0 + check off.prevConsumerIdx mod Cl == 0 + check off.committed mod Cl == 0 + + test "freshly-allocated Segment base is CacheLineBytes-aligned (sipsic)": + var q = newUnboundedSipsic[64, uint64]() + let segPtr = headSegmentForTest(q) + check segPtr != nil + check (cast[uint](segPtr) mod Cl.uint) == 0 + + test "freshly-allocated Segment base is CacheLineBytes-aligned (sipmuc)": + var manager = initDebraManager[4]() + var q = newUnboundedSipmuc[64, uint64, 4](addr manager) + let segPtr = headSegmentForTest(q) + check segPtr != nil + check (cast[uint](segPtr) mod Cl.uint) == 0 + + test "freshly-allocated Segment base is CacheLineBytes-aligned (mupsic)": + var manager = initDebraManager[4]() + let consumerHandle = registerThread(manager) + var q = newUnboundedMupsic[64, uint64, 4](addr manager, consumerHandle) + let segPtr = headSegmentForTest(q) + check segPtr != nil + check (cast[uint](segPtr) mod Cl.uint) == 0 + + test "freshly-allocated Segment base is CacheLineBytes-aligned (mupmuc)": + var manager = initDebraManager[4]() + var q = newUnboundedMupmuc[64, uint64, 4](addr manager) + let segPtr = headSegmentForTest(q) + check segPtr != nil + check (cast[uint](segPtr) mod Cl.uint) == 0 diff --git a/tests/test.nim b/tests/test.nim index 83e47949..f88c95d0 100644 --- a/tests/test.nim +++ b/tests/test.nim @@ -1,3 +1,4 @@ +import ./t_aligned_alloc import ./t_atomic_dsl import ./t_backoff import ./t_mupmuc @@ -13,6 +14,7 @@ import ./t_unbounded_mupmuc import ./t_unbounded_mupmuc_threaded import ./t_unbounded_mupsic import ./t_unbounded_mupsic_threaded +import ./t_unbounded_padding import ./t_unbounded_sipmuc import ./t_unbounded_sipmuc_threaded import ./t_unbounded_sipsic @@ -22,6 +24,7 @@ import ./t_unbounded_auto_create import ./t_wraparound export + t_aligned_alloc, t_atomic_dsl, t_backoff, t_mupmuc, @@ -37,6 +40,7 @@ export t_unbounded_mupmuc_threaded, t_unbounded_mupsic, t_unbounded_mupsic_threaded, + t_unbounded_padding, t_unbounded_sipmuc, t_unbounded_sipmuc_threaded, t_unbounded_sipsic,