jhartquist · pengowray · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/resonators/Cargo.toml b/crates/resonators/Cargo.toml
@@ -22,3 +22,11 @@ harness = false
 
 [dependencies]
 num-complex = "0.4.6"
+
+# WASM SIMD128 only. `wide::f32x4` is used by the per-sample bank inner
+# loop when compiling for wasm32 with `target-feature=+simd128` enabled.
+# On every other target the scalar path is used (LLVM auto-vectorises
+# it to SSE/NEON just fine, and explicit SIMD there either matches or
+# slightly regresses vs auto-vectorised scalar in our benchmarks).
+[target.'cfg(all(target_arch = "wasm32", target_feature = "simd128"))'.dependencies]
+wide = "0.8"
diff --git a/crates/resonators/benches/bank.rs b/crates/resonators/benches/bank.rs
@@ -1,4 +1,5 @@
 use std::hint::black_box;
+use std::time::Duration;
 
 use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use resonators::{ResonatorBank, ResonatorConfig, heuristic_alpha, midi_to_hz};
@@ -21,34 +22,151 @@ fn log_spaced_configs(n_bins: usize) -> Vec<ResonatorConfig> {
         .collect()
 }
 
+const BIN_COUNTS: &[usize] = &[88, 264, 440, 880];
+
 fn bench_bank(c: &mut Criterion) {
     let n = SAMPLE_RATE as usize; // 1 second of a 440 Hz sine wave
     let signal: Vec<f32> = (0..n)
         .map(|i| (2.0 * std::f32::consts::PI * 440.0 * i as f32 / SAMPLE_RATE).sin())
         .collect();
 
-    let mut group = c.benchmark_group("bank");
-    group.throughput(Throughput::Elements(n as u64));
-    group.sample_size(50);
-
-    for &n_bins in &[88, 264, 440, 880] {
-        let configs = log_spaced_configs(n_bins);
-        group.bench_with_input(
-            BenchmarkId::from_parameter(n_bins),
-            &configs,
-            |bencher, configs| {
-                let mut bank = ResonatorBank::new(configs, SAMPLE_RATE);
-                bencher.iter(|| {
-                    bank.reset();
-                    for &sample in &signal {
-                        bank.process_sample(black_box(sample));
-                    }
-                });
-            },
-        );
+    // Scalar path — forced via process_sample_scalar to bypass the
+    // x86_64 runtime-dispatch match, so the measurement reflects the
+    // scalar hot loop only. On non-x86 targets `process_sample` has no
+    // dispatch so we call it directly; on x86_64 LLVM auto-vectorises
+    // the scalar loop to whatever target-cpu supports (SSE2 baseline,
+    // AVX2/AVX-512 if enabled).
+    {
+        let mut group = c.benchmark_group("bank/scalar");
+        group.throughput(Throughput::Elements(n as u64));
+        group.sample_size(50);
+        // 10 s covers the largest bin count (880, ~17 ms/iter × 50
+        // samples ≈ 0.9 s + warmup) with headroom, so criterion won't
+        // warn about missed sample budget at any of the benched sizes.
+        group.measurement_time(Duration::from_secs(10));
+
+        for &n_bins in BIN_COUNTS {
+            let configs = log_spaced_configs(n_bins);
+            group.bench_with_input(
+                BenchmarkId::from_parameter(n_bins),
+                &configs,
+                |bencher, configs| {
+                    let mut bank = ResonatorBank::new(configs, SAMPLE_RATE);
+                    bencher.iter(|| {
+                        bank.reset();
+                        for &sample in &signal {
+                            #[cfg(target_arch = "x86_64")]
+                            bank.process_sample_scalar(black_box(sample));
+                            #[cfg(not(target_arch = "x86_64"))]
+                            bank.process_sample(black_box(sample));
+                        }
+                    });
+                },
+            );
+        }
+        group.finish();
+    }
+
+    // Runtime-dispatched path — `ResonatorBank::new` picks the widest
+    // supported backend, then `process_sample` does a per-call match
+    // and dispatches. The interesting measurement is the delta vs the
+    // forced `bank/avx512` (or `bank/avx2` on hosts without AVX-512):
+    // that's the cost of the dispatch match. Expected near-zero
+    // because the branch is predictable and set once at construction.
+    #[cfg(target_arch = "x86_64")]
+    {
+        let mut group = c.benchmark_group("bank/dispatch");
+        group.throughput(Throughput::Elements(n as u64));
+        group.sample_size(50);
+        group.measurement_time(Duration::from_secs(10));
+
+        for &n_bins in BIN_COUNTS {
+            let configs = log_spaced_configs(n_bins);
+            group.bench_with_input(
+                BenchmarkId::from_parameter(n_bins),
+                &configs,
+                |bencher, configs| {
+                    let mut bank = ResonatorBank::new(configs, SAMPLE_RATE);
+                    bencher.iter(|| {
+                        bank.reset();
+                        for &sample in &signal {
+                            bank.process_sample(black_box(sample));
+                        }
+                    });
+                },
+            );
+        }
+        group.finish();
+        eprintln!("bank/dispatch backend = {:?}", resonators::Backend::detect());
     }
 
-    group.finish();
+    // Explicit AVX2 + FMA — 8 bins per iteration via __m256 + vfmadd231ps.
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma") {
+            let mut group = c.benchmark_group("bank/avx2");
+            group.throughput(Throughput::Elements(n as u64));
+            group.sample_size(50);
+            group.measurement_time(Duration::from_secs(10));
+
+            for &n_bins in BIN_COUNTS {
+                let configs = log_spaced_configs(n_bins);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(n_bins),
+                    &configs,
+                    |bencher, configs| {
+                        let mut bank = ResonatorBank::new(configs, SAMPLE_RATE);
+                        bencher.iter(|| {
+                            bank.reset();
+                            // Safety: we've checked avx2+fma support above.
+                            unsafe {
+                                for &sample in &signal {
+                                    bank.process_sample_avx2(black_box(sample));
+                                }
+                            }
+                        });
+                    },
+                );
+            }
+            group.finish();
+        } else {
+            eprintln!("SKIPPED bank/avx2 — CPU lacks avx2 or fma");
+        }
+    }
+
+    // Explicit AVX-512F — 16 bins per iteration via __m512.
+    #[cfg(target_arch = "x86_64")]
+    {
+        if std::arch::is_x86_feature_detected!("avx512f") {
+            let mut group = c.benchmark_group("bank/avx512");
+            group.throughput(Throughput::Elements(n as u64));
+            group.sample_size(50);
+            group.measurement_time(Duration::from_secs(10));
+
+            for &n_bins in BIN_COUNTS {
+                let configs = log_spaced_configs(n_bins);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(n_bins),
+                    &configs,
+                    |bencher, configs| {
+                        let mut bank = ResonatorBank::new(configs, SAMPLE_RATE);
+                        bencher.iter(|| {
+                            bank.reset();
+                            // Safety: we've checked avx512f support above.
+                            unsafe {
+                                for &sample in &signal {
+                                    bank.process_sample_avx512(black_box(sample));
+                                }
+                            }
+                        });
+                    },
+                );
+            }
+            group.finish();
+        } else {
+            eprintln!("SKIPPED bank/avx512 — CPU lacks avx512f");
+        }
+    }
 }
 
 criterion_group!(benches, bench_bank);