Skip to content

Commit fec942a

Browse files
jhartquistpengowray
andcommitted
bank: restore wasm SIMD autovectorization in hot loop
On wasm32+simd128, `f32::mul_add` lowered to per-lane `fmaf` calls and defeated autovectorization of the EWMA loop, leaving the "SIMD" build roughly 10x slower than it should be. Three changes in bank.rs recover the full speedup: - `mul_add(a, b, c)` helper: unfused (a*b + c) on wasm32+simd128 to keep the vector loop; `f32::mul_add` on native where vector FMA exists. - `process_samples` chunks to the next stabilization boundary so the per-sample modulo check moves out of the hot path. - `process_sample_inner` hoists the ten backing `Vec` fields to local `&mut [f32]` of known length `n`, letting LLVM drop bounds checks, hoist the length-min across slices, and trust disjointedness. Browser bench throughput (ns/sample, 48 kHz x 1 s): bins before after speedup 88 730 54 13.5x 264 2097 143 14.7x 440 3449 238 14.5x 880 6888 470 14.7x Native `cargo bench --bench bank` (aarch64): within noise at every bin count. Co-authored-by: Pengo Wray <pengowray@users.noreply.github.com>
1 parent 3eee482 commit fec942a

1 file changed

Lines changed: 69 additions & 22 deletions

File tree

crates/resonators/src/bank.rs

Lines changed: 69 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -104,26 +104,7 @@ impl ResonatorBank {
104104
/// Updates every resonator with a single input sample.
105105
#[inline]
106106
pub fn process_sample(&mut self, sample: f32) {
107-
for k in 0..self.n_resonators {
108-
let alpha = self.alphas[k];
109-
let beta = self.betas[k];
110-
let alpha_sample = alpha * sample;
111-
112-
// EWMA accumulation
113-
self.r_re[k] = (1.0 - alpha).mul_add(self.r_re[k], alpha_sample * self.z_re[k]);
114-
self.r_im[k] = (1.0 - alpha).mul_add(self.r_im[k], alpha_sample * self.z_im[k]);
115-
116-
// output smoothing
117-
self.rr_re[k] = (1.0 - beta).mul_add(self.rr_re[k], beta * self.r_re[k]);
118-
self.rr_im[k] = (1.0 - beta).mul_add(self.rr_im[k], beta * self.r_im[k]);
119-
120-
// rotate phasor
121-
let zr = self.z_re[k];
122-
let zi = self.z_im[k];
123-
self.z_re[k] = zr * self.w_re[k] - zi * self.w_im[k];
124-
self.z_im[k] = zr * self.w_im[k] + zi * self.w_re[k];
125-
}
126-
107+
self.process_sample_inner(sample);
127108
self.sample_count += 1;
128109
if self.sample_count.is_multiple_of(STABILIZE_EVERY) {
129110
self.stabilize();
@@ -133,11 +114,64 @@ impl ResonatorBank {
133114
/// Updates every resonator with a block of input samples, in order.
134115
#[inline]
135116
pub fn process_samples(&mut self, samples: &[f32]) {
136-
for &s in samples {
137-
self.process_sample(s);
117+
let mut remaining = samples;
118+
while !remaining.is_empty() {
119+
let chunk_len = remaining.len().min(self.samples_until_stabilize());
120+
let (chunk, rest) = remaining.split_at(chunk_len);
121+
122+
for &sample in chunk {
123+
self.process_sample_inner(sample);
124+
}
125+
126+
self.sample_count += chunk_len as u64;
127+
if self.sample_count.is_multiple_of(STABILIZE_EVERY) {
128+
self.stabilize();
129+
}
130+
remaining = rest;
131+
}
132+
}
133+
134+
#[inline(always)]
135+
fn process_sample_inner(&mut self, sample: f32) {
136+
// hoisted to locals so LLVM can drop bounds checks and vectorize cleanly.
137+
let n = self.n_resonators;
138+
let alphas = &self.alphas[..n];
139+
let betas = &self.betas[..n];
140+
let w_re = &self.w_re[..n];
141+
let w_im = &self.w_im[..n];
142+
let r_re = &mut self.r_re[..n];
143+
let r_im = &mut self.r_im[..n];
144+
let rr_re = &mut self.rr_re[..n];
145+
let rr_im = &mut self.rr_im[..n];
146+
let z_re = &mut self.z_re[..n];
147+
let z_im = &mut self.z_im[..n];
148+
149+
for k in 0..n {
150+
let alpha = alphas[k];
151+
let beta = betas[k];
152+
let alpha_sample = alpha * sample;
153+
154+
// EWMA accumulation
155+
r_re[k] = mul_add(1.0 - alpha, r_re[k], alpha_sample * z_re[k]);
156+
r_im[k] = mul_add(1.0 - alpha, r_im[k], alpha_sample * z_im[k]);
157+
158+
// output smoothing
159+
rr_re[k] = mul_add(1.0 - beta, rr_re[k], beta * r_re[k]);
160+
rr_im[k] = mul_add(1.0 - beta, rr_im[k], beta * r_im[k]);
161+
162+
// rotate phasor
163+
let zr = z_re[k];
164+
let zi = z_im[k];
165+
z_re[k] = zr * w_re[k] - zi * w_im[k];
166+
z_im[k] = zr * w_im[k] + zi * w_re[k];
138167
}
139168
}
140169

170+
fn samples_until_stabilize(&self) -> usize {
171+
let offset = (self.sample_count % STABILIZE_EVERY) as usize;
172+
STABILIZE_EVERY as usize - offset
173+
}
174+
141175
/// Processes `signal` in hops and returns the complex state of every
142176
/// resonator at the end of each hop.
143177
///
@@ -231,6 +265,19 @@ impl ResonatorBank {
231265
}
232266
}
233267

268+
// Unfused on wasm32+simd128: `f32::mul_add` kills autovectorization there.
269+
#[inline(always)]
270+
fn mul_add(a: f32, b: f32, c: f32) -> f32 {
271+
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
272+
{
273+
a * b + c
274+
}
275+
#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
276+
{
277+
a.mul_add(b, c)
278+
}
279+
}
280+
234281
#[cfg(test)]
235282
mod tests {
236283
use super::*;

0 commit comments

Comments
 (0)