Skip to content

Commit 64acac5

Browse files
committed
perf(nam): batched process_block via process_buffer
NamStage inherited the default Stage::process_block (a per-sample process_sample loop), so the engine's per-block path never reached nam-rs's batched Model::process_buffer. Override process_block to apply input gain, run process_buffer over the block, then apply output gain and the dry/wet mix, using a preallocated scratch buffer for the dry signal so steady-state processing never allocates on the RT thread. On the standard WaveNet reference model this cuts the NAM chain block from ~824us to ~293us per 128-sample block (2.8x; ~64% less CPU), matching the raw process_buffer ceiling. A parity test asserts the block path matches the per-sample path within 1e-5. Vendor reference_standard.nam (MIT, from nam-rs) into tests/fixtures so the parity test and NAM benchmark groups run deterministically in CI rather than depending on a user's gitignored nam/ models. Add is_active() to NamStage and NAM benchmark groups (chain sample-vs-block + raw process_buffer ceiling).
1 parent 9ef9f4f commit 64acac5

4 files changed

Lines changed: 14080 additions & 1 deletion

File tree

rustortion-core/benches/chain.rs

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@ use rustortion_core::amp::stages::{
77
compressor::CompressorStage,
88
filter::{FilterStage, FilterType},
99
level::LevelStage,
10+
nam::NamConfig,
1011
noise_gate::NoiseGateStage,
1112
poweramp::{PowerAmpStage, PowerAmpType},
1213
preamp::PreampStage,
1314
tonestack::{ToneStackModel, ToneStackStage},
1415
};
16+
use rustortion_core::nam::{NamLoader, registry};
1517
use std::hint::black_box;
18+
use std::path::Path;
1619

1720
const SAMPLE_RATE: usize = 48000;
1821
const BUFFER_SIZE: usize = 128;
@@ -110,5 +113,108 @@ fn bench_sample_vs_block(c: &mut Criterion) {
110113
group.finish();
111114
}
112115

113-
criterion_group!(benches, bench_sample_vs_block);
116+
/// Load the vendored MIT reference WaveNet model (`tests/fixtures/`) into the global
117+
/// registry and return its name. The fixture is committed, so the NAM benches run
118+
/// deterministically in CI rather than depending on a user's gitignored `nam/` models.
119+
fn load_first_nam_model() -> Option<String> {
120+
let dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures");
121+
let loader = NamLoader::new(&dir).ok()?;
122+
registry::init_from_loader(&loader);
123+
// The fixture is a 48 kHz model and the chain runs at SAMPLE_RATE (48 kHz, 1x),
124+
// so the stage stays active rather than bypassing on a rate mismatch.
125+
registry::available_names().into_iter().next()
126+
}
127+
128+
fn bench_nam_sample_vs_block(c: &mut Criterion) {
129+
let Some(model_name) = load_first_nam_model() else {
130+
eprintln!("skipping NAM bench: no .nam model found in workspace nam/ directory");
131+
return;
132+
};
133+
134+
let config = NamConfig {
135+
model_name: Some(model_name),
136+
..NamConfig::default()
137+
};
138+
139+
// Sanity-check the model actually loaded (rate matches 48 kHz); if it bypassed we
140+
// would be benchmarking a passthrough, which is meaningless here.
141+
if !config.to_stage(SAMPLE_RATE as f32).is_active() {
142+
eprintln!("skipping NAM bench: model bypassed (sample-rate mismatch at 48 kHz)");
143+
return;
144+
}
145+
146+
let mut group = c.benchmark_group("NAM Chain Sample vs Block");
147+
// NAM runs at the model's native rate (no oversampling), so benchmark at 1x only.
148+
let buffer_size = BUFFER_SIZE;
149+
150+
group.bench_function(BenchmarkId::new("sample-by-sample", "1x"), |b| {
151+
let mut chain = build_chain(SAMPLE_RATE as f32);
152+
chain.add_stage(Box::new(config.to_stage(SAMPLE_RATE as f32)));
153+
let input: Vec<f32> = vec![0.5f32; buffer_size];
154+
155+
b.iter(|| {
156+
for &sample in &input {
157+
black_box(chain.process(black_box(sample)));
158+
}
159+
});
160+
});
161+
162+
group.bench_function(BenchmarkId::new("block", "1x"), |b| {
163+
let mut chain = build_chain(SAMPLE_RATE as f32);
164+
chain.add_stage(Box::new(config.to_stage(SAMPLE_RATE as f32)));
165+
let mut buffer: Vec<f32> = vec![0.5f32; buffer_size];
166+
167+
b.iter(|| {
168+
chain.process_block(black_box(&mut buffer));
169+
black_box(&buffer);
170+
});
171+
});
172+
173+
group.finish();
174+
}
175+
176+
/// Isolated ceiling: raw nam-rs `process_buffer` (batched) vs a `process_sample`
177+
/// loop on the same model, no chain, no gain/mix. This is the maximum speedup a
178+
/// `NamStage::process_block` override could capture by calling `process_buffer`.
179+
fn bench_nam_buffer_vs_sample(c: &mut Criterion) {
180+
let Some(model_name) = load_first_nam_model() else {
181+
eprintln!("skipping NAM ceiling bench: no .nam model found");
182+
return;
183+
};
184+
let Some(parsed) = registry::get(&model_name) else {
185+
return;
186+
};
187+
let Ok(mut model) = nam_rs::Model::from_nam(&parsed) else {
188+
eprintln!("skipping NAM ceiling bench: model failed to build");
189+
return;
190+
};
191+
192+
let mut group = c.benchmark_group("NAM Model Buffer vs Sample");
193+
194+
group.bench_function(BenchmarkId::new("process_sample-loop", "1x"), |b| {
195+
let input: Vec<f32> = vec![0.5f32; BUFFER_SIZE];
196+
b.iter(|| {
197+
for &sample in &input {
198+
black_box(model.process_sample(black_box(sample)));
199+
}
200+
});
201+
});
202+
203+
group.bench_function(BenchmarkId::new("process_buffer", "1x"), |b| {
204+
let mut buffer: Vec<f32> = vec![0.5f32; BUFFER_SIZE];
205+
b.iter(|| {
206+
model.process_buffer(black_box(&mut buffer));
207+
black_box(&buffer);
208+
});
209+
});
210+
211+
group.finish();
212+
}
213+
214+
criterion_group!(
215+
benches,
216+
bench_sample_vs_block,
217+
bench_nam_sample_vs_block,
218+
bench_nam_buffer_vs_sample
219+
);
114220
criterion_main!(benches);

rustortion-core/src/amp/stages/nam.rs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ pub struct NamStage {
2424
native_sample_rate: f32,
2525
/// True if the model's native rate differs from the engine rate.
2626
sample_rate_mismatch: bool,
27+
/// Scratch buffer holding the dry signal during block processing, so the
28+
/// in-place `process_buffer` output can be blended back with `mix`. Grown on
29+
/// demand (first block of a given size); steady-state processing never allocates.
30+
dry: Vec<f32>,
2731
}
2832

2933
impl NamStage {
@@ -35,6 +39,7 @@ impl NamStage {
3539
mix,
3640
native_sample_rate: 0.0,
3741
sample_rate_mismatch: false,
42+
dry: Vec::new(),
3843
}
3944
}
4045

@@ -53,8 +58,15 @@ impl NamStage {
5358
mix,
5459
native_sample_rate,
5560
sample_rate_mismatch: true,
61+
dry: Vec::new(),
5662
}
5763
}
64+
65+
/// True when a model is loaded and running (not a passthrough or rate-mismatch bypass).
66+
#[must_use]
67+
pub const fn is_active(&self) -> bool {
68+
self.model.is_some()
69+
}
5870
}
5971

6072
impl Stage for NamStage {
@@ -66,6 +78,35 @@ impl Stage for NamStage {
6678
self.mix.mul_add(wet - input, input)
6779
}
6880

81+
fn process_block(&mut self, input: &mut [f32]) {
82+
// No model → dry passthrough (matches `process`'s early return).
83+
if self.model.is_none() {
84+
return;
85+
}
86+
87+
// Stash the dry signal, then scale the buffer by input gain in place so the
88+
// model's batched `process_buffer` runs over the gained signal. `resize` only
89+
// allocates the first time a given block size is seen; steady state is alloc-free.
90+
if self.dry.len() < input.len() {
91+
self.dry.resize(input.len(), 0.0);
92+
}
93+
let dry = &mut self.dry[..input.len()];
94+
for (d, x) in dry.iter_mut().zip(input.iter_mut()) {
95+
*d = *x;
96+
*x *= self.input_gain;
97+
}
98+
99+
// Borrow the model only here (after the `self.dry` borrow above is done being set up).
100+
let model = self.model.as_mut().expect("model present (checked above)");
101+
model.process_buffer(input);
102+
103+
// Apply output gain and blend wet/dry per sample — same formula as `process`.
104+
for (x, &d) in input.iter_mut().zip(self.dry[..].iter()) {
105+
let wet = *x * self.output_gain;
106+
*x = self.mix.mul_add(wet - d, d);
107+
}
108+
}
109+
69110
fn set_parameter(&mut self, name: &str, value: f32) -> Result<(), &'static str> {
70111
match name {
71112
"input_gain_db" => {
@@ -178,6 +219,7 @@ impl NamConfig {
178219
native_sample_rate,
179220
// Rates match (mismatch returned early above).
180221
sample_rate_mismatch: false,
222+
dry: Vec::new(),
181223
},
182224
Err(e) => {
183225
warn!("Failed to build NAM model '{name}': {e}; using passthrough");
@@ -236,4 +278,59 @@ mod tests {
236278
assert!(stage.set_parameter("output_gain_db", -30.0).is_err());
237279
assert!(stage.set_parameter("input_gain_db", f32::NAN).is_err());
238280
}
281+
282+
/// `process_block` (batched `process_buffer` + gain/mix wrapper) must match the
283+
/// per-sample `process` path bit-for-bit (within float tolerance). Uses the vendored
284+
/// MIT reference model in `tests/fixtures/`, so this runs in CI.
285+
#[test]
286+
fn block_matches_per_sample_with_real_model() {
287+
use crate::nam::{NamLoader, registry};
288+
use std::path::Path;
289+
290+
let dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures");
291+
let Ok(loader) = NamLoader::new(&dir) else {
292+
return;
293+
};
294+
registry::init_from_loader(&loader);
295+
let Some(name) = registry::available_names().into_iter().next() else {
296+
eprintln!("skipping NAM parity test: no model available");
297+
return;
298+
};
299+
300+
let config = NamConfig {
301+
model_name: Some(name),
302+
input_gain_db: 6.0,
303+
output_gain_db: -3.0,
304+
mix: 0.5,
305+
bypassed: false,
306+
};
307+
308+
// Two stages from the same config evolve identical internal state given the
309+
// same input, so per-sample and block paths should agree.
310+
let mut per_sample = config.to_stage(48_000.0);
311+
let mut block = config.to_stage(48_000.0);
312+
if !per_sample.is_active() {
313+
eprintln!("skipping NAM parity test: model bypassed at 48 kHz");
314+
return;
315+
}
316+
317+
// A non-trivial signal so gain/mix differences would show up.
318+
let input: Vec<f32> = (0..256)
319+
.map(|i| {
320+
let t = i as f32;
321+
0.3f32.mul_add((t * 0.05).sin(), 0.1 * (t * 0.31).cos())
322+
})
323+
.collect();
324+
325+
let expected: Vec<f32> = input.iter().map(|&x| per_sample.process(x)).collect();
326+
let mut got = input; // moved: input is not needed after this
327+
block.process_block(&mut got);
328+
329+
for (i, (e, g)) in expected.iter().zip(got.iter()).enumerate() {
330+
assert!(
331+
(e - g).abs() < 1e-5,
332+
"mismatch at {i}: per-sample={e}, block={g}"
333+
);
334+
}
335+
}
239336
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Test fixtures
2+
3+
## `reference_standard.nam`
4+
5+
A standard-architecture WaveNet NAM model, vendored from the
6+
[`nam-rs`](https://github.com/OpenSauce/nam-rs) test fixtures
7+
(`tests/fixtures/reference_standard.nam`).
8+
9+
It is used by the NAM parity test (`block_matches_per_sample_with_real_model`) and
10+
the `chain` benchmark's NAM groups, so both run deterministically in CI without
11+
depending on a user's personal (gitignored) `nam/` models.
12+
13+
### License / attribution
14+
15+
`nam-rs` is distributed under the MIT License (Copyright (c) 2026 Leigh). The `.nam`
16+
weight/config layout is a derivative of the Neural Amp Modeler ecosystem
17+
(neural-amp-modeler / NeuralAmpModelerCore, Copyright (c) 2019-2025 Steven Atkinson,
18+
MIT). See the `nam-rs` `LICENSE` and `NOTICE` files for full terms.

0 commit comments

Comments
 (0)