diff --git a/doc/12_roadmap.md b/doc/12_roadmap.md index 0ceb9e4..e511f60 100644 --- a/doc/12_roadmap.md +++ b/doc/12_roadmap.md @@ -31,6 +31,7 @@ to add new test files to tests/CMakeLists.txt explicitly. Noted your CLAUDE.md e ## Smaller Things from the crew - djTubig reports patch swapping kinda slow on windows. Look for profile? +- Refresh patch browser ## Noise Upgrades **DONE** diff --git a/src/dsp/matrix_node.h b/src/dsp/matrix_node.h index 5793cf2..604f4b5 100644 --- a/src/dsp/matrix_node.h +++ b/src/dsp/matrix_node.h @@ -87,7 +87,7 @@ struct MatrixNodeFrom : public EnvelopeSupport, calculateModulation(); envProcess(); lfoProcess(); - float modlev alignas(16)[blockSize], mod alignas(16)[blockSize]; + float modlev alignas(16)[blockSize]; // Construct the level which is lfo * lev + env * lev or + env * dept + lev if (lfoIsEnveloped) @@ -119,6 +119,12 @@ struct MatrixNodeFrom : public EnvelopeSupport, } } + // Fused mul + apply. Earlier this was a two-pass: + // mul_block(modlev, from.output, mod); // SIMD write to mod[8] + // for j: onto.X[j] += f(overdriveFactor * mod[j]); // scalar read back + // The fused form computes `modlev[j] * from.output[j]` inline and never + // materializes `mod[]`. Parens kept so the multiply order matches the + // original (`overdriveFactor * (modlev * fromOut)`) — bit-exact output. if (modMode == 1) { // we want op * ( 1 - depth ) + op * rm * depth or @@ -149,28 +155,27 @@ struct MatrixNodeFrom : public EnvelopeSupport, else if (modMode == 2) { // linear FM. -1..1 with a 10x ocerdrivce - mech::mul_block(modlev, from.output, mod); for (int j = 0; j < blockSize; ++j) { - onto.fmAmount[j] += (overdriveFactor * mod[j]); + onto.fmAmount[j] += (overdriveFactor * (modlev[j] * from.output[j])); } } else if (modMode == 3) { // expoential fm. if mod is 0...1 the result is 2^mod - 1 - mech::mul_block(modlev, from.output, mod); for (int j = 0; j < blockSize; ++j) { - onto.fmAmount[j] += monoValues.twoToTheX.twoToThe(overdriveFactor * mod[j]) - 1.0; + onto.fmAmount[j] += + monoValues.twoToTheX.twoToThe(overdriveFactor * (modlev[j] * from.output[j])) - + 1.0; } } else { - mech::mul_block(modlev, from.output, mod); - for (int j = 0; j < blockSize; ++j) { - onto.phaseInput[j] += (int32_t)((1 << 27) * (overdriveFactor * mod[j])); + onto.phaseInput[j] += + (int32_t)((1 << 27) * (overdriveFactor * (modlev[j] * from.output[j]))); } } } @@ -307,6 +312,9 @@ struct MatrixNodeSelf : EnvelopeSupport, { onto.feedbackLevel[j] = (int32_t)((1 << 24) * modlev[j] * overdriveFactor); } + // Tell the OpSource to take the UsesFB=true template path on this block + // — feedbackLevel may be non-zero. + onto.hasActiveFeedback = true; } float fbMod{0.f}; diff --git a/src/dsp/op_source.h b/src/dsp/op_source.h index 27d6aeb..6c373b7 100644 --- a/src/dsp/op_source.h +++ b/src/dsp/op_source.h @@ -41,6 +41,12 @@ struct alignas(16) OpSource : public EnvelopeSupport, float rmLevel alignas(16)[blockSize]; float fmAmount alignas(16)[blockSize]; // in hz bool rmAssigned{false}; + // Per-block signal from MatrixNodeSelf::applyBlock: true when self-feedback is + // active for this block (so feedbackLevel[] may be non-zero). The inner-loop + // dispatcher uses this to pick a no-FB template instantiation that skips the + // feedback math AND the fbv[] shift entirely — removing the per-sample + // backward dependency unlocks a lot more compiler reordering. + bool hasActiveFeedback{false}; int opIndex{0}; // set by Voice constructor; 0 = op1 which can use AUDIO_IN float output alignas(16)[blockSize]; @@ -64,6 +70,70 @@ struct alignas(16) OpSource : public EnvelopeSupport, static constexpr float centsScale{1.0 / (12 * 100)}; + // Latch-at-attack cache. The dispatch in innerLoop() and the AUDIO_IN check + // in renderBlock() used to round+cast the patch .value every block. They + // read these typed members instead, which are populated by cacheEnums() in + // reset() and stay stable for the life of the note. Mid-note mode changes + // don't take effect until retrigger — same as the heavy state (stWindow, + // extendedLagM/N) already did. + SinTable::WaveForm waveFormCachedAtAttack{SinTable::SIN}; + bool isAudioInCachedAtAttack{false}; + Patch::SourceNode::ExtendedMode extendedModeCachedAtAttack{ + Patch::SourceNode::ExtendedMode::NONE}; + Patch::SourceNode::PhaseMapShape phaseMapShapeCachedAtAttack{ + Patch::SourceNode::PhaseMapShape::SAW}; + Patch::SourceNode::ResonantSweepWindow resonantSweepWindowCachedAtAttack{ + Patch::SourceNode::ResonantSweepWindow::SAW}; + Patch::SourceNode::NoiseMode noiseModeCachedAtAttack{ + Patch::SourceNode::NoiseMode::ADD_TO_PHASE}; + Patch::SourceNode::NoiseType noiseTypeCachedAtAttack{Patch::SourceNode::NoiseType::PINK}; + Patch::SourceNode::LFSRMode lfsrModeCachedAtAttack{Patch::SourceNode::LFSRMode::LONG_KEYTRACK}; + float resonantSweepKScaleCachedAtAttack{1.0f}; + + void cacheEnums() + { + using EM = Patch::SourceNode::ExtendedMode; + using PM = Patch::SourceNode::PhaseMapShape; + using RW = Patch::SourceNode::ResonantSweepWindow; + using RFD = Patch::SourceNode::ResonantSweepFrequencyDepth; + using NM = Patch::SourceNode::NoiseMode; + using NT = Patch::SourceNode::NoiseType; + using LM = Patch::SourceNode::LFSRMode; + + waveFormCachedAtAttack = + static_cast(static_cast(std::round(waveForm))); + isAudioInCachedAtAttack = (waveFormCachedAtAttack == SinTable::AUDIO_IN); + + extendedModeCachedAtAttack = + static_cast(static_cast(std::round(sourceNode.extendedModeMode.value))); + phaseMapShapeCachedAtAttack = + static_cast(static_cast(std::round(sourceNode.phaseMapModeShape.value))); + resonantSweepWindowCachedAtAttack = static_cast( + static_cast(std::round(sourceNode.resonantSweepWindowShape.value))); + + auto rfd = static_cast( + static_cast(std::round(sourceNode.resonantSweepFrequencyDepth.value))); + switch (rfd) + { + case RFD::TWO: + resonantSweepKScaleCachedAtAttack = 2.0f; + break; + case RFD::FOUR: + resonantSweepKScaleCachedAtAttack = 4.0f; + break; + case RFD::TEN: + resonantSweepKScaleCachedAtAttack = 10.0f; + break; + } + + noiseModeCachedAtAttack = + static_cast(static_cast(std::round(sourceNode.noiseMode.value))); + noiseTypeCachedAtAttack = + static_cast(static_cast(std::round(sourceNode.noiseType.value))); + lfsrModeCachedAtAttack = + static_cast(static_cast(std::round(sourceNode.lfsrMode.value))); + } + OpSource(const Patch::SourceNode &sn, MonoValues &mv, const VoiceValues &vv) : sourceNode(sn), monoValues(mv), voiceValues(vv), EnvelopeSupport(sn, mv, vv), LFOSupport(sn, mv), ModulationSupport(sn, this, mv, vv), ratio(sn.ratio), @@ -86,6 +156,10 @@ struct alignas(16) OpSource : public EnvelopeSupport, envResetMod(); lfoResetMod(); + // Latch all mode/shape enums up front; reset() and renderBlock() / innerLoop() + // both read the cached typed members from here on. + cacheEnums(); + st.setSampleRate(monoValues.sr.sampleRate); stWindow.setSampleRate(monoValues.sr.sampleRate); // Pick the table for the resonant-sweep window. BLACKMAN_HARRIS and TUKEY pull @@ -94,9 +168,7 @@ struct alignas(16) OpSource : public EnvelopeSupport, // closed-form arms in the inner loop don't read stWindow at all. { using RW = Patch::SourceNode::ResonantSweepWindow; - auto rw = static_cast( - static_cast(std::round(sourceNode.resonantSweepWindowShape.value))); - switch (rw) + switch (resonantSweepWindowCachedAtAttack) { case RW::BLACKMAN_HARRIS: stWindow.setWaveForm(SinTable::BLACKMAN_HARRIS_WINDOW); @@ -115,14 +187,14 @@ struct alignas(16) OpSource : public EnvelopeSupport, // that consumes them. In NONE the lag members exist but are never touched. { using EM = Patch::SourceNode::ExtendedMode; - auto em = static_cast( - static_cast(std::round(sourceNode.extendedModeMode.value))); - if (em == EM::PHASE_REMAP || em == EM::RESONANT_SWEEP || em == EM::NOISE) + if (extendedModeCachedAtAttack == EM::PHASE_REMAP || + extendedModeCachedAtAttack == EM::RESONANT_SWEEP || + extendedModeCachedAtAttack == EM::NOISE) { extendedLagM.setRateInMilliseconds(10, monoValues.sr.samplerate, blockSizeInv); extendedLagM.snapTo(sourceNode.extendedModeM.value); } - if (em == EM::NOISE) + if (extendedModeCachedAtAttack == EM::NOISE) { extendedLagN.setRateInMilliseconds(10, monoValues.sr.samplerate, blockSizeInv); extendedLagN.snapTo(sourceNode.extendedModeN.value); @@ -145,8 +217,7 @@ struct alignas(16) OpSource : public EnvelopeSupport, resetPhaseOnly(); fbVal[0] = 0.f; fbVal[1] = 0.f; - auto wf = (SinTable::WaveForm)std::round(waveForm); - st.setWaveForm(wf); + st.setWaveForm(waveFormCachedAtAttack); if (lfoIsEnveloped) { @@ -197,11 +268,11 @@ struct alignas(16) OpSource : public EnvelopeSupport, // Extended-mode targets — only consume the LFO when the mode that uses them is active. using EM = Patch::SourceNode::ExtendedMode; - auto em = - static_cast(static_cast(std::round(sourceNode.extendedModeMode.value))); - if (em == EM::PHASE_REMAP || em == EM::RESONANT_SWEEP || em == EM::NOISE) + if (extendedModeCachedAtAttack == EM::PHASE_REMAP || + extendedModeCachedAtAttack == EM::RESONANT_SWEEP || + extendedModeCachedAtAttack == EM::NOISE) used = used || (sourceNode.lfoToExtendedModeM.value != 0); - if (em == EM::NOISE) + if (extendedModeCachedAtAttack == EM::NOISE) used = used || (sourceNode.lfoToExtendedModeN.value != 0); return used; @@ -228,6 +299,7 @@ struct alignas(16) OpSource : public EnvelopeSupport, fmAmount[i] = 0.f; } rmAssigned = false; + hasActiveFeedback = false; } void clearOutputs() { memset(output, 0, sizeof(output)); } @@ -278,7 +350,7 @@ struct alignas(16) OpSource : public EnvelopeSupport, return; } - if ((int)std::round(waveForm) == SinTable::AUDIO_IN) + if (isAudioInCachedAtAttack) { if (opIndex == 0) { @@ -358,39 +430,49 @@ struct alignas(16) OpSource : public EnvelopeSupport, } void innerLoop(float *onto, float *fbv, float rf, const float dRF, uint32_t &phs) + { + // Split on per-block self-feedback so we pick the right template + // instantiation of innerLoopImpl. The flag is set by + // MatrixNodeSelf::applyBlock and stays false when self-FB is off for + // this op (the common case for modulator stacks) — the UsesFB=false + // path then skips the feedback math entirely. + if (hasActiveFeedback) + innerLoopDispatch(onto, fbv, rf, dRF, phs); + else + innerLoopDispatch(onto, fbv, rf, dRF, phs); + } + + template + void innerLoopDispatch(float *onto, float *fbv, float rf, const float dRF, uint32_t &phs) { using EM = Patch::SourceNode::ExtendedMode; using PM = Patch::SourceNode::PhaseMapShape; - auto em = - static_cast(static_cast(std::round(sourceNode.extendedModeMode.value))); - switch (em) + switch (extendedModeCachedAtAttack) { case EM::NONE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case EM::PHASE_REMAP: { - auto pm = static_cast( - static_cast(std::round(sourceNode.phaseMapModeShape.value))); - switch (pm) + switch (phaseMapShapeCachedAtAttack) { case PM::SAW: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case PM::SQUARE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case PM::PULSE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case PM::DOUBLE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case PM::SIN_TO_SQUARE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; case PM::DOUBLE_SAW: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, phs); break; } break; @@ -398,54 +480,41 @@ struct alignas(16) OpSource : public EnvelopeSupport, case EM::RESONANT_SWEEP: { using RW = Patch::SourceNode::ResonantSweepWindow; - using RFD = Patch::SourceNode::ResonantSweepFrequencyDepth; - auto rw = static_cast( - static_cast(std::round(sourceNode.resonantSweepWindowShape.value))); - auto rfd = static_cast( - static_cast(std::round(sourceNode.resonantSweepFrequencyDepth.value))); - float kScale = 4.0f; - switch (rfd) - { - case RFD::TWO: - kScale = 2.0f; - break; - case RFD::FOUR: - kScale = 4.0f; - break; - case RFD::TEN: - kScale = 10.0f; - break; - } // The PhaseMapShape template arg is unused on this branch; leave it at default. - switch (rw) + switch (resonantSweepWindowCachedAtAttack) { case RW::SAW: - innerLoopImpl( - onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, resonantSweepKScaleCachedAtAttack); break; case RW::TRIANGLE: - innerLoopImpl(onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, + resonantSweepKScaleCachedAtAttack); break; case RW::TRAPEZOID: - innerLoopImpl(onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, + resonantSweepKScaleCachedAtAttack); break; case RW::FULLTRAP: - innerLoopImpl(onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, + resonantSweepKScaleCachedAtAttack); break; case RW::HANN: - innerLoopImpl( - onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, resonantSweepKScaleCachedAtAttack); break; case RW::BLACKMAN_HARRIS: - innerLoopImpl(onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, + resonantSweepKScaleCachedAtAttack); break; case RW::TUKEY: - innerLoopImpl( - onto, fbv, rf, dRF, phs, kScale); + innerLoopImpl(onto, fbv, rf, dRF, phs, + resonantSweepKScaleCachedAtAttack); break; } break; @@ -453,28 +522,29 @@ struct alignas(16) OpSource : public EnvelopeSupport, case EM::NOISE: { using NM = Patch::SourceNode::NoiseMode; - auto nm = - static_cast(static_cast(std::round(sourceNode.noiseMode.value))); constexpr auto PMSAW = Patch::SourceNode::PhaseMapShape::SAW; constexpr auto RWSAW = Patch::SourceNode::ResonantSweepWindow::SAW; - switch (nm) + switch (noiseModeCachedAtAttack) { case NM::ADD_TO_PHASE: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, dRF, + phs); break; case NM::ADD_TO_SIGNAL: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, + dRF, phs); break; case NM::MIX_WITH_SIGNAL: - innerLoopImpl(onto, fbv, rf, dRF, - phs); + innerLoopImpl(onto, fbv, rf, + dRF, phs); break; case NM::MUL_BY_SIGNAL: - innerLoopImpl(onto, fbv, rf, dRF, phs); + innerLoopImpl(onto, fbv, rf, + dRF, phs); break; case NM::MUL_BY_UNI_SIGNAL: - innerLoopImpl(onto, fbv, rf, dRF, - phs); + innerLoopImpl(onto, fbv, rf, + dRF, phs); break; } break; @@ -483,7 +553,7 @@ struct alignas(16) OpSource : public EnvelopeSupport, } template < - Patch::SourceNode::ExtendedMode ET, + bool UsesFB, Patch::SourceNode::ExtendedMode ET, Patch::SourceNode::PhaseMapShape S = Patch::SourceNode::PhaseMapShape::SAW, Patch::SourceNode::ResonantSweepWindow R = Patch::SourceNode::ResonantSweepWindow::SAW, Patch::SourceNode::NoiseMode NM = Patch::SourceNode::NoiseMode::ADD_TO_PHASE> @@ -523,10 +593,8 @@ struct alignas(16) OpSource : public EnvelopeSupport, extendedLagN.setTarget(targetN); extendedLagN.process(); nextN = extendedLagN.v; - noiseType = - static_cast(static_cast(std::round(sourceNode.noiseType.value))); - lfsrMode = - static_cast(static_cast(std::round(sourceNode.lfsrMode.value))); + noiseType = noiseTypeCachedAtAttack; + lfsrMode = lfsrModeCachedAtAttack; } for (int i = 0; i < blockSize; ++i) @@ -535,15 +603,29 @@ struct alignas(16) OpSource : public EnvelopeSupport, rf += dRF; phs += dPhase; - auto fb = 0.5 * (fbv[0] + fbv[1]); - auto sb = std::signbit(feedbackLevel[i]); - // fb = sb ? fb * fb : fb. Ugh a branch. but bool = 0/1, so - // (1-sb) * fb + sb * fb * fb - 3 mul, 2 add - // fb - sb * fb + sb * fb * fb - 3 nul 2 add - // fb * ( 1 - sb * ( 1 - fb)) - 2 mul 2 add - fb = fb * (1 - sb * (1 - fb)); - - auto ph = phs + phaseInput[i] + (int32_t)(feedbackLevel[i] * fb); + // When self-feedback is inactive for this block, skip the fb math + // entirely (constexpr-out). When active, use an int compare for the + // sign bit instead of std::signbit on int32_t — std::signbit's + // integral overload promotes to double per C++11 [c.math.fpclass], + // which is implementation-defined cost. `ph` then feeds every + // extended-mode transform downstream, so this gating must be at the + // top of the per-sample loop, not around EM::NONE only. + uint32_t ph{0}; + if constexpr (UsesFB) + { + auto fb = 0.5 * (fbv[0] + fbv[1]); + auto sb = (feedbackLevel[i] < 0); + // fb = sb ? fb * fb : fb. Ugh a branch. but bool = 0/1, so + // (1-sb) * fb + sb * fb * fb - 3 mul, 2 add + // fb - sb * fb + sb * fb * fb - 3 nul 2 add + // fb * ( 1 - sb * ( 1 - fb)) - 2 mul 2 add + fb = fb * (1 - sb * (1 - fb)); + ph = phs + phaseInput[i] + (int32_t)(feedbackLevel[i] * fb); + } + else + { + ph = phs + phaseInput[i]; + } float out; if constexpr (ET == EM::PHASE_REMAP) @@ -629,8 +711,11 @@ struct alignas(16) OpSource : public EnvelopeSupport, out = out * rmLevel[i]; onto[i] = out; - fbv[1] = fbv[0]; - fbv[0] = out; + if constexpr (UsesFB) + { + fbv[1] = fbv[0]; + fbv[0] = out; + } } } diff --git a/src/synth/voice.cpp b/src/synth/voice.cpp index 1f56caa..6f96510 100644 --- a/src/synth/voice.cpp +++ b/src/synth/voice.cpp @@ -174,7 +174,7 @@ void Voice::renderBlock() auto pos = MatrixIndex::positionForSourceTarget(j, i); matrixNode[pos].applyBlock(); } - if ((int)std::round(src[i].waveForm) != SinTable::AUDIO_IN) + if (!src[i].isAudioInCachedAtAttack) selfNode[i].applyBlock(); src[i].renderBlock(); mixerNode[i].renderBlock(); diff --git a/tests/perf/results/baseline-item-1.csv b/tests/perf/results/baseline-item-1.csv new file mode 100644 index 0000000..7ac5d3b --- /dev/null +++ b/tests/perf/results/baseline-item-1.csv @@ -0,0 +1,18 @@ +tag,level,voices,ops,block_ns_median,sample_ns_median,cpu_pct_48k_median,vops_per_s_median,stddev_pct_max,iters_first,hash_first +scn:1v_dense,plugin,1,6,5979.40,747.4300,3.590,8.028e+06,1.13,16727,0x6ababdcc1d9bec34 +scn:32v_dense,plugin,32,6,143223.00,17902.8800,85.930,1.072e+07,1.51,696,0x9e922c2e299a52e8 +scn:64v_dense,plugin,64,6,285490.80,35686.3600,171.290,1.076e+07,1.58,352,0x639f7a59d94b8b9c +scn:8v_dense,plugin,8,6,37526.40,4690.8000,22.520,1.023e+07,1.40,2582,0xcc3032b27293ce14 +scn:em_noise,plugin,16,6,72925.30,9115.6600,43.760,1.053e+07,1.67,1384,0xd499522869b3d028 +scn:em_phaseremap,plugin,16,6,81699.20,10212.4000,49.020,9.4e+06,2.20,1213,0xae49249a9f58a7c0 +scn:em_resonant,plugin,16,6,82150.50,10268.8100,49.290,9.349e+06,1.39,1220,0x397b0969c836c140 +scn:inner_noise,inner,1,1,189.40,23.6700,0.110,4.225e+07,0.71,523557,0xb755fa22f4e23dac +scn:inner_none,inner,1,1,174.50,21.8100,0.100,4.584e+07,0.99,580293,0x9d19662037c37614 +scn:inner_phaseremap,inner,1,1,220.80,27.6000,0.130,3.623e+07,0.62,458098,0x65d2efaf762f9fac +scn:inner_resonant,inner,1,1,239.30,29.9100,0.140,3.344e+07,0.87,419347,0x382b2459fb0a087c +scn:minimal,plugin,1,1,2626.20,328.2700,1.580,3.046e+06,0.66,38132,0x9d19662037c37614 +scn:no_fb_simd,plugin,16,6,67964.40,8495.5500,40.780,1.13e+07,0.75,1482,0x0709d7b08163e794 +scn:voice_32v_dense,voice,32,6,56419.10,7052.3900,33.850,2.722e+07,0.83,1789,0x9e922c2e299a52e8 +scn:voice_8v_dense,voice,8,6,14107.40,1763.4200,8.460,2.722e+07,1.65,6951,0xcc3032b27293ce14 +scn:voice_em_resonant,voice,16,6,32001.40,4000.1800,19.200,2.4e+07,0.60,3134,0x397b0969c836c140 +scn:worst,plugin,64,6,303619.70,37952.4600,182.170,1.012e+07,1.67,332,0x79c9779af5758e58 diff --git a/tests/perf/run.sh b/tests/perf/run.sh index 60b078d..a014ca8 100755 --- a/tests/perf/run.sh +++ b/tests/perf/run.sh @@ -18,6 +18,13 @@ # PERF_SAMPLE_MS=N wall-clock target per timed sample, passed to the # binary (default 30 in code). Use 5 for a fast smoke # check, 100+ for a long stable run. +# +# IMPORTANT: thermal drift on laptops makes cross-session comparisons +# unreliable for sub-2% deltas. For meaningful before/after on a single +# optimization, run both versions back-to-back within minutes of each +# other and diff those CSVs — not against an older baseline. Baselines +# committed to results/baseline-*.csv are useful for ROUGH sanity checks +# only ("did we regress 10%?"), not for "is this 1% faster?". set -euo pipefail