feat(rp-27 part 2): opt-in second ONNX input orig_sample_rate for raw_audio

zhmiao · Copilot · zhmiao · commit 5ea864117e78 · 2026-06-05T13:14:58.000-07:00
Extends PreprocessMethod::RawAudio with pass_orig_sample_rate: bool (default
false). When true, the CPU detect_audio raw-path engine passes a second
ONNX input "orig_sample_rate" [1] int64 alongside the audio tensor.
Models that opt in can implement in-graph fill_highfreq (or any other
data-driven sample-rate behavior) without engine-side mel knowledge.

Wiring:
- PreparedAudioKind::Raw gained pass_orig_sample_rate field; carried from
  manifest through prepare_audio_detection into detect_audio_loop_raw.
- session.run dispatches via the named-input form
  Vec&lt;(Cow&lt;str&gt;, SessionInputValue)&gt; when the flag is on.
- resolve_classifier_output (the model-load probe at line 292) also needed
  the second input — probes with orig_sample_rate=target_sample_rate
  (no-op for fill_highfreq).

Manifest schema: [preprocessing] pass_orig_sample_rate=true|false (default
false). Existing perch-v2 + future single-input raw_audio models keep
working unchanged.

Motivation: orca-ecotype-dclde2026-v1 (RP-onboarding-2026-06-01) uses
raw_audio + softmax with the same fill_highfreq requirement as Stage 1.
Engine-side mel fill (RP-27 Part 1) doesn't reach the in-graph mel path,
so Stage 2 needed its own data-driven fix. The exported ONNX wrapper
implements fill_highfreq via sort + dynamic-k gather; the engine just
ships orig_sample_rate so the in-graph mask boundary tracks it.

Stage 2 parity (300 windows, 10 fixtures): top1 prob delta mean 0.027,
median 0.018, max 0.181; argmax flip rate 4.0% (gate 15%). 24 kHz fixture
(no-resample, fill no-op) matches engine bit-exactly because the in-graph
fill is guarded with no_upsample = (orig_sample_rate &gt;= target_sample_rate)
matching upstream PW's `if orig_sr &lt; SR` guard.

Lib tests: types 123 + core 178 + cpu 74 = 375 PASS.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sparrow-engine/sparrow-engine-cpu/src/detect_audio.rs b/sparrow-engine/sparrow-engine-cpu/src/detect_audio.rs
@@ -133,8 +133,12 @@ enum PreparedAudioKind {
         /// tensor named "label" at session-load time; for single-output
         /// models this is just `0`.
         logits_output_idx: usize,
-        /// Number of classes (= length of the softmax distribution).
+        /// Number of softmax classes the model emits.
         num_classes: usize,
+        /// Opt-in (RP-27 Part 2, 2026-06-05): when true, the engine passes a
+        /// second ONNX input `orig_sample_rate [1] int64` alongside the
+        /// audio tensor. Used by in-graph fill_highfreq.
+        pass_orig_sample_rate: bool,
     },
 }
 
@@ -216,6 +220,7 @@ fn prepare_audio_detection(
         }
         PreprocessMethod::RawAudio {
             window_samples,
+            pass_orig_sample_rate,
             ..
         } => {
             let segment_samples = *window_samples as usize;
@@ -251,14 +256,17 @@ fn prepare_audio_detection(
 
             // Resolve the logits output: prefer the tensor named "label" (Perch 2),
             // fall back to output 0 for single-head softmax classifiers.
+            // When pass_orig_sample_rate=true, probe with a dummy orig_sr=sample_rate
+            // (the no-op case for fill_highfreq) so the 2-input ONNX accepts the call.
             let (logits_output_idx, num_classes) =
-                resolve_classifier_output(handle, segment_samples)?;
+                resolve_classifier_output(handle, segment_samples, *pass_orig_sample_rate, sample_rate)?;
 
             Ok(PreparedAudioDetection {
                 audio_samples,
                 kind: PreparedAudioKind::Raw {
                     logits_output_idx,
                     num_classes,
+                    pass_orig_sample_rate: *pass_orig_sample_rate,
                 },
                 segment_samples,
                 stride_samples,
@@ -286,6 +294,8 @@ fn prepare_audio_detection(
 fn resolve_classifier_output(
     handle: &ModelHandle,
     window_samples: usize,
+    pass_orig_sample_rate: bool,
+    target_sample_rate: u32,
 ) -> Result<(usize, usize)> {
     let session = handle.pin_session()?;
     let mut guard = session
@@ -302,9 +312,23 @@ fn resolve_classifier_output(
     // Probe with one zero-filled window to learn the class count.
     let probe = ndarray::Array2::<f32>::zeros((1, window_samples));
     let input_value = TensorRef::from_array_view(&probe).map_err(crate::engine::ort_err)?;
-    let outputs = guard
-        .run(ort::inputs![input_value])
-        .map_err(crate::engine::ort_err)?;
+    // RP-27 Part 2: 2-input ONNX needs orig_sample_rate populated even at probe
+    // time. Use target_sample_rate (the no-op case for fill_highfreq).
+    let probe_sr_arr;
+    let outputs = if pass_orig_sample_rate {
+        probe_sr_arr = ndarray::Array1::from_vec(vec![target_sample_rate as i64]);
+        let orig_sr_value =
+            TensorRef::from_array_view(&probe_sr_arr).map_err(crate::engine::ort_err)?;
+        let inputs: Vec<(std::borrow::Cow<'_, str>, ort::session::SessionInputValue<'_>)> = vec![
+            (std::borrow::Cow::Borrowed("audio"), input_value.into()),
+            (std::borrow::Cow::Borrowed("orig_sample_rate"), orig_sr_value.into()),
+        ];
+        guard.run(inputs).map_err(crate::engine::ort_err)?
+    } else {
+        guard
+            .run(ort::inputs![input_value])
+            .map_err(crate::engine::ort_err)?
+    };
     if outputs.len() <= logits_idx {
         return Err(SparrowEngineError::Ort(format!(
             "classifier session probe returned {} outputs; expected at least {}",
@@ -551,11 +575,12 @@ fn detect_audio_loop_raw(
     start: Instant,
     mut on_segment: Option<&mut dyn FnMut(&AudioSegment)>,
 ) -> Result<AudioDetectResult> {
-    let (logits_output_idx, num_classes) = match &prep.kind {
+    let (logits_output_idx, num_classes, pass_orig_sample_rate) = match &prep.kind {
         PreparedAudioKind::Raw {
             logits_output_idx,
             num_classes,
-        } => (*logits_output_idx, *num_classes),
+            pass_orig_sample_rate,
+        } => (*logits_output_idx, *num_classes, *pass_orig_sample_rate),
         _ => unreachable!("guarded by detect_audio_loop dispatch"),
     };
 
@@ -608,9 +633,26 @@ fn detect_audio_loop_raw(
         let mut guard = session
             .lock()
             .map_err(|_| SparrowEngineError::Ort("audio session lock poisoned".into()))?;
-        let outputs = guard
-            .run(ort::inputs![input_value])
-            .map_err(crate::engine::ort_err)?;
+        // RP-27 Part 2: when manifest opts in, pass orig_sample_rate as a
+        // second [1] int64 input alongside the audio tensor. The exported
+        // ONNX must declare two inputs in this order: ("audio", "orig_sample_rate").
+        let orig_sr_arr;
+        let outputs = if pass_orig_sample_rate {
+            orig_sr_arr = ndarray::Array1::from_vec(vec![
+                prep.audio_samples.orig_sample_rate as i64,
+            ]);
+            let orig_sr_value =
+                TensorRef::from_array_view(&orig_sr_arr).map_err(crate::engine::ort_err)?;
+            let inputs: Vec<(std::borrow::Cow<'_, str>, ort::session::SessionInputValue<'_>)> = vec![
+                (std::borrow::Cow::Borrowed("audio"), input_value.into()),
+                (std::borrow::Cow::Borrowed("orig_sample_rate"), orig_sr_value.into()),
+            ];
+            guard.run(inputs).map_err(crate::engine::ort_err)?
+        } else {
+            guard
+                .run(ort::inputs![input_value])
+                .map_err(crate::engine::ort_err)?
+        };
         if outputs.len() <= logits_output_idx {
             return Err(SparrowEngineError::Ort(format!(
                 "audio classifier returned {} outputs; expected at least {}",
diff --git a/sparrow-engine/sparrow-engine-cpu/src/engine.rs b/sparrow-engine/sparrow-engine-cpu/src/engine.rs
@@ -1240,6 +1240,7 @@ mod tests {
         PreprocessMethod::RawAudio {
             sample_rate: 32_000,
             window_samples: 160_000,
+            pass_orig_sample_rate: false,
         }
     }
 
diff --git a/sparrow-engine/sparrow-engine-types/src/manifest.rs b/sparrow-engine/sparrow-engine-types/src/manifest.rs
@@ -53,6 +53,13 @@ pub enum PreprocessMethod {
     RawAudio {
         sample_rate: u32,
         window_samples: u32,
+        /// Opt-in: when true, engine passes a second ONNX input
+        /// `orig_sample_rate [1] int64` carrying the original (pre-resample)
+        /// sample rate. Used by in-graph fill_highfreq passes that need to
+        /// know whether the audio was upsampled and where the original
+        /// Nyquist sat. Default false preserves Perch 2 / single-input
+        /// RawAudio behavior (RP-27 Part 2, 2026-06-05).
+        pass_orig_sample_rate: bool,
     },
 }
 
@@ -406,6 +413,12 @@ struct RawPreprocessing {
     /// Number of samples per inference window (= segment_duration_s × sample_rate).
     /// Required for `raw_audio`. For Perch 2: 160000 = 5 s × 32 kHz.
     window_samples: Option<u32>,
+    /// RawAudio-only opt-in (RP-27 Part 2, 2026-06-05): when true, the
+    /// engine passes a second ONNX input `orig_sample_rate [1] int64`
+    /// alongside the audio tensor so the model can apply in-graph
+    /// fill_highfreq.
+    #[serde(default)]
+    pass_orig_sample_rate: Option<bool>,
     /// Opt-in high-frequency fill for mel_spectrogram preprocess (RP-27).
     /// Defaults to `false` (md-audiobirds-v1 behavior). When `true` and the
     /// engine resamples upward, mel bins above `orig_sr/2 - 2500 Hz` are
@@ -538,6 +551,10 @@ pub fn load_manifest(path: &Path) -> Result<ModelManifest> {
                     .preprocessing
                     .window_samples
                     .ok_or_else(|| raw_err("window_samples"))?,
+                pass_orig_sample_rate: raw
+                    .preprocessing
+                    .pass_orig_sample_rate
+                    .unwrap_or(false),
             }
         }
         "mel_spectrogram" => {
@@ -668,6 +685,7 @@ pub fn load_manifest(path: &Path) -> Result<ModelManifest> {
     if let PreprocessMethod::RawAudio {
         sample_rate,
         window_samples,
+        ..
     } = &preprocess_method
     {
         if *sample_rate == 0 {
@@ -825,6 +843,7 @@ pub fn load_manifest(path: &Path) -> Result<ModelManifest> {
         PreprocessMethod::RawAudio {
             sample_rate,
             window_samples,
+            ..
         },
         InferenceStrategy::SlidingWindow {
             segment_duration_s, ..
@@ -2033,6 +2052,7 @@ format = "one_per_line"
             PreprocessMethod::RawAudio {
                 sample_rate: 32000,
                 window_samples: 160000,
+                ..
             }
         ));
         assert_eq!(
diff --git a/sparrow-engine/sparrow-engine-types/src/model_type.rs b/sparrow-engine/sparrow-engine-types/src/model_type.rs
@@ -72,6 +72,7 @@ mod phase_a_r1_model_type_tests {
         PreprocessMethod::RawAudio {
             sample_rate: 32000,
             window_samples: 160000,
+            pass_orig_sample_rate: false,
         }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -1240,6 +1240,7 @@ mod tests {`
`1240`	`1240`	`PreprocessMethod::RawAudio {`
`1241`	`1241`	`sample_rate: 32_000,`
`1242`	`1242`	`window_samples: 160_000,`
	`1243`	`+ pass_orig_sample_rate: false,`
`1243`	`1244`	`}`
`1244`	`1245`	`}`
`1245`	`1246`
Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ mod phase_a_r1_model_type_tests {`
`72`	`72`	`PreprocessMethod::RawAudio {`
`73`	`73`	`sample_rate: 32000,`
`74`	`74`	`window_samples: 160000,`
	`75`	`+ pass_orig_sample_rate: false,`
`75`	`76`	`}`
`76`	`77`	`}`
`77`	`78`