@@ -34,6 +34,10 @@ pub struct AudioPreprocessConfig {
3434 pub fmin : f32 ,
3535 pub fmax : f32 ,
3636 pub top_db : f32 ,
37+ /// Opt-in high-frequency mel-band fill for upsampled inputs (RP-27,
38+ /// 2026-06-01). Default `false` preserves md-audiobirds-v1 behavior.
39+ /// See [`mel_spectrogram`] for the algorithm details.
40+ pub fill_highfreq : bool ,
3741}
3842
3943impl AudioPreprocessConfig {
@@ -111,6 +115,7 @@ impl AudioPreprocessConfig {
111115 fmin,
112116 fmax,
113117 top_db,
118+ fill_highfreq,
114119 .. // window, mel_scale, filter_norm: validated at load time, only one implementation exists
115120 } => Some ( Self {
116121 sample_rate : * sample_rate,
@@ -120,6 +125,7 @@ impl AudioPreprocessConfig {
120125 fmin : * fmin,
121126 fmax : * fmax,
122127 top_db : * top_db,
128+ fill_highfreq : * fill_highfreq,
123129 } ) ,
124130 _ => None ,
125131 }
@@ -136,6 +142,7 @@ impl Default for AudioPreprocessConfig {
136142 fmin : 0.0 ,
137143 fmax : 24_000.0 ,
138144 top_db : 80.0 ,
145+ fill_highfreq : false ,
139146 }
140147 }
141148}
@@ -145,6 +152,10 @@ pub struct AudioSamples {
145152 pub data : Vec < f32 > ,
146153 pub sample_rate : u32 ,
147154 pub duration_s : f32 ,
155+ /// Original sample rate of the source file, before resampling to `sample_rate`.
156+ /// Equal to `sample_rate` when no resample happened. Used by
157+ /// [`mel_spectrogram`] to drive the optional `fill_highfreq` step.
158+ pub orig_sample_rate : u32 ,
148159}
149160
150161// ---------------------------------------------------------------------------
@@ -227,6 +238,7 @@ pub fn load_audio_at_sample_rate(
227238 data : resampled,
228239 sample_rate : target_sample_rate,
229240 duration_s,
241+ orig_sample_rate : sr,
230242 } )
231243}
232244
@@ -379,12 +391,26 @@ impl MelFilterbank {
379391/// Returns tensor `[1, 1, n_mels, time_steps]` (NCHW, single-channel).
380392/// For 48000 samples with n_fft=2048, hop=512: time_steps=90.
381393///
394+ /// The `orig_sample_rate` argument is the **input file's** native sample rate
395+ /// (before any engine-side resampling to `config.sample_rate`). When
396+ /// `config.fill_highfreq == true` AND `orig_sample_rate < config.sample_rate`,
397+ /// the engine applies the upstream PytorchWildlife "fill_highfreq" treatment
398+ /// after power-to-dB: mel bins whose center frequency exceeds
399+ /// `orig_sample_rate / 2 - 2500 Hz` are replaced with the 10th-percentile dB
400+ /// value of the valid (below-boundary) bins, then the whole spectrogram is
401+ /// clamped to `[-top_db, +20.0]`. This matches
402+ /// `bioacoustics_spectrograms.compute_mel_spectrograms_gpu(fill_highfreq=True,
403+ /// fill_mean_below_sr=False)` exactly (RP-27, 2026-06-01). When the flag is
404+ /// off, or when no resample happened, the fill step is a no-op and behavior
405+ /// matches the pre-RP-27 implementation.
406+ ///
382407/// Emits tracing events for `audio.preprocess.mel_gemm` and
383408/// `audio.preprocess.power_to_db`. The internal STFT call emits
384409/// `audio.preprocess.window_frame` and `audio.preprocess.fft` from inside
385410/// [`stft`].
386411pub fn mel_spectrogram (
387412 samples : & [ f32 ] ,
413+ orig_sample_rate : u32 ,
388414 config : & AudioPreprocessConfig ,
389415 filterbank : & MelFilterbank ,
390416) -> Result < Array4 < f32 > > {
@@ -454,13 +480,114 @@ pub fn mel_spectrogram(
454480 n_values = mel. len( ) ,
455481 ) ;
456482
483+ // Step 3b (RP-27): optional fill_highfreq for upsampled inputs.
484+ if config. fill_highfreq && orig_sample_rate < config. sample_rate {
485+ let t_fill = Instant :: now ( ) ;
486+ apply_fill_highfreq ( & mut mel, n_mels, n_frames, orig_sample_rate, config) ;
487+ tracing:: info!(
488+ stage = "audio.preprocess.fill_highfreq" ,
489+ duration_ns = t_fill. elapsed( ) . as_nanos( ) as u64 ,
490+ orig_sr = orig_sample_rate,
491+ target_sr = config. sample_rate,
492+ ) ;
493+ }
494+
457495 // Step 4: Tensor [1, 1, n_mels, n_frames]
458496 let tensor = Array4 :: from_shape_vec ( [ 1 , 1 , n_mels, n_frames] , mel)
459497 . map_err ( |e| SparrowEngineError :: AudioPreprocess ( e. to_string ( ) ) ) ?;
460498
461499 Ok ( tensor)
462500}
463501
502+ /// Apply the PytorchWildlife `fill_highfreq` treatment to a dB-scale mel
503+ /// spectrogram in-place (RP-27, 2026-06-01).
504+ ///
505+ /// For inputs whose native sample rate is below `config.sample_rate`, mel
506+ /// bins above `orig_sample_rate/2 - 2500 Hz` carry no useful signal — at
507+ /// training time these bins were replaced with a noise-floor estimate (the
508+ /// 10th-percentile dB value over all valid bins) so the model never learned
509+ /// to depend on them. At inference time, leaving them at the power-to-dB
510+ /// clamp floor (`max − top_db`) produces a different distribution and biases
511+ /// the model. This routine reproduces the training-time fill exactly.
512+ ///
513+ /// `mel` is laid out as `[n_mels, n_frames]` (row-major). Caller guarantees
514+ /// `orig_sample_rate < config.sample_rate` and `mel.len() == n_mels * n_frames`.
515+ fn apply_fill_highfreq (
516+ mel : & mut [ f32 ] ,
517+ n_mels : usize ,
518+ n_frames : usize ,
519+ orig_sample_rate : u32 ,
520+ config : & AudioPreprocessConfig ,
521+ ) {
522+ debug_assert ! ( orig_sample_rate < config. sample_rate) ;
523+ debug_assert_eq ! ( mel. len( ) , n_mels * n_frames) ;
524+
525+ // Mel bin center frequencies, matching `librosa.mel_frequencies(n_mels,
526+ // fmin=config.fmin, fmax=config.fmax)`. librosa uses endpoint-inclusive
527+ // linspace over n_mels positions, NOT the n_mels+2 triangular-filter
528+ // anchors used by [`mel_filterbank`]. This distinction is load-bearing
529+ // for `fill_highfreq`: torchaudio's MelSpectrogram (used for the filterbank
530+ // matmul) and librosa's mel_frequencies (used by PW Bioacoustics
531+ // `fill_highfreq` to decide which bins are "noise") have different
532+ // center-frequency conventions; we must match the latter exactly here.
533+ let mel_min = slaney_hz_to_mel ( config. fmin ) ;
534+ let mel_max = slaney_hz_to_mel ( config. fmax ) ;
535+ let mel_centers_hz: Vec < f32 > = ( 0 ..n_mels)
536+ . map ( |i| {
537+ let mel = mel_min + ( mel_max - mel_min) * i as f32 / ( n_mels - 1 ) . max ( 1 ) as f32 ;
538+ slaney_mel_to_hz ( mel)
539+ } )
540+ . collect ( ) ;
541+
542+ let nyq_orig = ( orig_sample_rate as f32 / 2.0 ) - 2500.0 ;
543+ let noise_mask: Vec < bool > = mel_centers_hz. iter ( ) . map ( |& hz| hz > nyq_orig) . collect ( ) ;
544+ let n_noise: usize = noise_mask. iter ( ) . filter ( |& & b| b) . count ( ) ;
545+ if n_noise == 0 {
546+ return ; // no bins above boundary — nothing to fill, no clamp.
547+ }
548+
549+ // 10th-percentile dB of valid (below-boundary) bins.
550+ // librosa uses k = ceil(0.10 * len(valid_vals)); torch.kthvalue is
551+ // 1-indexed and returns the value at position k of the sorted ascending
552+ // sequence. Mirror that semantics exactly.
553+ let n_valid = n_mels - n_noise;
554+ debug_assert ! ( n_valid > 0 ) ; // when n_noise = n_mels we'd've returned above
555+ let mut valid_vals: Vec < f32 > = Vec :: with_capacity ( n_valid * n_frames) ;
556+ for ( m, & is_noise) in noise_mask. iter ( ) . enumerate ( ) {
557+ if !is_noise {
558+ valid_vals. extend_from_slice ( & mel[ m * n_frames..( m + 1 ) * n_frames] ) ;
559+ }
560+ }
561+ let k = ( 0.10_f32 * valid_vals. len ( ) as f32 ) . ceil ( ) as usize ;
562+ let k = k. max ( 1 ) . min ( valid_vals. len ( ) ) ;
563+ // Partial sort: nth_element semantics. select_nth_unstable is O(n) and
564+ // gives us the kth-smallest element in valid_vals[k-1] after the call.
565+ valid_vals. select_nth_unstable_by ( k - 1 , |a, b| a. partial_cmp ( b) . unwrap ( ) ) ;
566+ let mu = valid_vals[ k - 1 ] ;
567+
568+ // Replace noise bins with mu.
569+ for ( m, & is_noise) in noise_mask. iter ( ) . enumerate ( ) {
570+ if is_noise {
571+ for v in & mut mel[ m * n_frames..( m + 1 ) * n_frames] {
572+ * v = mu;
573+ }
574+ }
575+ }
576+
577+ // Final clamp to [-top_db, +20.0]. Matches PW upstream: after the fill,
578+ // the spectrogram is clamped to the broader [-top_db, +20] range
579+ // (regardless of the per-segment amax used in step 3's top_db clamp).
580+ let lo = -config. top_db ;
581+ let hi = 20.0_f32 ;
582+ for v in mel. iter_mut ( ) {
583+ if * v < lo {
584+ * v = lo;
585+ } else if * v > hi {
586+ * v = hi;
587+ }
588+ }
589+ }
590+
464591// ---------------------------------------------------------------------------
465592// WAV decoding
466593// ---------------------------------------------------------------------------
@@ -1079,21 +1206,22 @@ mod tests {
10791206 fmin : 0.0 ,
10801207 fmax : 8_000.0 ,
10811208 top_db : 80.0 ,
1209+ fill_highfreq : false ,
10821210 } ;
10831211 let samples = vec ! [ 0.0f32 ; 128 ] ;
10841212 let wrong_mels = MelFilterbank {
10851213 data : vec ! [ 0.0 ; 3 * 33 ] ,
10861214 n_mels : 3 ,
10871215 n_freqs : 33 ,
10881216 } ;
1089- assert ! ( mel_spectrogram( & samples, & config, & wrong_mels) . is_err( ) ) ;
1217+ assert ! ( mel_spectrogram( & samples, config . sample_rate , & config, & wrong_mels) . is_err( ) ) ;
10901218
10911219 let wrong_freqs = MelFilterbank {
10921220 data : vec ! [ 0.0 ; 2 * 32 ] ,
10931221 n_mels : 2 ,
10941222 n_freqs : 32 ,
10951223 } ;
1096- assert ! ( mel_spectrogram( & samples, & config, & wrong_freqs) . is_err( ) ) ;
1224+ assert ! ( mel_spectrogram( & samples, config . sample_rate , & config, & wrong_freqs) . is_err( ) ) ;
10971225 }
10981226
10991227 #[ test]
@@ -1129,7 +1257,7 @@ mod tests {
11291257 let config = AudioPreprocessConfig :: default ( ) ;
11301258 let fb = MelFilterbank :: new ( & config) . expect ( "MelFilterbank::new" ) ;
11311259 let samples = vec ! [ 0.0f32 ; 48000 ] ;
1132- let tensor = mel_spectrogram ( & samples, & config, & fb) . unwrap ( ) ;
1260+ let tensor = mel_spectrogram ( & samples, config . sample_rate , & config, & fb) . unwrap ( ) ;
11331261 assert_eq ! ( tensor. shape( ) , & [ 1 , 1 , 224 , 90 ] ) ;
11341262 }
11351263
@@ -1138,7 +1266,7 @@ mod tests {
11381266 let config = AudioPreprocessConfig :: default ( ) ;
11391267 let fb = MelFilterbank :: new ( & config) . expect ( "MelFilterbank::new" ) ;
11401268 let samples = vec ! [ 0.0f32 ; 1024 ] ; // Too short for n_fft=2048
1141- let result = mel_spectrogram ( & samples, & config, & fb) ;
1269+ let result = mel_spectrogram ( & samples, config . sample_rate , & config, & fb) ;
11421270 assert ! ( result. is_err( ) ) ;
11431271 }
11441272
@@ -1305,6 +1433,7 @@ mod phase_a_r1_preprocess_audio {
13051433 fmin : 0.0 ,
13061434 fmax : 8000.0 ,
13071435 top_db : 80.0 ,
1436+ fill_highfreq : false ,
13081437 } ;
13091438 let fb1 = MelFilterbank :: new ( & cfg) . expect ( "MelFilterbank::new" ) ;
13101439 let fb2 = MelFilterbank :: new ( & cfg) . expect ( "MelFilterbank::new" ) ;
@@ -1337,6 +1466,7 @@ mod phase_a_r1_preprocess_audio {
13371466 fmin : 0.0 ,
13381467 fmax : 8000.0 ,
13391468 top_db : 80.0 ,
1469+ fill_highfreq : false ,
13401470 } ;
13411471 let samples: Vec < f32 > = ( 0 ..16384 ) . map ( |i| ( i as f32 / 32.0 ) . sin ( ) * 0.5 ) . collect ( ) ;
13421472 let loaded = load_audio (
@@ -1366,7 +1496,7 @@ mod phase_a_r1_preprocess_audio {
13661496 let cfg = AudioPreprocessConfig :: default ( ) ;
13671497 let fb = MelFilterbank :: new ( & cfg) . expect ( "MelFilterbank::new" ) ;
13681498 let samples = vec ! [ 0.0f32 ; 48000 ] ;
1369- let tensor = mel_spectrogram ( & samples, & cfg, & fb) . unwrap ( ) ;
1499+ let tensor = mel_spectrogram ( & samples, cfg . sample_rate , & cfg, & fb) . unwrap ( ) ;
13701500 let slice = tensor. as_slice ( ) . unwrap ( ) ;
13711501 // All entries must be finite (no NaN from log10 thanks to .max(epsilon)).
13721502 for v in slice {
@@ -1403,7 +1533,7 @@ mod phase_a_r1_preprocess_audio {
14031533 for sample in samples. iter_mut ( ) . skip ( 24000 ) . take ( 100 ) {
14041534 * sample = 0.5 ;
14051535 }
1406- let tensor = mel_spectrogram ( & samples, & cfg, & fb) . unwrap ( ) ;
1536+ let tensor = mel_spectrogram ( & samples, cfg . sample_rate , & cfg, & fb) . unwrap ( ) ;
14071537 let slice = tensor. as_slice ( ) . unwrap ( ) ;
14081538 let max = slice. iter ( ) . cloned ( ) . fold ( f32:: NEG_INFINITY , f32:: max) ;
14091539 let min = slice. iter ( ) . cloned ( ) . fold ( f32:: INFINITY , f32:: min) ;
0 commit comments