kaldi-asr · pegahgh · Dec 12, 2018 · Dec 16, 2018 · Dec 16, 2018 · Dec 21, 2018
diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc
@@ -95,8 +95,8 @@ static void UnitTestSimple() {
   op.frame_opts.round_to_power_of_two = true;
   op.mel_opts.low_freq = 0.0;
   op.mel_opts.htk_mode = true;
+  op.mel_opts.modified = (Rand() % 2 == 0 ? true : false);
   op.htk_compat = true;
-
   Mfcc mfcc(op);
   // use default parameters
 
@@ -613,42 +613,29 @@ static void UnitTestHTKCompare6() {
   }
 
   std::cout << "Test passed :)\n\n";
-  
+
   unlink("tmp.test.wav.fea_kaldi.6");
 }
 
 void UnitTestVtln() {
   // Test the function VtlnWarpFreq.
-  BaseFloat low_freq = 10, high_freq = 7800,
-      vtln_low_cutoff = 20, vtln_high_cutoff = 7400;
-
+  BaseFloat low_freq = 10, high_freq = 7800;
+  MelBanksOptions mel_opts;
+  mel_opts.low_freq = low_freq, mel_opts.high_freq = high_freq;
+  FrameExtractionOptions frame_opts;
+  MelBanks melfbank(mel_opts, frame_opts, 0.9);
   for (size_t i = 0; i < 100; i++) {
     BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2;
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             freq),
-                freq / warp_factor);
-
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             low_freq),
-                low_freq);
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             high_freq),
-                high_freq);
+    AssertEqual(melfbank.VtlnWarpFreq(warp_factor, freq), freq / warp_factor);
+
+    AssertEqual(melfbank.VtlnWarpFreq(warp_factor, low_freq), low_freq);
+    AssertEqual(melfbank.VtlnWarpFreq(warp_factor, high_freq), high_freq);
     BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(),
         freq3 = freq2 +  (high_freq-freq2) * RandUniform();  // freq3>=freq2
-    BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                low_freq, high_freq, warp_factor,
-                                freq2);
-    BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                low_freq, high_freq, warp_factor,
-                                freq3);
+    BaseFloat w2 = melfbank.VtlnWarpFreq(warp_factor, freq2);
+    BaseFloat w3 = melfbank.VtlnWarpFreq(warp_factor, freq3);
     KALDI_ASSERT(w3 >= w2);  // increasing function.
-    BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                    low_freq, high_freq, 1.0,
-                                    freq3);
+    BaseFloat w3dash = melfbank.VtlnWarpFreq(1.0, freq3);
     AssertEqual(w3dash, freq3);
   }
 }

diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
@@ -34,86 +34,65 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
                    const FrameExtractionOptions &frame_opts,
                    BaseFloat vtln_warp_factor):
     htk_mode_(opts.htk_mode) {
+  SetConfigs(opts, frame_opts, vtln_warp_factor);
+
   int32 num_bins = opts.num_bins;
   if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
-  BaseFloat sample_freq = frame_opts.samp_freq;
-  int32 window_length_padded = frame_opts.PaddedWindowSize();
-  KALDI_ASSERT(window_length_padded % 2 == 0);
-  int32 num_fft_bins = window_length_padded / 2;
-  BaseFloat nyquist = 0.5 * sample_freq;
 
-  BaseFloat low_freq = opts.low_freq, high_freq;
-  if (opts.high_freq > 0.0)
-    high_freq = opts.high_freq;
-  else
-    high_freq = nyquist + opts.high_freq;
 
-  if (low_freq < 0.0 || low_freq >= nyquist
-      || high_freq <= 0.0 || high_freq > nyquist
-      || high_freq <= low_freq)
-    KALDI_ERR << "Bad values in options: low-freq " << low_freq
-              << " and high-freq " << high_freq << " vs. nyquist "
-              << nyquist;
-
-  BaseFloat fft_bin_width = sample_freq / window_length_padded;
-  // fft-bin width [think of it as Nyquist-freq / half-window-length]
+  BaseFloat mel_low_freq = MelScale(low_freq_);
+  BaseFloat mel_high_freq = MelScale(high_freq_);
 
-  BaseFloat mel_low_freq = MelScale(low_freq);
-  BaseFloat mel_high_freq = MelScale(high_freq);
 
-  debug_ = opts.debug_mel;
 
-  // divide by num_bins+1 in next line because of end-effects where the bins
-  // spread out to the sides.
-  BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
+  bins_.resize(num_bins);
+  center_freqs_.Resize(num_bins);
 
-  BaseFloat vtln_low = opts.vtln_low,
-      vtln_high = opts.vtln_high;
-  if (vtln_high < 0.0) {
-    vtln_high += nyquist;
+  for (int32 bin = 0; bin < num_bins; bin++) {
+    BaseFloat mel = mel_low_freq +
+        (bin + 1) * (mel_high_freq - mel_low_freq) / (num_bins + 1);
+    if (vtln_warp_factor != 1.0)
+      mel = VtlnWarpMelFreq(vtln_warp_factor, mel);
+    center_freqs_(bin) = InverseMelScale(mel);
   }
 
-  if (vtln_warp_factor != 1.0 &&
-      (vtln_low < 0.0 || vtln_low <= low_freq
-       || vtln_low >= high_freq
-       || vtln_high <= 0.0 || vtln_high >= high_freq
-       || vtln_high <= vtln_low))
-    KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
-              << " and vtln-high " << vtln_high << ", versus "
-              << "low-freq " << low_freq << " and high-freq "
-              << high_freq;
+  if (!opts.modified)
+    ComputeBins(opts.htk_mode);
+  else
+    ComputeModifiedBins();
 
-  bins_.resize(num_bins);
-  center_freqs_.Resize(num_bins);
+  if (debug_) {
+    for (size_t i = 0; i < bins_.size(); i++) {
+      KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
+                << ", vec = " << bins_[i].second;
+    }
+  }
+}
 
+void MelBanks::ComputeBins(bool htk_mode) {
+  int32 num_bins = center_freqs_.Dim();
   for (int32 bin = 0; bin < num_bins; bin++) {
-    BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
-        center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
-        right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
-
-    if (vtln_warp_factor != 1.0) {
-      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                 vtln_warp_factor, left_mel);
-      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                 vtln_warp_factor, center_mel);
-      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                  vtln_warp_factor, right_mel);
-    }
-    center_freqs_(bin) = InverseMelScale(center_mel);
+    // center_mel is the center frequency (in mel) of this bin, and left_mel and
+    // right_mel are those of the bins immediately to the left and right.
+    BaseFloat center_mel = MelScale(center_freqs_(bin)),
+        left_mel = MelScale(bin == 0 ?
+                            low_freq_ : center_freqs_(bin - 1)),
+        right_mel = MelScale(bin == num_bins - 1 ?
+                             high_freq_ : center_freqs_(bin + 1));
     // this_bin will be a vector of coefficients that is only
     // nonzero where this mel bin is active.
-    Vector<BaseFloat> this_bin(num_fft_bins);
+    Vector<BaseFloat> this_bin(num_fft_bins_);
     int32 first_index = -1, last_index = -1;
-    for (int32 i = 0; i < num_fft_bins; i++) {
-      BaseFloat freq = (fft_bin_width * i);  // Center frequency of this fft
+    for (int32 i = 0; i < num_fft_bins_; i++) {
+      BaseFloat freq = (fft_bin_width_ * i);  // Center frequency of this fft
                                              // bin.
       BaseFloat mel = MelScale(freq);
       if (mel > left_mel && mel < right_mel) {
         BaseFloat weight;
         if (mel <= center_mel)
           weight = (mel - left_mel) / (center_mel - left_mel);
         else
-         weight = (right_mel-mel) / (right_mel-center_mel);
+         weight = (right_mel - mel) / (right_mel - center_mel);
         this_bin(i) = weight;
         if (first_index == -1)
           first_index = i;
@@ -129,29 +108,73 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
 
     // Replicate a bug in HTK, for testing purposes.
-    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
+    if (htk_mode && bin == 0 && low_freq_ != 0.0)
       bins_[bin].second(0) = 0.0;
-
   }
-  if (debug_) {
-    for (size_t i = 0; i < bins_.size(); i++) {
-      KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
-                << ", vec = " << bins_[i].second;
+}
+
+/*
+  Notes on the shape of the modified bins.
+
+  They are shaped like a cosine function from -pi/2 to pi/2 (unlike the standard
+  triangular bins).  We define their diameter as the distance between the
+  first and last nonzero value (pi for the canonical function).  If there are
+  a lot of bins, their diamter is defined by a formula and it's a function of
+  the center frequency f of the bin:
+     diameter = 30 + 60 f / (f + 500).
+  so it increases from 30Hz to 90Hz with a knee around 500Hz.
+  However (and this matters if the number of bins is relatively small), we never
+  let the diameter fall below the point where the crossing-point of this and
+  the next bin would be less than 0.1.  By this I mean is the y-value where the
+  raised-cosines cross.  This value ensures that there won't be too a 'dip'
+  in the middle of the two bins.
+ */
+void MelBanks::ComputeModifiedBins() {
+  int32 num_bins = center_freqs_.Dim();
+  for (int32 bin = 0; bin < num_bins; bin++) {
+    BaseFloat center_freq = center_freqs_(bin),
+        next_center = (bin == num_bins - 1 ?
+                       high_freq_ : center_freqs_(bin + 1));
+
+    // note: breakpoint_ is 500 (Hz).
+    BaseFloat diameter_floor = (next_center - center_freq) * 1.1,
+        diameter = 30.0 + 60.0 * (center_freq / (center_freq + breakpoint_));
+
+    diameter = pow(diameter * diameter + diameter_floor * diameter_floor, 0.5);
+
+    // 'freq_scale' is the scaling factor on the frequencies that will ensure
+    // that the diameter becomes equal to pi, like the canonical bin function
+    // (the cosine from -pi/2 to pi/2).
+    BaseFloat freq_scale = M_PI / diameter;
+
+    // this_bin will be a vector of coefficients that is only
+    // nonzero where this mel bin is active.
+    Vector<BaseFloat> this_bin(num_fft_bins_);
+    int32 first_index = -1, last_index = -1;
+
+    for (int32 i = 0; i < num_fft_bins_; i++) {
+      BaseFloat freq = (fft_bin_width_ * i);  // Center frequency of this fft
+                                             // bin.
+      BaseFloat normalized_freq = freq_scale * (freq - center_freq);
+      if (normalized_freq > -M_PI_2 && normalized_freq < M_PI_2) {
+        BaseFloat weight = cos(normalized_freq);
+        this_bin(i) = weight;
+        if (first_index == -1)
+          first_index = i;
+        last_index = i;
+      }
     }
+    KALDI_ASSERT(first_index != -1 && last_index >= first_index
+                 && "You may have set --num-mel-bins too large.");
+
+    bins_[bin].first = first_index;
+    int32 size = last_index + 1 - first_index;
+    bins_[bin].second.Resize(size);
+    bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
   }
 }
 
-MelBanks::MelBanks(const MelBanks &other):
-    center_freqs_(other.center_freqs_),
-    bins_(other.bins_),
-    debug_(other.debug_),
-    htk_mode_(other.htk_mode_) { }
-
-BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
-                                 BaseFloat vtln_high_cutoff,
-                                 BaseFloat low_freq,  // upper+lower frequency cutoffs in mel computation
-                                 BaseFloat high_freq,
-                                 BaseFloat vtln_warp_factor,
+BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_warp_factor,
                                  BaseFloat freq) {
   /// This computes a VTLN warping function that is not the same as HTK's one,
   /// but has similar inputs (this function has the advantage of never producing
@@ -180,45 +203,34 @@ BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower freq
   ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
 
 
-  if (freq < low_freq || freq > high_freq) return freq;  // in case this gets called
+  if (freq < low_freq_ || freq > high_freq_) return freq;  // in case this gets called
   // for out-of-range frequencies, just return the freq.
 
-  KALDI_ASSERT(vtln_low_cutoff > low_freq &&
-               "be sure to set the --vtln-low option higher than --low-freq");
-  KALDI_ASSERT(vtln_high_cutoff < high_freq &&
-               "be sure to set the --vtln-high option lower than --high-freq [or negative]");
-  BaseFloat one = 1.0;
-  BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
-  BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
+  BaseFloat l = vtln_low_ * std::max(BaseFloat(1.0), vtln_warp_factor);
+  BaseFloat h = vtln_high_ * std::min(BaseFloat(1.0), vtln_warp_factor);
   BaseFloat scale = 1.0 / vtln_warp_factor;
   BaseFloat Fl = scale * l;  // F(l);
   BaseFloat Fh = scale * h;  // F(h);
-  KALDI_ASSERT(l > low_freq && h < high_freq);
+  KALDI_ASSERT(l > low_freq_ && h < high_freq_);
   // slope of left part of the 3-piece linear function
-  BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
+  BaseFloat scale_left = (Fl - low_freq_) / (l - low_freq_);
   // [slope of center part is just "scale"]
 
   // slope of right part of the 3-piece linear function
-  BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
+  BaseFloat scale_right = (high_freq_ - Fh) / (high_freq_ - h);
 
   if (freq < l) {
-    return low_freq + scale_left * (freq - low_freq);
+    return low_freq_ + scale_left * (freq - low_freq_);
   } else if (freq < h) {
     return scale * freq;
   } else {  // freq >= h
-    return high_freq + scale_right * (freq - high_freq);
+    return high_freq_ + scale_right * (freq - high_freq_);
   }
 }
 
-BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
-                                    BaseFloat vtln_high_cutoff,
-                                    BaseFloat low_freq,  // upper+lower frequency cutoffs in mel computation
-                                    BaseFloat high_freq,
-                                    BaseFloat vtln_warp_factor,
+BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_warp_factor,
                                     BaseFloat mel_freq) {
-  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                               low_freq, high_freq,
-                               vtln_warp_factor, InverseMelScale(mel_freq)));
+  return MelScale(VtlnWarpFreq(vtln_warp_factor, InverseMelScale(mel_freq)));
 }
 
 
@@ -250,6 +262,53 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
   }
 }
 
+void MelBanks::SetConfigs(const MelBanksOptions &opts,
+                          const FrameExtractionOptions &frame_opts,
+                          BaseFloat vtln_warp_factor) {
+  BaseFloat sample_freq = frame_opts.samp_freq,
+      nyquist = 0.5 * sample_freq;
+  int32 window_length_padded = frame_opts.PaddedWindowSize();
+  KALDI_ASSERT(window_length_padded % 2 == 0);
+  num_fft_bins_ = window_length_padded / 2;
+  // fft-bin width [think of it as Nyquist-freq / half-window-length]
+  fft_bin_width_ = sample_freq / window_length_padded;
+
+  debug_ = opts.debug_mel;
+
+
+  low_freq_ = opts.low_freq;
+  if (opts.high_freq > 0.0)
+    high_freq_ = opts.high_freq;
+  else
+    high_freq_ = nyquist + opts.high_freq;
+
+  if (low_freq_ < 0.0 || low_freq_ >= nyquist
+      || high_freq_ <= 0.0 || high_freq_ > nyquist
+      || high_freq_ <= low_freq_)
+    KALDI_ERR << "Bad values in options: low-freq " << low_freq_
+              << " and high-freq " << high_freq_ << " vs. nyquist "
+              << nyquist;
+
+  breakpoint_ = (opts.modified ? 500.0 : 700.0);
+  sec_breakpoint_ = (opts.modified ? 3500 : -1);
+  vtln_low_ = opts.vtln_low;
+  if (opts.vtln_high > 0.0)
+    vtln_high_ = opts.vtln_high;
+  else
+    vtln_high_ = opts.vtln_high + nyquist;
+
+  if (vtln_warp_factor != 1.0 &&
+      (vtln_low_ < 0.0 || vtln_low_ <= low_freq_
+       || vtln_low_ >= high_freq_
+       || vtln_high_ <= 0.0 || vtln_high_ >= high_freq_
+       || vtln_high_ <= vtln_low_))
+    KALDI_ERR << "Bad values in options: vtln-low " << vtln_low_
+              << " and vtln-high " << vtln_high_ << ", versus "
+              << "low-freq " << low_freq_ << " and high-freq "
+              << high_freq_;
+}
+
+
 void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
   // Compute liftering coefficients (scaling on cepstral coeffs)
   // coeffs are numbered slightly differently from HTK: the zeroth