-
Notifications
You must be signed in to change notification settings - Fork 5.4k
added modified MFCC features based on DNN-c and fDNN-c features; it i… #2908
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
pegahgh
wants to merge
5
commits into
kaldi-asr:master
Choose a base branch
from
pegahgh:modified-mel-kaldi
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+254
−153
Open
Changes from 4 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
232df9f
added modified MFCC features based on DNN-c and fDNN-c features; it i…
pegahgh 4eb4862
pushed to trigger the build (travis issue)
pegahgh 799969e
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into modi…
pegahgh 126c89a
modified test set w.r.t new VtlnWarpMelFreq function.
pegahgh e272089
fixed typos.
pegahgh File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,86 +34,65 @@ MelBanks::MelBanks(const MelBanksOptions &opts, | |
const FrameExtractionOptions &frame_opts, | ||
BaseFloat vtln_warp_factor): | ||
htk_mode_(opts.htk_mode) { | ||
SetConfigs(opts, frame_opts, vtln_warp_factor); | ||
|
||
int32 num_bins = opts.num_bins; | ||
if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins"; | ||
BaseFloat sample_freq = frame_opts.samp_freq; | ||
int32 window_length_padded = frame_opts.PaddedWindowSize(); | ||
KALDI_ASSERT(window_length_padded % 2 == 0); | ||
int32 num_fft_bins = window_length_padded / 2; | ||
BaseFloat nyquist = 0.5 * sample_freq; | ||
|
||
BaseFloat low_freq = opts.low_freq, high_freq; | ||
if (opts.high_freq > 0.0) | ||
high_freq = opts.high_freq; | ||
else | ||
high_freq = nyquist + opts.high_freq; | ||
|
||
if (low_freq < 0.0 || low_freq >= nyquist | ||
|| high_freq <= 0.0 || high_freq > nyquist | ||
|| high_freq <= low_freq) | ||
KALDI_ERR << "Bad values in options: low-freq " << low_freq | ||
<< " and high-freq " << high_freq << " vs. nyquist " | ||
<< nyquist; | ||
|
||
BaseFloat fft_bin_width = sample_freq / window_length_padded; | ||
// fft-bin width [think of it as Nyquist-freq / half-window-length] | ||
BaseFloat mel_low_freq = MelScale(low_freq_); | ||
BaseFloat mel_high_freq = MelScale(high_freq_); | ||
|
||
BaseFloat mel_low_freq = MelScale(low_freq); | ||
BaseFloat mel_high_freq = MelScale(high_freq); | ||
|
||
debug_ = opts.debug_mel; | ||
|
||
// divide by num_bins+1 in next line because of end-effects where the bins | ||
// spread out to the sides. | ||
BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1); | ||
bins_.resize(num_bins); | ||
center_freqs_.Resize(num_bins); | ||
|
||
BaseFloat vtln_low = opts.vtln_low, | ||
vtln_high = opts.vtln_high; | ||
if (vtln_high < 0.0) { | ||
vtln_high += nyquist; | ||
for (int32 bin = 0; bin < num_bins; bin++) { | ||
BaseFloat mel = mel_low_freq + | ||
(bin + 1) * (mel_high_freq - mel_low_freq) / (num_bins + 1); | ||
if (vtln_warp_factor != 1.0) | ||
mel = VtlnWarpMelFreq(vtln_warp_factor, mel); | ||
center_freqs_(bin) = InverseMelScale(mel); | ||
} | ||
|
||
if (vtln_warp_factor != 1.0 && | ||
(vtln_low < 0.0 || vtln_low <= low_freq | ||
|| vtln_low >= high_freq | ||
|| vtln_high <= 0.0 || vtln_high >= high_freq | ||
|| vtln_high <= vtln_low)) | ||
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low | ||
<< " and vtln-high " << vtln_high << ", versus " | ||
<< "low-freq " << low_freq << " and high-freq " | ||
<< high_freq; | ||
if (!opts.modified) | ||
ComputeBins(opts.htk_mode); | ||
else | ||
ComputeModifiedBins(); | ||
|
||
bins_.resize(num_bins); | ||
center_freqs_.Resize(num_bins); | ||
if (debug_) { | ||
for (size_t i = 0; i < bins_.size(); i++) { | ||
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first | ||
<< ", vec = " << bins_[i].second; | ||
} | ||
} | ||
} | ||
|
||
void MelBanks::ComputeBins(bool htk_mode) { | ||
int32 num_bins = center_freqs_.Dim(); | ||
for (int32 bin = 0; bin < num_bins; bin++) { | ||
BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta, | ||
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, | ||
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; | ||
|
||
if (vtln_warp_factor != 1.0) { | ||
left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, | ||
vtln_warp_factor, left_mel); | ||
center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, | ||
vtln_warp_factor, center_mel); | ||
right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, | ||
vtln_warp_factor, right_mel); | ||
} | ||
center_freqs_(bin) = InverseMelScale(center_mel); | ||
// center_mel is the center frequency (in mel) of this bin, and left_mel and | ||
// right_mel are those of the bins immediately to the left and right. | ||
BaseFloat center_mel = MelScale(center_freqs_(bin)), | ||
left_mel = MelScale(bin == 0 ? | ||
low_freq_ : center_freqs_(bin - 1)), | ||
right_mel = MelScale(bin == num_bins - 1 ? | ||
high_freq_ : center_freqs_(bin + 1)); | ||
// this_bin will be a vector of coefficients that is only | ||
// nonzero where this mel bin is active. | ||
Vector<BaseFloat> this_bin(num_fft_bins); | ||
Vector<BaseFloat> this_bin(num_fft_bins_); | ||
int32 first_index = -1, last_index = -1; | ||
for (int32 i = 0; i < num_fft_bins; i++) { | ||
BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft | ||
for (int32 i = 0; i < num_fft_bins_; i++) { | ||
BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft | ||
// bin. | ||
BaseFloat mel = MelScale(freq); | ||
if (mel > left_mel && mel < right_mel) { | ||
BaseFloat weight; | ||
if (mel <= center_mel) | ||
weight = (mel - left_mel) / (center_mel - left_mel); | ||
else | ||
weight = (right_mel-mel) / (right_mel-center_mel); | ||
weight = (right_mel - mel) / (right_mel - center_mel); | ||
this_bin(i) = weight; | ||
if (first_index == -1) | ||
first_index = i; | ||
|
@@ -129,29 +108,73 @@ MelBanks::MelBanks(const MelBanksOptions &opts, | |
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size)); | ||
|
||
// Replicate a bug in HTK, for testing purposes. | ||
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0) | ||
if (htk_mode && bin == 0 && low_freq_ != 0.0) | ||
bins_[bin].second(0) = 0.0; | ||
|
||
} | ||
if (debug_) { | ||
for (size_t i = 0; i < bins_.size(); i++) { | ||
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first | ||
<< ", vec = " << bins_[i].second; | ||
} | ||
|
||
/* | ||
Notes on the shape of the modified bins. | ||
|
||
They are shaped like a cosine function from -pi/2 to pi/2 (unlike the standard | ||
triangular bins). We define their diameter as the distance between the | ||
first and last nonzero value (pi for the canonical function). If there are | ||
a lot of bins, their diamter is defined by a formula and it's a function of | ||
the center frequency f of the bin: | ||
diameter = 30 + 60 f / (f + 500). | ||
so it increases from 30Hz to 90Hz with a knee around 500Hz. | ||
However (and this matters if the number of bins is relatively small), we never | ||
let the diameter fall below the point where the crossing-point of this and | ||
the next bin would be less than 0.1. By this I mean is the y-value where the | ||
raised-cosines cross. This value ensures that there won't be too a 'dip' | ||
in the middle of the two bins. | ||
*/ | ||
void MelBanks::ComputeModifiedBins() { | ||
int32 num_bins = center_freqs_.Dim(); | ||
for (int32 bin = 0; bin < num_bins; bin++) { | ||
BaseFloat center_freq = center_freqs_(bin), | ||
next_center = (bin == num_bins - 1 ? | ||
high_freq_ : center_freqs_(bin + 1)); | ||
|
||
// note: breakpoint_ is 500 (Hz). | ||
BaseFloat diameter_floor = (next_center - center_freq) * 1.1, | ||
diameter = 30.0 + 60.0 * (center_freq / (center_freq + breakpoint_)); | ||
|
||
diameter = pow(diameter * diameter + diameter_floor * diameter_floor, 0.5); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think sqrt would be easier than pow(.., 0.5). |
||
|
||
// 'freq_scale' is the scaling factor on the frequencies that will ensure | ||
// that the diameter becomes equal to pi, like the canonical bin function | ||
// (the cosine from -pi/2 to pi/2). | ||
BaseFloat freq_scale = M_PI / diameter; | ||
|
||
// this_bin will be a vector of coefficients that is only | ||
// nonzero where this mel bin is active. | ||
Vector<BaseFloat> this_bin(num_fft_bins_); | ||
int32 first_index = -1, last_index = -1; | ||
|
||
for (int32 i = 0; i < num_fft_bins_; i++) { | ||
BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft | ||
// bin. | ||
BaseFloat normalized_freq = freq_scale * (freq - center_freq); | ||
if (normalized_freq > -M_PI_2 && normalized_freq < M_PI_2) { | ||
BaseFloat weight = cos(normalized_freq); | ||
this_bin(i) = weight; | ||
if (first_index == -1) | ||
first_index = i; | ||
last_index = i; | ||
} | ||
} | ||
KALDI_ASSERT(first_index != -1 && last_index >= first_index | ||
&& "You may have set --num-mel-bins too large."); | ||
|
||
bins_[bin].first = first_index; | ||
int32 size = last_index + 1 - first_index; | ||
bins_[bin].second.Resize(size); | ||
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size)); | ||
} | ||
} | ||
|
||
MelBanks::MelBanks(const MelBanks &other): | ||
center_freqs_(other.center_freqs_), | ||
bins_(other.bins_), | ||
debug_(other.debug_), | ||
htk_mode_(other.htk_mode_) { } | ||
|
||
BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. | ||
BaseFloat vtln_high_cutoff, | ||
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation | ||
BaseFloat high_freq, | ||
BaseFloat vtln_warp_factor, | ||
BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_warp_factor, | ||
BaseFloat freq) { | ||
/// This computes a VTLN warping function that is not the same as HTK's one, | ||
/// but has similar inputs (this function has the advantage of never producing | ||
|
@@ -180,45 +203,34 @@ BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower freq | |
/// = vtln_low_cutoff * max(1, vtln_warp_factor) | ||
|
||
|
||
if (freq < low_freq || freq > high_freq) return freq; // in case this gets called | ||
if (freq < low_freq_ || freq > high_freq_) return freq; // in case this gets called | ||
// for out-of-range frequencies, just return the freq. | ||
|
||
KALDI_ASSERT(vtln_low_cutoff > low_freq && | ||
"be sure to set the --vtln-low option higher than --low-freq"); | ||
KALDI_ASSERT(vtln_high_cutoff < high_freq && | ||
"be sure to set the --vtln-high option lower than --high-freq [or negative]"); | ||
BaseFloat one = 1.0; | ||
BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor); | ||
BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor); | ||
BaseFloat l = vtln_low_ * std::max(BaseFloat(1.0), vtln_warp_factor); | ||
BaseFloat h = vtln_high_ * std::min(BaseFloat(1.0), vtln_warp_factor); | ||
BaseFloat scale = 1.0 / vtln_warp_factor; | ||
BaseFloat Fl = scale * l; // F(l); | ||
BaseFloat Fh = scale * h; // F(h); | ||
KALDI_ASSERT(l > low_freq && h < high_freq); | ||
KALDI_ASSERT(l > low_freq_ && h < high_freq_); | ||
// slope of left part of the 3-piece linear function | ||
BaseFloat scale_left = (Fl - low_freq) / (l - low_freq); | ||
BaseFloat scale_left = (Fl - low_freq_) / (l - low_freq_); | ||
// [slope of center part is just "scale"] | ||
|
||
// slope of right part of the 3-piece linear function | ||
BaseFloat scale_right = (high_freq - Fh) / (high_freq - h); | ||
BaseFloat scale_right = (high_freq_ - Fh) / (high_freq_ - h); | ||
|
||
if (freq < l) { | ||
return low_freq + scale_left * (freq - low_freq); | ||
return low_freq_ + scale_left * (freq - low_freq_); | ||
} else if (freq < h) { | ||
return scale * freq; | ||
} else { // freq >= h | ||
return high_freq + scale_right * (freq - high_freq); | ||
return high_freq_ + scale_right * (freq - high_freq_); | ||
} | ||
} | ||
|
||
BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. | ||
BaseFloat vtln_high_cutoff, | ||
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation | ||
BaseFloat high_freq, | ||
BaseFloat vtln_warp_factor, | ||
BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_warp_factor, | ||
BaseFloat mel_freq) { | ||
return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, | ||
low_freq, high_freq, | ||
vtln_warp_factor, InverseMelScale(mel_freq))); | ||
return MelScale(VtlnWarpFreq(vtln_warp_factor, InverseMelScale(mel_freq))); | ||
} | ||
|
||
|
||
|
@@ -250,6 +262,53 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum, | |
} | ||
} | ||
|
||
void MelBanks::SetConfigs(const MelBanksOptions &opts, | ||
const FrameExtractionOptions &frame_opts, | ||
BaseFloat vtln_warp_factor) { | ||
BaseFloat sample_freq = frame_opts.samp_freq, | ||
nyquist = 0.5 * sample_freq; | ||
int32 window_length_padded = frame_opts.PaddedWindowSize(); | ||
KALDI_ASSERT(window_length_padded % 2 == 0); | ||
num_fft_bins_ = window_length_padded / 2; | ||
// fft-bin width [think of it as Nyquist-freq / half-window-length] | ||
fft_bin_width_ = sample_freq / window_length_padded; | ||
|
||
debug_ = opts.debug_mel; | ||
|
||
|
||
low_freq_ = opts.low_freq; | ||
if (opts.high_freq > 0.0) | ||
high_freq_ = opts.high_freq; | ||
else | ||
high_freq_ = nyquist + opts.high_freq; | ||
|
||
if (low_freq_ < 0.0 || low_freq_ >= nyquist | ||
|| high_freq_ <= 0.0 || high_freq_ > nyquist | ||
|| high_freq_ <= low_freq_) | ||
KALDI_ERR << "Bad values in options: low-freq " << low_freq_ | ||
<< " and high-freq " << high_freq_ << " vs. nyquist " | ||
<< nyquist; | ||
|
||
breakpoint_ = (opts.modified ? 500.0 : 700.0); | ||
sec_breakpoint_ = (opts.modified ? 3500 : -1); | ||
vtln_low_ = opts.vtln_low; | ||
if (opts.vtln_high > 0.0) | ||
vtln_high_ = opts.vtln_high; | ||
else | ||
vtln_high_ = opts.vtln_high + nyquist; | ||
|
||
if (vtln_warp_factor != 1.0 && | ||
(vtln_low_ < 0.0 || vtln_low_ <= low_freq_ | ||
|| vtln_low_ >= high_freq_ | ||
|| vtln_high_ <= 0.0 || vtln_high_ >= high_freq_ | ||
|| vtln_high_ <= vtln_low_)) | ||
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low_ | ||
<< " and vtln-high " << vtln_high_ << ", versus " | ||
<< "low-freq " << low_freq_ << " and high-freq " | ||
<< high_freq_; | ||
} | ||
|
||
|
||
void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) { | ||
// Compute liftering coefficients (scaling on cepstral coeffs) | ||
// coeffs are numbered slightly differently from HTK: the zeroth | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This documentation seems a bit out of date.