1717# You should have received a copy of the GNU General Public License
1818# along with Friture. If not, see <http://www.gnu.org/licenses/>.
1919
20+ # Pitch estimator adapted from libf0 (MIT License):
21+ # https://github.com/groupmm/libf0
22+ # Based on SWIPE algorithm by Arturo Camacho:
23+ # Arturo Camacho, John G. Harris; A sawtooth waveform inspired pitch estimator for speech and music.
24+ # J. Acoust. Soc. Am. 1 September 2008; 124 (3): 1638–1652. https://doi.org/10.1121/1.2951592
25+ # Released under GPLv3 for inclusion in Friture.
26+
2027from collections .abc import Generator
2128import logging
2229import math as m
3643 DEFAULT_DURATION ,
3744 DEFAULT_FFT_SIZE ,
3845 DEFAULT_MIN_DB ,
46+ DEFAULT_C_RES ,
47+ DEFAULT_P_CONF ,
48+ DEFAULT_P_DELTA ,
3949 PitchTrackerSettingsDialog ,
4050)
4151from friture .plotting .coordinateTransform import CoordinateTransform
@@ -70,10 +80,10 @@ def __init__(self, parent: QtWidgets.QWidget):
7080 self ._pitch_tracker_data .vertical_axis .setRange ( # type: ignore
7181 self .min_freq , self .max_freq )
7282 self ._pitch_tracker_data .vertical_axis .setScale ( # type: ignore
73- fscales .Octave )
83+ fscales .OctaveC )
7484 self .vertical_transform = CoordinateTransform (
7585 self .min_freq , self .max_freq , 1 , 0 , 0 )
76- self .vertical_transform .setScale (fscales .Octave )
86+ self .vertical_transform .setScale (fscales .OctaveC )
7787
7888 self .duration = DEFAULT_DURATION
7989 self ._pitch_tracker_data .horizontal_axis .setRange ( # type: ignore
@@ -117,11 +127,13 @@ def set_min_freq(self, value: int) -> None:
117127 self .min_freq = value
118128 self ._pitch_tracker_data .vertical_axis .setRange (self .min_freq , self .max_freq ) # type: ignore
119129 self .vertical_transform .setRange (self .min_freq , self .max_freq )
130+ self .tracker ._init_swipe () # Reinitialize kernels with new min_freq
120131
121132 def set_max_freq (self , value : int ) -> None :
122133 self .max_freq = value
123134 self ._pitch_tracker_data .vertical_axis .setRange (self .min_freq , self .max_freq ) # type: ignore
124135 self .vertical_transform .setRange (self .min_freq , self .max_freq )
136+ self .tracker ._init_swipe () # Reinitialize kernels with new max_freq
125137
126138 def set_duration (self , value : int ) -> None :
127139 self .duration = value
@@ -130,6 +142,9 @@ def set_duration(self, value: int) -> None:
130142 def set_min_db (self , value : float ) -> None :
131143 self .tracker .min_db = value
132144
145+ def set_conf (self , value : float ) -> None :
146+ self .tracker .conf = value
147+
133148 # slot
134149 def settings_called (self , checked : bool ) -> None :
135150 self .settings_dialog .show ()
@@ -142,6 +157,114 @@ def saveState(self, settings: QSettings) -> None:
142157 def restoreState (self , settings : QSettings ) -> None :
143158 self .settings_dialog .restore_state (settings )
144159
160+ def fastParabolicInterp (y1 , y2 , y3 ):
161+ """Estimate sub-bin pitch offset using parabolic interpolation.
162+
163+ To increase the precision of the pitch estimate beyond the kernel
164+ resolution (10 cents), the strongest pitch correlation (y2) and its two
165+ adjacent values (y1 and y3) are fitted to a parabola. The x-value of
166+ the vertex represents the positional offset of the "true" pitch
167+ relative to the center bin, enabling sub-bin pitch accuracy with fewer
168+ kernels.
169+
170+ SWIPE's discretized pitch strength values are calculated from an inner
171+ product of the audio signal with kernels made of cosine lobes. This
172+ allows the underlying, smooth function to be modeled using a Taylor
173+ series with even powers. Higher order terms don't contribute
174+ significantly, so a simple parabolic interpolation can generate sub-bin
175+ pitch accuracy.
176+
177+ Args:
178+ y1 (float): Pitch strength at frequency bin i - 1.
179+ y2 (float): Pitch strength at frequency bin i.
180+ y3 (float): Pitch strength at frequency bin i + 1.
181+
182+ Returns:
183+ tuple:
184+ vx (float): The position offset (in index) of the interpolated
185+ peak relative to the center index.
186+ vy (float): The interpolated pitch strength value at the peak.
187+ """
188+ a = (y1 - 2 * y2 + y3 ) / 2
189+ b = (y3 - y1 ) / 2
190+ #c = y2
191+
192+ vx = - b / (2 * a + np .finfo (np .float64 ).eps ) # Avoids division by zero
193+ vy = a * vx ** 2 + b * vx + y2
194+
195+ return vx , vy
196+
197+ def calcCosineKernel (f , freqList ):
198+ """Generate a SWIPE-style kernel based on candidate frequency 'f'.
199+
200+ SWIPE kernels are normalized, continuous curves generated by sums of
201+ positive and negative cosine lobes: positive lobes at the selected
202+ harmonics and negative lobes between. This SWIPE implementation uses a
203+ mixture of harmonics from SWIPE and SWIPE', tailored for a better match
204+ to human voices. The magnitude of the kernels decays as the square root
205+ of frequency, except for the fundamental and first harmonic, which are
206+ given equal weighting.
207+
208+ Args:
209+ f (float): Fundamental frequency (in Hz).
210+ freqList (ndarray): Array of frequency bins for the kernel (in Hz).
211+
212+ Returns:
213+ ndarray: Normalized kernel corresponding to the fundamental 'f'.
214+ """
215+ # Harmonics are a mix of SWIPE and SWIPE'. Good match for human voice.
216+ harmonics = np .array ([1 , 2 , 3 , 4 , 5 , 7 , 8 , 9 , 10 , 11 , 13 , 17 , 19 , 23 ])
217+
218+ # Thin peaks, wide valleys
219+ peakWidth = 0.15
220+ valleyWidth = 1 - peakWidth
221+
222+ valleyDivisor = 2
223+ decayStartHarmonic = 2
224+
225+ # Kernels don't need harmonics beyond the Nyquist frequency.
226+ maxPossibleHarmonics = int (freqList [- 1 ] / f )
227+ maxPossibleHarmonics = min (maxPossibleHarmonics , len (harmonics ))
228+ selectedHarmonics = harmonics [:maxPossibleHarmonics ]
229+
230+ # For high frequencies, the kernel is normalized by the number of harmonics.
231+ harmonicsNormalizer = maxPossibleHarmonics / len (harmonics )
232+
233+ # Normalize frequencies around the current frequency, f. Useful for harmonic comparisons.
234+ ratio = freqList / f
235+
236+ # Initialize the kernel
237+ k = np .zeros_like (freqList )
238+
239+ for i in np .arange (1 , harmonics [- 1 ] + 1 ):
240+ a = ratio - i # Distance from the harmonic
241+
242+ # Kernels are generated from left to right around the harmonic.
243+ if i in selectedHarmonics :
244+ valleyMask = np .logical_and (- valleyWidth < a , a < - peakWidth )
245+ k [valleyMask ] = - np .cos ((a [valleyMask ] + 0.5 ) / ((valleyWidth - peakWidth ) / 2 ) * (np .pi / 2 )) / valleyDivisor
246+
247+ peakMask = np .abs (a ) < peakWidth
248+ k [peakMask ] = np .cos (a [peakMask ] / peakWidth * (np .pi / 2 ))
249+
250+ else :
251+ valleyMask = np .logical_and (- valleyWidth < a , a < peakWidth )
252+ k [valleyMask ] = - np .cos ((a [valleyMask ] + 0.5 ) / ((valleyWidth - peakWidth ) / 2 ) * (np .pi / 2 )) / valleyDivisor
253+
254+ peakMask = np .abs (a ) < peakWidth
255+ k [peakMask ] = np .cos (a [peakMask ] / peakWidth * (np .pi / 2 )) / 4
256+
257+ # Piecewise decay
258+ decay = np .where (freqList <= f * (decayStartHarmonic + peakWidth ), np .sqrt (1.0 / (f * (decayStartHarmonic + peakWidth ))), np .sqrt (1.0 / freqList )) / np .sqrt (1.0 / (f * (decayStartHarmonic + peakWidth )))
259+ k *= decay
260+
261+ # Normalize the kernel to have a positive area of 1
262+ k /= np .sum (k [k > 0 ])
263+
264+ # Goose kernels with fewer harmonics
265+ k /= harmonicsNormalizer
266+
267+ return k
145268
146269class PitchTracker :
147270 def __init__ (
@@ -150,12 +273,23 @@ def __init__(
150273 fft_size : int = DEFAULT_FFT_SIZE ,
151274 overlap : float = 0.75 ,
152275 sample_rate : int = SAMPLING_RATE ,
276+ min_freq : float = DEFAULT_MIN_FREQ ,
277+ max_freq : float = DEFAULT_MAX_FREQ ,
153278 min_db : float = DEFAULT_MIN_DB ,
279+ cres : int = DEFAULT_C_RES ,
280+ conf : float = DEFAULT_P_CONF ,
281+ p_delta : int = DEFAULT_P_DELTA
154282 ):
155283 self .fft_size = fft_size
156284 self .overlap = overlap
157285 self .sample_rate = sample_rate
286+ self .min_freq = min_freq
287+ self .max_freq = max_freq
158288 self .min_db = min_db
289+ self .cres = cres
290+ self .conf = conf
291+ self .p_delta = p_delta
292+ self .prev_f0 = None
159293
160294 self .input_buf = input_buf
161295 self .input_buf .grow_if_needed (fft_size )
@@ -167,6 +301,9 @@ def __init__(
167301 self .proc = audioproc ()
168302 self .proc .set_fftsize (self .fft_size )
169303
304+ # Only generate the log-spaced pitch candidates and kernels for the SWIPE algorithm on instantiation.
305+ self ._init_swipe ()
306+
170307 def set_input_buffer (self , new_buf : RingBuffer ) -> None :
171308 self .input_buf = new_buf
172309 self .input_buf .grow_if_needed (self .fft_size )
@@ -194,29 +331,98 @@ def new_frames(self) -> Generator[np.ndarray, None, None]:
194331 self .next_in_offset + self .fft_size , self .fft_size )
195332 self .next_in_offset += m .floor (self .fft_size * (1.0 - self .overlap ))
196333
334+ def _init_swipe (self ):
335+ """Initialize log-spaced frequency grid and SWIPE kernels.
336+
337+ Constructs the log-spaced frequency grid (up to Nyquist) and the SWIPE
338+ kernels for frequencies up to the user selected max_freq.
339+
340+ The default pitch resolution (cres) of 10 cents produces 120 pitch
341+ candidates per octave, which, after parabolic interpolation, is sufficient
342+ to generate very precise pitch estimates.
343+ """
344+ numberOfLogSpacedFreqs = int (np .log2 (self .sample_rate / (2 * self .min_freq )) * (1200 / self .cres ))
345+ self .logSpacedFreqs = np .logspace (np .log2 (self .min_freq ), np .log2 (self .sample_rate // 2 ), num = numberOfLogSpacedFreqs , base = 2 )
346+
347+ # The pitch candidates for the kernels are a subset of the log-spaced freqs only up to the user's max_freq.
348+ fMaxIndex = np .searchsorted (self .logSpacedFreqs , self .max_freq )
349+ self .pitchCandidates = self .logSpacedFreqs [:fMaxIndex ]
350+
351+ self .kernels = np .zeros ((len (self .pitchCandidates ), len (self .logSpacedFreqs )))
352+ for i , freq in enumerate (self .pitchCandidates ):
353+ #self.kernels[i] = calcKernel(freq, self.logSpacedFreqs)
354+ self .kernels [i ] = calcCosineKernel (freq , self .logSpacedFreqs )
355+
197356 def estimate_pitch (self , frame : np .ndarray ) -> Optional [float ]:
357+ """Estimate the fundamental frequency from a single audio frame.
358+
359+ Each frame's spectrum amplitude, linearly spaced, is resampled onto a
360+ log-spaced frequency grid and correlated against SWIPE-like kernels.
361+ The strongest match is refined via parabolic interpolation to achieve
362+ sub-bin pitch precision.
363+
364+ Args:
365+ frame (np.ndarray): A 2D array containing one audio frame.
366+
367+ Returns:
368+ Optional[float]: Estimated pitch in Hz, or NaN if unvoiced.
369+ """
198370 spectrum = np .abs (np .fft .rfft (frame [0 , :] * self .proc .window ))
199371
200- # Compute harmonic product spectrum; the frequency with the largest
201- # value is quite likely to be a fundamental frequency.
202- # Chose 3 harmonics for no particularly good reason; empirically this
203- # seems reasonably effective.
204- product_count = 3
205- harmonic_length = spectrum .shape [0 ] // product_count
206- hps = spectrum [:harmonic_length ]
207- for i in range (2 , product_count + 1 ):
208- hps *= spectrum [::i ][:harmonic_length ]
209-
210- pitch_idx = np .argmax (hps )
211- if pitch_idx == 0 :
212- # This should only occur if the HPS is all zero. No pitch, don't
213- # try to take the log of zero.
214- return None
215-
216- # Compute dB for the detected fundamental; if it's too low presume it's
217- # a false detection and return no result.
218- db = 10 * np .log10 (spectrum [pitch_idx ] ** 2 / self .fft_size ** 2 )
219- if db < self .min_db :
220- return None
372+ # Generate the frequency bins (linear) to that correspond with the spectrum data
373+ freqLin = np .arange (len (spectrum ), dtype = 'float64' )
374+ freqLin *= float (self .sample_rate ) / float (self .fft_size )
375+
376+ # Create spline mapping from linear frequency to amplitude
377+ # Apply this spline to the log-spaced frequencies and normalize
378+ specLog = np .interp (self .logSpacedFreqs , freqLin , spectrum )
379+ specLogRMS = np .sqrt (np .mean (specLog ** 2 ))
380+ specLogNorm = specLog / specLogRMS
381+
382+ # Get the correlation between the audio frame and the kernels
383+ pitchStrengths = np .matmul (self .kernels , specLogNorm )
384+
385+ # Get the highest strength index
386+ idxMax = np .argmax (pitchStrengths )
387+
388+ # The highest values for pitchStrength (confidence) will come from
389+ # voiced pitches that nearly identically match the kernel for that
390+ # pitch, with a peak correlation product of around 2.56.
391+
392+ # Use parabolic interp to get frequency values between resolution bins
393+ if 0 < idxMax < len (pitchStrengths ) - 1 :
394+ y1 = pitchStrengths [idxMax - 1 ]
395+ y2 = pitchStrengths [idxMax ]
396+ y3 = pitchStrengths [idxMax + 1 ]
397+ idxShift , _ = fastParabolicInterp (y1 , y2 , y3 )
398+ else :
399+ idxShift = 0
400+
401+ # Generate and refine the pitch estimate by creating a mapping from indices
402+ # to frequency and applying it to idxMax with the interpolated index offset.
403+ f0 = np .interp (idxMax + idxShift , np .arange (len (self .logSpacedFreqs )), self .logSpacedFreqs )
404+
405+ # Get dBFS of the frame.
406+ # Only signals above user's min_db threshold will be considered voiced.
407+ RMS = np .sqrt (np .mean (frame ** 2 ))
408+ dBFS = 20 * np .log10 (RMS + np .finfo (np .float64 ).eps )
409+
410+ # Exclude pitch jumps greater than p_delta semitones from the previous pitch
411+ if self .prev_f0 is not None and f0 != np .nan :
412+ semitoneDiff = 12 * np .abs (np .log2 (f0 / self .prev_f0 ))
413+ else :
414+ semitoneDiff = 0
415+
416+ # Scale the correlation strength (confidence) to be between 0 and 1
417+ pitchConf = pitchStrengths [idxMax ] / 2.56
418+
419+ # Bool conditions for unreliable pitch estimate
420+ pitchUnreliable = (dBFS < self .min_db ) or (pitchConf < self .conf ) or (semitoneDiff > self .p_delta )
421+
422+ # Return
423+ if pitchUnreliable :
424+ self .prev_f0 = None
425+ return np .nan
221426 else :
222- return self .proc .freq [pitch_idx ]
427+ self .prev_f0 = f0
428+ return f0
0 commit comments