Skip to content

Commit 1d0cd60

Browse files
committed
Rebased Pitch Tracker to friture/master
Pitch tracker now works with the latest friture changes. Kernel generation is slightly different, tuned with vocal stems (male/female) to return fewer spurious voiced pitches.
1 parent a04bf9b commit 1d0cd60

File tree

3 files changed

+290
-31
lines changed

3 files changed

+290
-31
lines changed

friture/pitch_tracker.py

Lines changed: 230 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@
1717
# You should have received a copy of the GNU General Public License
1818
# along with Friture. If not, see <http://www.gnu.org/licenses/>.
1919

20+
# Pitch estimator adapted from libf0 (MIT License):
21+
# https://github.com/groupmm/libf0
22+
# Based on SWIPE algorithm by Arturo Camacho:
23+
# Arturo Camacho, John G. Harris; A sawtooth waveform inspired pitch estimator for speech and music.
24+
# J. Acoust. Soc. Am. 1 September 2008; 124 (3): 1638–1652. https://doi.org/10.1121/1.2951592
25+
# Released under GPLv3 for inclusion in Friture.
26+
2027
from collections.abc import Generator
2128
import logging
2229
import math as m
@@ -36,6 +43,9 @@
3643
DEFAULT_DURATION,
3744
DEFAULT_FFT_SIZE,
3845
DEFAULT_MIN_DB,
46+
DEFAULT_C_RES,
47+
DEFAULT_P_CONF,
48+
DEFAULT_P_DELTA,
3949
PitchTrackerSettingsDialog,
4050
)
4151
from friture.plotting.coordinateTransform import CoordinateTransform
@@ -70,10 +80,10 @@ def __init__(self, parent: QtWidgets.QWidget):
7080
self._pitch_tracker_data.vertical_axis.setRange( # type: ignore
7181
self.min_freq, self.max_freq)
7282
self._pitch_tracker_data.vertical_axis.setScale( # type: ignore
73-
fscales.Octave)
83+
fscales.OctaveC)
7484
self.vertical_transform = CoordinateTransform(
7585
self.min_freq, self.max_freq, 1, 0, 0)
76-
self.vertical_transform.setScale(fscales.Octave)
86+
self.vertical_transform.setScale(fscales.OctaveC)
7787

7888
self.duration = DEFAULT_DURATION
7989
self._pitch_tracker_data.horizontal_axis.setRange( # type: ignore
@@ -117,11 +127,13 @@ def set_min_freq(self, value: int) -> None:
117127
self.min_freq = value
118128
self._pitch_tracker_data.vertical_axis.setRange(self.min_freq, self.max_freq) # type: ignore
119129
self.vertical_transform.setRange(self.min_freq, self.max_freq)
130+
self.tracker._init_swipe() # Reinitialize kernels with new min_freq
120131

121132
def set_max_freq(self, value: int) -> None:
122133
self.max_freq = value
123134
self._pitch_tracker_data.vertical_axis.setRange(self.min_freq, self.max_freq) # type: ignore
124135
self.vertical_transform.setRange(self.min_freq, self.max_freq)
136+
self.tracker._init_swipe() # Reinitialize kernels with new max_freq
125137

126138
def set_duration(self, value: int) -> None:
127139
self.duration = value
@@ -130,6 +142,9 @@ def set_duration(self, value: int) -> None:
130142
def set_min_db(self, value: float) -> None:
131143
self.tracker.min_db = value
132144

145+
def set_conf(self, value: float) -> None:
146+
self.tracker.conf = value
147+
133148
# slot
134149
def settings_called(self, checked: bool) -> None:
135150
self.settings_dialog.show()
@@ -142,6 +157,114 @@ def saveState(self, settings: QSettings) -> None:
142157
def restoreState(self, settings: QSettings) -> None:
143158
self.settings_dialog.restore_state(settings)
144159

160+
def fastParabolicInterp(y1, y2, y3):
161+
"""Estimate sub-bin pitch offset using parabolic interpolation.
162+
163+
To increase the precision of the pitch estimate beyond the kernel
164+
resolution (10 cents), the strongest pitch correlation (y2) and its two
165+
adjacent values (y1 and y3) are fitted to a parabola. The x-value of
166+
the vertex represents the positional offset of the "true" pitch
167+
relative to the center bin, enabling sub-bin pitch accuracy with fewer
168+
kernels.
169+
170+
SWIPE's discretized pitch strength values are calculated from an inner
171+
product of the audio signal with kernels made of cosine lobes. This
172+
allows the underlying, smooth function to be modeled using a Taylor
173+
series with even powers. Higher order terms don't contribute
174+
significantly, so a simple parabolic interpolation can generate sub-bin
175+
pitch accuracy.
176+
177+
Args:
178+
y1 (float): Pitch strength at frequency bin i - 1.
179+
y2 (float): Pitch strength at frequency bin i.
180+
y3 (float): Pitch strength at frequency bin i + 1.
181+
182+
Returns:
183+
tuple:
184+
vx (float): The position offset (in index) of the interpolated
185+
peak relative to the center index.
186+
vy (float): The interpolated pitch strength value at the peak.
187+
"""
188+
a = (y1 - 2*y2 + y3) / 2
189+
b = (y3 - y1) / 2
190+
#c = y2
191+
192+
vx = -b / (2 * a + np.finfo(np.float64).eps) # Avoids division by zero
193+
vy = a * vx**2 + b * vx + y2
194+
195+
return vx, vy
196+
197+
def calcCosineKernel(f, freqList):
198+
"""Generate a SWIPE-style kernel based on candidate frequency 'f'.
199+
200+
SWIPE kernels are normalized, continuous curves generated by sums of
201+
positive and negative cosine lobes: positive lobes at the selected
202+
harmonics and negative lobes between. This SWIPE implementation uses a
203+
mixture of harmonics from SWIPE and SWIPE', tailored for a better match
204+
to human voices. The magnitude of the kernels decays as the square root
205+
of frequency, except for the fundamental and first harmonic, which are
206+
given equal weighting.
207+
208+
Args:
209+
f (float): Fundamental frequency (in Hz).
210+
freqList (ndarray): Array of frequency bins for the kernel (in Hz).
211+
212+
Returns:
213+
ndarray: Normalized kernel corresponding to the fundamental 'f'.
214+
"""
215+
# Harmonics are a mix of SWIPE and SWIPE'. Good match for human voice.
216+
harmonics = np.array([1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 17, 19, 23])
217+
218+
# Thin peaks, wide valleys
219+
peakWidth = 0.15
220+
valleyWidth = 1 - peakWidth
221+
222+
valleyDivisor = 2
223+
decayStartHarmonic = 2
224+
225+
# Kernels don't need harmonics beyond the Nyquist frequency.
226+
maxPossibleHarmonics = int(freqList[-1] / f)
227+
maxPossibleHarmonics = min(maxPossibleHarmonics, len(harmonics))
228+
selectedHarmonics = harmonics[:maxPossibleHarmonics]
229+
230+
# For high frequencies, the kernel is normalized by the number of harmonics.
231+
harmonicsNormalizer = maxPossibleHarmonics / len(harmonics)
232+
233+
# Normalize frequencies around the current frequency, f. Useful for harmonic comparisons.
234+
ratio = freqList / f
235+
236+
# Initialize the kernel
237+
k = np.zeros_like(freqList)
238+
239+
for i in np.arange(1, harmonics[-1] + 1):
240+
a = ratio - i # Distance from the harmonic
241+
242+
# Kernels are generated from left to right around the harmonic.
243+
if i in selectedHarmonics:
244+
valleyMask = np.logical_and(-valleyWidth < a, a < -peakWidth)
245+
k[valleyMask] = -np.cos((a[valleyMask] + 0.5) / ((valleyWidth - peakWidth) / 2) * (np.pi / 2)) / valleyDivisor
246+
247+
peakMask = np.abs(a) < peakWidth
248+
k[peakMask] = np.cos(a[peakMask] / peakWidth * (np.pi / 2))
249+
250+
else:
251+
valleyMask = np.logical_and(-valleyWidth < a, a < peakWidth)
252+
k[valleyMask] = -np.cos((a[valleyMask] + 0.5) / ((valleyWidth - peakWidth) / 2) * (np.pi / 2)) / valleyDivisor
253+
254+
peakMask = np.abs(a) < peakWidth
255+
k[peakMask] = np.cos(a[peakMask] / peakWidth * (np.pi / 2)) / 4
256+
257+
# Piecewise decay
258+
decay = np.where(freqList <= f*(decayStartHarmonic + peakWidth), np.sqrt(1.0 / (f*(decayStartHarmonic + peakWidth))), np.sqrt(1.0 / freqList)) / np.sqrt(1.0/(f*(decayStartHarmonic + peakWidth)))
259+
k *= decay
260+
261+
# Normalize the kernel to have a positive area of 1
262+
k /= np.sum(k[k >0])
263+
264+
# Goose kernels with fewer harmonics
265+
k /= harmonicsNormalizer
266+
267+
return k
145268

146269
class PitchTracker:
147270
def __init__(
@@ -150,12 +273,23 @@ def __init__(
150273
fft_size: int = DEFAULT_FFT_SIZE,
151274
overlap: float = 0.75,
152275
sample_rate: int = SAMPLING_RATE,
276+
min_freq: float = DEFAULT_MIN_FREQ,
277+
max_freq: float = DEFAULT_MAX_FREQ,
153278
min_db: float = DEFAULT_MIN_DB,
279+
cres: int = DEFAULT_C_RES,
280+
conf: float = DEFAULT_P_CONF,
281+
p_delta: int = DEFAULT_P_DELTA
154282
):
155283
self.fft_size = fft_size
156284
self.overlap = overlap
157285
self.sample_rate = sample_rate
286+
self.min_freq = min_freq
287+
self.max_freq = max_freq
158288
self.min_db = min_db
289+
self.cres = cres
290+
self.conf = conf
291+
self.p_delta = p_delta
292+
self.prev_f0 = None
159293

160294
self.input_buf = input_buf
161295
self.input_buf.grow_if_needed(fft_size)
@@ -167,6 +301,9 @@ def __init__(
167301
self.proc = audioproc()
168302
self.proc.set_fftsize(self.fft_size)
169303

304+
# Only generate the log-spaced pitch candidates and kernels for the SWIPE algorithm on instantiation.
305+
self._init_swipe()
306+
170307
def set_input_buffer(self, new_buf: RingBuffer) -> None:
171308
self.input_buf = new_buf
172309
self.input_buf.grow_if_needed(self.fft_size)
@@ -194,29 +331,98 @@ def new_frames(self) -> Generator[np.ndarray, None, None]:
194331
self.next_in_offset + self.fft_size, self.fft_size)
195332
self.next_in_offset += m.floor(self.fft_size * (1.0 - self.overlap))
196333

334+
def _init_swipe(self):
335+
"""Initialize log-spaced frequency grid and SWIPE kernels.
336+
337+
Constructs the log-spaced frequency grid (up to Nyquist) and the SWIPE
338+
kernels for frequencies up to the user selected max_freq.
339+
340+
The default pitch resolution (cres) of 10 cents produces 120 pitch
341+
candidates per octave, which, after parabolic interpolation, is sufficient
342+
to generate very precise pitch estimates.
343+
"""
344+
numberOfLogSpacedFreqs = int(np.log2(self.sample_rate / (2 * self.min_freq)) * (1200 / self.cres))
345+
self.logSpacedFreqs = np.logspace(np.log2(self.min_freq), np.log2(self.sample_rate // 2), num=numberOfLogSpacedFreqs, base=2)
346+
347+
# The pitch candidates for the kernels are a subset of the log-spaced freqs only up to the user's max_freq.
348+
fMaxIndex = np.searchsorted(self.logSpacedFreqs, self.max_freq)
349+
self.pitchCandidates = self.logSpacedFreqs[:fMaxIndex]
350+
351+
self.kernels = np.zeros((len(self.pitchCandidates), len(self.logSpacedFreqs)))
352+
for i, freq in enumerate(self.pitchCandidates):
353+
#self.kernels[i] = calcKernel(freq, self.logSpacedFreqs)
354+
self.kernels[i] = calcCosineKernel(freq, self.logSpacedFreqs)
355+
197356
def estimate_pitch(self, frame: np.ndarray) -> Optional[float]:
357+
"""Estimate the fundamental frequency from a single audio frame.
358+
359+
Each frame's spectrum amplitude, linearly spaced, is resampled onto a
360+
log-spaced frequency grid and correlated against SWIPE-like kernels.
361+
The strongest match is refined via parabolic interpolation to achieve
362+
sub-bin pitch precision.
363+
364+
Args:
365+
frame (np.ndarray): A 2D array containing one audio frame.
366+
367+
Returns:
368+
Optional[float]: Estimated pitch in Hz, or NaN if unvoiced.
369+
"""
198370
spectrum = np.abs(np.fft.rfft(frame[0, :] * self.proc.window))
199371

200-
# Compute harmonic product spectrum; the frequency with the largest
201-
# value is quite likely to be a fundamental frequency.
202-
# Chose 3 harmonics for no particularly good reason; empirically this
203-
# seems reasonably effective.
204-
product_count = 3
205-
harmonic_length = spectrum.shape[0] // product_count
206-
hps = spectrum[:harmonic_length]
207-
for i in range(2, product_count + 1):
208-
hps *= spectrum[::i][:harmonic_length]
209-
210-
pitch_idx = np.argmax(hps)
211-
if pitch_idx == 0:
212-
# This should only occur if the HPS is all zero. No pitch, don't
213-
# try to take the log of zero.
214-
return None
215-
216-
# Compute dB for the detected fundamental; if it's too low presume it's
217-
# a false detection and return no result.
218-
db = 10 * np.log10(spectrum[pitch_idx] ** 2 / self.fft_size ** 2)
219-
if db < self.min_db:
220-
return None
372+
# Generate the frequency bins (linear) to that correspond with the spectrum data
373+
freqLin = np.arange(len(spectrum), dtype='float64')
374+
freqLin *= float(self.sample_rate) / float(self.fft_size)
375+
376+
# Create spline mapping from linear frequency to amplitude
377+
# Apply this spline to the log-spaced frequencies and normalize
378+
specLog = np.interp(self.logSpacedFreqs, freqLin, spectrum)
379+
specLogRMS = np.sqrt(np.mean(specLog**2))
380+
specLogNorm = specLog / specLogRMS
381+
382+
# Get the correlation between the audio frame and the kernels
383+
pitchStrengths = np.matmul(self.kernels, specLogNorm)
384+
385+
# Get the highest strength index
386+
idxMax = np.argmax(pitchStrengths)
387+
388+
# The highest values for pitchStrength (confidence) will come from
389+
# voiced pitches that nearly identically match the kernel for that
390+
# pitch, with a peak correlation product of around 2.56.
391+
392+
# Use parabolic interp to get frequency values between resolution bins
393+
if 0 < idxMax < len(pitchStrengths) - 1:
394+
y1 = pitchStrengths[idxMax - 1]
395+
y2 = pitchStrengths[idxMax]
396+
y3 = pitchStrengths[idxMax + 1]
397+
idxShift, _ = fastParabolicInterp(y1, y2, y3)
398+
else:
399+
idxShift = 0
400+
401+
# Generate and refine the pitch estimate by creating a mapping from indices
402+
# to frequency and applying it to idxMax with the interpolated index offset.
403+
f0 = np.interp(idxMax + idxShift, np.arange(len(self.logSpacedFreqs)), self.logSpacedFreqs)
404+
405+
# Get dBFS of the frame.
406+
# Only signals above user's min_db threshold will be considered voiced.
407+
RMS = np.sqrt(np.mean(frame**2))
408+
dBFS = 20 * np.log10(RMS + np.finfo(np.float64).eps)
409+
410+
# Exclude pitch jumps greater than p_delta semitones from the previous pitch
411+
if self.prev_f0 is not None and f0 != np.nan:
412+
semitoneDiff = 12 * np.abs(np.log2(f0 / self.prev_f0))
413+
else:
414+
semitoneDiff = 0
415+
416+
# Scale the correlation strength (confidence) to be between 0 and 1
417+
pitchConf = pitchStrengths[idxMax] / 2.56
418+
419+
# Bool conditions for unreliable pitch estimate
420+
pitchUnreliable = (dBFS < self.min_db) or (pitchConf < self.conf) or (semitoneDiff > self.p_delta)
421+
422+
# Return
423+
if pitchUnreliable:
424+
self.prev_f0 = None
425+
return np.nan
221426
else:
222-
return self.proc.freq[pitch_idx]
427+
self.prev_f0 = f0
428+
return f0

friture/pitch_tracker_settings.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,15 @@
2323

2424
from friture.audiobackend import SAMPLING_RATE
2525

26-
# Roughly the maximum singing range:
27-
DEFAULT_MIN_FREQ = 80
28-
DEFAULT_MAX_FREQ = 1000
29-
DEFAULT_DURATION = 30
30-
DEFAULT_MIN_DB = -70.0
31-
DEFAULT_FFT_SIZE = 16384
26+
# Pitch tracker defaults:
27+
DEFAULT_FFT_SIZE = 4096
28+
DEFAULT_MIN_FREQ = 65
29+
DEFAULT_MAX_FREQ = 1047
30+
DEFAULT_DURATION = 10
31+
DEFAULT_MIN_DB = -50.0
32+
DEFAULT_C_RES = 10 # Pitch resolution in cents. Default of 10 produces 120 kernels per octave.
33+
DEFAULT_P_CONF = 0.50 # Confidence threshold for pitch estimation. Unitless.
34+
DEFAULT_P_DELTA = 2 # Maximum pitch jump between frames in semitones.
3235

3336
class PitchTrackerSettingsDialog(QtWidgets.QDialog):
3437
def __init__(self, parent: QtWidgets.QWidget, view_model: Any) -> None:
@@ -58,14 +61,24 @@ def __init__(self, parent: QtWidgets.QWidget, view_model: Any) -> None:
5861

5962
self.duration = QtWidgets.QSpinBox(self)
6063
self.duration.setMinimum(5)
61-
self.duration.setMaximum(600)
64+
self.duration.setMaximum(60)
6265
self.duration.setSingleStep(1)
6366
self.duration.setValue(DEFAULT_DURATION)
6467
self.duration.setSuffix(" s")
6568
self.duration.setObjectName("duration")
6669
self.duration.valueChanged.connect(view_model.set_duration) # type: ignore
6770
self.form_layout.addRow("Duration:", self.duration)
6871

72+
self.conf = QtWidgets.QDoubleSpinBox(self)
73+
self.conf.setMinimum(0)
74+
self.conf.setMaximum(1)
75+
self.conf.setSingleStep(0.01)
76+
self.conf.setValue(DEFAULT_P_CONF)
77+
self.conf.setSuffix("")
78+
self.conf.setObjectName("conf")
79+
self.conf.valueChanged.connect(view_model.set_conf)
80+
self.form_layout.addRow("Conf. Cut:", self.conf)
81+
6982
self.min_db = QtWidgets.QDoubleSpinBox(self)
7083
self.min_db.setMinimum(-100)
7184
self.min_db.setMaximum(0)
@@ -82,6 +95,7 @@ def save_state(self, settings: QSettings) -> None:
8295
settings.setValue("min_freq", self.min_freq.value())
8396
settings.setValue("max_freq", self.max_freq.value())
8497
settings.setValue("duration", self.duration.value())
98+
settings.setValue("conf", self.conf.value())
8599
settings.setValue("min_db", self.min_db.value())
86100

87101
def restore_state(self, settings: QSettings) -> None:
@@ -91,6 +105,8 @@ def restore_state(self, settings: QSettings) -> None:
91105
settings.value("max_freq", DEFAULT_MAX_FREQ, type=int))
92106
self.duration.setValue(
93107
settings.value("duration", DEFAULT_DURATION, type=int))
108+
self.conf.setValue(
109+
settings.value("conf", DEFAULT_P_CONF, type=float))
94110
self.min_db.setValue(
95111
settings.value("min_db", DEFAULT_MIN_DB, type=float))
96112

0 commit comments

Comments
 (0)