Skip to content

Commit 77f85c3

Browse files
committed
Correct algorithms
1 parent 53a3388 commit 77f85c3

30 files changed

Lines changed: 4528 additions & 0 deletions

.github/workflows/test.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: test
2+
3+
on:
4+
push:
5+
pull_request:
6+
7+
jobs:
8+
test:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/checkout@v4
12+
- uses: actions/setup-node@v4
13+
with:
14+
node-version: 20
15+
cache: npm
16+
- run: npm ci
17+
- run: npm test
18+
- run: npm run quality -- --ci

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/node_modules
2+
/.work
3+
/.playwright-mcp
4+
*.tgz
5+
/.claude

README.md

Lines changed: 249 additions & 0 deletions
Large diffs are not rendered by default.

demo.html

Lines changed: 555 additions & 0 deletions
Large diffs are not rendered by default.

formant-shift.js

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import { fft, ifft } from 'fourier-transform'
2+
import { stftBatch, stftStream } from './stft.js'
3+
import { matchGain, wrapPhase, makePitchShift, resolvePitchParams } from './util.js'
4+
5+
// Formant-preserving pitch shift. The spectral envelope is extracted via cepstral liftering
6+
// (low-quefrency coefficients) from the original frame. A peak-locked phase vocoder then
7+
// shifts pitch (reusing the phase-lock architecture so partials stay coherent). Finally the
8+
// shifted magnitude is divided by its own envelope and multiplied by the original envelope,
9+
// re-imposing vowel timbre on the shifted pitch.
10+
11+
// `preLog`: if true, `mag` is already log-magnitude (skip the log step).
12+
function cepstralEnvelope(mag, N, liftCutoff, preLog = false) {
13+
let half = N >> 1
14+
let logMag = new Float64Array(half + 1)
15+
let zeroIm = new Float64Array(half + 1)
16+
for (let k = 0; k <= half; k++) logMag[k] = preLog ? mag[k] : Math.log(Math.max(1e-8, mag[k]))
17+
18+
let cep = ifft(logMag, zeroIm, new Float64Array(N))
19+
20+
let lifted = new Float64Array(N)
21+
lifted[0] = cep[0]
22+
let cutoff = Math.min(liftCutoff, half - 1)
23+
for (let n = 1; n < cutoff; n++) {
24+
lifted[n] = cep[n]
25+
lifted[N - n] = cep[N - n]
26+
}
27+
28+
let [envLogRe] = fft(lifted)
29+
let env = new Float64Array(half + 1)
30+
for (let k = 0; k <= half; k++) env[k] = Math.exp(envLogRe[k])
31+
return env
32+
}
33+
34+
function findPeaks(mag, half) {
35+
// First-order comparison; ±2 shadows closely-spaced chord partials (see phase-lock.js).
36+
let maxM = 0
37+
for (let k = 0; k <= half; k++) if (mag[k] > maxM) maxM = mag[k]
38+
let floor = Math.max(1e-8, maxM * 0.005)
39+
let peaks = []
40+
for (let k = 1; k < half; k++) {
41+
let v = mag[k]
42+
if (v < floor) continue
43+
if (v > mag[k - 1] && v > mag[k + 1]) peaks.push(k)
44+
}
45+
return peaks
46+
}
47+
48+
function assignedPeak(peaks, k) {
49+
if (!peaks.length) return -1
50+
let lo = 0, hi = peaks.length - 1
51+
while (lo < hi) {
52+
let mid = (lo + hi) >> 1
53+
if (peaks[mid] < k) lo = mid + 1
54+
else hi = mid
55+
}
56+
if (lo > 0 && Math.abs(peaks[lo - 1] - k) <= Math.abs(peaks[lo] - k)) return lo - 1
57+
return lo
58+
}
59+
60+
function makeProcess(ratio, envelopeWidth) {
61+
return function process(mag, phase, state, ctx) {
62+
let { N, half, hop, freqPerBin } = ctx
63+
if (!state.prev) {
64+
state.prev = new Float64Array(half + 1)
65+
state.syn = new Float64Array(half + 1)
66+
state.logMagAvg = new Float64Array(half + 1)
67+
state.first = true
68+
}
69+
let { prev, syn, logMagAvg } = state
70+
71+
// 1. Original spectral envelope extracted from a smoothed log-magnitude.
72+
// Computing the envelope per-frame directly causes inter-partial bins to fluctuate at
73+
// the chord beat frequency (e.g. 55 Hz for a 220/275 Hz pair). That 55 Hz beat aliases
74+
// against the 86 Hz frame rate into ~31 Hz flutter on the correction factor — audible
75+
// as a soft click on raised chord material. An EMA of log(mag) with α=0.4 (τ ≈ 13 ms
76+
// at hop=512 / 44.1 kHz) stabilises the envelope: it converges within 5τ ≈ 65 ms
77+
// (before the 20%-skip activeRegion window opens) and attenuates the 55 Hz oscillation
78+
// by ≈2×, bringing it below the flicker perception threshold.
79+
let alpha = 0.4
80+
for (let k = 0; k <= half; k++) {
81+
let lm = Math.log(Math.max(1e-8, mag[k]))
82+
logMagAvg[k] = state.first ? lm : alpha * logMagAvg[k] + (1 - alpha) * lm
83+
}
84+
let origEnv = cepstralEnvelope(logMagAvg, N, envelopeWidth, true) // pre-log mode
85+
86+
// 2. Peak-locked phase vocoder shift — same logic as phase-lock.js. Peaks scatter to
87+
// shifted dest bins, their region of influence is carried along, and per-peak phase
88+
// is advanced at the shifted instantaneous frequency.
89+
let peaks = findPeaks(mag, half)
90+
let newMag = new Float64Array(half + 1)
91+
let newPhase = new Float64Array(half + 1)
92+
let peakDest = new Int32Array(peaks.length)
93+
let peakSynPhase = new Float64Array(peaks.length)
94+
95+
for (let i = 0; i < peaks.length; i++) {
96+
let k = peaks[i]
97+
let trueFreq
98+
if (state.first) {
99+
trueFreq = k * freqPerBin
100+
} else {
101+
let dp = wrapPhase(phase[k] - prev[k] - k * freqPerBin * hop)
102+
trueFreq = k * freqPerBin + dp / hop
103+
}
104+
let shifted = trueFreq * ratio
105+
let destBin = Math.round(shifted / freqPerBin)
106+
if (destBin < 0 || destBin > half) { peakDest[i] = -1; continue }
107+
let newSyn = wrapPhase(syn[k] + shifted * hop)
108+
peakDest[i] = destBin
109+
peakSynPhase[i] = newSyn
110+
syn[k] = newSyn
111+
}
112+
113+
for (let k = 0; k <= half; k++) {
114+
let pi = assignedPeak(peaks, k)
115+
if (pi < 0) continue
116+
let pk = peaks[pi]
117+
let destBin = peakDest[pi]
118+
if (destBin < 0) continue
119+
let dest = destBin + (k - pk)
120+
if (dest < 0 || dest > half) continue
121+
let p = peakSynPhase[pi] + (phase[k] - phase[pk])
122+
if (mag[k] >= newMag[dest]) {
123+
newMag[dest] = mag[k]
124+
newPhase[dest] = p
125+
}
126+
}
127+
128+
for (let k = 0; k <= half; k++) prev[k] = phase[k]
129+
state.first = false
130+
131+
// 3. Re-impose the original vocal-tract envelope. The naive shift carried the envelope
132+
// along with the pitch — output bin k carries the original envelope at k/ratio. Divide
133+
// that out, multiply by origEnv[k]. origEnv is extracted from the log-magnitude average
134+
// so the correction is already temporally stable (see step 1 above).
135+
for (let k = 0; k <= half; k++) {
136+
let src = k / ratio
137+
let i = src | 0
138+
let f = src - i
139+
let a = origEnv[Math.min(i, half)]
140+
let b = origEnv[Math.min(i + 1, half)]
141+
let shiftedEnvK = a * (1 - f) + b * f
142+
let corr = origEnv[k] / Math.max(1e-8, shiftedEnvK)
143+
if (corr > 8) corr = 8
144+
if (corr < 0.125) corr = 0.125
145+
newMag[k] *= corr
146+
}
147+
148+
return { mag: newMag, phase: newPhase }
149+
}
150+
}
151+
152+
function formantBatch(data, opts) {
153+
let { ratio } = resolvePitchParams(opts)
154+
let frameSize = opts?.frameSize ?? 2048
155+
let envelopeWidth = opts?.envelopeWidth ?? Math.max(8, Math.round(frameSize / 64))
156+
let out = stftBatch(data, makeProcess(ratio, envelopeWidth), { ...opts, ratio, frameSize })
157+
return matchGain(out, data)
158+
}
159+
160+
function formantStream(opts) {
161+
let { ratio } = resolvePitchParams(opts)
162+
let frameSize = opts?.frameSize ?? 2048
163+
let envelopeWidth = opts?.envelopeWidth ?? Math.max(8, Math.round(frameSize / 64))
164+
let s = stftStream(makeProcess(ratio, envelopeWidth), { ...opts, ratio, frameSize })
165+
return (chunk) => chunk === undefined ? s.flush() : s.write(chunk)
166+
}
167+
168+
export default makePitchShift(formantBatch, formantStream)

granular.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import { bufferedStream, makePitchShift, resampleTo, resolvePitchParams } from './util.js'
2+
import { wsolaStretch } from './stretch.js'
3+
4+
// Granular pitch shift: small (1024-sample) Hann-windowed grains OLA-stretched with a
5+
// minimal similarity-lock window so tonal material stays audible and on-pitch. Distinct
6+
// from wsola in that the grains are half the size — the classic "granular synthesis"
7+
// grainy character on busy material, without the catastrophic dropout that pure OLA
8+
// (delta=0) suffers on chord/voice input.
9+
10+
function granularBatch(data, opts) {
11+
let frameSize = opts?.frameSize ?? 1024
12+
let { ratio } = resolvePitchParams(opts)
13+
let stretched = wsolaStretch(data, ratio, {
14+
frameSize,
15+
hopSize: opts?.hopSize,
16+
delta: opts?.delta ?? Math.max(16, frameSize >> 3),
17+
})
18+
return resampleTo(stretched, data.length)
19+
}
20+
21+
let granularStream = (opts) => bufferedStream(granularBatch, opts)
22+
23+
export default makePitchShift(granularBatch, granularStream)

0 commit comments

Comments
 (0)