Skip to content

Commit 777492f

Browse files
committed
fix: complete TTS audio rewrite - real warmup, remove destructive cancel, retry logic
1 parent 4348b2a commit 777492f

1 file changed

Lines changed: 81 additions & 25 deletions

File tree

interview/proctored-room.html

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2919,24 +2919,34 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
29192919
// ══════════════════════════════════════════════════════════════
29202920
// START SESSION
29212921
// ══════════════════════════════════════════════════════════════
2922-
async function startSession() {
2923-
// ── CRITICAL: Warm up TTS engine on user gesture ──
2924-
// Chrome's autoplay policy requires the FIRST speechSynthesis.speak()
2925-
// to happen during a user gesture callback. Since we do async work
2926-
// (API fetch) before speaking, the gesture context expires and Chrome
2927-
// silently drops all subsequent speak() calls. This empty utterance
2928-
// "unlocks" the TTS engine while we still have gesture context.
2922+
// Track whether TTS has been unlocked by a user gesture
2923+
let _ttsUnlocked = false;
2924+
2925+
// Unlock TTS with a real utterance (empty strings are silently ignored by Chrome)
2926+
function unlockTTS() {
29292927
try {
2930-
const warmup = new SpeechSynthesisUtterance('');
2931-
warmup.volume = 0;
2932-
warmup.rate = 10; // fastest possible
2928+
state.voice.synth.cancel(); // clear any stale queue
2929+
const warmup = new SpeechSynthesisUtterance('.');
2930+
warmup.volume = 0.01; // near-silent but non-zero (volume 0 may not init audio pipeline)
2931+
warmup.rate = 10; // fastest possible
2932+
warmup.pitch = 0.01;
29332933
if (state.voice.preferredVoice) warmup.voice = state.voice.preferredVoice;
29342934
warmup.lang = state.interviewLanguage || 'en-IN';
2935+
warmup.onend = () => { _ttsUnlocked = true; console.log('[TTS] ✓ Engine unlocked via warmup'); };
2936+
warmup.onerror = () => { console.warn('[TTS] Warmup utterance error — will retry'); };
29352937
state.voice.synth.speak(warmup);
2936-
console.log('[TTS] Engine warmed up on user gesture');
2938+
console.log('[TTS] Warmup utterance dispatched (text=".", vol=0.01)');
29372939
} catch (e) {
29382940
console.warn('[TTS] Warmup failed:', e);
29392941
}
2942+
}
2943+
2944+
async function startSession() {
2945+
// ── CRITICAL: Warm up TTS engine on user gesture ──
2946+
// Chrome's autoplay policy requires the FIRST speechSynthesis.speak()
2947+
// to happen during a user gesture callback. Must use NON-EMPTY text
2948+
// (empty strings are silently ignored) with non-zero volume.
2949+
unlockTTS();
29402950

29412951
// Request screen share
29422952
if (!state.media.screen) {
@@ -2963,6 +2973,12 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
29632973
}
29642974
}
29652975

2976+
// Re-unlock TTS after getDisplayMedia (the system dialog breaks gesture context)
2977+
if (!_ttsUnlocked) {
2978+
console.log('[TTS] Re-unlocking after screen share dialog...');
2979+
unlockTTS();
2980+
}
2981+
29662982
document.getElementById('setupOverlay').style.display = 'none';
29672983
document.getElementById('interviewApp').style.display = 'flex';
29682984
document.getElementById('selfVid').srcObject = state.media.stream;
@@ -2987,8 +3003,13 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
29873003
setOrbState('thinking');
29883004
const introQuestion = await engine.generateNextQuestion(null, null);
29893005

2990-
// Small delay to let warmup utterance finish naturally
2991-
await sleep(200);
3006+
// Wait for TTS warmup to actually complete (onend sets _ttsUnlocked)
3007+
// The '.' utterance at rate=10 finishes in ~50ms, but we wait up to 2s as safety
3008+
const ttsWaitStart = Date.now();
3009+
while (!_ttsUnlocked && (Date.now() - ttsWaitStart) < 2000) {
3010+
await sleep(50);
3011+
}
3012+
console.log('[TTS] Warmup wait done, unlocked=', _ttsUnlocked, 'waited=', Date.now() - ttsWaitStart, 'ms');
29923013

29933014
await aiSpeak(introQuestion);
29943015
}
@@ -3034,11 +3055,11 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
30343055
const listenVizEl = document.getElementById('listenViz');
30353056
if (listenVizEl) listenVizEl.classList.remove('active');
30363057

3037-
// Only cancel if something is actually playing (avoids Chrome bug
3038-
// where cancel() right before speak() kills the engine)
3039-
if (state.voice.synth.speaking || state.voice.synth.pending) {
3040-
state.voice.synth.cancel();
3041-
}
3058+
// ── DO NOT call synth.cancel() here ──
3059+
// Chrome bug: cancel() right before speak() kills the internal audio
3060+
// pipeline, causing all subsequent speak() calls to produce no sound.
3061+
// The queue is already clean because we cancel in startSession after
3062+
// warmup, and each chunk completes naturally via onend/onerror.
30423063

30433064
// If no voices are available, try reloading them
30443065
if (!state.voice.preferredVoice) {
@@ -3148,19 +3169,54 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31483169
};
31493170

31503171
try {
3172+
u.onstart = () => {
3173+
console.log('[TTS] ▶ Chunk', idx, 'started playing, text:', chunkText.substring(0, 40) + '...');
3174+
};
31513175
state.voice.synth.speak(u);
31523176
startTTSKeepAlive();
31533177

3154-
// Extra safety: if synth is not speaking after 500ms, it failed silently
3178+
// Extra safety: if synth is not speaking after 800ms, try re-speaking
31553179
setTimeout(() => {
31563180
if (!resolved && !state.voice.synth.speaking && !state.voice.synth.pending) {
3157-
console.warn('[TTS] Speech did not start, skipping chunk', idx);
3158-
resolved = true;
3159-
clearTimeout(watchdog);
3160-
idx++;
3161-
speakNext();
3181+
console.warn('[TTS] Speech did not start after 800ms, chunk', idx,
3182+
'synth.paused=', state.voice.synth.paused,
3183+
'voices=', state.voice.synth.getVoices().length,
3184+
'voice=', state.voice.preferredVoice?.name || 'NONE');
3185+
// Try one more time — sometimes Chrome needs a retry
3186+
try {
3187+
const retry = new SpeechSynthesisUtterance(chunkText);
3188+
retry.lang = u.lang;
3189+
if (state.voice.preferredVoice) retry.voice = state.voice.preferredVoice;
3190+
retry.rate = u.rate;
3191+
retry.pitch = u.pitch;
3192+
retry.volume = u.volume;
3193+
retry.onend = () => {
3194+
if (!resolved && state.voice.isSpeaking) {
3195+
resolved = true;
3196+
clearTimeout(watchdog);
3197+
idx++;
3198+
speakNext();
3199+
}
3200+
};
3201+
retry.onerror = () => {
3202+
if (!resolved && state.voice.isSpeaking) {
3203+
resolved = true;
3204+
clearTimeout(watchdog);
3205+
idx++;
3206+
speakNext();
3207+
}
3208+
};
3209+
state.voice.synth.speak(retry);
3210+
console.log('[TTS] Retry speak for chunk', idx);
3211+
} catch (retryErr) {
3212+
console.warn('[TTS] Retry also failed:', retryErr);
3213+
resolved = true;
3214+
clearTimeout(watchdog);
3215+
idx++;
3216+
speakNext();
3217+
}
31623218
}
3163-
}, 500);
3219+
}, 800);
31643220
} catch (ttsErr) {
31653221
console.error('[TTS] speak() threw:', ttsErr);
31663222
resolved = true;

0 commit comments

Comments
 (0)