Skip to content

Commit 99b6dd3

Browse files
committed
feat: cloud TTS via Cloudflare Workers AI (Deepgram Aura-1)
Completely new approach to interview audio: - PRIMARY: Cloud TTS via /ai/tts endpoint using @cf/deepgram/aura-1 - Returns WAV audio, played via HTML Audio element - No gesture context required (bypasses Chrome autoplay issues) - Natural-sounding voice from Deepgram - FALLBACK: Browser speechSynthesis (if cloud fails) - Simplified speakNext with watchdog timers Backend: Added handleTTS handler + POST /ai/tts route Frontend: aiSpeakCloud() tries cloud first, aiSpeak() orchestrates
1 parent c89596d commit 99b6dd3

2 files changed

Lines changed: 139 additions & 84 deletions

File tree

backend/simpatico-ats.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,7 @@ route("POST", "/ai/chat/stream", handleAIChatStream);
12311231
route("POST", "/ai/employee-insight", handleEmployeeInsight);
12321232
route("POST", "/ai/sentiment", handleSentimentAnalysis);
12331233
route("POST", "/ai/interview-question", handleInterviewQuestion);
1234+
route("POST", "/ai/tts", handleTTS);
12341235
route("POST", "/ai/ats-generator", handleATSGenerator);
12351236
route("POST", "/ai/hr-automation-generator", handleHRAutomationGenerator);
12361237
route("POST", "/ai/generate-assessment", handleGenerateAssessment);
@@ -4099,6 +4100,38 @@ async function handleInterviewQuestion(request, env, ctx) {
40994100
});
41004101
}
41014102

4103+
// ── TTS via Cloudflare Workers AI (Deepgram Aura) ──────────────────────────
4104+
async function handleTTS(request, env, ctx) {
4105+
const { text } = await safeJson(request);
4106+
if (!text || typeof text !== 'string' || text.trim().length === 0) {
4107+
throw new ValidationError("text is required");
4108+
}
4109+
4110+
if (!env.AI) throw new ServiceUnavailableError("AI");
4111+
4112+
// Limit text to prevent abuse
4113+
const cleanText = text.trim().slice(0, 5000);
4114+
console.log(`[TTS] Generating speech: ${cleanText.length} chars`);
4115+
4116+
try {
4117+
const audioResponse = await env.AI.run("@cf/deepgram/aura-1", {
4118+
text: cleanText,
4119+
});
4120+
4121+
// audioResponse is an ArrayBuffer or ReadableStream
4122+
const headers = {
4123+
...CORS_HEADERS,
4124+
"Content-Type": "audio/wav",
4125+
"Cache-Control": "public, max-age=3600",
4126+
};
4127+
4128+
return new Response(audioResponse, { status: 200, headers });
4129+
} catch (err) {
4130+
console.error(`[TTS] AI.run error: ${err.message}`);
4131+
throw new AppError(`TTS generation failed: ${err.message}`, 500, "TTS_ERROR");
4132+
}
4133+
}
4134+
41024135
async function handleExpenseOCR(request, env, ctx) {
41034136
requireAuth(ctx);
41044137
const { receipt_text } = await safeJson(request);

interview/proctored-room.html

Lines changed: 106 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -3035,8 +3035,72 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
30353035
if (_ttsKeepAliveInt) { clearInterval(_ttsKeepAliveInt); _ttsKeepAliveInt = null; }
30363036
}
30373037

3038+
// ── Cloud TTS (primary) with speechSynthesis fallback ──
3039+
// Uses Cloudflare Workers AI @cf/deepgram/aura-1 to generate audio
3040+
// Falls back to browser speechSynthesis if cloud TTS fails
3041+
let _cloudTTSAvailable = null; // null = untested, true/false = tested
3042+
3043+
async function aiSpeakCloud(text) {
3044+
const cleanText = text.replace(/\*\*/g, '').replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
3045+
console.log('[CloudTTS] Requesting speech:', cleanText.substring(0, 60) + '...');
3046+
3047+
try {
3048+
const res = await fetch(`${API}/ai/tts`, {
3049+
method: 'POST',
3050+
headers: { 'Content-Type': 'application/json' },
3051+
body: JSON.stringify({ text: cleanText })
3052+
});
3053+
3054+
if (!res.ok) {
3055+
const errText = await res.text().catch(() => 'unknown');
3056+
throw new Error(`HTTP ${res.status}: ${errText.substring(0, 200)}`);
3057+
}
3058+
3059+
const audioBlob = await res.blob();
3060+
if (audioBlob.size < 100) {
3061+
throw new Error('Audio response too small');
3062+
}
3063+
3064+
const audioUrl = URL.createObjectURL(audioBlob);
3065+
const audio = new Audio(audioUrl);
3066+
audio.volume = 1.0;
3067+
3068+
return new Promise((resolve, reject) => {
3069+
audio.onended = () => {
3070+
URL.revokeObjectURL(audioUrl);
3071+
console.log('[CloudTTS] Playback complete');
3072+
resolve(true);
3073+
};
3074+
audio.onerror = (e) => {
3075+
URL.revokeObjectURL(audioUrl);
3076+
console.warn('[CloudTTS] Playback error:', e);
3077+
reject(new Error('Audio playback failed'));
3078+
};
3079+
// Timeout safety: if audio doesn't end in 60s, resolve anyway
3080+
const timeout = setTimeout(() => {
3081+
URL.revokeObjectURL(audioUrl);
3082+
resolve(true);
3083+
}, 60000);
3084+
audio.onended = () => {
3085+
clearTimeout(timeout);
3086+
URL.revokeObjectURL(audioUrl);
3087+
console.log('[CloudTTS] Playback complete');
3088+
resolve(true);
3089+
};
3090+
audio.play().catch(err => {
3091+
clearTimeout(timeout);
3092+
URL.revokeObjectURL(audioUrl);
3093+
reject(err);
3094+
});
3095+
});
3096+
} catch (err) {
3097+
console.warn('[CloudTTS] Failed:', err.message);
3098+
throw err;
3099+
}
3100+
}
3101+
30383102
function aiSpeak(text) {
3039-
return new Promise((resolve) => {
3103+
return new Promise(async (resolve) => {
30403104
state.voice.isSpeaking = true;
30413105
state.voice.activeResolve = resolve;
30423106

@@ -3056,36 +3120,6 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
30563120
const listenVizEl = document.getElementById('listenViz');
30573121
if (listenVizEl) listenVizEl.classList.remove('active');
30583122

3059-
// ── DO NOT call synth.cancel() here ──
3060-
// Chrome bug: cancel() right before speak() kills the internal audio
3061-
// pipeline, causing all subsequent speak() calls to produce no sound.
3062-
// The queue is already clean because we cancel in startSession after
3063-
// warmup, and each chunk completes naturally via onend/onerror.
3064-
3065-
// If no voices are available, try reloading them
3066-
if (!state.voice.preferredVoice) {
3067-
loadVoices();
3068-
// If still no voice after reload, wait briefly for async voice load
3069-
if (!state.voice.preferredVoice && state.voice.synth.getVoices().length === 0) {
3070-
console.warn('[TTS] No voices available — waiting for async load...');
3071-
}
3072-
}
3073-
3074-
// Chunk long text
3075-
const sentences = cleanText.match(/[^.!?]+[.!?]+/g) || [cleanText];
3076-
const chunks = [];
3077-
let current = '';
3078-
for (const s of sentences) {
3079-
if ((current + s).length > 160) {
3080-
if (current) chunks.push(current.trim());
3081-
current = s;
3082-
} else { current += s; }
3083-
}
3084-
if (current.trim()) chunks.push(current.trim());
3085-
3086-
let idx = 0;
3087-
state.voice.keepAlive = [];
3088-
30893123
function finishSpeaking() {
30903124
stopTTSKeepAlive();
30913125
state.voice.isSpeaking = false;
@@ -3097,7 +3131,6 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
30973131
}
30983132

30993133
if (engine.phase === 'closing') {
3100-
// Closing phase: stop all recognition, then endSession
31013134
state.voice.isListening = false;
31023135
clearTimeout(state.voice.silenceTimer);
31033136
try { if (state.voice.recognition) state.voice.recognition.stop(); } catch (e) { }
@@ -3113,6 +3146,43 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31133146
}
31143147
}
31153148

3149+
// ── PRIMARY: Try Cloud TTS ──
3150+
if (_cloudTTSAvailable !== false) {
3151+
try {
3152+
console.log('[TTS] Trying Cloud TTS...');
3153+
await aiSpeakCloud(cleanText);
3154+
_cloudTTSAvailable = true;
3155+
console.log('[TTS] Cloud TTS succeeded');
3156+
finishSpeaking();
3157+
return;
3158+
} catch (cloudErr) {
3159+
console.warn('[TTS] Cloud TTS failed, falling back to browser:', cloudErr.message);
3160+
_cloudTTSAvailable = false;
3161+
}
3162+
}
3163+
3164+
// ── FALLBACK: Browser speechSynthesis ──
3165+
console.log('[TTS] Using browser speechSynthesis fallback');
3166+
3167+
if (!state.voice.preferredVoice) {
3168+
loadVoices();
3169+
}
3170+
3171+
// Chunk long text for speechSynthesis (has length limits)
3172+
const sentences = cleanText.match(/[^.!?]+[.!?]+/g) || [cleanText];
3173+
const chunks = [];
3174+
let current = '';
3175+
for (const s of sentences) {
3176+
if ((current + s).length > 160) {
3177+
if (current) chunks.push(current.trim());
3178+
current = s;
3179+
} else { current += s; }
3180+
}
3181+
if (current.trim()) chunks.push(current.trim());
3182+
3183+
let idx = 0;
3184+
state.voice.keepAlive = [];
3185+
31163186
function speakNext() {
31173187
if (!state.voice.isSpeaking) return;
31183188

@@ -3125,7 +3195,6 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31253195
const u = new SpeechSynthesisUtterance(chunkText);
31263196
state.voice.keepAlive.push(u);
31273197

3128-
// Set the language on the utterance so the browser picks the correct voice
31293198
u.lang = state.interviewLanguage || 'en-IN';
31303199
if (state.voice.preferredVoice) u.voice = state.voice.preferredVoice;
31313200
u.rate = 0.92;
@@ -3134,17 +3203,13 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31343203

31353204
if (subtitle) subtitle.textContent = chunkText;
31363205

3137-
// Chrome bug: onend sometimes never fires.
3138-
// Use a watchdog timer based on estimated speech duration.
3139-
// Average speaking rate ~150 words/min at rate 0.92 ≈ ~138 wpm
3140-
// = ~2.3 words/sec. Add generous buffer.
31413206
const wordCount = chunkText.split(/\s+/).length;
31423207
const estimatedMs = Math.max(2000, (wordCount / 2.3) * 1000 + 1500);
31433208
let resolved = false;
31443209

31453210
const watchdog = setTimeout(() => {
31463211
if (!resolved && state.voice.isSpeaking) {
3147-
console.warn('[TTS] Watchdog: onend did not fire, advancing chunk', idx);
3212+
console.warn('[TTS-Browser] Watchdog: advancing chunk', idx);
31483213
resolved = true;
31493214
idx++;
31503215
speakNext();
@@ -3160,7 +3225,7 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31603225
}
31613226
};
31623227
u.onerror = (e) => {
3163-
console.warn('[TTS] Utterance error:', e.error || e);
3228+
console.warn('[TTS-Browser] Utterance error:', e.error || e);
31643229
if (!resolved && state.voice.isSpeaking) {
31653230
resolved = true;
31663231
clearTimeout(watchdog);
@@ -3171,55 +3236,12 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31713236

31723237
try {
31733238
u.onstart = () => {
3174-
console.log('[TTS] ▶ Chunk', idx, 'started playing, text:', chunkText.substring(0, 40) + '...');
3239+
console.log('[TTS-Browser] Chunk', idx, 'started');
31753240
};
31763241
state.voice.synth.speak(u);
31773242
startTTSKeepAlive();
3178-
3179-
// Extra safety: if synth is not speaking after 800ms, try re-speaking
3180-
setTimeout(() => {
3181-
if (!resolved && !state.voice.synth.speaking && !state.voice.synth.pending) {
3182-
console.warn('[TTS] Speech did not start after 800ms, chunk', idx,
3183-
'synth.paused=', state.voice.synth.paused,
3184-
'voices=', state.voice.synth.getVoices().length,
3185-
'voice=', state.voice.preferredVoice?.name || 'NONE');
3186-
// Try one more time — sometimes Chrome needs a retry
3187-
try {
3188-
const retry = new SpeechSynthesisUtterance(chunkText);
3189-
retry.lang = u.lang;
3190-
if (state.voice.preferredVoice) retry.voice = state.voice.preferredVoice;
3191-
retry.rate = u.rate;
3192-
retry.pitch = u.pitch;
3193-
retry.volume = u.volume;
3194-
retry.onend = () => {
3195-
if (!resolved && state.voice.isSpeaking) {
3196-
resolved = true;
3197-
clearTimeout(watchdog);
3198-
idx++;
3199-
speakNext();
3200-
}
3201-
};
3202-
retry.onerror = () => {
3203-
if (!resolved && state.voice.isSpeaking) {
3204-
resolved = true;
3205-
clearTimeout(watchdog);
3206-
idx++;
3207-
speakNext();
3208-
}
3209-
};
3210-
state.voice.synth.speak(retry);
3211-
console.log('[TTS] Retry speak for chunk', idx);
3212-
} catch (retryErr) {
3213-
console.warn('[TTS] Retry also failed:', retryErr);
3214-
resolved = true;
3215-
clearTimeout(watchdog);
3216-
idx++;
3217-
speakNext();
3218-
}
3219-
}
3220-
}, 800);
32213243
} catch (ttsErr) {
3222-
console.error('[TTS] speak() threw:', ttsErr);
3244+
console.error('[TTS-Browser] speak() threw:', ttsErr);
32233245
resolved = true;
32243246
clearTimeout(watchdog);
32253247
idx++;

0 commit comments

Comments
 (0)