Skip to content

Commit c2f9679

Browse files
committed
upgrade: TTS multi-model (Aura 2 + MeloTTS) + language-aware routing + GLM-4.7-Flash for multilingual
- TTS: Upgraded from Deepgram Aura 1 to Aura 2 EN (English) - TTS: Added Aura 2 ES for Spanish interviews - TTS: Added MeloTTS for French, Japanese, Korean, Chinese - TTS: Auto-fallback to Aura 2 EN if language model fails - Frontend: Sends interview language to /ai/tts for model selection - LLM: Added CF_FAST_MODEL (GLM-4.7-Flash) for faster multilingual - Malayalam/Hindi: Still uses browser speechSynthesis (no cloud model) - Non-English gate: Skips cloud TTS for Indian languages -> browser fallback
1 parent f8acea3 commit c2f9679

3 files changed

Lines changed: 67 additions & 17 deletions

File tree

.kilo/kilo.jsonc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"$schema": "https://app.kilo.ai/config.json"
3+
}

backend/simpatico-ats.js

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,7 @@ async function callExternalLLM(cfg, messages, maxTokens, stream = false) {
736736
* Returns { response: string, usage?: object }
737737
*/
738738
const CF_DEFAULT_MODEL = "@cf/meta/llama-3.3-70b-instruct-fp8-fast";
739+
const CF_FAST_MODEL = "@cf/zhipu/glm-4.7-flash"; // 100+ languages, 131K ctx, much faster for multilingual
739740
const CF_LIGHT_MODEL = "@cf/meta/llama-3.1-8b-instruct";
740741

741742
async function runLLM(env, tenantId, messages, maxTokens = 1024) {
@@ -4100,34 +4101,80 @@ async function handleInterviewQuestion(request, env, ctx) {
41004101
});
41014102
}
41024103

4103-
// ── TTS via Cloudflare Workers AI (Deepgram Aura) ──────────────────────────
4104+
// ── TTS via Cloudflare Workers AI ──────────────────────────────────────────
4105+
// Selects the best TTS model based on language:
4106+
// English → Deepgram Aura 2 (upgraded, context-aware)
4107+
// Spanish → Deepgram Aura 2 ES
4108+
// French, Japanese, Korean, Chinese → MeloTTS (multi-lingual)
4109+
// Other → Deepgram Aura 2 EN fallback (text should be English)
4110+
const TTS_MODEL_MAP = {
4111+
en: { model: "@cf/deepgram/aura-2-en", type: "deepgram" },
4112+
es: { model: "@cf/deepgram/aura-2-es", type: "deepgram" },
4113+
fr: { model: "@cf/myshell-ai/melotts", type: "melotts" },
4114+
ja: { model: "@cf/myshell-ai/melotts", type: "melotts" },
4115+
ko: { model: "@cf/myshell-ai/melotts", type: "melotts" },
4116+
zh: { model: "@cf/myshell-ai/melotts", type: "melotts" },
4117+
};
4118+
const TTS_DEFAULT = { model: "@cf/deepgram/aura-2-en", type: "deepgram" };
4119+
41044120
async function handleTTS(request, env, ctx) {
4105-
const { text } = await safeJson(request);
4121+
const body = await safeJson(request);
4122+
const { text, lang } = body;
41064123
if (!text || typeof text !== 'string' || text.trim().length === 0) {
41074124
throw new ValidationError("text is required");
41084125
}
41094126

41104127
if (!env.AI) throw new ServiceUnavailableError("AI");
41114128

4112-
// Limit text to prevent abuse
41134129
const cleanText = text.trim().slice(0, 5000);
4114-
console.log(`[TTS] Generating speech: ${cleanText.length} chars`);
4130+
const langBase = (lang || 'en').split('-')[0].toLowerCase();
4131+
const ttsConfig = TTS_MODEL_MAP[langBase] || TTS_DEFAULT;
41154132

4116-
try {
4117-
const audioResponse = await env.AI.run("@cf/deepgram/aura-1", {
4118-
text: cleanText,
4119-
});
4133+
console.log(`[TTS] Generating speech: ${cleanText.length} chars, lang=${langBase}, model=${ttsConfig.model}`);
41204134

4121-
// audioResponse is an ArrayBuffer or ReadableStream
4122-
const headers = {
4123-
...CORS_HEADERS,
4124-
"Content-Type": "audio/mpeg",
4125-
"Cache-Control": "public, max-age=3600",
4126-
};
4135+
try {
4136+
if (ttsConfig.type === "melotts") {
4137+
// MeloTTS returns { audio: base64_string }
4138+
const result = await env.AI.run(ttsConfig.model, {
4139+
prompt: cleanText,
4140+
lang: langBase,
4141+
});
41274142

4128-
return new Response(audioResponse, { status: 200, headers });
4143+
if (result && result.audio) {
4144+
const binaryString = atob(result.audio);
4145+
const bytes = new Uint8Array(binaryString.length);
4146+
for (let i = 0; i < binaryString.length; i++) {
4147+
bytes[i] = binaryString.charCodeAt(i);
4148+
}
4149+
return new Response(bytes.buffer, {
4150+
status: 200,
4151+
headers: { ...CORS_HEADERS, "Content-Type": "audio/mpeg", "Cache-Control": "public, max-age=3600" },
4152+
});
4153+
}
4154+
throw new Error("MeloTTS returned no audio");
4155+
} else {
4156+
// Deepgram Aura 2 — returns audio stream directly
4157+
const audioResponse = await env.AI.run(ttsConfig.model, { text: cleanText });
4158+
return new Response(audioResponse, {
4159+
status: 200,
4160+
headers: { ...CORS_HEADERS, "Content-Type": "audio/mpeg", "Cache-Control": "public, max-age=3600" },
4161+
});
4162+
}
41294163
} catch (err) {
4130-
console.error(`[TTS] AI.run error: ${err.message}`);
4164+
console.error(`[TTS] AI.run error (${ttsConfig.model}): ${err.message}`);
4165+
// Fallback: try Aura 2 EN if a language-specific model failed
4166+
if (ttsConfig.model !== TTS_DEFAULT.model) {
4167+
console.log(`[TTS] Retrying with fallback model: ${TTS_DEFAULT.model}`);
4168+
try {
4169+
const fallback = await env.AI.run(TTS_DEFAULT.model, { text: cleanText });
4170+
return new Response(fallback, {
4171+
status: 200,
4172+
headers: { ...CORS_HEADERS, "Content-Type": "audio/mpeg", "Cache-Control": "public, max-age=3600" },
4173+
});
4174+
} catch (e2) {
4175+
console.error(`[TTS] Fallback also failed: ${e2.message}`);
4176+
}
4177+
}
41314178
throw new AppError(`TTS generation failed: ${err.message}`, 500, "TTS_ERROR");
41324179
}
41334180
}

interview/proctored-room.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3117,7 +3117,7 @@ <h1><span class="brand-simp">Simpatico</span><span class="brand-hr">HR</span></h
31173117
const res = await fetch(`${API}/ai/tts`, {
31183118
method: 'POST',
31193119
headers: { 'Content-Type': 'application/json' },
3120-
body: JSON.stringify({ text: cleanText })
3120+
body: JSON.stringify({ text: cleanText, lang: state.interviewLanguage || 'en-US' })
31213121
});
31223122

31233123
if (!res.ok) {

0 commit comments

Comments
 (0)