From 73081e87df9eeee5fe8d9dfccdde57a4d760c36c Mon Sep 17 00:00:00 2001 From: ishanvohra2 Date: Mon, 15 Jun 2026 13:02:27 +0530 Subject: [PATCH] feat[api]: separate TTS language validation per engine Validate chatterbox and supertonic languages with distinct enums instead of a shared list. Expose all 18 chatterbox multilingual languages and restrict supertonic to its actual subset (en/es/fr/pt/ko). Co-authored-by: Cursor --- packages/sdk/schemas/text-to-speech.ts | 41 ++++++++++++++--- packages/sdk/test/unit/tts-schemas.test.ts | 51 ++++++++++++++++++++++ 2 files changed, 87 insertions(+), 5 deletions(-) diff --git a/packages/sdk/schemas/text-to-speech.ts b/packages/sdk/schemas/text-to-speech.ts index 0159433b82..49d54c01a0 100644 --- a/packages/sdk/schemas/text-to-speech.ts +++ b/packages/sdk/schemas/text-to-speech.ts @@ -1,26 +1,55 @@ import { z } from "zod"; import { modelSrcInputSchema } from "./model-src-utils"; -// TTS supported languages based on available models -export const TTS_LANGUAGES = [ +// Chatterbox multilingual supported languages (18). The engines support +// different language sets, so the language enum is validated per engine. +export const TTS_CHATTERBOX_LANGUAGES = [ "en", // English "es", // Spanish + "fr", // French "de", // German "it", // Italian + "pt", // Portuguese + "nl", // Dutch + "pl", // Polish + "tr", // Turkish + "sv", // Swedish + "da", // Danish + "fi", // Finnish + "no", // Norwegian + "el", // Greek + "ms", // Malay + "sw", // Swahili + "ar", // Arabic + "ko", // Korean ] as const; -const ttsLanguageSchema = z.enum(TTS_LANGUAGES); +// Supertonic supported languages (subset of the Chatterbox set). +export const TTS_SUPERTONIC_LANGUAGES = [ + "en", // English + "es", // Spanish + "fr", // French + "pt", // Portuguese + "ko", // Korean +] as const; + +// Union of all TTS-supported languages across engines. Kept for backwards +// compatibility; prefer the engine-specific lists when validating a config. +export const TTS_LANGUAGES = [...TTS_CHATTERBOX_LANGUAGES] as const; + +const ttsChatterboxLanguageSchema = z.enum(TTS_CHATTERBOX_LANGUAGES); +const ttsSupertonicLanguageSchema = z.enum(TTS_SUPERTONIC_LANGUAGES); export const ttsChatterboxRuntimeConfigSchema = z.object({ ttsEngine: z.literal("chatterbox"), - language: ttsLanguageSchema, + language: ttsChatterboxLanguageSchema, voice: z.string().optional(), useGPU: z.boolean().optional(), }); export const ttsSupertonicRuntimeConfigSchema = z.object({ ttsEngine: z.literal("supertonic"), - language: ttsLanguageSchema, + language: ttsSupertonicLanguageSchema, voice: z.string().optional(), ttsSpeed: z.number().optional(), ttsNumInferenceSteps: z.number().optional(), @@ -141,6 +170,8 @@ export const textToSpeechStreamResponseSchema = z.object({ }); export type TtsLanguage = (typeof TTS_LANGUAGES)[number]; +export type TtsChatterboxLanguage = (typeof TTS_CHATTERBOX_LANGUAGES)[number]; +export type TtsSupertonicLanguage = (typeof TTS_SUPERTONIC_LANGUAGES)[number]; export type TtsChatterboxLoadConfig = z.infer; export type TtsSupertonicLoadConfig = z.infer; export type TtsLoadConfig = z.infer; diff --git a/packages/sdk/test/unit/tts-schemas.test.ts b/packages/sdk/test/unit/tts-schemas.test.ts index 91fce99946..9908f22b84 100644 --- a/packages/sdk/test/unit/tts-schemas.test.ts +++ b/packages/sdk/test/unit/tts-schemas.test.ts @@ -4,7 +4,10 @@ import { ttsResponseSchema, textToSpeechStreamResponseSchema, ttsConfigSchema, + ttsChatterboxRuntimeConfigSchema, ttsSupertonicRuntimeConfigSchema, + TTS_CHATTERBOX_LANGUAGES, + TTS_SUPERTONIC_LANGUAGES, LEGACY_TTS_ONNX_MODEL_CONFIG_FIELDS, } from "@/schemas/text-to-speech"; @@ -26,6 +29,54 @@ test("ttsConfigSchema: accepts GGML supertonic load config", (t) => { t.is(r.success, true); }); +test("TTS_CHATTERBOX_LANGUAGES: exposes all 18 supported languages", (t) => { + t.is(TTS_CHATTERBOX_LANGUAGES.length, 18); + const expected = [ + "en", "es", "fr", "de", "it", "pt", "nl", "pl", "tr", + "sv", "da", "fi", "no", "el", "ms", "sw", "ar", "ko", + ]; + t.alike([...TTS_CHATTERBOX_LANGUAGES], expected); +}); + +test("ttsChatterboxRuntimeConfigSchema: accepts all 18 chatterbox languages", (t) => { + for (const language of TTS_CHATTERBOX_LANGUAGES) { + const r = ttsChatterboxRuntimeConfigSchema.safeParse({ + ttsEngine: "chatterbox", + language, + }); + t.is(r.success, true, `chatterbox should accept ${language}`); + } +}); + +test("ttsSupertonicRuntimeConfigSchema: only accepts its language subset", (t) => { + t.alike([...TTS_SUPERTONIC_LANGUAGES], ["en", "es", "fr", "pt", "ko"]); + for (const language of TTS_SUPERTONIC_LANGUAGES) { + const r = ttsSupertonicRuntimeConfigSchema.safeParse({ + ttsEngine: "supertonic", + language, + }); + t.is(r.success, true, `supertonic should accept ${language}`); + } +}); + +test("ttsSupertonicRuntimeConfigSchema: rejects chatterbox-only languages", (t) => { + // 'de' is supported by chatterbox but not supertonic. + const r = ttsSupertonicRuntimeConfigSchema.safeParse({ + ttsEngine: "supertonic", + language: "de", + }); + t.is(r.success, false, "supertonic must reject 'de'"); +}); + +test("ttsConfigSchema: accepts a chatterbox-only language for chatterbox", (t) => { + const r = ttsConfigSchema.safeParse({ + ttsEngine: "chatterbox", + language: "tr", + s3genModelSrc: "s3:///example/s3gen.gguf", + }); + t.is(r.success, true, "chatterbox load config accepts 'tr'"); +}); + test("ttsSupertonicRuntimeConfigSchema: strips removed ttsSupertonicMultilingual", (t) => { const r = ttsSupertonicRuntimeConfigSchema.safeParse({ ttsEngine: "supertonic",