diff --git a/BACKENDS.md b/BACKENDS.md index 517161e..70f0c3b 100644 --- a/BACKENDS.md +++ b/BACKENDS.md @@ -84,7 +84,6 @@ Uses Deepgram's WebSocket streaming API for real-time transcription. DEEPGRAM_API_KEY=your-key-here DEEPGRAM_MODEL=nova-2 DEEPGRAM_ENCODING=linear16 # Audio encoding: linear16 (PCM) or opus (default: linear16) -DEEPGRAM_LANGUAGE=multi # Multilingual code-switching (default) DEEPGRAM_INCLUDE_LANGUAGE=true # Append language to transcript (e.g., "Hello [en]") DEEPGRAM_PUNCTUATE=true DEEPGRAM_DIARIZE=false @@ -109,8 +108,8 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini - Supports KeepAlive, Finalize, and CloseStream control messages - Authentication via Sec-WebSocket-Protocol header - Multilingual streaming support: - - Defaults to `language=multi` for automatic multilingual code-switching (31+ languages with Nova-3) - - Can specify single language (e.g., `en`, `es`, `fr`, `de`, `pt`, etc.) + - Defaults to `language=multi` (auto-detect) when no `lang` URL parameter is provided + - Specify language via URL: `?lang=en`, `?lang=es`, `?lang=fr`, etc. (ISO-639-1 codes) - Automatically adds `endpointing=100` for multilingual mode (recommended for code-switching) - Optional: Include detected language in transcript (e.g., `"Hello [en]"`) via `DEEPGRAM_INCLUDE_LANGUAGE=true` - **Note**: `detect_language` parameter is NOT supported for streaming (only for pre-recorded audio) diff --git a/README.md b/README.md index 2044eb1..4eee120 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,6 @@ Set environment variables or use a `.env` file: |----------|---------|-------------| | `OPENAI_MODEL` | `gpt-4o-mini-transcribe` | OpenAI model | | `DEEPGRAM_MODEL` | `nova-2` | Deepgram model | -| `DEEPGRAM_LANGUAGE` | `multi` | Language code or `multi` for auto | | `DEEPGRAM_ENCODING` | `linear16` | `linear16`, `opus`, or `ogg-opus` | | `GEMINI_MODEL` | `gemini-2.0-flash-exp` | Gemini model | @@ -133,7 +132,7 @@ ws://host:port/transcribe?sessionId=xxx&sendBack=true | `sendBackInterim` | `false` | Return interim transcriptions | | `provider` | (auto) | Override provider selection | | `encoding` | `opus` | Audio encoding: `opus` or `ogg-opus` | -| `lang` | (auto) | Language hint | +| `lang` | (auto) | ISO-639-1 language code (e.g., `en`, `de`, `fr`). Auto-detect if omitted. | ### Client Messages diff --git a/src/backends/DeepgramBackend.ts b/src/backends/DeepgramBackend.ts index db5908d..26e2f21 100644 --- a/src/backends/DeepgramBackend.ts +++ b/src/backends/DeepgramBackend.ts @@ -70,15 +70,13 @@ export class DeepgramBackend implements TranscriptionBackend { } // Language configuration - // Use per-connection language if specified, otherwise use global config - const language = backendConfig.language || config.deepgram.language; - if (language) { - params.set('language', language); - // For multilingual streaming, add recommended endpointing - // See: https://developers.deepgram.com/docs/multilingual-code-switching - if (language === 'multi') { - params.set('endpointing', '100'); - } + // Use per-connection language if specified, otherwise auto-detect (multi) + const language = backendConfig.language || 'multi'; + params.set('language', language); + // For multilingual streaming, add recommended endpointing + // See: https://developers.deepgram.com/docs/multilingual-code-switching + if (language === 'multi') { + params.set('endpointing', '100'); } // Note: detect_language is NOT supported for streaming diff --git a/src/backends/GeminiBackend.ts b/src/backends/GeminiBackend.ts index 6ae967a..33acc32 100644 --- a/src/backends/GeminiBackend.ts +++ b/src/backends/GeminiBackend.ts @@ -10,6 +10,7 @@ import logger from '../logger'; import type { TranscriptionBackend, BackendConfig } from './TranscriptionBackend'; import type { TranscriptionMessage } from '../transcriberproxy'; import { writeMetric } from '../metrics'; +import { toLanguageName } from '../languageMap'; // Gemini WebSocket API endpoint (v1beta - more stable than v1alpha) const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent'; @@ -163,7 +164,8 @@ export class GeminiBackend implements TranscriptionBackend { let systemInstruction = this.backendConfig.prompt || 'Transcribe the following audio. Output only the transcribed text.'; if (this.backendConfig.language) { - systemInstruction += ` The audio is in ${this.backendConfig.language}.`; + const languageName = toLanguageName(this.backendConfig.language); + systemInstruction += ` The audio is in ${languageName}.`; } const setupMessage = { diff --git a/src/config.ts b/src/config.ts index cab1b5b..c1608d0 100644 --- a/src/config.ts +++ b/src/config.ts @@ -52,7 +52,6 @@ export const config = { deepgram: { apiKey: process.env.DEEPGRAM_API_KEY || '', model: process.env.DEEPGRAM_MODEL || 'nova-2', - language: process.env.DEEPGRAM_LANGUAGE || 'multi', encoding: (process.env.DEEPGRAM_ENCODING || 'linear16') as 'opus' | 'linear16', punctuate: process.env.DEEPGRAM_PUNCTUATE === 'true', diarize: process.env.DEEPGRAM_DIARIZE === 'true', diff --git a/src/languageMap.ts b/src/languageMap.ts new file mode 100644 index 0000000..080749e --- /dev/null +++ b/src/languageMap.ts @@ -0,0 +1,68 @@ +/** + * Language code mapping for transcription backends + * + * Accepts ISO-639-1 short codes (e.g., "en", "de", "fr") and converts + * them to the format required by each backend. + */ + +// ISO-639-1 to full language name mapping (for Gemini prompts) +const ISO_TO_NAME: Record = { + af: 'Afrikaans', + ar: 'Arabic', + be: 'Belarusian', + bg: 'Bulgarian', + bn: 'Bengali', + bs: 'Bosnian', + ca: 'Catalan', + cs: 'Czech', + da: 'Danish', + de: 'German', + el: 'Greek', + en: 'English', + es: 'Spanish', + et: 'Estonian', + fa: 'Persian', + fi: 'Finnish', + fr: 'French', + he: 'Hebrew', + hi: 'Hindi', + hr: 'Croatian', + hu: 'Hungarian', + id: 'Indonesian', + it: 'Italian', + ja: 'Japanese', + kn: 'Kannada', + ko: 'Korean', + lt: 'Lithuanian', + lv: 'Latvian', + mk: 'Macedonian', + mr: 'Marathi', + ms: 'Malay', + nl: 'Dutch', + no: 'Norwegian', + pl: 'Polish', + pt: 'Portuguese', + ro: 'Romanian', + ru: 'Russian', + sk: 'Slovak', + sl: 'Slovenian', + sr: 'Serbian', + sv: 'Swedish', + ta: 'Tamil', + te: 'Telugu', + th: 'Thai', + tl: 'Tagalog', + tr: 'Turkish', + uk: 'Ukrainian', + ur: 'Urdu', + vi: 'Vietnamese', + zh: 'Chinese', +}; + +/** + * Convert ISO-639-1 code to full language name (for Gemini). + * Returns the code itself if not found in mapping. + */ +export function toLanguageName(code: string): string { + return ISO_TO_NAME[code.toLowerCase()] || code; +}