Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions BACKENDS.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ Uses Deepgram's WebSocket streaming API for real-time transcription.
DEEPGRAM_API_KEY=your-key-here
DEEPGRAM_MODEL=nova-2
DEEPGRAM_ENCODING=linear16 # Audio encoding: linear16 (PCM) or opus (default: linear16)
DEEPGRAM_LANGUAGE=multi # Multilingual code-switching (default)
DEEPGRAM_INCLUDE_LANGUAGE=true # Append language to transcript (e.g., "Hello [en]")
DEEPGRAM_PUNCTUATE=true
DEEPGRAM_DIARIZE=false
Expand All @@ -109,8 +108,8 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini
- Supports KeepAlive, Finalize, and CloseStream control messages
- Authentication via Sec-WebSocket-Protocol header
- Multilingual streaming support:
- Defaults to `language=multi` for automatic multilingual code-switching (31+ languages with Nova-3)
- Can specify single language (e.g., `en`, `es`, `fr`, `de`, `pt`, etc.)
- Defaults to `language=multi` (auto-detect) when no `lang` URL parameter is provided
- Specify language via URL: `?lang=en`, `?lang=es`, `?lang=fr`, etc. (ISO-639-1 codes)
- Automatically adds `endpointing=100` for multilingual mode (recommended for code-switching)
- Optional: Include detected language in transcript (e.g., `"Hello [en]"`) via `DEEPGRAM_INCLUDE_LANGUAGE=true`
- **Note**: `detect_language` parameter is NOT supported for streaming (only for pre-recorded audio)
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ Set environment variables or use a `.env` file:
|----------|---------|-------------|
| `OPENAI_MODEL` | `gpt-4o-mini-transcribe` | OpenAI model |
| `DEEPGRAM_MODEL` | `nova-2` | Deepgram model |
| `DEEPGRAM_LANGUAGE` | `multi` | Language code or `multi` for auto |
| `DEEPGRAM_ENCODING` | `linear16` | `linear16`, `opus`, or `ogg-opus` |
| `GEMINI_MODEL` | `gemini-2.0-flash-exp` | Gemini model |

Expand Down Expand Up @@ -133,7 +132,7 @@ ws://host:port/transcribe?sessionId=xxx&sendBack=true
| `sendBackInterim` | `false` | Return interim transcriptions |
| `provider` | (auto) | Override provider selection |
| `encoding` | `opus` | Audio encoding: `opus` or `ogg-opus` |
| `lang` | (auto) | Language hint |
| `lang` | (auto) | ISO-639-1 language code (e.g., `en`, `de`, `fr`). Auto-detect if omitted. |

### Client Messages

Expand Down
16 changes: 7 additions & 9 deletions src/backends/DeepgramBackend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,13 @@ export class DeepgramBackend implements TranscriptionBackend {
}

// Language configuration
// Use per-connection language if specified, otherwise use global config
const language = backendConfig.language || config.deepgram.language;
if (language) {
params.set('language', language);
// For multilingual streaming, add recommended endpointing
// See: https://developers.deepgram.com/docs/multilingual-code-switching
if (language === 'multi') {
params.set('endpointing', '100');
}
// Use per-connection language if specified, otherwise auto-detect (multi)
const language = backendConfig.language || 'multi';
params.set('language', language);
// For multilingual streaming, add recommended endpointing
// See: https://developers.deepgram.com/docs/multilingual-code-switching
if (language === 'multi') {
params.set('endpointing', '100');
}

// Note: detect_language is NOT supported for streaming
Expand Down
4 changes: 3 additions & 1 deletion src/backends/GeminiBackend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import logger from '../logger';
import type { TranscriptionBackend, BackendConfig } from './TranscriptionBackend';
import type { TranscriptionMessage } from '../transcriberproxy';
import { writeMetric } from '../metrics';
import { toLanguageName } from '../languageMap';

// Gemini WebSocket API endpoint (v1beta - more stable than v1alpha)
const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
Expand Down Expand Up @@ -163,7 +164,8 @@ export class GeminiBackend implements TranscriptionBackend {
let systemInstruction = this.backendConfig.prompt || 'Transcribe the following audio. Output only the transcribed text.';

if (this.backendConfig.language) {
systemInstruction += ` The audio is in ${this.backendConfig.language}.`;
const languageName = toLanguageName(this.backendConfig.language);
systemInstruction += ` The audio is in ${languageName}.`;
}

const setupMessage = {
Expand Down
1 change: 0 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ export const config = {
deepgram: {
apiKey: process.env.DEEPGRAM_API_KEY || '',
model: process.env.DEEPGRAM_MODEL || 'nova-2',
language: process.env.DEEPGRAM_LANGUAGE || 'multi',
encoding: (process.env.DEEPGRAM_ENCODING || 'linear16') as 'opus' | 'linear16',
punctuate: process.env.DEEPGRAM_PUNCTUATE === 'true',
diarize: process.env.DEEPGRAM_DIARIZE === 'true',
Expand Down
68 changes: 68 additions & 0 deletions src/languageMap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* Language code mapping for transcription backends
*
* Accepts ISO-639-1 short codes (e.g., "en", "de", "fr") and converts
* them to the format required by each backend.
*/

// ISO-639-1 to full language name mapping (for Gemini prompts)
const ISO_TO_NAME: Record<string, string> = {
af: 'Afrikaans',
ar: 'Arabic',
be: 'Belarusian',
bg: 'Bulgarian',
bn: 'Bengali',
bs: 'Bosnian',
ca: 'Catalan',
cs: 'Czech',
da: 'Danish',
de: 'German',
el: 'Greek',
en: 'English',
es: 'Spanish',
et: 'Estonian',
fa: 'Persian',
fi: 'Finnish',
fr: 'French',
he: 'Hebrew',
hi: 'Hindi',
hr: 'Croatian',
hu: 'Hungarian',
id: 'Indonesian',
it: 'Italian',
ja: 'Japanese',
kn: 'Kannada',
ko: 'Korean',
lt: 'Lithuanian',
lv: 'Latvian',
mk: 'Macedonian',
mr: 'Marathi',
ms: 'Malay',
nl: 'Dutch',
no: 'Norwegian',
pl: 'Polish',
pt: 'Portuguese',
ro: 'Romanian',
ru: 'Russian',
sk: 'Slovak',
sl: 'Slovenian',
sr: 'Serbian',
sv: 'Swedish',
ta: 'Tamil',
te: 'Telugu',
th: 'Thai',
tl: 'Tagalog',
tr: 'Turkish',
uk: 'Ukrainian',
ur: 'Urdu',
vi: 'Vietnamese',
zh: 'Chinese',
};

/**
* Convert ISO-639-1 code to full language name (for Gemini).
* Returns the code itself if not found in mapping.
*/
export function toLanguageName(code: string): string {
return ISO_TO_NAME[code.toLowerCase()] || code;
}
Loading