feat: add ogg-opus encoding to deepgram (#53)

rpurdel · web-flow · commit b664f0e407b0 · 2026-01-28T12:57:17.000-06:00
diff --git a/BACKENDS.md b/BACKENDS.md
@@ -101,9 +101,10 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini
 
 **Technical Details:**
 - Uses WebSocket API: `wss://api.deepgram.com/v1/listen`
-- Audio encoding options:
+- Audio encoding options (via `DEEPGRAM_ENCODING` env var or `encoding` URL parameter):
   - **linear16** (default): Sends decoded PCM audio at 24kHz, 16-bit, mono. Uses more CPU for Opus decoding but universally compatible.
   - **opus**: Sends raw Opus frames at 48kHz. More efficient (skips decoding step), lower CPU usage, native Opus support.
+  - **ogg-opus**: Sends containerized Ogg-Opus audio (e.g., from Voximplant). Deepgram auto-detects encoding from the container header - no `encoding` or `sample_rate` params are sent to Deepgram.
 - Returns both interim and final transcriptions
 - Supports KeepAlive, Finalize, and CloseStream control messages
 - Authentication via Sec-WebSocket-Protocol header
@@ -121,6 +122,15 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini
 All backends implement the `TranscriptionBackend` interface:
 
 ```typescript
+type AudioEncoding = 'opus' | 'ogg-opus';
+
+interface BackendConfig {
+  language: string | null;
+  prompt?: string;
+  model?: string;
+  encoding?: AudioEncoding;  // Audio format from client
+}
+
 interface TranscriptionBackend {
   // Lifecycle
   connect(config: BackendConfig): Promise<void>;
@@ -130,6 +140,7 @@ interface TranscriptionBackend {
   // Audio
   sendAudio(audioBase64: string): Promise<void>;
   forceCommit(): void;
+  wantsRawOpus?(encoding?: AudioEncoding): boolean;  // Opt-in to raw audio
 
   // Configuration
   updatePrompt(prompt: string): void;
@@ -138,16 +149,25 @@ interface TranscriptionBackend {
   onInterimTranscription?: (message: TranscriptionMessage) => void;
   onCompleteTranscription?: (message: TranscriptionMessage) => void;
   onError?: (errorType: string, errorMessage: string) => void;
+  onClosed?: () => void;
 }
 ```
 
 ### Audio Format
-All backends receive **24 kHz, 16-bit, mono PCM audio** encoded as base64 strings.
+By default, backends receive **24 kHz, 16-bit, mono PCM audio** encoded as base64 strings.
 
 The opus-transcriber-proxy handles:
 1. Receiving Opus-encoded packets from clients
-2. Decoding to PCM
-3. Sending PCM to the transcription backend
+2. Decoding to PCM (unless backend opts out)
+3. Sending audio to the transcription backend
+
+**Raw Audio Mode:** Backends can implement `wantsRawOpus(encoding?: AudioEncoding): boolean` to receive raw audio instead of decoded PCM. This is useful for backends like Deepgram that natively support Opus or Ogg-Opus formats.
+
+**URL Parameter:** Clients can specify the audio encoding format via the `encoding` URL parameter:
+- `encoding=opus` (default): Raw Opus frames at 48kHz
+- `encoding=ogg-opus`: Containerized Ogg-Opus audio (e.g., from Voximplant)
+
+Example: `wss://host/transcribe?transcribe=true&sendBack=true&encoding=ogg-opus`
 
 ### Transcription Messages
 Backends must produce `TranscriptionMessage` objects:
diff --git a/env.example b/env.example
@@ -20,6 +20,13 @@
 #   ws://localhost:8080/transcribe?transcribe=true&sendBack=true&provider=deepgram
 # If the requested provider is not available or invalid, the connection will be rejected
 
+# Runtime encoding selection:
+# You can override the default audio encoding per request using the ?encoding= URL parameter
+# Options: opus (default), ogg-opus
+# Examples:
+#   ws://localhost:8080/transcribe?transcribe=true&sendBack=true&encoding=ogg-opus
+#   ws://localhost:8080/transcribe?transcribe=true&sendBack=true&provider=deepgram&encoding=ogg-opus
+
 # ============================================
 # OpenAI Configuration
 # ============================================
@@ -66,11 +73,13 @@ OPENAI_TRANSCRIPTION_PROMPT="You will hear audio coming from one participant in
 # Options: nova-2, nova, enhanced, base
 # DEEPGRAM_MODEL=nova-2
 
-# Audio encoding to use for Deepgram
+# Audio encoding to use for Deepgram (default)
 # Default: linear16 (decoded PCM audio at 24kHz)
 # Options:
 #   - linear16: Send decoded PCM audio at 24kHz (default, more CPU but compatible)
 #   - opus: Send raw Opus frames at 48kHz (less CPU, native Opus support)
+#   - ogg-opus: Send containerized Ogg-Opus audio (e.g., from Voximplant)
+# Can be overridden per-connection with the ?encoding= URL parameter
 # DEEPGRAM_ENCODING=linear16
 
 # Language configuration for transcription
diff --git a/src/OutgoingConnection.ts b/src/OutgoingConnection.ts
@@ -102,7 +102,7 @@ export class OutgoingConnection {
 			this.backend = createBackend(this.localTag, this.participant, this.options.provider);
 
 			// Check if backend wants raw Opus frames or decoded PCM
-			const wantsRawOpus = this.backend.wantsRawOpus?.() ?? false;
+			const wantsRawOpus = this.backend.wantsRawOpus?.(this.options.encoding) ?? false;
 
 			if (wantsRawOpus) {
 				logger.info(`Backend wants raw Opus frames for tag: ${this.localTag}, skipping Opus decoder initialization`);
@@ -138,6 +138,7 @@ export class OutgoingConnection {
 			// Get backend configuration
 			const backendConfig = getBackendConfig(this.options.provider);
 			backendConfig.language = this.options.language;
+			backendConfig.encoding = this.options.encoding;
 
 			// Connect the backend
 			await this.backend.connect(backendConfig);
@@ -214,7 +215,7 @@ export class OutgoingConnection {
 		}
 
 		// Check if backend wants raw Opus or decoded PCM
-		const wantsRawOpus = this.backend?.wantsRawOpus?.() ?? false;
+		const wantsRawOpus = this.backend?.wantsRawOpus?.(this.options.encoding) ?? false;
 		const backendStatus = this.backend?.getStatus();
 
 		if (wantsRawOpus && this.decoderStatus === 'ready' && backendStatus === 'connected') {
@@ -358,7 +359,7 @@ export class OutgoingConnection {
 		this.pendingOpusFrames = []; // Clear the queue
 
 		// Check if backend wants raw Opus or decoded PCM
-		const wantsRawOpus = this.backend?.wantsRawOpus?.() ?? false;
+		const wantsRawOpus = this.backend?.wantsRawOpus?.(this.options.encoding) ?? false;
 
 		for (const binaryData of queuedPayloads) {
 			if (wantsRawOpus) {
diff --git a/src/backends/DeepgramBackend.ts b/src/backends/DeepgramBackend.ts
@@ -46,16 +46,24 @@ export class DeepgramBackend implements TranscriptionBackend {
 		return new Promise((resolve, reject) => {
 			try {
 				// Build query parameters based on encoding type
-				const encoding = config.deepgram.encoding;
-				const sampleRate = encoding === 'opus' ? '48000' : '24000';
+				// Use per-connection encoding if specified, otherwise use global config
+				const encoding = backendConfig.encoding || config.deepgram.encoding;
+				const isContainerized = encoding === 'ogg-opus';
 
 				const params = new URLSearchParams({
-					encoding: encoding,
-					sample_rate: sampleRate,
 					channels: '1',
 					interim_results: 'true',
 				});
 
+				// For containerized audio (ogg-opus), omit encoding and sample_rate
+				// Deepgram will auto-detect from the container header
+				// See: https://developers.deepgram.com/docs/determining-your-audio-format-for-live-streaming-audio
+				if (!isContainerized) {
+					params.set('encoding', encoding);
+					const sampleRate = encoding === 'opus' ? '48000' : '24000';
+					params.set('sample_rate', sampleRate);
+				}
+
 				// Add model if specified
 				if (backendConfig.model) {
 					params.set('model', backendConfig.model);
@@ -207,9 +215,11 @@ export class DeepgramBackend implements TranscriptionBackend {
 		return this.status;
 	}
 
-	wantsRawOpus(): boolean {
-		// Return true if encoding is opus, otherwise false (use decoded PCM)
-		return config.deepgram.encoding === 'opus';
+	wantsRawOpus(encoding?: 'opus' | 'ogg-opus'): boolean {
+		// Return true for raw opus or containerized ogg-opus (both skip decoding)
+		// Use provided encoding or fall back to global config
+		const effectiveEncoding = encoding || config.deepgram.encoding;
+		return effectiveEncoding === 'opus' || effectiveEncoding === 'ogg-opus';
 	}
 
 	private startKeepAlive(): void {
diff --git a/src/backends/TranscriptionBackend.ts b/src/backends/TranscriptionBackend.ts
@@ -10,6 +10,7 @@
  */
 
 import type { TranscriptionMessage } from '../transcriberproxy';
+import type { AudioEncoding } from '../utils';
 
 export interface BackendConfig {
 	/** Language hint for transcription (null = auto-detect) */
@@ -18,6 +19,8 @@ export interface BackendConfig {
 	prompt?: string;
 	/** Model to use for transcription */
 	model?: string;
+	/** Audio encoding format ('opus' for raw frames, 'ogg-opus' for containerized) */
+	encoding?: AudioEncoding;
 }
 
 export interface TranscriptionBackend {
@@ -38,12 +41,13 @@ export interface TranscriptionBackend {
 	sendAudio(audioBase64: string): Promise<void>;
 
 	/**
-	 * Optional: Indicates if this backend wants raw Opus frames instead of decoded PCM
-	 * If true, sendAudio will receive base64-encoded Opus frames
+	 * Optional: Indicates if this backend wants raw audio instead of decoded PCM
+	 * If true, sendAudio will receive base64-encoded raw audio (Opus frames or Ogg-Opus container)
 	 * If false/undefined (default), sendAudio receives base64-encoded PCM
-	 * @returns true if backend prefers raw Opus frames
+	 * @param encoding - The audio encoding format being used (opus or ogg-opus)
+	 * @returns true if backend prefers raw audio in this encoding
 	 */
-	wantsRawOpus?(): boolean;
+	wantsRawOpus?(encoding?: AudioEncoding): boolean;
 
 	/**
 	 * Force the backend to commit/finalize pending audio and generate transcription
diff --git a/src/server.ts b/src/server.ts
@@ -68,10 +68,10 @@ server.on('upgrade', (request, socket, head) => {
 let wsConnectionId = 0;
 
 function handleWebSocketConnection(ws: WebSocket, parameters: any) {
-	const { sessionId, sendBack, sendBackInterim, language, provider: requestedProvider } = parameters;
+	const { sessionId, sendBack, sendBackInterim, language, provider: requestedProvider, encoding } = parameters;
 	const connectionId = ++wsConnectionId;
 
-	logger.info(`[WS-${connectionId}] New WebSocket connection, sessionId=${sessionId}, provider=${requestedProvider || 'default'}`);
+	logger.info(`[WS-${connectionId}] New WebSocket connection, sessionId=${sessionId}, provider=${requestedProvider || 'default'}, encoding=${encoding}`);
 
 	// Determine which provider to use
 	let provider: Provider | undefined;
@@ -103,7 +103,7 @@ function handleWebSocketConnection(ws: WebSocket, parameters: any) {
 	// Create transcription session
 	// Within this session, multiple participants (tags) can send audio
 	// Each tag gets its own backend connection, and transcripts are shared between tags
-	const session = new TranscriberProxy(ws, { language, sessionId, provider });
+	const session = new TranscriberProxy(ws, { language, sessionId, provider, encoding });
 
 	// Handle WebSocket close
 	ws.addEventListener('close', (event) => {
diff --git a/src/transcriberproxy.ts b/src/transcriberproxy.ts
@@ -2,6 +2,7 @@ import { OutgoingConnection } from './OutgoingConnection';
 import { EventEmitter } from 'node:events';
 import { WebSocket } from 'ws';
 import { config, type Provider } from './config';
+import type { AudioEncoding } from './utils';
 import * as fs from 'fs';
 import logger from './logger';
 
@@ -20,6 +21,7 @@ export interface TranscriberProxyOptions {
 	language: string | null;
 	sessionId?: string;
 	provider?: Provider;
+	encoding?: AudioEncoding;
 }
 
 export class TranscriberProxy extends EventEmitter {
diff --git a/src/utils.ts b/src/utils.ts
@@ -1,3 +1,5 @@
+export type AudioEncoding = 'opus' | 'ogg-opus';
+
 export interface ISessionParameters {
 	url: URL;
 	sessionId: string | null;
@@ -9,6 +11,7 @@ export interface ISessionParameters {
 	sendBackInterim: boolean;
 	language: string | null;
 	provider: string | null;
+	encoding: AudioEncoding;
 }
 
 export function extractSessionParameters(url: string): ISessionParameters {
@@ -22,6 +25,9 @@ export function extractSessionParameters(url: string): ISessionParameters {
 	const sendBackInterim = parsedUrl.searchParams.get('sendBackInterim');
 	const lang = parsedUrl.searchParams.get('lang');
 	const provider = parsedUrl.searchParams.get('provider');
+	const encodingParam = parsedUrl.searchParams.get('encoding');
+	// Default to 'opus' (raw opus frames) for backwards compatibility
+	const encoding: AudioEncoding = encodingParam === 'ogg-opus' ? 'ogg-opus' : 'opus';
 
 	return {
 		url: parsedUrl,
@@ -34,6 +40,7 @@ export function extractSessionParameters(url: string): ISessionParameters {
 		sendBackInterim: sendBackInterim === 'true',
 		language: lang,
 		provider,
+		encoding,
 	};
 }