Skip to content

Commit b664f0e

Browse files
authored
feat: add ogg-opus encoding to deepgram (#53)
1 parent 5d75546 commit b664f0e

File tree

8 files changed

+75
-22
lines changed

8 files changed

+75
-22
lines changed

BACKENDS.md

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,10 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini
101101

102102
**Technical Details:**
103103
- Uses WebSocket API: `wss://api.deepgram.com/v1/listen`
104-
- Audio encoding options:
104+
- Audio encoding options (via `DEEPGRAM_ENCODING` env var or `encoding` URL parameter):
105105
- **linear16** (default): Sends decoded PCM audio at 24kHz, 16-bit, mono. Uses more CPU for Opus decoding but universally compatible.
106106
- **opus**: Sends raw Opus frames at 48kHz. More efficient (skips decoding step), lower CPU usage, native Opus support.
107+
- **ogg-opus**: Sends containerized Ogg-Opus audio (e.g., from Voximplant). Deepgram auto-detects encoding from the container header - no `encoding` or `sample_rate` params are sent to Deepgram.
107108
- Returns both interim and final transcriptions
108109
- Supports KeepAlive, Finalize, and CloseStream control messages
109110
- Authentication via Sec-WebSocket-Protocol header
@@ -121,6 +122,15 @@ PROVIDERS_PRIORITY=deepgram,openai,gemini
121122
All backends implement the `TranscriptionBackend` interface:
122123

123124
```typescript
125+
type AudioEncoding = 'opus' | 'ogg-opus';
126+
127+
interface BackendConfig {
128+
language: string | null;
129+
prompt?: string;
130+
model?: string;
131+
encoding?: AudioEncoding; // Audio format from client
132+
}
133+
124134
interface TranscriptionBackend {
125135
// Lifecycle
126136
connect(config: BackendConfig): Promise<void>;
@@ -130,6 +140,7 @@ interface TranscriptionBackend {
130140
// Audio
131141
sendAudio(audioBase64: string): Promise<void>;
132142
forceCommit(): void;
143+
wantsRawOpus?(encoding?: AudioEncoding): boolean; // Opt-in to raw audio
133144

134145
// Configuration
135146
updatePrompt(prompt: string): void;
@@ -138,16 +149,25 @@ interface TranscriptionBackend {
138149
onInterimTranscription?: (message: TranscriptionMessage) => void;
139150
onCompleteTranscription?: (message: TranscriptionMessage) => void;
140151
onError?: (errorType: string, errorMessage: string) => void;
152+
onClosed?: () => void;
141153
}
142154
```
143155

144156
### Audio Format
145-
All backends receive **24 kHz, 16-bit, mono PCM audio** encoded as base64 strings.
157+
By default, backends receive **24 kHz, 16-bit, mono PCM audio** encoded as base64 strings.
146158

147159
The opus-transcriber-proxy handles:
148160
1. Receiving Opus-encoded packets from clients
149-
2. Decoding to PCM
150-
3. Sending PCM to the transcription backend
161+
2. Decoding to PCM (unless backend opts out)
162+
3. Sending audio to the transcription backend
163+
164+
**Raw Audio Mode:** Backends can implement `wantsRawOpus(encoding?: AudioEncoding): boolean` to receive raw audio instead of decoded PCM. This is useful for backends like Deepgram that natively support Opus or Ogg-Opus formats.
165+
166+
**URL Parameter:** Clients can specify the audio encoding format via the `encoding` URL parameter:
167+
- `encoding=opus` (default): Raw Opus frames at 48kHz
168+
- `encoding=ogg-opus`: Containerized Ogg-Opus audio (e.g., from Voximplant)
169+
170+
Example: `wss://host/transcribe?transcribe=true&sendBack=true&encoding=ogg-opus`
151171

152172
### Transcription Messages
153173
Backends must produce `TranscriptionMessage` objects:

env.example

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
# ws://localhost:8080/transcribe?transcribe=true&sendBack=true&provider=deepgram
2121
# If the requested provider is not available or invalid, the connection will be rejected
2222

23+
# Runtime encoding selection:
24+
# You can override the default audio encoding per request using the ?encoding= URL parameter
25+
# Options: opus (default), ogg-opus
26+
# Examples:
27+
# ws://localhost:8080/transcribe?transcribe=true&sendBack=true&encoding=ogg-opus
28+
# ws://localhost:8080/transcribe?transcribe=true&sendBack=true&provider=deepgram&encoding=ogg-opus
29+
2330
# ============================================
2431
# OpenAI Configuration
2532
# ============================================
@@ -66,11 +73,13 @@ OPENAI_TRANSCRIPTION_PROMPT="You will hear audio coming from one participant in
6673
# Options: nova-2, nova, enhanced, base
6774
# DEEPGRAM_MODEL=nova-2
6875

69-
# Audio encoding to use for Deepgram
76+
# Audio encoding to use for Deepgram (default)
7077
# Default: linear16 (decoded PCM audio at 24kHz)
7178
# Options:
7279
# - linear16: Send decoded PCM audio at 24kHz (default, more CPU but compatible)
7380
# - opus: Send raw Opus frames at 48kHz (less CPU, native Opus support)
81+
# - ogg-opus: Send containerized Ogg-Opus audio (e.g., from Voximplant)
82+
# Can be overridden per-connection with the ?encoding= URL parameter
7483
# DEEPGRAM_ENCODING=linear16
7584

7685
# Language configuration for transcription

src/OutgoingConnection.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ export class OutgoingConnection {
102102
this.backend = createBackend(this.localTag, this.participant, this.options.provider);
103103

104104
// Check if backend wants raw Opus frames or decoded PCM
105-
const wantsRawOpus = this.backend.wantsRawOpus?.() ?? false;
105+
const wantsRawOpus = this.backend.wantsRawOpus?.(this.options.encoding) ?? false;
106106

107107
if (wantsRawOpus) {
108108
logger.info(`Backend wants raw Opus frames for tag: ${this.localTag}, skipping Opus decoder initialization`);
@@ -138,6 +138,7 @@ export class OutgoingConnection {
138138
// Get backend configuration
139139
const backendConfig = getBackendConfig(this.options.provider);
140140
backendConfig.language = this.options.language;
141+
backendConfig.encoding = this.options.encoding;
141142

142143
// Connect the backend
143144
await this.backend.connect(backendConfig);
@@ -214,7 +215,7 @@ export class OutgoingConnection {
214215
}
215216

216217
// Check if backend wants raw Opus or decoded PCM
217-
const wantsRawOpus = this.backend?.wantsRawOpus?.() ?? false;
218+
const wantsRawOpus = this.backend?.wantsRawOpus?.(this.options.encoding) ?? false;
218219
const backendStatus = this.backend?.getStatus();
219220

220221
if (wantsRawOpus && this.decoderStatus === 'ready' && backendStatus === 'connected') {
@@ -358,7 +359,7 @@ export class OutgoingConnection {
358359
this.pendingOpusFrames = []; // Clear the queue
359360

360361
// Check if backend wants raw Opus or decoded PCM
361-
const wantsRawOpus = this.backend?.wantsRawOpus?.() ?? false;
362+
const wantsRawOpus = this.backend?.wantsRawOpus?.(this.options.encoding) ?? false;
362363

363364
for (const binaryData of queuedPayloads) {
364365
if (wantsRawOpus) {

src/backends/DeepgramBackend.ts

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,24 @@ export class DeepgramBackend implements TranscriptionBackend {
4646
return new Promise((resolve, reject) => {
4747
try {
4848
// Build query parameters based on encoding type
49-
const encoding = config.deepgram.encoding;
50-
const sampleRate = encoding === 'opus' ? '48000' : '24000';
49+
// Use per-connection encoding if specified, otherwise use global config
50+
const encoding = backendConfig.encoding || config.deepgram.encoding;
51+
const isContainerized = encoding === 'ogg-opus';
5152

5253
const params = new URLSearchParams({
53-
encoding: encoding,
54-
sample_rate: sampleRate,
5554
channels: '1',
5655
interim_results: 'true',
5756
});
5857

58+
// For containerized audio (ogg-opus), omit encoding and sample_rate
59+
// Deepgram will auto-detect from the container header
60+
// See: https://developers.deepgram.com/docs/determining-your-audio-format-for-live-streaming-audio
61+
if (!isContainerized) {
62+
params.set('encoding', encoding);
63+
const sampleRate = encoding === 'opus' ? '48000' : '24000';
64+
params.set('sample_rate', sampleRate);
65+
}
66+
5967
// Add model if specified
6068
if (backendConfig.model) {
6169
params.set('model', backendConfig.model);
@@ -207,9 +215,11 @@ export class DeepgramBackend implements TranscriptionBackend {
207215
return this.status;
208216
}
209217

210-
wantsRawOpus(): boolean {
211-
// Return true if encoding is opus, otherwise false (use decoded PCM)
212-
return config.deepgram.encoding === 'opus';
218+
wantsRawOpus(encoding?: 'opus' | 'ogg-opus'): boolean {
219+
// Return true for raw opus or containerized ogg-opus (both skip decoding)
220+
// Use provided encoding or fall back to global config
221+
const effectiveEncoding = encoding || config.deepgram.encoding;
222+
return effectiveEncoding === 'opus' || effectiveEncoding === 'ogg-opus';
213223
}
214224

215225
private startKeepAlive(): void {

src/backends/TranscriptionBackend.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
*/
1111

1212
import type { TranscriptionMessage } from '../transcriberproxy';
13+
import type { AudioEncoding } from '../utils';
1314

1415
export interface BackendConfig {
1516
/** Language hint for transcription (null = auto-detect) */
@@ -18,6 +19,8 @@ export interface BackendConfig {
1819
prompt?: string;
1920
/** Model to use for transcription */
2021
model?: string;
22+
/** Audio encoding format ('opus' for raw frames, 'ogg-opus' for containerized) */
23+
encoding?: AudioEncoding;
2124
}
2225

2326
export interface TranscriptionBackend {
@@ -38,12 +41,13 @@ export interface TranscriptionBackend {
3841
sendAudio(audioBase64: string): Promise<void>;
3942

4043
/**
41-
* Optional: Indicates if this backend wants raw Opus frames instead of decoded PCM
42-
* If true, sendAudio will receive base64-encoded Opus frames
44+
* Optional: Indicates if this backend wants raw audio instead of decoded PCM
45+
* If true, sendAudio will receive base64-encoded raw audio (Opus frames or Ogg-Opus container)
4346
* If false/undefined (default), sendAudio receives base64-encoded PCM
44-
* @returns true if backend prefers raw Opus frames
47+
* @param encoding - The audio encoding format being used (opus or ogg-opus)
48+
* @returns true if backend prefers raw audio in this encoding
4549
*/
46-
wantsRawOpus?(): boolean;
50+
wantsRawOpus?(encoding?: AudioEncoding): boolean;
4751

4852
/**
4953
* Force the backend to commit/finalize pending audio and generate transcription

src/server.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ server.on('upgrade', (request, socket, head) => {
6868
let wsConnectionId = 0;
6969

7070
function handleWebSocketConnection(ws: WebSocket, parameters: any) {
71-
const { sessionId, sendBack, sendBackInterim, language, provider: requestedProvider } = parameters;
71+
const { sessionId, sendBack, sendBackInterim, language, provider: requestedProvider, encoding } = parameters;
7272
const connectionId = ++wsConnectionId;
7373

74-
logger.info(`[WS-${connectionId}] New WebSocket connection, sessionId=${sessionId}, provider=${requestedProvider || 'default'}`);
74+
logger.info(`[WS-${connectionId}] New WebSocket connection, sessionId=${sessionId}, provider=${requestedProvider || 'default'}, encoding=${encoding}`);
7575

7676
// Determine which provider to use
7777
let provider: Provider | undefined;
@@ -103,7 +103,7 @@ function handleWebSocketConnection(ws: WebSocket, parameters: any) {
103103
// Create transcription session
104104
// Within this session, multiple participants (tags) can send audio
105105
// Each tag gets its own backend connection, and transcripts are shared between tags
106-
const session = new TranscriberProxy(ws, { language, sessionId, provider });
106+
const session = new TranscriberProxy(ws, { language, sessionId, provider, encoding });
107107

108108
// Handle WebSocket close
109109
ws.addEventListener('close', (event) => {

src/transcriberproxy.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { OutgoingConnection } from './OutgoingConnection';
22
import { EventEmitter } from 'node:events';
33
import { WebSocket } from 'ws';
44
import { config, type Provider } from './config';
5+
import type { AudioEncoding } from './utils';
56
import * as fs from 'fs';
67
import logger from './logger';
78

@@ -20,6 +21,7 @@ export interface TranscriberProxyOptions {
2021
language: string | null;
2122
sessionId?: string;
2223
provider?: Provider;
24+
encoding?: AudioEncoding;
2325
}
2426

2527
export class TranscriberProxy extends EventEmitter {

src/utils.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
export type AudioEncoding = 'opus' | 'ogg-opus';
2+
13
export interface ISessionParameters {
24
url: URL;
35
sessionId: string | null;
@@ -9,6 +11,7 @@ export interface ISessionParameters {
911
sendBackInterim: boolean;
1012
language: string | null;
1113
provider: string | null;
14+
encoding: AudioEncoding;
1215
}
1316

1417
export function extractSessionParameters(url: string): ISessionParameters {
@@ -22,6 +25,9 @@ export function extractSessionParameters(url: string): ISessionParameters {
2225
const sendBackInterim = parsedUrl.searchParams.get('sendBackInterim');
2326
const lang = parsedUrl.searchParams.get('lang');
2427
const provider = parsedUrl.searchParams.get('provider');
28+
const encodingParam = parsedUrl.searchParams.get('encoding');
29+
// Default to 'opus' (raw opus frames) for backwards compatibility
30+
const encoding: AudioEncoding = encodingParam === 'ogg-opus' ? 'ogg-opus' : 'opus';
2531

2632
return {
2733
url: parsedUrl,
@@ -34,6 +40,7 @@ export function extractSessionParameters(url: string): ISessionParameters {
3440
sendBackInterim: sendBackInterim === 'true',
3541
language: lang,
3642
provider,
43+
encoding,
3744
};
3845
}
3946

0 commit comments

Comments
 (0)