Foundry-Local/samples/js/live-audio-transcription/app.js at 83e8dc44b496cde3a778d214e0bd8c2928ef69e9 · microsoft/Foundry-Local · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// Live Audio Transcription Example — Foundry Local JS SDK
//
// Demonstrates real-time microphone-to-text using the JS SDK.
// Requires: npm install foundry-local-sdk naudiodon2
//
// Usage: node app.js

import { FoundryLocalManager, CoreError } from 'foundry-local-sdk';

console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║   Foundry Local — Live Audio Transcription (JS SDK)      ║');
console.log('╚══════════════════════════════════════════════════════════╝');
console.log();

// Initialize the Foundry Local SDK
console.log('Initializing Foundry Local SDK...');
const manager = FoundryLocalManager.create({
    appName: 'foundry',
    logLevel: 'info'
});
console.log('✓ SDK initialized');

// Get and load the nemotron model
const modelAlias = 'nemotron-speech-streaming-en-0.6b';
let model = await manager.catalog.getModel(modelAlias);
if (!model) {
    console.error(`ERROR: Model "${modelAlias}" not found in catalog.`);
    process.exit(1);
}

console.log(`Found model: ${model.id}`);
console.log('Downloading model (if needed)...');
await model.download((progress) => {
    process.stdout.write(`\rDownloading... ${progress.toFixed(2)}%`);
});
console.log('\n✓ Model downloaded');

console.log('Loading model...');
await model.load();
console.log('✓ Model loaded');

// Create live transcription session (same pattern as C# sample).
const audioClient = model.createAudioClient();
const session = audioClient.createLiveTranscriptionSession();

session.settings.sampleRate = 16000;  // Default is 16000; shown here for clarity
session.settings.channels = 1;
session.settings.bitsPerSample = 16;
session.settings.language = 'en';

// Graceful-shutdown coordinator. Passed to start() / append() / stop() /
// getTranscriptionStream() so Ctrl+C can cancel any in-flight async work
// (e.g., a backpressured append()) instead of waiting for stop() to drain.
const shutdown = new AbortController();

console.log('Starting streaming session...');
await session.start(shutdown.signal);
console.log('✓ Session started');

// Read transcription results in background
const readPromise = (async () => {
    try {
        for await (const result of session.getTranscriptionStream(shutdown.signal)) {
            const text = result.content?.[0]?.text;
            if (!text) continue;

            // `is_final` is a transcript-state marker only. It should not stop the app.
            if (result.is_final) {
                process.stdout.write(`\n  [FINAL] ${text}\n`);
            } else {
                process.stdout.write(text);
            }
        }
    } catch (err) {
        // AbortError is expected on Ctrl+C; ignore quietly.
        if (err.name === 'AbortError') return;

        // CoreError surfaces native-core failure metadata (code + isTransient).
        // Use it to retry quietly on transient blips instead of dying on the
        // first hiccup. Without CoreError the only signal would be err.message.
        if (err instanceof CoreError) {
            if (err.isTransient) {
                console.warn(`\n⚠ Transient ASR error (${err.code}): ${err.message}. Continuing...`);
                return;
            }
            console.error(`\n✗ Stream error [${err.code}]: ${err.message}`);
            return;
        }

        console.error('\n✗ Stream error:', err.message);
    }
})();

// --- Microphone capture ---
// This example uses naudiodon2 for cross-platform audio capture.
// Install with: npm install naudiodon2
//
// If you prefer a different audio library, just push PCM bytes
// (16-bit signed LE, mono, 16kHz) via session.append().

let audioInput;
try {
    const { default: portAudio } = await import('naudiodon2');

    audioInput = portAudio.AudioIO({
        inOptions: {
            channelCount: session.settings.channels,
            sampleFormat: session.settings.bitsPerSample === 16
                ? portAudio.SampleFormat16Bit
                : portAudio.SampleFormat32Bit,
            sampleRate: session.settings.sampleRate,
            // Larger chunk size lowers callback frequency and reduces overflow risk.
            framesPerBuffer: 3200,
            // Allow deeper native queue during occasional event-loop stalls.
            maxQueue: 64
        }
    });

    const appendQueue = [];
    let pumping = false;
    let warnedQueueDrop = false;

    const pumpAudio = async () => {
        if (pumping) return;
        pumping = true;
        try {
            while (appendQueue.length > 0) {
                const pcm = appendQueue.shift();
                // Pass the shutdown signal so a backpressured append() resolves
                // promptly on Ctrl+C instead of blocking the pump.
                await session.append(pcm, shutdown.signal);
            }
        } catch (err) {
            // Aborted via Ctrl+C — exit quietly.
            if (err.name === 'AbortError') return;
            console.error('append error:', err.message);
        } finally {
            pumping = false;
            // Handle race where new data arrived after loop exit.
            if (appendQueue.length > 0 && !shutdown.signal.aborted) {
                void pumpAudio();
            }
        }
    };

    audioInput.on('data', (buffer) => {
        // Single copy: slice the underlying ArrayBuffer to get an independent Uint8Array.
        const copy = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength).slice();

        // Keep a bounded queue to avoid unbounded memory growth.
        if (appendQueue.length >= 100) {
            appendQueue.shift();
            if (!warnedQueueDrop) {
                warnedQueueDrop = true;
                console.warn('Audio append queue overflow; dropping oldest chunk to keep stream alive.');
            }
        }

        appendQueue.push(copy);
        void pumpAudio();
    });

    console.log();
    console.log('════════════════════════════════════════════════════════════');
    console.log('  LIVE TRANSCRIPTION ACTIVE');
    console.log('  Speak into your microphone.');
    console.log('  Press Ctrl+C to stop.');
    console.log('════════════════════════════════════════════════════════════');
    console.log();

    audioInput.start();
} catch (err) {
    console.warn('⚠ Could not initialize microphone (naudiodon2 may not be installed).');
    console.warn('  Install with: npm install naudiodon2');
    console.warn('  Falling back to synthetic audio test...');
    console.warn();

    // Fallback: push 2 seconds of synthetic PCM (440Hz sine wave)
    const sampleRate = session.settings.sampleRate;
    const duration = 2;
    const totalSamples = sampleRate * duration;
    const pcmBytes = new Uint8Array(totalSamples * 2);
    for (let i = 0; i < totalSamples; i++) {
        const t = i / sampleRate;
        const sample = Math.round(32767 * 0.5 * Math.sin(2 * Math.PI * 440 * t));
        pcmBytes[i * 2] = sample & 0xFF;
        pcmBytes[i * 2 + 1] = (sample >> 8) & 0xFF;
    }

    // Push in 100ms chunks
    const chunkSize = (sampleRate / 10) * 2;
    for (let offset = 0; offset < pcmBytes.length; offset += chunkSize) {
        const len = Math.min(chunkSize, pcmBytes.length - offset);
        await session.append(pcmBytes.slice(offset, offset + len));
    }

    console.log('✓ Synthetic audio pushed');
    console.log('Waiting briefly for final transcription results...');
    await new Promise((resolve) => setTimeout(resolve, 3000));
    await session.stop();
    await readPromise;
    await model.unload();
    console.log('✓ Done');
    process.exit(0);
}

// Handle graceful shutdown.
//
// The AbortController fires the shared `shutdown` signal so any in-flight
// session.append() / getTranscriptionStream() resolves promptly with an
// AbortError instead of waiting for stop() to finish draining the queue.
process.on('SIGINT', async () => {
    console.log('\n\nStopping...');
    shutdown.abort();
    if (audioInput) {
        audioInput.quit();
    }
    await session.stop();
    await readPromise;
    await model.unload();
    console.log('✓ Done');
    process.exit(0);
});