Foundry-Local/samples/js/live-audio-transcription-example/app.js at d2ef88dd0b697728ff2d0cc978cb4d3f7acf7a33 · microsoft/Foundry-Local · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// Live Audio Transcription Example — Foundry Local JS SDK
//
// Demonstrates real-time microphone-to-text using the JS SDK.
// Requires: npm install foundry-local-sdk naudiodon2
//
// Usage: node app.js

import { FoundryLocalManager } from 'foundry-local-sdk';

console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║   Foundry Local — Live Audio Transcription (JS SDK)     ║');
console.log('╚══════════════════════════════════════════════════════════╝');
console.log();

// Initialize the Foundry Local SDK
console.log('Initializing Foundry Local SDK...');
const manager = FoundryLocalManager.create({
    appName: 'foundry_local_live_audio',
    logLevel: 'info'
});
console.log('✓ SDK initialized');

// Get and load the nemotron model
const modelAlias = 'nemotron';
let model = await manager.catalog.getModel(modelAlias);
if (!model) {
    console.error(`ERROR: Model "${modelAlias}" not found in catalog.`);
    process.exit(1);
}

console.log(`Found model: ${model.id}`);
console.log('Downloading model (if needed)...');
await model.download((progress) => {
    process.stdout.write(`\rDownloading... ${progress.toFixed(2)}%`);
});
console.log('\n✓ Model downloaded');

console.log('Loading model...');
await model.load();
console.log('✓ Model loaded');

// Create live transcription session (same pattern as C# sample).
const audioClient = model.createAudioClient();
const session = audioClient.createLiveTranscriptionSession();

session.settings.sampleRate = 16000;  // Default is 16000; shown here for clarity
session.settings.channels = 1;
session.settings.bitsPerSample = 16;
session.settings.language = 'en';

console.log('Starting streaming session...');
await session.start();
console.log('✓ Session started');

// Read transcription results in background
const readPromise = (async () => {
    try {
        for await (const result of session.getTranscriptionStream()) {
            const text = result.content?.[0]?.text;
            if (!text) continue;

            // `is_final` is a transcript-state marker only. It should not stop the app.
            if (result.is_final) {
                process.stdout.write(`\n  [FINAL] ${text}\n`);
            } else {
                process.stdout.write(text);
            }
        }
    } catch (err) {
        if (err.name !== 'AbortError') {
            console.error('Stream error:', err.message);
        }
    }
})();

// --- Microphone capture ---
// This example uses naudiodon2 for cross-platform audio capture.
// Install with: npm install naudiodon2
//
// If you prefer a different audio library, just push PCM bytes
// (16-bit signed LE, mono, 16kHz) via session.append().

let audioInput;
try {
    const { default: portAudio } = await import('naudiodon2');

    audioInput = portAudio.AudioIO({
        inOptions: {
            channelCount: session.settings.channels,
            sampleFormat: session.settings.bitsPerSample === 16
                ? portAudio.SampleFormat16Bit
                : portAudio.SampleFormat32Bit,
            sampleRate: session.settings.sampleRate,
            // Larger chunk size lowers callback frequency and reduces overflow risk.
            framesPerBuffer: 3200,
            // Allow deeper native queue during occasional event-loop stalls.
            maxQueue: 64
        }
    });

    const appendQueue = [];
    let pumping = false;
    let warnedQueueDrop = false;

    const pumpAudio = async () => {
        if (pumping) return;
        pumping = true;
        try {
            while (appendQueue.length > 0) {
                const pcm = appendQueue.shift();
                await session.append(pcm);
            }
        } catch (err) {
            console.error('append error:', err.message);
        } finally {
            pumping = false;
            // Handle race where new data arrived after loop exit.
            if (appendQueue.length > 0) {
                void pumpAudio();
            }
        }
    };

    audioInput.on('data', (buffer) => {
        const pcm = new Uint8Array(buffer);
        const copy = new Uint8Array(pcm.length);
        copy.set(pcm);

        // Keep a bounded queue to avoid unbounded memory growth.
        if (appendQueue.length >= 100) {
            appendQueue.shift();
            if (!warnedQueueDrop) {
                warnedQueueDrop = true;
                console.warn('Audio append queue overflow; dropping oldest chunk to keep stream alive.');
            }
        }

        appendQueue.push(copy);
        void pumpAudio();
    });

    console.log();
    console.log('════════════════════════════════════════════════════════════');
    console.log('  LIVE TRANSCRIPTION ACTIVE');
    console.log('  Speak into your microphone.');
    console.log('  Press Ctrl+C to stop.');
    console.log('════════════════════════════════════════════════════════════');
    console.log();

    audioInput.start();
} catch (err) {
    console.warn('⚠ Could not initialize microphone (naudiodon2 may not be installed).');
    console.warn('  Install with: npm install naudiodon2');
    console.warn('  Falling back to synthetic audio test...');
    console.warn();

    // Fallback: push 2 seconds of synthetic PCM (440Hz sine wave)
    const sampleRate = session.settings.sampleRate;
    const duration = 2;
    const totalSamples = sampleRate * duration;
    const pcmBytes = new Uint8Array(totalSamples * 2);
    for (let i = 0; i < totalSamples; i++) {
        const t = i / sampleRate;
        const sample = Math.round(32767 * 0.5 * Math.sin(2 * Math.PI * 440 * t));
        pcmBytes[i * 2] = sample & 0xFF;
        pcmBytes[i * 2 + 1] = (sample >> 8) & 0xFF;
    }

    // Push in 100ms chunks
    const chunkSize = (sampleRate / 10) * 2;
    for (let offset = 0; offset < pcmBytes.length; offset += chunkSize) {
        const len = Math.min(chunkSize, pcmBytes.length - offset);
        await session.append(pcmBytes.slice(offset, offset + len));
    }

    console.log('✓ Synthetic audio pushed');
}

// Handle graceful shutdown
process.on('SIGINT', async () => {
    console.log('\n\nStopping...');
    if (audioInput) {
        audioInput.quit();
    }
    await session.stop();
    await readPromise;
    await model.unload();
    console.log('✓ Done');
    process.exit(0);
});