Skip to content

Commit e45ea83

Browse files
elicep01claude
andcommitted
fix(windows): native speech recognizer via System.Speech + fix binary paths
Root causes of broken Windows dictation: 1. hotkey-hold-monitor.exe path wrong: windows.ts compiles to dist/main/platform/ so __dirname/../native = dist/main/native (not found). Fixed to go up two levels so the path resolves to dist/native/. 2. Web Speech API ('webspeech' backend) fails with 'network' error in Electron because Electron builds do not include Google's API keys that Chrome embeds. Switched to the 'native' backend on all platforms. 3. Windows has no native speech binary. Added speech-recognizer.cs — a C# console app using System.Speech.Recognition (built into .NET Framework 4.x, available on every Windows 10/11 machine, works fully offline with no API key). Protocol: emits {"ready":true}, {"transcript":"...","isFinal":true|false}, {"error":"..."} JSON lines to stdout, same as the macOS Swift binary. 4. whisper-start-native in main.ts used 'speech-recognizer' (no .exe) on all platforms; Windows needs 'speech-recognizer.exe'. Added platform check. On-demand compile fallback now uses csc.exe + the WPF-subfolder System.Speech.dll path on Windows. build-native.js changes: - Add C# compilation step using C:\Windows\Microsoft.NET\Framework64\ v4.0.30319\csc.exe with System.Speech.dll from the WPF subfolder. - Both hotkey-hold-monitor.exe and speech-recognizer.exe are now built by `npm run build:native`. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 658e592 commit e45ea83

5 files changed

Lines changed: 195 additions & 13 deletions

File tree

scripts/build-native.js

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,58 @@ if (process.platform === 'win32') {
199199
}
200200
}
201201

202+
// ── C# binaries (compiled with csc.exe from .NET Framework — always available on Windows 10/11) ──
203+
function findCsc() {
204+
// Try .NET Framework 4.x csc.exe (guaranteed on Windows 10/11)
205+
const cscPaths = [
206+
'C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319\\csc.exe',
207+
'C:\\Windows\\Microsoft.NET\\Framework\\v4.0.30319\\csc.exe',
208+
];
209+
for (const p of cscPaths) {
210+
if (fs.existsSync(p)) return p;
211+
}
212+
return null;
213+
}
214+
215+
// System.Speech.dll lives in the WPF subfolder of the .NET Framework directory.
216+
function findSystemSpeechDll(cscPath) {
217+
const netDir = path.dirname(cscPath); // e.g. C:\Windows\Microsoft.NET\Framework64\v4.0.30319
218+
const wpfPath = path.join(netDir, 'WPF', 'System.Speech.dll');
219+
if (fs.existsSync(wpfPath)) return wpfPath;
220+
return null;
221+
}
222+
223+
const cscBinaries = [
224+
{
225+
out: 'speech-recognizer.exe',
226+
src: 'src/native/speech-recognizer.cs',
227+
},
228+
];
229+
230+
const csc = findCsc();
231+
if (!csc) {
232+
console.warn('[build-native] WARNING: csc.exe not found — speech-recognizer.exe will not be built.');
233+
console.warn('[build-native] Dictation will fall back to cloud transcription if an API key is configured.');
234+
} else {
235+
const speechDll = findSystemSpeechDll(csc);
236+
if (!speechDll) {
237+
console.warn('[build-native] WARNING: System.Speech.dll not found — speech-recognizer.exe will not be built.');
238+
} else {
239+
for (const { out, src } of cscBinaries) {
240+
const outPath = path.join(outDir, out);
241+
const srcPath = path.join(__dirname, '..', src); // absolute path required by csc.exe
242+
const cmd = `"${csc}" /nologo /target:exe /optimize+ /r:"${speechDll}" /out:"${outPath}" "${srcPath}"`;
243+
console.log(`[build-native] Compiling ${out} with csc.exe...`);
244+
try {
245+
execSync(cmd, { stdio: 'inherit' });
246+
} catch (err) {
247+
console.warn(`[build-native] WARNING: Failed to compile ${out}:`, err.message);
248+
console.warn('[build-native] Dictation will fall back to cloud transcription if an API key is configured.');
249+
}
250+
}
251+
}
252+
}
253+
202254
console.log('[build-native] Done (Windows).');
203255
process.exit(0);
204256
}

src/main/main.ts

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7088,23 +7088,44 @@ return appURL's |path|() as text`,
70887088
}
70897089

70907090
const lang = language || loadSettings().ai.speechLanguage || 'en-US';
7091-
const binaryPath = getNativeBinaryPath('speech-recognizer');
7091+
// On Windows the binary has an .exe extension; on macOS it has none.
7092+
const binaryName = process.platform === 'win32' ? 'speech-recognizer.exe' : 'speech-recognizer';
7093+
const binaryPath = getNativeBinaryPath(binaryName);
70927094
const fs = require('fs');
70937095

7094-
// Compile on demand (same pattern as color-picker / snippet-expander)
7096+
// Compile on demand if binary is missing.
70957097
if (!fs.existsSync(binaryPath)) {
70967098
try {
70977099
const { execFileSync } = require('child_process');
7098-
const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.swift');
7099-
execFileSync('swiftc', [
7100-
'-O', '-o', binaryPath, sourcePath,
7101-
'-framework', 'Speech',
7102-
'-framework', 'AVFoundation',
7103-
]);
7100+
if (process.platform === 'win32') {
7101+
// Compile the C# speech recognizer with .NET Framework csc.exe.
7102+
const cscCandidates = [
7103+
'C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319\\csc.exe',
7104+
'C:\\Windows\\Microsoft.NET\\Framework\\v4.0.30319\\csc.exe',
7105+
];
7106+
const csc = cscCandidates.find((p) => fs.existsSync(p));
7107+
if (!csc) throw new Error('csc.exe not found — run `npm run build:native` after installing .NET Framework.');
7108+
const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.cs');
7109+
// System.Speech.dll lives in the WPF subfolder next to csc.exe.
7110+
const netDir = path.dirname(csc);
7111+
const speechDll = path.join(netDir, 'WPF', 'System.Speech.dll');
7112+
execFileSync(csc, ['/nologo', '/target:exe', '/optimize+', `/r:${speechDll}`, `/out:${binaryPath}`, sourcePath]);
7113+
} else {
7114+
const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.swift');
7115+
execFileSync('swiftc', [
7116+
'-O', '-o', binaryPath, sourcePath,
7117+
'-framework', 'Speech',
7118+
'-framework', 'AVFoundation',
7119+
]);
7120+
}
71047121
console.log('[Whisper][native] Compiled speech-recognizer binary');
71057122
} catch (error) {
71067123
console.error('[Whisper][native] Compile failed:', error);
7107-
throw new Error('Failed to compile native speech recognizer. Ensure Xcode Command Line Tools are installed.');
7124+
throw new Error(
7125+
process.platform === 'win32'
7126+
? 'Failed to compile Windows speech recognizer. Run `npm run build:native` to build it.'
7127+
: 'Failed to compile native speech recognizer. Ensure Xcode Command Line Tools are installed.'
7128+
);
71087129
}
71097130
}
71107131

src/main/platform/windows.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import { app } from 'electron';
2121
// ── Helpers ───────────────────────────────────────────────────────────────────
2222

2323
function getNativeBinaryPath(name: string): string {
24-
const base = path.join(__dirname, '..', 'native', name);
24+
// windows.ts compiles to dist/main/platform/windows.js, so __dirname is
25+
// dist/main/platform — go up two levels to reach dist/, then into native/.
26+
const base = path.join(__dirname, '..', '..', 'native', name);
2527
if (app.isPackaged) {
2628
return base.replace('app.asar', 'app.asar.unpacked');
2729
}

src/native/speech-recognizer.cs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/**
2+
* src/native/speech-recognizer.cs
3+
*
4+
* Windows speech recognizer using System.Speech (built into .NET Framework 4.x,
5+
* available on every Windows 10/11 machine — no API key, works offline).
6+
*
7+
* Usage: speech-recognizer.exe [lang]
8+
* lang BCP-47 language tag, e.g. en-US (default).
9+
*
10+
* Stdout protocol (one JSON object per line, same as the macOS Swift binary):
11+
* {"ready":true} – recognizer is listening
12+
* {"transcript":"text","isFinal":true|false} – speech result
13+
* {"error":"message"} – fatal error then exit
14+
*
15+
* The process runs until killed by the parent (node sends SIGTERM / TerminateProcess).
16+
*/
17+
18+
using System;
19+
using System.Globalization;
20+
using System.Speech.Recognition;
21+
using System.Threading;
22+
23+
class SpeechRecognizerProgram
24+
{
25+
static string Escape(string s)
26+
{
27+
return (s ?? string.Empty)
28+
.Replace("\\", "\\\\")
29+
.Replace("\"", "\\\"")
30+
.Replace("\n", "\\n")
31+
.Replace("\r", "\\r");
32+
}
33+
34+
static void Emit(string json)
35+
{
36+
Console.WriteLine(json);
37+
Console.Out.Flush();
38+
}
39+
40+
static void Main(string[] args)
41+
{
42+
string lang = args.Length > 0 ? args[0].Trim() : "en-US";
43+
44+
// Try the requested culture, fall back to en-US, then system default.
45+
SpeechRecognitionEngine engine = null;
46+
string[] candidates = { lang, "en-US", string.Empty };
47+
foreach (string id in candidates)
48+
{
49+
try
50+
{
51+
engine = id.Length == 0
52+
? new SpeechRecognitionEngine()
53+
: new SpeechRecognitionEngine(new CultureInfo(id));
54+
break;
55+
}
56+
catch { /* try next */ }
57+
}
58+
59+
if (engine == null)
60+
{
61+
Emit("{\"error\":\"No speech recognition engine is available on this system.\"}");
62+
return;
63+
}
64+
65+
try
66+
{
67+
engine.SetInputToDefaultAudioDevice();
68+
}
69+
catch (Exception ex)
70+
{
71+
Emit("{\"error\":\"" + Escape(ex.Message) + "\"}");
72+
engine.Dispose();
73+
return;
74+
}
75+
76+
engine.LoadGrammar(new DictationGrammar());
77+
78+
// Interim / hypothesis — sent while the user is still speaking.
79+
engine.SpeechHypothesized += (sender, e) =>
80+
{
81+
Emit("{\"transcript\":\"" + Escape(e.Result.Text) + "\",\"isFinal\":false}");
82+
};
83+
84+
// Final result — sent when the engine has committed to a result.
85+
engine.SpeechRecognized += (sender, e) =>
86+
{
87+
Emit("{\"transcript\":\"" + Escape(e.Result.Text) + "\",\"isFinal\":true}");
88+
};
89+
90+
// Low-confidence results are silently ignored.
91+
engine.SpeechRecognitionRejected += (sender, e) => { };
92+
93+
engine.RecognizeAsync(RecognizeMode.Multiple);
94+
95+
// Signal the parent that we are ready.
96+
Emit("{\"ready\":true}");
97+
98+
// Block the main thread forever — the SAPI engine delivers events on
99+
// background threads. The parent kills this process via TerminateProcess
100+
// when it wants to stop recognition.
101+
Thread.Sleep(Timeout.Infinite);
102+
103+
engine.Dispose();
104+
}
105+
}

src/renderer/src/SuperCmdWhisper.tsx

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,11 @@ const SuperCmdWhisper: React.FC<SuperCmdWhisperProps> = ({
385385
(wantsOpenAI && !!settings.ai.openaiApiKey) ||
386386
(wantsElevenLabs && !!settings.ai.elevenlabsApiKey);
387387
const isWindowsPlatform = window.electron?.platform === 'win32';
388-
// On Windows there is no macOS SFSpeechRecognizer binary — use Chromium's
389-
// built-in Web Speech API instead (same engine as Chrome, no API key needed).
390-
const backend: WhisperBackend = canUseCloud ? 'whisper' : (isWindowsPlatform ? 'webspeech' : 'native');
388+
// On Windows use the native System.Speech recognizer binary (compiled from
389+
// src/native/speech-recognizer.cs) instead of the macOS SFSpeechRecognizer.
390+
// The Web Speech API ('webspeech') requires Google API keys that Electron
391+
// does not include, so it fails with network errors on non-Chrome builds.
392+
const backend: WhisperBackend = canUseCloud ? 'whisper' : 'native';
391393
backendRef.current = backend;
392394
return { backend, language };
393395
} catch {

0 commit comments

Comments
 (0)