fix(windows): native speech recognizer via System.Speech + fix binary paths

elicep01 · claude · elicep01 · commit e45ea83fec3a · 2026-02-18T14:47:19.000-06:00
Root causes of broken Windows dictation:
1. hotkey-hold-monitor.exe path wrong: windows.ts compiles to
   dist/main/platform/ so __dirname/../native = dist/main/native (not found).
   Fixed to go up two levels so the path resolves to dist/native/.

2. Web Speech API ('webspeech' backend) fails with 'network' error in
   Electron because Electron builds do not include Google's API keys that
   Chrome embeds. Switched to the 'native' backend on all platforms.

3. Windows has no native speech binary. Added speech-recognizer.cs — a
   C# console app using System.Speech.Recognition (built into .NET
   Framework 4.x, available on every Windows 10/11 machine, works fully
   offline with no API key). Protocol: emits {"ready":true},
   {"transcript":"...","isFinal":true|false}, {"error":"..."} JSON lines
   to stdout, same as the macOS Swift binary.

4. whisper-start-native in main.ts used 'speech-recognizer' (no .exe) on
   all platforms; Windows needs 'speech-recognizer.exe'. Added platform
   check. On-demand compile fallback now uses csc.exe + the WPF-subfolder
   System.Speech.dll path on Windows.

build-native.js changes:
- Add C# compilation step using C:\Windows\Microsoft.NET\Framework64\
  v4.0.30319\csc.exe with System.Speech.dll from the WPF subfolder.
- Both hotkey-hold-monitor.exe and speech-recognizer.exe are now built
  by `npm run build:native`.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/scripts/build-native.js b/scripts/build-native.js
@@ -199,6 +199,58 @@ if (process.platform === 'win32') {
     }
   }
 
+  // ── C# binaries (compiled with csc.exe from .NET Framework — always available on Windows 10/11) ──
+  function findCsc() {
+    // Try .NET Framework 4.x csc.exe (guaranteed on Windows 10/11)
+    const cscPaths = [
+      'C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319\\csc.exe',
+      'C:\\Windows\\Microsoft.NET\\Framework\\v4.0.30319\\csc.exe',
+    ];
+    for (const p of cscPaths) {
+      if (fs.existsSync(p)) return p;
+    }
+    return null;
+  }
+
+  // System.Speech.dll lives in the WPF subfolder of the .NET Framework directory.
+  function findSystemSpeechDll(cscPath) {
+    const netDir = path.dirname(cscPath); // e.g. C:\Windows\Microsoft.NET\Framework64\v4.0.30319
+    const wpfPath = path.join(netDir, 'WPF', 'System.Speech.dll');
+    if (fs.existsSync(wpfPath)) return wpfPath;
+    return null;
+  }
+
+  const cscBinaries = [
+    {
+      out: 'speech-recognizer.exe',
+      src: 'src/native/speech-recognizer.cs',
+    },
+  ];
+
+  const csc = findCsc();
+  if (!csc) {
+    console.warn('[build-native] WARNING: csc.exe not found — speech-recognizer.exe will not be built.');
+    console.warn('[build-native] Dictation will fall back to cloud transcription if an API key is configured.');
+  } else {
+    const speechDll = findSystemSpeechDll(csc);
+    if (!speechDll) {
+      console.warn('[build-native] WARNING: System.Speech.dll not found — speech-recognizer.exe will not be built.');
+    } else {
+      for (const { out, src } of cscBinaries) {
+        const outPath = path.join(outDir, out);
+        const srcPath = path.join(__dirname, '..', src); // absolute path required by csc.exe
+        const cmd = `"${csc}" /nologo /target:exe /optimize+ /r:"${speechDll}" /out:"${outPath}" "${srcPath}"`;
+        console.log(`[build-native] Compiling ${out} with csc.exe...`);
+        try {
+          execSync(cmd, { stdio: 'inherit' });
+        } catch (err) {
+          console.warn(`[build-native] WARNING: Failed to compile ${out}:`, err.message);
+          console.warn('[build-native] Dictation will fall back to cloud transcription if an API key is configured.');
+        }
+      }
+    }
+  }
+
   console.log('[build-native] Done (Windows).');
   process.exit(0);
 }
diff --git a/src/main/main.ts b/src/main/main.ts
@@ -7088,23 +7088,44 @@ return appURL's |path|() as text`,
     }
 
     const lang = language || loadSettings().ai.speechLanguage || 'en-US';
-    const binaryPath = getNativeBinaryPath('speech-recognizer');
+    // On Windows the binary has an .exe extension; on macOS it has none.
+    const binaryName = process.platform === 'win32' ? 'speech-recognizer.exe' : 'speech-recognizer';
+    const binaryPath = getNativeBinaryPath(binaryName);
     const fs = require('fs');
 
-    // Compile on demand (same pattern as color-picker / snippet-expander)
+    // Compile on demand if binary is missing.
     if (!fs.existsSync(binaryPath)) {
       try {
         const { execFileSync } = require('child_process');
-        const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.swift');
-        execFileSync('swiftc', [
-          '-O', '-o', binaryPath, sourcePath,
-          '-framework', 'Speech',
-          '-framework', 'AVFoundation',
-        ]);
+        if (process.platform === 'win32') {
+          // Compile the C# speech recognizer with .NET Framework csc.exe.
+          const cscCandidates = [
+            'C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319\\csc.exe',
+            'C:\\Windows\\Microsoft.NET\\Framework\\v4.0.30319\\csc.exe',
+          ];
+          const csc = cscCandidates.find((p) => fs.existsSync(p));
+          if (!csc) throw new Error('csc.exe not found — run `npm run build:native` after installing .NET Framework.');
+          const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.cs');
+          // System.Speech.dll lives in the WPF subfolder next to csc.exe.
+          const netDir = path.dirname(csc);
+          const speechDll = path.join(netDir, 'WPF', 'System.Speech.dll');
+          execFileSync(csc, ['/nologo', '/target:exe', '/optimize+', `/r:${speechDll}`, `/out:${binaryPath}`, sourcePath]);
+        } else {
+          const sourcePath = path.join(app.getAppPath(), 'src', 'native', 'speech-recognizer.swift');
+          execFileSync('swiftc', [
+            '-O', '-o', binaryPath, sourcePath,
+            '-framework', 'Speech',
+            '-framework', 'AVFoundation',
+          ]);
+        }
         console.log('[Whisper][native] Compiled speech-recognizer binary');
       } catch (error) {
         console.error('[Whisper][native] Compile failed:', error);
-        throw new Error('Failed to compile native speech recognizer. Ensure Xcode Command Line Tools are installed.');
+        throw new Error(
+          process.platform === 'win32'
+            ? 'Failed to compile Windows speech recognizer. Run `npm run build:native` to build it.'
+            : 'Failed to compile native speech recognizer. Ensure Xcode Command Line Tools are installed.'
+        );
       }
     }
 
diff --git a/src/main/platform/windows.ts b/src/main/platform/windows.ts
@@ -21,7 +21,9 @@ import { app } from 'electron';
 // ── Helpers ───────────────────────────────────────────────────────────────────
 
 function getNativeBinaryPath(name: string): string {
-  const base = path.join(__dirname, '..', 'native', name);
+  // windows.ts compiles to dist/main/platform/windows.js, so __dirname is
+  // dist/main/platform — go up two levels to reach dist/, then into native/.
+  const base = path.join(__dirname, '..', '..', 'native', name);
   if (app.isPackaged) {
     return base.replace('app.asar', 'app.asar.unpacked');
   }
diff --git a/src/native/speech-recognizer.cs b/src/native/speech-recognizer.cs
@@ -0,0 +1,105 @@
+/**
+ * src/native/speech-recognizer.cs
+ *
+ * Windows speech recognizer using System.Speech (built into .NET Framework 4.x,
+ * available on every Windows 10/11 machine — no API key, works offline).
+ *
+ * Usage:  speech-recognizer.exe [lang]
+ *   lang  BCP-47 language tag, e.g. en-US (default).
+ *
+ * Stdout protocol (one JSON object per line, same as the macOS Swift binary):
+ *   {"ready":true}                                – recognizer is listening
+ *   {"transcript":"text","isFinal":true|false}    – speech result
+ *   {"error":"message"}                           – fatal error then exit
+ *
+ * The process runs until killed by the parent (node sends SIGTERM / TerminateProcess).
+ */
+
+using System;
+using System.Globalization;
+using System.Speech.Recognition;
+using System.Threading;
+
+class SpeechRecognizerProgram
+{
+    static string Escape(string s)
+    {
+        return (s ?? string.Empty)
+            .Replace("\\", "\\\\")
+            .Replace("\"", "\\\"")
+            .Replace("\n", "\\n")
+            .Replace("\r", "\\r");
+    }
+
+    static void Emit(string json)
+    {
+        Console.WriteLine(json);
+        Console.Out.Flush();
+    }
+
+    static void Main(string[] args)
+    {
+        string lang = args.Length > 0 ? args[0].Trim() : "en-US";
+
+        // Try the requested culture, fall back to en-US, then system default.
+        SpeechRecognitionEngine engine = null;
+        string[] candidates = { lang, "en-US", string.Empty };
+        foreach (string id in candidates)
+        {
+            try
+            {
+                engine = id.Length == 0
+                    ? new SpeechRecognitionEngine()
+                    : new SpeechRecognitionEngine(new CultureInfo(id));
+                break;
+            }
+            catch { /* try next */ }
+        }
+
+        if (engine == null)
+        {
+            Emit("{\"error\":\"No speech recognition engine is available on this system.\"}");
+            return;
+        }
+
+        try
+        {
+            engine.SetInputToDefaultAudioDevice();
+        }
+        catch (Exception ex)
+        {
+            Emit("{\"error\":\"" + Escape(ex.Message) + "\"}");
+            engine.Dispose();
+            return;
+        }
+
+        engine.LoadGrammar(new DictationGrammar());
+
+        // Interim / hypothesis — sent while the user is still speaking.
+        engine.SpeechHypothesized += (sender, e) =>
+        {
+            Emit("{\"transcript\":\"" + Escape(e.Result.Text) + "\",\"isFinal\":false}");
+        };
+
+        // Final result — sent when the engine has committed to a result.
+        engine.SpeechRecognized += (sender, e) =>
+        {
+            Emit("{\"transcript\":\"" + Escape(e.Result.Text) + "\",\"isFinal\":true}");
+        };
+
+        // Low-confidence results are silently ignored.
+        engine.SpeechRecognitionRejected += (sender, e) => { };
+
+        engine.RecognizeAsync(RecognizeMode.Multiple);
+
+        // Signal the parent that we are ready.
+        Emit("{\"ready\":true}");
+
+        // Block the main thread forever — the SAPI engine delivers events on
+        // background threads.  The parent kills this process via TerminateProcess
+        // when it wants to stop recognition.
+        Thread.Sleep(Timeout.Infinite);
+
+        engine.Dispose();
+    }
+}
diff --git a/src/renderer/src/SuperCmdWhisper.tsx b/src/renderer/src/SuperCmdWhisper.tsx
@@ -385,9 +385,11 @@ const SuperCmdWhisper: React.FC<SuperCmdWhisperProps> = ({
         (wantsOpenAI && !!settings.ai.openaiApiKey) ||
         (wantsElevenLabs && !!settings.ai.elevenlabsApiKey);
       const isWindowsPlatform = window.electron?.platform === 'win32';
-      // On Windows there is no macOS SFSpeechRecognizer binary — use Chromium's
-      // built-in Web Speech API instead (same engine as Chrome, no API key needed).
-      const backend: WhisperBackend = canUseCloud ? 'whisper' : (isWindowsPlatform ? 'webspeech' : 'native');
+      // On Windows use the native System.Speech recognizer binary (compiled from
+      // src/native/speech-recognizer.cs) instead of the macOS SFSpeechRecognizer.
+      // The Web Speech API ('webspeech') requires Google API keys that Electron
+      // does not include, so it fails with network errors on non-Chrome builds.
+      const backend: WhisperBackend = canUseCloud ? 'whisper' : 'native';
       backendRef.current = backend;
       return { backend, language };
     } catch {