perf(windows): use persistent whisper-server to eliminate cold-start

rodrigoluizs · rodrigoluizs · commit 1c42f7a9fc50 · 2026-03-14T10:40:20.000+01:00
Replace the spawn-per-transcription approach (whisper-cli.exe) with a
persistent whisper-server.exe process on non-macOS platforms. The server
loads the model once at app launch and accepts audio via HTTP POST on
localhost, eliminating the ~500ms model-loading overhead on every
transcription.

Additionally switches from beam-search decoding (--best-of 5 --beam-size
5) to greedy decoding, saving ~600ms per transcription. The LLM
correction layer compensates for any quality difference.

- Add WhisperServer class with lifecycle management and HTTP-based
  readiness polling
- Server auto-restarts when the user changes whisper models
- Falls back to CLI mode if the server fails to start
- macOS keeps the existing CLI approach (Metal GPU makes cold-start
  negligible)

Benchmark (JFK 11s audio, small model, 12 threads):
  Before: ~4,500ms (CLI + beam search)
  After:  ~3,300ms (server + greedy)
diff --git a/src/main/app.ts b/src/main/app.ts
@@ -6,7 +6,7 @@ import { ConfigManager } from "./config/manager";
 import { createSecretStore } from "./config/secrets";
 import { ModelManager } from "./models/manager";
 import { AudioRecorder } from "./audio/recorder";
-import { transcribe } from "./audio/whisper";
+import { transcribe, initWhisperServer, shutdownWhisperServer } from "./audio/whisper";
 import { createLlmProvider } from "./llm/factory";
 import { Pipeline } from "./pipeline";
 import { ShortcutManager } from "./shortcuts/manager";
@@ -177,6 +177,16 @@ function reloadConfig(): void {
   if (!busy) {
     setupPipeline();
   }
+
+  // Restart the whisper-server when the model changes (server auto-skips
+  // if the model is the same; on macOS this is a no-op).
+  const newModelPath = config.whisper.model
+    ? modelManager.getModelPath(config.whisper.model)
+    : "";
+  initWhisperServer(newModelPath).catch((err) => {
+    slog.error("Failed to restart whisper-server after config change", err);
+  });
+
   shortcutManager?.registerShortcutKeys();
   shortcutManager?.updateHud();
   updateTrayConfig(config);
@@ -272,6 +282,17 @@ app.whenReady().then(async () => {
   });
 
   setupPipeline();
+
+  // Start persistent whisper-server on non-macOS platforms to eliminate
+  // the cold-start model-loading overhead on each transcription.
+  {
+    const whisperModel = configManager.load().whisper.model;
+    const initialModelPath = whisperModel ? modelManager.getModelPath(whisperModel) : "";
+    initWhisperServer(initialModelPath).catch((err) => {
+      slog.error("Failed to start whisper-server at launch", err);
+    });
+  }
+
   historyManager.cleanup();
 
   // Clean up orphaned audio files
@@ -434,6 +455,7 @@ app.on("activate", () => {
 
 app.on("before-quit", () => {
   shortcutManager?.stop();
+  shutdownWhisperServer().catch(() => {});
   destroyTray();
   for (const win of BrowserWindow.getAllWindows()) {
     if (!win.isDestroyed()) win.destroy();
diff --git a/src/main/audio/whisper-server.ts b/src/main/audio/whisper-server.ts
@@ -0,0 +1,231 @@
+import { spawn, type ChildProcess } from "child_process";
+import * as fs from "fs";
+import * as http from "http";
+import * as net from "net";
+import * as path from "path";
+import * as os from "os";
+import { app } from "electron";
+import log from "electron-log/main";
+
+const slog = log.scope("WhisperServer");
+
+const appRoot = app.getAppPath().replace("app.asar", "app.asar.unpacked");
+const WHISPER_CPP_DIR = path.join(appRoot, "node_modules/whisper-node/lib/whisper.cpp");
+const SERVER_BIN = path.join(WHISPER_CPP_DIR, "whisper-server.exe");
+
+const STARTUP_TIMEOUT_MS = 15_000;
+const INFERENCE_TIMEOUT_MS = 30_000;
+
+// On Windows/Linux (CPU-only), using ~75% of cores gives the best throughput
+// without starving the OS. Minimum 4 to avoid slowdowns on low-core machines.
+const WHISPER_THREADS = Math.max(4, Math.floor(os.cpus().length * 0.75));
+
+function findFreePort(): Promise<number> {
+  return new Promise((resolve, reject) => {
+    const server = net.createServer();
+    server.listen(0, "127.0.0.1", () => {
+      const addr = server.address() as net.AddressInfo;
+      server.close(() => resolve(addr.port));
+    });
+    server.on("error", reject);
+  });
+}
+
+export class WhisperServer {
+  private proc: ChildProcess | null = null;
+  private port = 0;
+  private modelPath = "";
+  private ready = false;
+  private starting = false;
+
+  async start(modelPath: string): Promise<void> {
+    if (this.proc && this.modelPath === modelPath) return;
+    if (this.proc) await this.stop();
+
+    this.modelPath = modelPath;
+
+    if (!fs.existsSync(SERVER_BIN)) {
+      throw new Error(`whisper-server binary not found: ${SERVER_BIN}`);
+    }
+    if (!fs.existsSync(modelPath)) {
+      throw new Error(`Whisper model not found: ${modelPath}`);
+    }
+
+    this.starting = true;
+    this.ready = false;
+    this.port = await findFreePort();
+
+    slog.info("Starting whisper-server", { port: this.port, model: path.basename(modelPath), threads: WHISPER_THREADS });
+
+    this.proc = spawn(SERVER_BIN, [
+      "--host", "127.0.0.1",
+      "--port", String(this.port),
+      "-m", modelPath,
+      "-t", String(WHISPER_THREADS),
+      "--no-gpu",
+    ], {
+      cwd: WHISPER_CPP_DIR,
+      stdio: "ignore",
+      windowsHide: true,
+    });
+
+    this.proc.on("exit", (code, signal) => {
+      // Guard: during app shutdown the console transport may already be torn
+      // down, causing EPIPE if we try to log here.
+      try { slog.warn("whisper-server exited", { code, signal }); } catch { /* ignore */ }
+      this.proc = null;
+      this.ready = false;
+      this.starting = false;
+    });
+
+    this.proc.on("error", (err) => {
+      try { slog.error("whisper-server spawn error", err); } catch { /* ignore */ }
+      this.proc = null;
+      this.ready = false;
+      this.starting = false;
+    });
+
+    await this.waitForReady();
+  }
+
+  async stop(): Promise<void> {
+    if (!this.proc) return;
+
+    try { slog.info("Stopping whisper-server"); } catch { /* ignore */ }
+    const proc = this.proc;
+    this.proc = null;
+    this.ready = false;
+    this.starting = false;
+
+    proc.kill("SIGTERM");
+
+    // Give it a moment to exit gracefully, then force kill
+    await new Promise<void>((resolve) => {
+      const timeout = setTimeout(() => {
+        try { proc.kill("SIGKILL"); } catch { /* already dead */ }
+        resolve();
+      }, 2000);
+      proc.on("exit", () => {
+        clearTimeout(timeout);
+        resolve();
+      });
+    });
+  }
+
+  isReady(): boolean {
+    return this.ready && this.proc !== null;
+  }
+
+  async transcribe(wavPath: string, language: string, prompt: string): Promise<string> {
+    if (!this.ready || !this.proc) {
+      throw new Error("whisper-server is not running");
+    }
+
+    const fileBuffer = fs.readFileSync(wavPath);
+    const boundary = `----VoxBoundary${Date.now()}`;
+
+    const fields: Record<string, string> = {
+      response_format: "json",
+    };
+    if (language && language !== "auto") {
+      fields.language = language;
+    }
+    if (prompt) {
+      fields.prompt = prompt;
+    }
+
+    // Build multipart body
+    const parts: Buffer[] = [];
+
+    for (const [key, value] of Object.entries(fields)) {
+      parts.push(Buffer.from(
+        `--${boundary}\r\nContent-Disposition: form-data; name="${key}"\r\n\r\n${value}\r\n`
+      ));
+    }
+
+    parts.push(Buffer.from(
+      `--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="audio.wav"\r\nContent-Type: audio/wav\r\n\r\n`
+    ));
+    parts.push(fileBuffer);
+    parts.push(Buffer.from(`\r\n--${boundary}--\r\n`));
+
+    const body = Buffer.concat(parts);
+
+    const responseBody = await new Promise<string>((resolve, reject) => {
+      const req = http.request({
+        hostname: "127.0.0.1",
+        port: this.port,
+        path: "/inference",
+        method: "POST",
+        headers: {
+          "Content-Type": `multipart/form-data; boundary=${boundary}`,
+          "Content-Length": body.length,
+        },
+        timeout: INFERENCE_TIMEOUT_MS,
+      }, (res) => {
+        const chunks: Buffer[] = [];
+        res.on("data", (chunk: Buffer) => chunks.push(chunk));
+        res.on("end", () => {
+          const text = Buffer.concat(chunks).toString("utf-8");
+          if (res.statusCode !== 200) {
+            reject(new Error(`whisper-server returned ${res.statusCode}: ${text}`));
+            return;
+          }
+          resolve(text);
+        });
+      });
+
+      req.on("error", reject);
+      req.on("timeout", () => {
+        req.destroy(new Error("whisper-server inference timed out"));
+      });
+
+      req.write(body);
+      req.end();
+    });
+
+    const parsed = JSON.parse(responseBody) as { text?: string };
+    return (parsed.text ?? "").trim();
+  }
+
+  private async waitForReady(): Promise<void> {
+    const deadline = Date.now() + STARTUP_TIMEOUT_MS;
+    const POLL_INTERVAL_MS = 300;
+
+    while (Date.now() < deadline) {
+      if (!this.proc) {
+        throw new Error("whisper-server exited during startup");
+      }
+
+      const alive = await this.httpProbe();
+      if (alive) {
+        this.ready = true;
+        this.starting = false;
+        slog.info("whisper-server is ready", { port: this.port });
+        return;
+      }
+
+      await new Promise<void>((r) => setTimeout(r, POLL_INTERVAL_MS));
+    }
+
+    this.starting = false;
+    throw new Error(`whisper-server failed to start within ${STARTUP_TIMEOUT_MS}ms`);
+  }
+
+  /** Send a lightweight HTTP request to check if the server is accepting connections. */
+  private httpProbe(): Promise<boolean> {
+    return new Promise<boolean>((resolve) => {
+      const req = http.get({
+        hostname: "127.0.0.1",
+        port: this.port,
+        path: "/",
+        timeout: 500,
+      }, (res) => {
+        res.resume();
+        resolve(true);
+      });
+      req.on("error", () => resolve(false));
+      req.on("timeout", () => { req.destroy(); resolve(false); });
+    });
+  }
+}
diff --git a/src/main/audio/whisper.ts b/src/main/audio/whisper.ts
@@ -5,6 +5,10 @@ import * as fs from "fs";
 import * as path from "path";
 import * as os from "os";
 import { buildWhisperPrompt, buildWhisperArgs } from "../../shared/constants";
+import { WhisperServer } from "./whisper-server";
+import log from "electron-log/main";
+
+const slog = log.scope("Whisper");
 
 export interface TranscriptionResult {
   text: string;
@@ -19,6 +23,42 @@ const WHISPER_BIN = path.join(
   process.platform === "win32" ? "whisper-cli.exe" : "main"
 );
 
+// On non-macOS platforms, use a persistent whisper-server process to avoid
+// the ~500ms cold-start (model loading) on every transcription.
+const USE_SERVER = process.platform !== "darwin";
+let server: WhisperServer | null = null;
+
+/**
+ * Start the persistent whisper-server process (non-macOS only).
+ * Call this once at app launch. No-ops on macOS.
+ */
+export async function initWhisperServer(modelPath: string): Promise<void> {
+  if (!USE_SERVER) return;
+  if (!modelPath || !fs.existsSync(modelPath)) {
+    slog.warn("Skipping whisper-server start: no model", modelPath);
+    return;
+  }
+
+  try {
+    if (!server) server = new WhisperServer();
+    await server.start(modelPath);
+  } catch (err) {
+    slog.error("Failed to start whisper-server, falling back to CLI", err);
+    server = null;
+  }
+}
+
+/**
+ * Stop the persistent whisper-server process.
+ * Call this on app quit.
+ */
+export async function shutdownWhisperServer(): Promise<void> {
+  if (server) {
+    await server.stop();
+    server = null;
+  }
+}
+
 export async function transcribe(
   audioBuffer: Float32Array,
   sampleRate: number,
@@ -46,8 +86,15 @@ export async function transcribe(
       && speechLanguages.length > 0
       ? speechLanguages[0]
       : whisperArgs.language;
-    const stdout = await runWhisper(modelPath, tempPath, prompt, language, temperature);
-    const text = parseWhisperOutput(stdout);
+
+    let text: string;
+
+    if (USE_SERVER && server?.isReady()) {
+      text = await server.transcribe(tempPath, language, prompt);
+    } else {
+      const stdout = await runWhisperCli(modelPath, tempPath, prompt, language, temperature);
+      text = parseWhisperOutput(stdout);
+    }
 
     return { text };
   } finally {
@@ -64,14 +111,12 @@ const WHISPER_THREADS = process.platform === "darwin"
   ? 4
   : Math.max(4, Math.floor(os.cpus().length * 0.75));
 
-function runWhisper(modelPath: string, filePath: string, prompt: string, language = "auto", temperature?: number): Promise<string> {
+function runWhisperCli(modelPath: string, filePath: string, prompt: string, language = "auto", temperature?: number): Promise<string> {
   const args = [
     "-t", String(WHISPER_THREADS),
     "-l", language,
     "-m", modelPath,
     "-f", filePath,
-    "--best-of", "5",
-    "--beam-size", "5",
     "--entropy-thold", "2.0",
     "--prompt", prompt,
   ];
diff --git a/tests/main/audio/whisper-server.test.ts b/tests/main/audio/whisper-server.test.ts