diff --git a/eval/behavioral/tests/test_local_ai_app_integration.py b/eval/behavioral/tests/test_local_ai_app_integration.py
new file mode 100644
index 0000000..ced0d6d
--- /dev/null
+++ b/eval/behavioral/tests/test_local_ai_app_integration.py
@@ -0,0 +1,92 @@
+"""Behavioral tests for the `local-ai-app-integration` skill.
+
+Run locally (needs the `claude` CLI authenticated):
+
+ pytest eval/behavioral/tests/test_local_ai_app_integration.py -s
+
+Prompts are scoped to code-generation only ("Do not download or install
+anything") to avoid the agent attempting the GitHub download, which hangs
+indefinitely. Checks prefer `logs_contains` / `workspace_contains` (instant)
+over `should` / `should_not` (spawns a judge subprocess) wherever possible.
+"""
+
+from harness import claude
+
+_STUB = "from openai import OpenAI\nclient = OpenAI()\n"
+
+
+def test_launcher_module_written():
+ with claude("sonnet", skill="local-ai-app-integration") as agent:
+ (agent.workspace / "main.py").write_text(_STUB)
+
+ run = agent.prompt(
+ "Write a lemond launcher module for this Python app. "
+ "Do not download or install anything — just write the file."
+ )
+
+ run.workspace_contains("lemond_launcher.py")
+ run.logs_contains("secrets") # random API key generation
+ run.logs_contains("socket") # dynamic port via socket bind
+ run.logs_contains("subprocess") # lemond spawned as subprocess
+
+
+def test_http_client_timeout_is_120s():
+ with claude("sonnet", skill="local-ai-app-integration") as agent:
+ (agent.workspace / "main.py").write_text(_STUB)
+
+ run = agent.prompt(
+ "Update main.py to re-point the OpenAI client at a local lemond "
+ "instance. Do not download or install anything — just edit the file."
+ )
+
+ run.workspace_contains("main.py")
+ run.logs_contains("120") # 120s timeout present in written code
+
+
+def test_health_check_uses_http_not_stdout():
+ with claude("sonnet", skill="local-ai-app-integration") as agent:
+ (agent.workspace / "main.py").write_text(_STUB)
+
+ run = agent.prompt(
+ "Write a health-check helper for lemond in this Python app. "
+ "Do not download or install anything — just write the code."
+ )
+
+ run.logs_contains("/api/v1/health")
+ run.should_not("Read or parse lemond's stdout or stderr to detect readiness")
+
+
+def test_no_preload_call_in_written_code():
+ with claude("sonnet", skill="local-ai-app-integration") as agent:
+ (agent.workspace / "main.py").write_text(_STUB)
+
+ run = agent.prompt(
+ "Write a lemond launcher for this Python app that waits for the "
+ "server to be ready. Do not download or install anything."
+ )
+
+ run.logs_contains("/api/v1/health")
+ run.should_not("Call POST /api/v1/load to pre-load the model at startup")
+
+
+def test_api_key_gate_bypassed_in_local_mode():
+ with claude("sonnet", skill="local-ai-app-integration") as agent:
+ (agent.workspace / "main.py").write_text(
+ "import os\n"
+ "from openai import OpenAI\n\n"
+ "api_key = os.environ.get('OPENAI_API_KEY', '')\n"
+ "if not api_key:\n"
+ " raise SystemExit('No API key set. Please enter your OpenAI key.')\n\n"
+ "client = OpenAI(api_key=api_key)\n"
+ )
+
+ run = agent.prompt(
+ "Edit main.py so it works in local mode without an OPENAI_API_KEY. "
+ "Do not download or install anything — just edit the file."
+ )
+
+ run.workspace_contains("main.py")
+ run.should(
+ "Remove or bypass the API-key guard so the app starts in local mode "
+ "without requiring OPENAI_API_KEY to be set"
+ )
diff --git a/skills/local-ai-app-integration/SKILL.md b/skills/local-ai-app-integration/SKILL.md
index 3f197e5..e9e4ee5 100644
--- a/skills/local-ai-app-integration/SKILL.md
+++ b/skills/local-ai-app-integration/SKILL.md
@@ -19,6 +19,10 @@ talks to it on `http://localhost:PORT/api/v1`. The user gets local, private,
hardware-optimized inference (CPU, AMD iGPU/dGPU, XDNA2 NPU) with no separate
install.
+**What you'll end up with:** one new launcher module (~30 lines), one config
+change to the existing HTTP client (base URL + API key), one vendored binary
+under `vendor/lemonade/`.
+
## When this skill is the right tool
Use this skill when **all** of the following are true:
@@ -41,15 +45,22 @@ This skill follows one fixed sequence. Do not deviate without a stated reason.
```
[ ] 1. Survey the app's current AI integration
[ ] 2. Pick a model + backend profile
-[ ] 3. Place Embeddable Lemonade in the app's tree
-[ ] 4. Add a `lemond` launcher (subprocess + API key + port)
-[ ] 5. Re-point the existing client at lemond
-[ ] 6. Wait for /v1/health and pre-load the default model
+[ ] 3. Place Embeddable Lemonade in the app's tree (full package, not just the binary)
+[ ] 4. Add a `lemond` launcher (subprocess + API key + port + per-stage logging)
+[ ] 5. Re-point the existing client at lemond (set HTTP timeout to 120s)
+[ ] 6. Wait for /api/v1/health, install backend, then PULL the model before first use
[ ] 7. Wire shutdown and error recovery
```
Track progress against this checklist. Move on only when each step verifies.
+> **Log every stage.** A local integration has many silent failure points —
+> spawn, health, backend install, model download, first inference. Without a
+> log line at each transition, "nothing happened" is indistinguishable from
+> "broke at stage 3." Emit one clear line per stage as you build (see
+> [Step 4](#step-4-add-a-lemond-launcher)); the most common dead-end in this
+> integration — a blank result with no error — is invisible without them.
+
---
## Step 1: Survey the app
@@ -87,8 +98,8 @@ it.
| Coding assistant | `Qwen2.5-Coder-7B-Instruct-GGUF` | `llamacpp` | Strong code, runs on iGPU |
| Vision / multimodal chat | `Gemma-4-E2B-it-GGUF` | `llamacpp` | Small multimodal default |
| NPU-first on Ryzen AI | `Llama-3.2-3B-Instruct-Hybrid` | `ryzenai-llm` | XDNA2 NPU on Windows |
-| CPU Speech-to-text | `Whisper-Large-v3-Turbo` | `whispercpp` | Best quality/speed |
-| NPU speech-to-text | `whisper-v3-turbo-FLM` | `flm` | XDNA2 NPU on Windows |
+| Speech-to-text (Windows) | `Whisper-Large-v3-Turbo` | `whispercpp` | One model; probe picks NPU → iGPU/dGPU → CPU automatically |
+| Speech-to-text (Linux NPU) | `whisper-v3-turbo-FLM` | `flm` | Linux NPU path; falls back to `whispercpp` iGPU/CPU off-NPU |
| Text-to-speech | `kokoro-v1` | `kokoro` | CPU-only, low latency |
| Image generation | `SDXL-Turbo` | `sd-cpp` | Single-step generation |
@@ -96,26 +107,94 @@ For the LLM backend, default to `llamacpp` and let `lemond` pick
`rocm` → `vulkan` → `cpu` automatically by leaving `llamacpp_backend`
unset. Override only if the app has hard hardware requirements.
+**Scope: this skill selects a backend once at integration time on the
+developer's machine.** Runtime fallback based on the end user's hardware is
+out of scope. Bundle `vulkan` as the universal fallback so the app works on
+any machine. If the dev machine has an NPU and the chosen recipe supports it,
+the skill will use the NPU backend — otherwise it falls back to `vulkan`.
+
+> **Note:** having an NPU does not mean every recipe supports NPU. Confirm
+> the recipe/backend pair is `installed` or `installable` via
+> `GET /api/v1/system-info` before committing to it. See
+> [reference.md](reference.md#hardware-probing-with-v1system-info) for
+> per-recipe decision rules.
+
For more options and tradeoffs, see [reference.md](reference.md).
## Step 3: Place Embeddable Lemonade in the app's tree and install backends
-Get the embeddable artifact from the latest Lemonade release:
+**Get the embeddable artifact** from the latest Lemonade release:
+
+```
+https://github.com/lemonade-sdk/lemonade/releases/latest
+```
+
+Download the file matching your target OS:
- Windows: `lemonade-embeddable-{VERSION}-windows-x64.zip`
-- Linux: `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz`
+- Linux: `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz`
+
+> **Don't hand-build the download URL from the tag.** The git tag carries a
+> leading `v` (e.g. `v10.8.0`) but the asset filename strips it
+> (`lemonade-embeddable-10.8.0-...`), so using the tag verbatim 404s. Ask the
+> GitHub API for the asset by its stable name pattern and use the URL it
+> returns, as below — this stays correct across version and naming changes.
+
+**First, create the target directory** — it does not exist in a fresh repo:
-Unpack into the app source tree at `vendor/lemonade/` (or whatever the app's
-existing convention for vendored binaries is). The expected layout after
-customization:
+```powershell
+# Windows
+New-Item -ItemType Directory -Force vendor\lemonade
+```
+
+```bash
+# Linux
+mkdir -p vendor/lemonade
+```
+
+Then download and unpack on Windows (PowerShell):
+
+```powershell
+$rel = Invoke-RestMethod https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest
+$asset = $rel.assets | Where-Object { $_.name -like "lemonade-embeddable-*-windows-x64.zip" } | Select-Object -First 1
+Invoke-WebRequest $asset.browser_download_url -OutFile lemond.zip
+Expand-Archive lemond.zip -DestinationPath "$env:TEMP\lemond-unpack"
+$folder = $asset.name -replace '\.zip$','' # unpacked dir = asset name without .zip
+Copy-Item -Recurse "$env:TEMP\lemond-unpack\$folder\*" vendor\lemonade\
+# Sanity check: resources/ must be nested under vendor\lemonade\ (not flattened)
+if (-not (Test-Path vendor\lemonade\resources\*.json)) { throw "resources/ missing — re-extract and copy again" }
+```
+
+On Linux (bash):
+
+```bash
+URL=$(curl -s https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest \
+ | grep browser_download_url | grep ubuntu-x64.tar.gz | cut -d'"' -f4)
+curl -L "$URL" | tar -xz --strip-components=1 -C vendor/lemonade
+```
+
+> **Copy the full package, not just the binary.** The archive contains
+> `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and `resources/`. The
+> `resources/` directory is required — without it lemond starts and passes the
+> health check but fails on every model and backend request. Copying only the
+> binary produces a server that looks healthy but cannot function.
+
+> **`lemond` vs `lemonade` CLI:** `lemond` is the embedded server binary that
+> ships with the app. The `lemonade` CLI is a separate packaging tool used
+> only during development/build time to install backends. Install it once on
+> the developer machine with `pip install lemonade-sdk`.
+
+The expected layout **after setup** (first run + backend install). A freshly
+unzipped package contains only `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and
+`resources/` — the items below are created later, as their comments note:
```
vendor/lemonade/
lemond[.exe] # the only binary the app ships
LICENSE
- config.json # generated on first run
+ config.json # generated on first run; commit a seed copy
resources/
- server_models.json # trim to just the models you ship
+ server_models.json # do not edit; use GET /api/v1/models at runtime
backend_versions.json
bin/ # backends bundled at packaging time
llamacpp/vulkan/llama-server[.exe]
@@ -123,24 +202,41 @@ vendor/lemonade/
models--unsloth--Qwen3-4B-GGUF/
```
+> **`server_models.json`:** Do not edit or rely on this file. It can be stale.
+> The only authoritative model list is `GET /api/v1/models` on a running
+> `lemond` instance with the backend already installed.
+
**Bundle decisions: pick deliberately**
- **Backends:** Bundle `llamacpp:vulkan` at packaging time (works on every
GPU). Install `llamacpp:rocm` at first run on supported AMD systems via
- `POST /v1/install` after probing `GET /v1/system-info`. Never ship every
- backend, or the artifact balloons.
+ `POST /api/v1/install` after probing `GET /api/v1/system-info`. Never ship
+ every backend, or the artifact balloons.
- **Models:** Either bundle the default model under `models/` (offline
- install, larger installer) **or** pull on first run with `POST /v1/pull`
- (smaller installer, needs network). Pick one and document it.
+ install, larger installer) **or** pull on first run with
+ `POST /api/v1/pull` (smaller installer, needs network). Pick one and
+ document it.
- **`models_dir`:** Set to `./models` in `config.json` to keep weights
private to the app. Leave as `auto` only if the user explicitly wants to
share weights with other apps.
-**Install the backend before running any model.** Right after placing
-`lemond`, install the backend your chosen recipe needs — a model won't load
-without it. Use the CLI at packaging time, e.g. `lemonade backends install
-flm:npu` (or `llamacpp:vulkan`, `sd-cpp:cpu`, etc.), or `POST /v1/install`
-at first run for hardware-specific backends like `llamacpp:rocm`.
+**Backend install timing — two distinct paths:**
+
+> **Packaging time** (developer machine, before bundling):
+> ```
+> lemonade backends install llamacpp:vulkan
+> lemonade backends install flm:npu # Windows NPU path only
+> ```
+> This bakes the backend binaries into `vendor/lemonade/bin/` before the app
+> ships. `lemond` does not need to be running.
+>
+> **First-run / runtime** (user's machine, after `lemond` is running):
+> ```http
+> POST /api/v1/install
+> {"recipe": "llamacpp", "backend": "rocm"}
+> ```
+> Use this for hardware-specific backends (e.g. `llamacpp:rocm`) that cannot
+> be bundled universally. `lemond` must already be running (Step 4 complete).
## Step 4: Add a `lemond` launcher
@@ -151,87 +247,42 @@ The launcher is a thin process supervisor. Its only jobs:
3. Spawn `lemond
--port ` with `LEMONADE_API_KEY` set.
4. Expose the chosen `port` and `key` to the rest of the app.
-**Python reference launcher** (adapt to the app's language):
+> **Log one line per lifecycle stage.** Build the logging in from the start —
+> not as an afterthought when something breaks. Each silent transition needs a
+> visible marker so a failure points at the exact stage. Aim for:
+>
+> ```
+> [lemond] Starting on port
+> [lemond] Healthy on port
+> [lemond] : installed (or: already installed / install failed)
+> [lemond] Pulling model ... then: Model ready (or: pull returned )
+> [local] result: (first inference output — empty string here = unpulled model)
+> ```
+>
+> Logging the **first inference result verbatim** is what turns the
+> silent-empty failure (Step 6) from a multi-hour mystery into a one-line
+> diagnosis. Route these through the app's normal logging so they can be quieted
+> for release.
+
+> **Dev-mode file watchers:** If the app runs with a file watcher (Tauri,
+> Electron, Next.js, Vite, etc.) that watches the source tree, ensure
+> `vendor/lemonade/` is excluded from the watched paths. Lemond writes config
+> and cache files at runtime; a watcher that picks these up will restart the
+> app, kill the lemond subprocess, and spawn a new one on a new port —
+> silently breaking any in-flight transcription. Add `vendor/` (or the
+> equivalent) to the watcher's ignore list before testing.
+
+The launcher logic in pseudocode (full Python and Node.js implementations in [reference.md](reference.md#reference-launchers)):
-```python
-import os, secrets, socket, subprocess, sys, time, urllib.request
-from pathlib import Path
-
-LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade"
-LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond")
-
-def _free_port() -> int:
- with socket.socket() as s:
- s.bind(("127.0.0.1", 0))
- return s.getsockname()[1]
-
-def start_lemond() -> tuple[subprocess.Popen, str, int]:
- port = _free_port()
- key = secrets.token_urlsafe(32)
- env = {**os.environ, "LEMONADE_API_KEY": key}
- proc = subprocess.Popen(
- [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)],
- env=env,
- stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
- )
- _wait_for_health(port, key, timeout_s=30)
- return proc, key, port
-
-def _wait_for_health(port: int, key: str, timeout_s: int) -> None:
- url = f"http://127.0.0.1:{port}/api/v1/health"
- req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"})
- deadline = time.monotonic() + timeout_s
- while time.monotonic() < deadline:
- try:
- with urllib.request.urlopen(req, timeout=1) as r:
- if r.status == 200:
- return
- except Exception:
- time.sleep(0.25)
- raise RuntimeError("lemond failed to become healthy")
```
-
-**Node.js reference launcher:**
-
-```js
-import { spawn } from "node:child_process";
-import { randomBytes } from "node:crypto";
-import { createServer } from "node:net";
-import path from "node:path";
-
-const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade");
-const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond");
-
-const freePort = () => new Promise((res) => {
- const s = createServer().listen(0, "127.0.0.1", () => {
- const { port } = s.address(); s.close(() => res(port));
- });
-});
-
-export async function startLemond() {
- const port = await freePort();
- const key = randomBytes(32).toString("base64url");
- const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], {
- env: { ...process.env, LEMONADE_API_KEY: key },
- stdio: ["ignore", "pipe", "pipe"],
- });
- await waitForHealth(port, key, 30_000);
- return { proc, key, port };
-}
-
-async function waitForHealth(port, key, timeoutMs) {
- const url = `http://127.0.0.1:${port}/api/v1/health`;
- const headers = { Authorization: `Bearer ${key}` };
- const deadline = Date.now() + timeoutMs;
- while (Date.now() < deadline) {
- try {
- const r = await fetch(url, { headers });
- if (r.ok) return;
- } catch {}
- await new Promise((r) => setTimeout(r, 250));
- }
- throw new Error("lemond failed to become healthy");
-}
+port = bind("127.0.0.1:0"), read port, close socket
+key = random_bytes(32)
+proc = spawn(lemond_bin, [lemond_dir, "--port", port], env={LEMONADE_API_KEY: key})
+poll GET /api/v1/health with Bearer key, retry for 90s, 250ms interval
+return proc, key, port
+
+# On failure: kill proc, pick new port, retry up to 3 times
+# On app exit: proc.kill() (Windows) / proc.terminate() (Unix), then wait()
```
## Step 5: Re-point the existing client at `lemond`
@@ -249,23 +300,39 @@ and the API key. Nothing else.
The model identifier on requests stays a Lemonade model name (e.g.
`Qwen3-4B-GGUF`), not the cloud name.
-**Bypass the app's API-key gate in local mode.** A local backend needs no
-cloud key, so any onboarding wall, validator, or startup check that demands
-one must not block local-mode users. Skip or auto-satisfy the key-entry
-screen, treat local mode as already-authorized in validation logic, and
-re-enable the gate only for cloud mode. The `lemond` key from Step 4 is set
-internally by the launcher, so the user never enters one and any UI
-placeholder (e.g. `"local"`) is fine. Flipping into local mode should never
-strand the user on a key-entry wall.
+**Local mode needs no cloud API key — at all.** This is a defining property of
+local mode, not an edge case: there is no cloud service to authenticate to, so
+nothing should ever ask the user for a key. Any onboarding wall, validator, or
+startup check that demands one must not block local-mode users. Concretely:
+
+- Skip or auto-satisfy the key-entry screen in local mode.
+- Treat local mode as already-authorized in every validation path — an
+ empty-key check must short-circuit to "valid" when the active mode is local,
+ never throw "API key not configured".
+- Re-enable the gate **only** for cloud mode.
+
+The `lemond` key from Step 4 is generated internally by the launcher and used
+only for the local loopback connection, so the user never sees or enters one;
+any UI placeholder (e.g. `"local"`) is fine. Flipping into local mode should
+never strand the user on a key-entry wall.
+
+**Set the HTTP client timeout to at least 120 seconds.** The default timeout
+on most HTTP clients (30s) is shorter than the time lemond takes to load a
+model on first use. A silent timeout looks identical to a broken integration
+— the request fires, nothing comes back, and the UI shows nothing. 120s
+covers first-run model load on any supported hardware.
**Python (openai) example:**
```python
from openai import OpenAI
+import httpx
+
proc, key, port = start_lemond()
client = OpenAI(
base_url=f"http://127.0.0.1:{port}/api/v1",
api_key=key,
+ http_client=httpx.Client(timeout=120.0), # covers first-run model load
)
resp = client.chat.completions.create(
model="Qwen3-4B-GGUF",
@@ -273,21 +340,69 @@ resp = client.chat.completions.create(
)
```
-## Step 6: Wait for health, then preload the default model
+## Step 6: Health, backend, then pull the model — *before* first inference
+
+`GET /api/v1/health` returning 200 means the **server** is up. It does **not**
+mean inference will work. Before the first real request succeeds, three more
+things must be true: the backend for your modality is installed, the model's
+weights are **downloaded to disk**, and (on the first call) the model is loaded
+into memory. Treating health=200 as "ready" is the single biggest cause of a
+broken-looking integration.
+
+**Do not call `POST /api/v1/load` at startup.** Lemond lazy-loads the model
+into memory on the first inference request and handles that step on its own.
+Pre-loading is unreliable across lemond versions (the `/load` request body
+shape has changed between releases) and a malformed call can crash or
+destabilise the server before the user takes any action. Loading is the one
+step you let lemond do lazily — pulling is not.
-`lemond` lazy-loads models on first inference. To eliminate cold-start
-latency on the user's first message, preload right after the health check
-passes:
+### Pull the model so it exists on disk
+
+Lazy-load only loads weights that are **already downloaded**. If the model was
+never pulled, the first inference does not error — lemond returns an empty /
+blank result with HTTP 200. So after health passes and the backend is
+installed, proactively pull the model:
```http
-POST /api/v1/load
-Authorization: Bearer {key}
-Content-Type: application/json
+POST /api/v1/pull
+{"model": "Whisper-Large-v3-Turbo"}
+```
+
+This is **idempotent** — a no-op if the weights are already present, a download
+if they are not. Run it once during setup (after backend install, before the
+first user-triggered inference) and log the result.
+
+- **Default model** (the one you chose in Step 2): pull it by name as above.
+- **Custom / user-overridden model:** do not assume it exists. Confirm it is a
+ real Lemonade model first via `GET /api/v1/models` (the **only** trusted
+ catalog — see [reference.md](reference.md)), then pull it the same way. A
+ model appearing in the catalog is **not** proof its weights are downloaded;
+ a successful pull is.
+
+> **Silent-empty is almost always an unpulled model.** If inference returns an
+> empty string / blank output with no HTTP error, the model was not downloaded.
+> Check your pull step before debugging anything else — this is the failure mode
+> that wastes the most time. Log the pull result and the first inference result
+> (see Step 4) so this is diagnosable from the console, not by guesswork.
+
+### Surface the *whole* setup, not just model load
+
+First-run cold start is more than a model load. The full sequence is:
-{"model": "Qwen3-4B-GGUF"}
```
+server spawn → health 200 → backend install → model download → model load → first result
+```
+
+On a fresh machine, backend install and model download can each take from tens
+of seconds to several **minutes** (multi-GB weights over the network). Model
+load alone is 10–30s. An app that shows nothing during this will look frozen.
-If the model isn't downloaded yet, follow the recovery flow in Step 7.
+Minimum: show a loading indicator or status message ("Setting up local AI…")
+from the moment setup begins until the first response arrives — covering the
+*entire* sequence above, not just the final load. The simplest implementation
+is a flag set when setup/first-request starts and cleared when the first
+response arrives. Once the model is pulled and loaded once, subsequent runs are
+fast; the long wait is first-run only.
## Step 7: Lifecycle and recovery
@@ -295,10 +410,11 @@ These are the only failure modes worth handling. Do not over-engineer.
| Symptom | Cause | Recovery |
|---|---|---|
-| `POST /v1/load` returns 404 / model not found | Model not pulled yet | `POST /v1/pull` with `{"model": "..."}` then retry `/v1/load` |
-| `/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /v1/system-info`, pick a supported backend, `POST /v1/install` with `{"recipe": "...", "backend": "..."}`, retry |
-| Subprocess exits immediately | Port already in use by another `lemond` | Pick a new free port and retry once |
-| `/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after |
+| **Inference returns empty / blank with HTTP 200, no error** | Model never pulled: backend is installed but weights are absent, so lazy-load has nothing to load | `POST /api/v1/pull` with `{"model":"..."}`, wait for success, retry. Log the pulled result and the first inference result. This is the most common silent failure — see [Step 6](#step-6-health-backend-then-pull-the-model--before-first-inference) |
+| `POST /api/v1/load` returns 404 / model not found | Model not pulled yet (same root cause as the empty-result row above) | `POST /api/v1/pull` with `{"model": "..."}` then retry `/api/v1/load` |
+| `POST /api/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /api/v1/system-info`, pick a supported backend, `POST /api/v1/install` with `{"recipe": "...", "backend": "..."}`, retry |
+| Subprocess exits immediately | Port race: another process grabbed the port between `freePort()` and lemond binding | The reference launcher retries with a fresh port automatically (3 attempts) |
+| `/api/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after |
| HTTP 401 on every request | Forgot the `Authorization: Bearer` header | Audit the client config because Lemonade rejects unauth'd calls when `LEMONADE_API_KEY` is set |
**Shutdown:** On app exit, `proc.terminate()` (Unix) or
@@ -314,13 +430,25 @@ couple of seconds. Always wait on the process; never orphan it.
The integration is done when **all** of these are true:
+- [ ] `vendor/lemonade/` contains the full package: `lemond[.exe]`,
+ `lemonade[.exe]`, `LICENSE`, and `resources/` — not just the binary.
- [ ] `lemond` starts as a subprocess with a fresh API key per launch.
- [ ] `GET /api/v1/health` returns 200 within the timeout.
-- [ ] The default model loads successfully via `POST /v1/load`.
+- [ ] The default model is pulled (or bundled) before the first inference; a
+ custom/overridden model is confirmed via `GET /api/v1/models` and then
+ pulled. A blank result with no error means this step was skipped.
+- [ ] Each lifecycle stage logs a clear line (spawn, health, backend install,
+ model pull, first result) so a failure is diagnosable from the console.
- [ ] The existing client's chat / image / speech call returns a valid
response with the base URL and key swapped, with no other code changed.
-- [ ] In local mode the app's API-key gate is bypassed: no onboarding wall,
- validator, or startup check blocks the user for lacking a cloud key.
+- [ ] First-run latency is surfaced: the interface shows a loading state from the
+ moment the first inference request is sent until the response arrives.
+- [ ] The HTTP client timeout is set to at least 120 seconds.
+- [ ] In local mode the app requires **no** cloud API key: no onboarding wall,
+ validator, or startup check blocks the user, and no code path throws
+ "API key not configured" when the active mode is local.
+- [ ] If the app uses a dev-mode file watcher, `vendor/lemonade/` is excluded
+ from the watched paths so runtime writes by lemond do not trigger restarts.
- [ ] Killing the parent process leaves no `lemond` subprocess behind.
- [ ] On a fresh machine without the optimal backend, the app still works
via the Vulkan fallback bundled in `bin/`.
diff --git a/skills/local-ai-app-integration/reference.md b/skills/local-ai-app-integration/reference.md
index 9b8bb9e..d77f5f1 100644
--- a/skills/local-ai-app-integration/reference.md
+++ b/skills/local-ai-app-integration/reference.md
@@ -40,14 +40,12 @@ hardware-optimized one at first run after a system probe.
### Speech-to-text
-Two NPU paths exist. **Prefer `flm` for NPU**.
-
| Recipe | Backend | Model | Hardware | OS |
|---|---|---|---|---|
-| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Windows |
+| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | AMD iGPU / dGPU | Windows, Linux |
| `whispercpp` | `cpu` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Windows, Linux |
-| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Linux |
-| `whispercpp` | `npu` | `.rai`-cached whisper model | XDNA2 NPU | Windows (avoid) |
+| `whispercpp` | `npu` | `Whisper-Large-v3-Turbo` | XDNA2 NPU | Windows |
+| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Linux (runtime-install only) |
### Text-to-speech
@@ -89,6 +87,14 @@ model catalog; it can be stale or incomplete. A model only appears in
`GET /v1/models` once its backend is installed (see Step 3), so install the
backend first or the list will look empty/incomplete.
+**Catalogued ≠ downloaded.** A model listed by `GET /v1/models` is *available
+to use*, not necessarily present on disk. It must be **pulled**
+(`POST /api/v1/pull {"model":"..."}`) before it can serve — until then,
+inference returns an empty result with HTTP 200, not an error. The surest
+signal that a model is ready is a successful pull, not its presence in the
+catalog. See SKILL.md
+[Step 6](SKILL.md#step-6-health-backend-then-pull-the-model--before-first-inference).
+
---
## Hardware probing with /v1/system-info
@@ -120,13 +126,31 @@ Response shape (truncated):
}
```
-Decision rules in priority order, for the default `llamacpp` recipe:
+The same pattern applies to **every** recipe: read the per-backend `state`,
+install the best one that is `installable`, use it if already `installed`, and
+fall back down the priority list otherwise. Apply it to whichever recipe matches
+the app's modality.
+
+Decision rules in priority order, for the default `llamacpp` recipe (text gen):
1. If `recipes.llamacpp.backends.rocm.state == "installable"` →
`POST /v1/install {"recipe":"llamacpp","backend":"rocm"}`.
2. Else if `state == "installed"` for `vulkan` → use it as-is.
3. Else fall back to `cpu`.
+Decision rules for the `whispercpp` recipe (speech-to-text), NPU-first:
+
+1. If `recipes.whispercpp.backends.npu.state == "installed"` → use NPU as-is.
+2. Else if `npu.state == "installable"` →
+ `POST /v1/install {"recipe":"whispercpp","backend":"npu"}`, then use NPU.
+3. Else if `vulkan` is `installed`/`installable` → use the iGPU/dGPU path.
+4. Else fall back to `cpu`.
+
+Probe **once**, cache the chosen backend for the session (the result does not
+change while the app runs), and log which backend was selected. This is the
+mechanism that lets one build run on an NPU machine and a CPU-only machine
+without any user configuration.
+
For Ryzen AI Hybrid models on Windows, additionally check
`ryzenai-llm.backends.npu.state` and install if `installable`.
@@ -247,3 +271,112 @@ Two backend limitations on Linux as of this writing:
When building from source for an unusual Linux distro, see the upstream
`docs/embeddable/building.md` in the lemonade-sdk/lemonade repo.
+
+---
+
+## Reference launchers
+
+Full implementations for Step 4. Adapt to the app's language; the key
+constraints are: retry with a fresh port on spawn failure (the socket is
+released before lemond binds), poll `/api/v1/health` with the Bearer key,
+and kill the process on app exit.
+
+**Python:**
+
+```python
+import os, secrets, socket, subprocess, sys, time, urllib.request
+from pathlib import Path
+
+LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade"
+LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond")
+
+def _free_port() -> int:
+ with socket.socket() as s:
+ s.bind(("127.0.0.1", 0))
+ return s.getsockname()[1]
+
+def start_lemond(retries: int = 3) -> tuple[subprocess.Popen, str, int]:
+ last_err: Exception | None = None
+ for _ in range(retries):
+ port = _free_port()
+ key = secrets.token_urlsafe(32)
+ env = {**os.environ, "LEMONADE_API_KEY": key}
+ proc = subprocess.Popen(
+ [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)],
+ env=env,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+ )
+ try:
+ _wait_for_health(port, key, timeout_s=30)
+ return proc, key, port
+ except RuntimeError as e:
+ proc.kill()
+ proc.wait()
+ last_err = e
+ raise RuntimeError(f"lemond failed to start after {retries} attempts") from last_err
+
+def _wait_for_health(port: int, key: str, timeout_s: int) -> None:
+ url = f"http://127.0.0.1:{port}/api/v1/health"
+ req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"})
+ deadline = time.monotonic() + timeout_s
+ while time.monotonic() < deadline:
+ try:
+ with urllib.request.urlopen(req, timeout=1) as r:
+ if r.status == 200:
+ return
+ except Exception:
+ time.sleep(0.25)
+ raise RuntimeError(f"lemond on port {port} did not become healthy within {timeout_s}s")
+```
+
+**Node.js:**
+
+```js
+import { spawn } from "node:child_process";
+import { randomBytes } from "node:crypto";
+import { createServer } from "node:net";
+import path from "node:path";
+
+const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade");
+const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond");
+
+const freePort = () => new Promise((res) => {
+ const s = createServer().listen(0, "127.0.0.1", () => {
+ const { port } = s.address(); s.close(() => res(port));
+ });
+});
+
+export async function startLemond(retries = 3) {
+ let lastErr;
+ for (let i = 0; i < retries; i++) {
+ const port = await freePort();
+ const key = randomBytes(32).toString("base64url");
+ const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], {
+ env: { ...process.env, LEMONADE_API_KEY: key },
+ stdio: ["ignore", "pipe", "pipe"],
+ });
+ try {
+ await waitForHealth(port, key, 30_000);
+ return { proc, key, port };
+ } catch (e) {
+ proc.kill();
+ lastErr = e;
+ }
+ }
+ throw new Error(`lemond failed to start after ${retries} attempts: ${lastErr?.message}`);
+}
+
+async function waitForHealth(port, key, timeoutMs) {
+ const url = `http://127.0.0.1:${port}/api/v1/health`;
+ const headers = { Authorization: `Bearer ${key}` };
+ const deadline = Date.now() + timeoutMs;
+ while (Date.now() < deadline) {
+ try {
+ const r = await fetch(url, { headers });
+ if (r.ok) return;
+ } catch {}
+ await new Promise((r) => setTimeout(r, 250));
+ }
+ throw new Error(`lemond on port ${port} did not become healthy within ${timeoutMs}ms`);
+}
+```
diff --git a/walkthroughs/local-ai-app-integration.md b/walkthroughs/local-ai-app-integration.md
index 4a67642..13dac97 100644
--- a/walkthroughs/local-ai-app-integration.md
+++ b/walkthroughs/local-ai-app-integration.md
@@ -7,10 +7,21 @@ For this walkthrough we use [`danielholanda/dictate`](https://github.com/danielh
a Windows dictation app that currently sends every recording to cloud
speech-to-text providers (Groq, Deepgram, Cartesia, Gemini, Mistral, etc.).
-## Prerequiresites
-This sample app used here requires the Rust toolchain (install from https://rustup.rs/).
+**What you'll end up with:** one new launcher module, one config change to
+the existing HTTP client, and `lemond` vendored under `vendor/lemonade/`.
+Transcription moves from cloud to your local device. Expect 1–2 hours.
-Because this walkthrough runs transcription on the NPU, you need a Ryzen AI PC with an XDNA2 NPU (Strix, Strix Halo, Kraken, or Gorgon Point) running Windows.
+## Prerequisites
+
+This sample app requires the Rust toolchain (install from https://rustup.rs/).
+
+**Hardware:** Any Windows x64 PC works. The skill selects a backend once at integration time based on your development machine. If your machine has an NPU and the chosen recipe supports it, the NPU backend is used — otherwise it transparently falls back to Vulkan as the universal fallback so the app works on any end-user machine. The skill logs which backend was selected and why, so you always know what ran.
+
+| Priority | Your hardware | What you get |
+|---|---|---|
+| 1 (fastest) | Ryzen AI with XDNA2 NPU (Strix, Strix Halo, Kraken, Gorgon Point) | NPU-accelerated transcription |
+| 2 | AMD iGPU / dGPU | GPU-accelerated transcription |
+| 3 (fallback) | Any other Windows x64 PC | CPU transcription |
## Step 1 - Get the target app
@@ -37,21 +48,32 @@ npx skills add amd/skills --skill local-ai-app-integration --agent claude-code
## Step 4 - Running the skill
-Run `claude --model opus` inside the `dictate` repo run the prompt:
+Run `claude --model opus` inside the `dictate` repo with this prompt:
```
This app sends my dictation audio to cloud speech-to-text providers.
Add a local AI mode that runs transcription on my machine instead by default.
-I want it to run using the NPU. Keep the cloud providers as an option and minimize code changes.
+Use the best available local backend — NPU if I have one, otherwise iGPU or CPU.
+Keep the cloud providers as an option and minimize code changes.
```
Claude should:
1. Survey where the app calls its cloud transcription APIs.
-2. Pick a local speech-to-text model + backend (e.g. `whisper-v3-turbo-FLM` using the `FLM` NPU backend).
+2. Probe hardware (`GET /api/v1/system-info`) and pick the fastest available
+ backend for `Whisper-Large-v3-Turbo`, NPU-first:
+ - XDNA2 NPU present → whispercpp NPU backend
+ - else AMD iGPU/dGPU → whispercpp iGPU/dGPU backend
+ - else → whispercpp CPU backend
3. Vendor the Embeddable Lemonade (`lemond`) binary into the app tree.
-4. Add a launcher that spawns `lemond` on a free port.
-5. Re-point the app's existing client at the local endpoint and wait for `/v1/health`.
+4. Add a launcher that spawns `lemond` on a free port with retry logic, logging
+ each lifecycle stage (spawn → health → backend install → model pull → result).
+5. Re-point the app's existing client at the local endpoint and wait for
+ `/api/v1/health`. Because local mode talks to your own machine, it needs **no
+ cloud API key** — Claude should bypass the app's key-entry gate in local mode.
+6. Install the backend, then **pull the model** (`POST /api/v1/pull`) so its
+ weights are on disk before the first recording. Skipping this makes the very
+ first transcription come back blank with no error.
Please note this may take several minutes as this app has a fairly large codebase.
@@ -63,7 +85,35 @@ Dictate is a Tauri (Rust + Node) app. From the repo root:
npm install
npm run tauri dev
```
-Once the window opens, press the microphone button to speak, and confirm that transcription is now running through your local device instead of a cloud provider. The transcribed text should appear where your cursor was last located.
+
+**What the first launch looks like.** Watch the terminal (not the browser
+console). On a cold first run you should see the staged log lines as setup
+progresses — the model download in particular can take a while:
+
+```
+[lemond] Starting on port 56748
+[lemond] Healthy on port 56748
+[lemond] whispercpp:npu already installed
+[lemond] Pulling model Whisper-Large-v3-Turbo...
+[lemond] Model Whisper-Large-v3-Turbo ready
+[local] Using transcription model: Whisper-Large-v3-Turbo backend: npu
+[local] Transcription result: " Hi, can you hear me?"
+```
+
+The **first transcription can be slow** because it covers the whole setup chain:
+server spawn + backend setup + model download + model load. Subsequent
+recordings are fast. Once the window opens, press the microphone button to
+speak, and confirm transcription runs through your local device instead of a
+cloud provider — the text appears where your cursor was last located.
+
+> **Blank result?** If a recording produces no text and the terminal shows no
+> error, the model was not pulled — `[local] Transcription result: ""`. The
+> model-pull step (item 6 of "Claude should" above) fixes this; it is the most
+> common first-run snag.
+
+> **Repeated phrases** like `" How can you hear me now?\n How can you hear me
+> now?\n"` on quiet audio are a known Whisper behavior on silence/low-energy
+> input, not an integration bug.
## Step 6 - (Optional) Going beyond