diff --git a/eval/behavioral/tests/test_local_ai_app_integration.py b/eval/behavioral/tests/test_local_ai_app_integration.py
new file mode 100644
index 0000000..ced0d6d
--- /dev/null
+++ b/eval/behavioral/tests/test_local_ai_app_integration.py
@@ -0,0 +1,92 @@
+"""Behavioral tests for the `local-ai-app-integration` skill.
+
+Run locally (needs the `claude` CLI authenticated):
+
+    pytest eval/behavioral/tests/test_local_ai_app_integration.py -s
+
+Prompts are scoped to code-generation only ("Do not download or install
+anything") to avoid the agent attempting the GitHub download, which hangs
+indefinitely. Checks prefer `logs_contains` / `workspace_contains` (instant)
+over `should` / `should_not` (spawns a judge subprocess) wherever possible.
+"""
+
+from harness import claude
+
+_STUB = "from openai import OpenAI\nclient = OpenAI()\n"
+
+
+def test_launcher_module_written():
+    with claude("sonnet", skill="local-ai-app-integration") as agent:
+        (agent.workspace / "main.py").write_text(_STUB)
+
+        run = agent.prompt(
+            "Write a lemond launcher module for this Python app. "
+            "Do not download or install anything — just write the file."
+        )
+
+        run.workspace_contains("lemond_launcher.py")
+        run.logs_contains("secrets")      # random API key generation
+        run.logs_contains("socket")       # dynamic port via socket bind
+        run.logs_contains("subprocess")   # lemond spawned as subprocess
+
+
+def test_http_client_timeout_is_120s():
+    with claude("sonnet", skill="local-ai-app-integration") as agent:
+        (agent.workspace / "main.py").write_text(_STUB)
+
+        run = agent.prompt(
+            "Update main.py to re-point the OpenAI client at a local lemond "
+            "instance. Do not download or install anything — just edit the file."
+        )
+
+        run.workspace_contains("main.py")
+        run.logs_contains("120")          # 120s timeout present in written code
+
+
+def test_health_check_uses_http_not_stdout():
+    with claude("sonnet", skill="local-ai-app-integration") as agent:
+        (agent.workspace / "main.py").write_text(_STUB)
+
+        run = agent.prompt(
+            "Write a health-check helper for lemond in this Python app. "
+            "Do not download or install anything — just write the code."
+        )
+
+        run.logs_contains("/api/v1/health")
+        run.should_not("Read or parse lemond's stdout or stderr to detect readiness")
+
+
+def test_no_preload_call_in_written_code():
+    with claude("sonnet", skill="local-ai-app-integration") as agent:
+        (agent.workspace / "main.py").write_text(_STUB)
+
+        run = agent.prompt(
+            "Write a lemond launcher for this Python app that waits for the "
+            "server to be ready. Do not download or install anything."
+        )
+
+        run.logs_contains("/api/v1/health")
+        run.should_not("Call POST /api/v1/load to pre-load the model at startup")
+
+
+def test_api_key_gate_bypassed_in_local_mode():
+    with claude("sonnet", skill="local-ai-app-integration") as agent:
+        (agent.workspace / "main.py").write_text(
+            "import os\n"
+            "from openai import OpenAI\n\n"
+            "api_key = os.environ.get('OPENAI_API_KEY', '')\n"
+            "if not api_key:\n"
+            "    raise SystemExit('No API key set. Please enter your OpenAI key.')\n\n"
+            "client = OpenAI(api_key=api_key)\n"
+        )
+
+        run = agent.prompt(
+            "Edit main.py so it works in local mode without an OPENAI_API_KEY. "
+            "Do not download or install anything — just edit the file."
+        )
+
+        run.workspace_contains("main.py")
+        run.should(
+            "Remove or bypass the API-key guard so the app starts in local mode "
+            "without requiring OPENAI_API_KEY to be set"
+        )
diff --git a/skills/local-ai-app-integration/SKILL.md b/skills/local-ai-app-integration/SKILL.md
index 3f197e5..e9e4ee5 100644
--- a/skills/local-ai-app-integration/SKILL.md
+++ b/skills/local-ai-app-integration/SKILL.md
@@ -19,6 +19,10 @@ talks to it on `http://localhost:PORT/api/v1`. The user gets local, private,
 hardware-optimized inference (CPU, AMD iGPU/dGPU, XDNA2 NPU) with no separate
 install.
 
+**What you'll end up with:** one new launcher module (~30 lines), one config
+change to the existing HTTP client (base URL + API key), one vendored binary
+under `vendor/lemonade/`.
+
 ## When this skill is the right tool
 
 Use this skill when **all** of the following are true:
@@ -41,15 +45,22 @@ This skill follows one fixed sequence. Do not deviate without a stated reason.
 ```
 [ ] 1. Survey the app's current AI integration
 [ ] 2. Pick a model + backend profile
-[ ] 3. Place Embeddable Lemonade in the app's tree
-[ ] 4. Add a `lemond` launcher (subprocess + API key + port)
-[ ] 5. Re-point the existing client at lemond
-[ ] 6. Wait for /v1/health and pre-load the default model
+[ ] 3. Place Embeddable Lemonade in the app's tree (full package, not just the binary)
+[ ] 4. Add a `lemond` launcher (subprocess + API key + port + per-stage logging)
+[ ] 5. Re-point the existing client at lemond (set HTTP timeout to 120s)
+[ ] 6. Wait for /api/v1/health, install backend, then PULL the model before first use
 [ ] 7. Wire shutdown and error recovery
 ```
 
 Track progress against this checklist. Move on only when each step verifies.
 
+> **Log every stage.** A local integration has many silent failure points —
+> spawn, health, backend install, model download, first inference. Without a
+> log line at each transition, "nothing happened" is indistinguishable from
+> "broke at stage 3." Emit one clear line per stage as you build (see
+> [Step 4](#step-4-add-a-lemond-launcher)); the most common dead-end in this
+> integration — a blank result with no error — is invisible without them.
+
 ---
 
 ## Step 1: Survey the app
@@ -87,8 +98,8 @@ it.
 | Coding assistant | `Qwen2.5-Coder-7B-Instruct-GGUF` | `llamacpp` | Strong code, runs on iGPU |
 | Vision / multimodal chat | `Gemma-4-E2B-it-GGUF` | `llamacpp` | Small multimodal default |
 | NPU-first on Ryzen AI | `Llama-3.2-3B-Instruct-Hybrid` | `ryzenai-llm` | XDNA2 NPU on Windows |
-| CPU Speech-to-text | `Whisper-Large-v3-Turbo` | `whispercpp` | Best quality/speed |
-| NPU speech-to-text | `whisper-v3-turbo-FLM` | `flm` | XDNA2 NPU on Windows |
+| Speech-to-text (Windows) | `Whisper-Large-v3-Turbo` | `whispercpp` | One model; probe picks NPU → iGPU/dGPU → CPU automatically |
+| Speech-to-text (Linux NPU) | `whisper-v3-turbo-FLM` | `flm` | Linux NPU path; falls back to `whispercpp` iGPU/CPU off-NPU |
 | Text-to-speech | `kokoro-v1` | `kokoro` | CPU-only, low latency |
 | Image generation | `SDXL-Turbo` | `sd-cpp` | Single-step generation |
 
@@ -96,26 +107,94 @@ For the LLM backend, default to `llamacpp` and let `lemond` pick
 `rocm` → `vulkan` → `cpu` automatically by leaving `llamacpp_backend`
 unset. Override only if the app has hard hardware requirements.
 
+**Scope: this skill selects a backend once at integration time on the
+developer's machine.** Runtime fallback based on the end user's hardware is
+out of scope. Bundle `vulkan` as the universal fallback so the app works on
+any machine. If the dev machine has an NPU and the chosen recipe supports it,
+the skill will use the NPU backend — otherwise it falls back to `vulkan`.
+
+> **Note:** having an NPU does not mean every recipe supports NPU. Confirm
+> the recipe/backend pair is `installed` or `installable` via
+> `GET /api/v1/system-info` before committing to it. See
+> [reference.md](reference.md#hardware-probing-with-v1system-info) for
+> per-recipe decision rules.
+
 For more options and tradeoffs, see [reference.md](reference.md).
 
 ## Step 3: Place Embeddable Lemonade in the app's tree and install backends
 
-Get the embeddable artifact from the latest Lemonade release:
+**Get the embeddable artifact** from the latest Lemonade release:
+
+```
+https://github.com/lemonade-sdk/lemonade/releases/latest
+```
+
+Download the file matching your target OS:
 
 - Windows: `lemonade-embeddable-{VERSION}-windows-x64.zip`
-- Linux: `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz`
+- Linux:   `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz`
+
+> **Don't hand-build the download URL from the tag.** The git tag carries a
+> leading `v` (e.g. `v10.8.0`) but the asset filename strips it
+> (`lemonade-embeddable-10.8.0-...`), so using the tag verbatim 404s. Ask the
+> GitHub API for the asset by its stable name pattern and use the URL it
+> returns, as below — this stays correct across version and naming changes.
+
+**First, create the target directory** — it does not exist in a fresh repo:
 
-Unpack into the app source tree at `vendor/lemonade/` (or whatever the app's
-existing convention for vendored binaries is). The expected layout after
-customization:
+```powershell
+# Windows
+New-Item -ItemType Directory -Force vendor\lemonade
+```
+
+```bash
+# Linux
+mkdir -p vendor/lemonade
+```
+
+Then download and unpack on Windows (PowerShell):
+
+```powershell
+$rel = Invoke-RestMethod https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest
+$asset = $rel.assets | Where-Object { $_.name -like "lemonade-embeddable-*-windows-x64.zip" } | Select-Object -First 1
+Invoke-WebRequest $asset.browser_download_url -OutFile lemond.zip
+Expand-Archive lemond.zip -DestinationPath "$env:TEMP\lemond-unpack"
+$folder = $asset.name -replace '\.zip$',''   # unpacked dir = asset name without .zip
+Copy-Item -Recurse "$env:TEMP\lemond-unpack\$folder\*" vendor\lemonade\
+# Sanity check: resources/ must be nested under vendor\lemonade\ (not flattened)
+if (-not (Test-Path vendor\lemonade\resources\*.json)) { throw "resources/ missing — re-extract and copy again" }
+```
+
+On Linux (bash):
+
+```bash
+URL=$(curl -s https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest \
+  | grep browser_download_url | grep ubuntu-x64.tar.gz | cut -d'"' -f4)
+curl -L "$URL" | tar -xz --strip-components=1 -C vendor/lemonade
+```
+
+> **Copy the full package, not just the binary.** The archive contains
+> `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and `resources/`. The
+> `resources/` directory is required — without it lemond starts and passes the
+> health check but fails on every model and backend request. Copying only the
+> binary produces a server that looks healthy but cannot function.
+
+> **`lemond` vs `lemonade` CLI:** `lemond` is the embedded server binary that
+> ships with the app. The `lemonade` CLI is a separate packaging tool used
+> only during development/build time to install backends. Install it once on
+> the developer machine with `pip install lemonade-sdk`.
+
+The expected layout **after setup** (first run + backend install). A freshly
+unzipped package contains only `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and
+`resources/` — the items below are created later, as their comments note:
 
 ```
 vendor/lemonade/
   lemond[.exe]                     # the only binary the app ships
   LICENSE
-  config.json                      # generated on first run
+  config.json                      # generated on first run; commit a seed copy
   resources/
-    server_models.json             # trim to just the models you ship
+    server_models.json             # do not edit; use GET /api/v1/models at runtime
     backend_versions.json
   bin/                             # backends bundled at packaging time
     llamacpp/vulkan/llama-server[.exe]
@@ -123,24 +202,41 @@ vendor/lemonade/
     models--unsloth--Qwen3-4B-GGUF/
 ```
 
+> **`server_models.json`:** Do not edit or rely on this file. It can be stale.
+> The only authoritative model list is `GET /api/v1/models` on a running
+> `lemond` instance with the backend already installed.
+
 **Bundle decisions: pick deliberately**
 
 - **Backends:** Bundle `llamacpp:vulkan` at packaging time (works on every
   GPU). Install `llamacpp:rocm` at first run on supported AMD systems via
-  `POST /v1/install` after probing `GET /v1/system-info`. Never ship every
-  backend, or the artifact balloons.
+  `POST /api/v1/install` after probing `GET /api/v1/system-info`. Never ship
+  every backend, or the artifact balloons.
 - **Models:** Either bundle the default model under `models/` (offline
-  install, larger installer) **or** pull on first run with `POST /v1/pull`
-  (smaller installer, needs network). Pick one and document it.
+  install, larger installer) **or** pull on first run with
+  `POST /api/v1/pull` (smaller installer, needs network). Pick one and
+  document it.
 - **`models_dir`:** Set to `./models` in `config.json` to keep weights
   private to the app. Leave as `auto` only if the user explicitly wants to
   share weights with other apps.
 
-**Install the backend before running any model.** Right after placing
-`lemond`, install the backend your chosen recipe needs — a model won't load
-without it. Use the CLI at packaging time, e.g. `lemonade backends install
-flm:npu` (or `llamacpp:vulkan`, `sd-cpp:cpu`, etc.), or `POST /v1/install`
-at first run for hardware-specific backends like `llamacpp:rocm`.
+**Backend install timing — two distinct paths:**
+
+> **Packaging time** (developer machine, before bundling):
+> ```
+> lemonade backends install llamacpp:vulkan
+> lemonade backends install flm:npu    # Windows NPU path only
+> ```
+> This bakes the backend binaries into `vendor/lemonade/bin/` before the app
+> ships. `lemond` does not need to be running.
+>
+> **First-run / runtime** (user's machine, after `lemond` is running):
+> ```http
+> POST /api/v1/install
+> {"recipe": "llamacpp", "backend": "rocm"}
+> ```
+> Use this for hardware-specific backends (e.g. `llamacpp:rocm`) that cannot
+> be bundled universally. `lemond` must already be running (Step 4 complete).
 
 ## Step 4: Add a `lemond` launcher
 
@@ -151,87 +247,42 @@ The launcher is a thin process supervisor. Its only jobs:
 3. Spawn `lemond <dir> --port <port>` with `LEMONADE_API_KEY` set.
 4. Expose the chosen `port` and `key` to the rest of the app.
 
-**Python reference launcher** (adapt to the app's language):
+> **Log one line per lifecycle stage.** Build the logging in from the start —
+> not as an afterthought when something breaks. Each silent transition needs a
+> visible marker so a failure points at the exact stage. Aim for:
+>
+> ```
+> [lemond] Starting on port <port>
+> [lemond] Healthy on port <port>
+> [lemond] <recipe>:<backend> installed        (or: already installed / install failed)
+> [lemond] Pulling model <name>...             then: Model <name> ready  (or: pull returned <status>)
+> [local]  <modality> result: <value>          (first inference output — empty string here = unpulled model)
+> ```
+>
+> Logging the **first inference result verbatim** is what turns the
+> silent-empty failure (Step 6) from a multi-hour mystery into a one-line
+> diagnosis. Route these through the app's normal logging so they can be quieted
+> for release.
+
+> **Dev-mode file watchers:** If the app runs with a file watcher (Tauri,
+> Electron, Next.js, Vite, etc.) that watches the source tree, ensure
+> `vendor/lemonade/` is excluded from the watched paths. Lemond writes config
+> and cache files at runtime; a watcher that picks these up will restart the
+> app, kill the lemond subprocess, and spawn a new one on a new port —
+> silently breaking any in-flight transcription. Add `vendor/` (or the
+> equivalent) to the watcher's ignore list before testing.
+
+The launcher logic in pseudocode (full Python and Node.js implementations in [reference.md](reference.md#reference-launchers)):
 
-```python
-import os, secrets, socket, subprocess, sys, time, urllib.request
-from pathlib import Path
-
-LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade"
-LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond")
-
-def _free_port() -> int:
-    with socket.socket() as s:
-        s.bind(("127.0.0.1", 0))
-        return s.getsockname()[1]
-
-def start_lemond() -> tuple[subprocess.Popen, str, int]:
-    port = _free_port()
-    key = secrets.token_urlsafe(32)
-    env = {**os.environ, "LEMONADE_API_KEY": key}
-    proc = subprocess.Popen(
-        [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)],
-        env=env,
-        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-    )
-    _wait_for_health(port, key, timeout_s=30)
-    return proc, key, port
-
-def _wait_for_health(port: int, key: str, timeout_s: int) -> None:
-    url = f"http://127.0.0.1:{port}/api/v1/health"
-    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"})
-    deadline = time.monotonic() + timeout_s
-    while time.monotonic() < deadline:
-        try:
-            with urllib.request.urlopen(req, timeout=1) as r:
-                if r.status == 200:
-                    return
-        except Exception:
-            time.sleep(0.25)
-    raise RuntimeError("lemond failed to become healthy")
 ```
-
-**Node.js reference launcher:**
-
-```js
-import { spawn } from "node:child_process";
-import { randomBytes } from "node:crypto";
-import { createServer } from "node:net";
-import path from "node:path";
-
-const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade");
-const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond");
-
-const freePort = () => new Promise((res) => {
-  const s = createServer().listen(0, "127.0.0.1", () => {
-    const { port } = s.address(); s.close(() => res(port));
-  });
-});
-
-export async function startLemond() {
-  const port = await freePort();
-  const key = randomBytes(32).toString("base64url");
-  const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], {
-    env: { ...process.env, LEMONADE_API_KEY: key },
-    stdio: ["ignore", "pipe", "pipe"],
-  });
-  await waitForHealth(port, key, 30_000);
-  return { proc, key, port };
-}
-
-async function waitForHealth(port, key, timeoutMs) {
-  const url = `http://127.0.0.1:${port}/api/v1/health`;
-  const headers = { Authorization: `Bearer ${key}` };
-  const deadline = Date.now() + timeoutMs;
-  while (Date.now() < deadline) {
-    try {
-      const r = await fetch(url, { headers });
-      if (r.ok) return;
-    } catch {}
-    await new Promise((r) => setTimeout(r, 250));
-  }
-  throw new Error("lemond failed to become healthy");
-}
+port  = bind("127.0.0.1:0"), read port, close socket
+key   = random_bytes(32)
+proc  = spawn(lemond_bin, [lemond_dir, "--port", port], env={LEMONADE_API_KEY: key})
+poll  GET /api/v1/health with Bearer key, retry for 90s, 250ms interval
+return proc, key, port
+
+# On failure: kill proc, pick new port, retry up to 3 times
+# On app exit: proc.kill() (Windows) / proc.terminate() (Unix), then wait()
 ```
 
 ## Step 5: Re-point the existing client at `lemond`
@@ -249,23 +300,39 @@ and the API key. Nothing else.
 The model identifier on requests stays a Lemonade model name (e.g.
 `Qwen3-4B-GGUF`), not the cloud name.
 
-**Bypass the app's API-key gate in local mode.** A local backend needs no
-cloud key, so any onboarding wall, validator, or startup check that demands
-one must not block local-mode users. Skip or auto-satisfy the key-entry
-screen, treat local mode as already-authorized in validation logic, and
-re-enable the gate only for cloud mode. The `lemond` key from Step 4 is set
-internally by the launcher, so the user never enters one and any UI
-placeholder (e.g. `"local"`) is fine. Flipping into local mode should never
-strand the user on a key-entry wall.
+**Local mode needs no cloud API key — at all.** This is a defining property of
+local mode, not an edge case: there is no cloud service to authenticate to, so
+nothing should ever ask the user for a key. Any onboarding wall, validator, or
+startup check that demands one must not block local-mode users. Concretely:
+
+- Skip or auto-satisfy the key-entry screen in local mode.
+- Treat local mode as already-authorized in every validation path — an
+  empty-key check must short-circuit to "valid" when the active mode is local,
+  never throw "API key not configured".
+- Re-enable the gate **only** for cloud mode.
+
+The `lemond` key from Step 4 is generated internally by the launcher and used
+only for the local loopback connection, so the user never sees or enters one;
+any UI placeholder (e.g. `"local"`) is fine. Flipping into local mode should
+never strand the user on a key-entry wall.
+
+**Set the HTTP client timeout to at least 120 seconds.** The default timeout
+on most HTTP clients (30s) is shorter than the time lemond takes to load a
+model on first use. A silent timeout looks identical to a broken integration
+— the request fires, nothing comes back, and the UI shows nothing. 120s
+covers first-run model load on any supported hardware.
 
 **Python (openai) example:**
 
 ```python
 from openai import OpenAI
+import httpx
+
 proc, key, port = start_lemond()
 client = OpenAI(
     base_url=f"http://127.0.0.1:{port}/api/v1",
     api_key=key,
+    http_client=httpx.Client(timeout=120.0),  # covers first-run model load
 )
 resp = client.chat.completions.create(
     model="Qwen3-4B-GGUF",
@@ -273,21 +340,69 @@ resp = client.chat.completions.create(
 )
 ```
 
-## Step 6: Wait for health, then preload the default model
+## Step 6: Health, backend, then pull the model — *before* first inference
+
+`GET /api/v1/health` returning 200 means the **server** is up. It does **not**
+mean inference will work. Before the first real request succeeds, three more
+things must be true: the backend for your modality is installed, the model's
+weights are **downloaded to disk**, and (on the first call) the model is loaded
+into memory. Treating health=200 as "ready" is the single biggest cause of a
+broken-looking integration.
+
+**Do not call `POST /api/v1/load` at startup.** Lemond lazy-loads the model
+into memory on the first inference request and handles that step on its own.
+Pre-loading is unreliable across lemond versions (the `/load` request body
+shape has changed between releases) and a malformed call can crash or
+destabilise the server before the user takes any action. Loading is the one
+step you let lemond do lazily — pulling is not.
 
-`lemond` lazy-loads models on first inference. To eliminate cold-start
-latency on the user's first message, preload right after the health check
-passes:
+### Pull the model so it exists on disk
+
+Lazy-load only loads weights that are **already downloaded**. If the model was
+never pulled, the first inference does not error — lemond returns an empty /
+blank result with HTTP 200. So after health passes and the backend is
+installed, proactively pull the model:
 
 ```http
-POST /api/v1/load
-Authorization: Bearer {key}
-Content-Type: application/json
+POST /api/v1/pull
+{"model": "Whisper-Large-v3-Turbo"}
+```
+
+This is **idempotent** — a no-op if the weights are already present, a download
+if they are not. Run it once during setup (after backend install, before the
+first user-triggered inference) and log the result.
+
+- **Default model** (the one you chose in Step 2): pull it by name as above.
+- **Custom / user-overridden model:** do not assume it exists. Confirm it is a
+  real Lemonade model first via `GET /api/v1/models` (the **only** trusted
+  catalog — see [reference.md](reference.md)), then pull it the same way. A
+  model appearing in the catalog is **not** proof its weights are downloaded;
+  a successful pull is.
+
+> **Silent-empty is almost always an unpulled model.** If inference returns an
+> empty string / blank output with no HTTP error, the model was not downloaded.
+> Check your pull step before debugging anything else — this is the failure mode
+> that wastes the most time. Log the pull result and the first inference result
+> (see Step 4) so this is diagnosable from the console, not by guesswork.
+
+### Surface the *whole* setup, not just model load
+
+First-run cold start is more than a model load. The full sequence is:
 
-{"model": "Qwen3-4B-GGUF"}
 ```
+server spawn  →  health 200  →  backend install  →  model download  →  model load  →  first result
+```
+
+On a fresh machine, backend install and model download can each take from tens
+of seconds to several **minutes** (multi-GB weights over the network). Model
+load alone is 10–30s. An app that shows nothing during this will look frozen.
 
-If the model isn't downloaded yet, follow the recovery flow in Step 7.
+Minimum: show a loading indicator or status message ("Setting up local AI…")
+from the moment setup begins until the first response arrives — covering the
+*entire* sequence above, not just the final load. The simplest implementation
+is a flag set when setup/first-request starts and cleared when the first
+response arrives. Once the model is pulled and loaded once, subsequent runs are
+fast; the long wait is first-run only.
 
 ## Step 7: Lifecycle and recovery
 
@@ -295,10 +410,11 @@ These are the only failure modes worth handling. Do not over-engineer.
 
 | Symptom | Cause | Recovery |
 |---|---|---|
-| `POST /v1/load` returns 404 / model not found | Model not pulled yet | `POST /v1/pull` with `{"model": "..."}` then retry `/v1/load` |
-| `/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /v1/system-info`, pick a supported backend, `POST /v1/install` with `{"recipe": "...", "backend": "..."}`, retry |
-| Subprocess exits immediately | Port already in use by another `lemond` | Pick a new free port and retry once |
-| `/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after |
+| **Inference returns empty / blank with HTTP 200, no error** | Model never pulled: backend is installed but weights are absent, so lazy-load has nothing to load | `POST /api/v1/pull` with `{"model":"..."}`, wait for success, retry. Log the pulled result and the first inference result. This is the most common silent failure — see [Step 6](#step-6-health-backend-then-pull-the-model--before-first-inference) |
+| `POST /api/v1/load` returns 404 / model not found | Model not pulled yet (same root cause as the empty-result row above) | `POST /api/v1/pull` with `{"model": "..."}` then retry `/api/v1/load` |
+| `POST /api/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /api/v1/system-info`, pick a supported backend, `POST /api/v1/install` with `{"recipe": "...", "backend": "..."}`, retry |
+| Subprocess exits immediately | Port race: another process grabbed the port between `freePort()` and lemond binding | The reference launcher retries with a fresh port automatically (3 attempts) |
+| `/api/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after |
 | HTTP 401 on every request | Forgot the `Authorization: Bearer` header | Audit the client config because Lemonade rejects unauth'd calls when `LEMONADE_API_KEY` is set |
 
 **Shutdown:** On app exit, `proc.terminate()` (Unix) or
@@ -314,13 +430,25 @@ couple of seconds. Always wait on the process; never orphan it.
 
 The integration is done when **all** of these are true:
 
+- [ ] `vendor/lemonade/` contains the full package: `lemond[.exe]`,
+      `lemonade[.exe]`, `LICENSE`, and `resources/` — not just the binary.
 - [ ] `lemond` starts as a subprocess with a fresh API key per launch.
 - [ ] `GET /api/v1/health` returns 200 within the timeout.
-- [ ] The default model loads successfully via `POST /v1/load`.
+- [ ] The default model is pulled (or bundled) before the first inference; a
+      custom/overridden model is confirmed via `GET /api/v1/models` and then
+      pulled. A blank result with no error means this step was skipped.
+- [ ] Each lifecycle stage logs a clear line (spawn, health, backend install,
+      model pull, first result) so a failure is diagnosable from the console.
 - [ ] The existing client's chat / image / speech call returns a valid
       response with the base URL and key swapped, with no other code changed.
-- [ ] In local mode the app's API-key gate is bypassed: no onboarding wall,
-      validator, or startup check blocks the user for lacking a cloud key.
+- [ ] First-run latency is surfaced: the interface shows a loading state from the
+      moment the first inference request is sent until the response arrives.
+- [ ] The HTTP client timeout is set to at least 120 seconds.
+- [ ] In local mode the app requires **no** cloud API key: no onboarding wall,
+      validator, or startup check blocks the user, and no code path throws
+      "API key not configured" when the active mode is local.
+- [ ] If the app uses a dev-mode file watcher, `vendor/lemonade/` is excluded
+      from the watched paths so runtime writes by lemond do not trigger restarts.
 - [ ] Killing the parent process leaves no `lemond` subprocess behind.
 - [ ] On a fresh machine without the optimal backend, the app still works
       via the Vulkan fallback bundled in `bin/`.
diff --git a/skills/local-ai-app-integration/reference.md b/skills/local-ai-app-integration/reference.md
index 9b8bb9e..d77f5f1 100644
--- a/skills/local-ai-app-integration/reference.md
+++ b/skills/local-ai-app-integration/reference.md
@@ -40,14 +40,12 @@ hardware-optimized one at first run after a system probe.
 
 ### Speech-to-text
 
-Two NPU paths exist. **Prefer `flm` for NPU**.
-
 | Recipe | Backend | Model | Hardware | OS |
 |---|---|---|---|---|
-| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Windows |
+| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | AMD iGPU / dGPU | Windows, Linux |
 | `whispercpp` | `cpu` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Windows, Linux |
-| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Linux |
-| `whispercpp` | `npu` | `.rai`-cached whisper model | XDNA2 NPU | Windows (avoid) |
+| `whispercpp` | `npu` | `Whisper-Large-v3-Turbo` | XDNA2 NPU | Windows |
+| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Linux (runtime-install only) |
 
 ### Text-to-speech
 
@@ -89,6 +87,14 @@ model catalog; it can be stale or incomplete. A model only appears in
 `GET /v1/models` once its backend is installed (see Step 3), so install the
 backend first or the list will look empty/incomplete.
 
+**Catalogued ≠ downloaded.** A model listed by `GET /v1/models` is *available
+to use*, not necessarily present on disk. It must be **pulled**
+(`POST /api/v1/pull {"model":"..."}`) before it can serve — until then,
+inference returns an empty result with HTTP 200, not an error. The surest
+signal that a model is ready is a successful pull, not its presence in the
+catalog. See SKILL.md
+[Step 6](SKILL.md#step-6-health-backend-then-pull-the-model--before-first-inference).
+
 ---
 
 ## Hardware probing with /v1/system-info
@@ -120,13 +126,31 @@ Response shape (truncated):
 }
 ```
 
-Decision rules in priority order, for the default `llamacpp` recipe:
+The same pattern applies to **every** recipe: read the per-backend `state`,
+install the best one that is `installable`, use it if already `installed`, and
+fall back down the priority list otherwise. Apply it to whichever recipe matches
+the app's modality.
+
+Decision rules in priority order, for the default `llamacpp` recipe (text gen):
 
 1. If `recipes.llamacpp.backends.rocm.state == "installable"` →
    `POST /v1/install {"recipe":"llamacpp","backend":"rocm"}`.
 2. Else if `state == "installed"` for `vulkan` → use it as-is.
 3. Else fall back to `cpu`.
 
+Decision rules for the `whispercpp` recipe (speech-to-text), NPU-first:
+
+1. If `recipes.whispercpp.backends.npu.state == "installed"` → use NPU as-is.
+2. Else if `npu.state == "installable"` →
+   `POST /v1/install {"recipe":"whispercpp","backend":"npu"}`, then use NPU.
+3. Else if `vulkan` is `installed`/`installable` → use the iGPU/dGPU path.
+4. Else fall back to `cpu`.
+
+Probe **once**, cache the chosen backend for the session (the result does not
+change while the app runs), and log which backend was selected. This is the
+mechanism that lets one build run on an NPU machine and a CPU-only machine
+without any user configuration.
+
 For Ryzen AI Hybrid models on Windows, additionally check
 `ryzenai-llm.backends.npu.state` and install if `installable`.
 
@@ -247,3 +271,112 @@ Two backend limitations on Linux as of this writing:
 
 When building from source for an unusual Linux distro, see the upstream
 `docs/embeddable/building.md` in the lemonade-sdk/lemonade repo.
+
+---
+
+## Reference launchers
+
+Full implementations for Step 4. Adapt to the app's language; the key
+constraints are: retry with a fresh port on spawn failure (the socket is
+released before lemond binds), poll `/api/v1/health` with the Bearer key,
+and kill the process on app exit.
+
+**Python:**
+
+```python
+import os, secrets, socket, subprocess, sys, time, urllib.request
+from pathlib import Path
+
+LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade"
+LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond")
+
+def _free_port() -> int:
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+def start_lemond(retries: int = 3) -> tuple[subprocess.Popen, str, int]:
+    last_err: Exception | None = None
+    for _ in range(retries):
+        port = _free_port()
+        key = secrets.token_urlsafe(32)
+        env = {**os.environ, "LEMONADE_API_KEY": key}
+        proc = subprocess.Popen(
+            [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)],
+            env=env,
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        )
+        try:
+            _wait_for_health(port, key, timeout_s=30)
+            return proc, key, port
+        except RuntimeError as e:
+            proc.kill()
+            proc.wait()
+            last_err = e
+    raise RuntimeError(f"lemond failed to start after {retries} attempts") from last_err
+
+def _wait_for_health(port: int, key: str, timeout_s: int) -> None:
+    url = f"http://127.0.0.1:{port}/api/v1/health"
+    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"})
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        try:
+            with urllib.request.urlopen(req, timeout=1) as r:
+                if r.status == 200:
+                    return
+        except Exception:
+            time.sleep(0.25)
+    raise RuntimeError(f"lemond on port {port} did not become healthy within {timeout_s}s")
+```
+
+**Node.js:**
+
+```js
+import { spawn } from "node:child_process";
+import { randomBytes } from "node:crypto";
+import { createServer } from "node:net";
+import path from "node:path";
+
+const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade");
+const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond");
+
+const freePort = () => new Promise((res) => {
+  const s = createServer().listen(0, "127.0.0.1", () => {
+    const { port } = s.address(); s.close(() => res(port));
+  });
+});
+
+export async function startLemond(retries = 3) {
+  let lastErr;
+  for (let i = 0; i < retries; i++) {
+    const port = await freePort();
+    const key = randomBytes(32).toString("base64url");
+    const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], {
+      env: { ...process.env, LEMONADE_API_KEY: key },
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+    try {
+      await waitForHealth(port, key, 30_000);
+      return { proc, key, port };
+    } catch (e) {
+      proc.kill();
+      lastErr = e;
+    }
+  }
+  throw new Error(`lemond failed to start after ${retries} attempts: ${lastErr?.message}`);
+}
+
+async function waitForHealth(port, key, timeoutMs) {
+  const url = `http://127.0.0.1:${port}/api/v1/health`;
+  const headers = { Authorization: `Bearer ${key}` };
+  const deadline = Date.now() + timeoutMs;
+  while (Date.now() < deadline) {
+    try {
+      const r = await fetch(url, { headers });
+      if (r.ok) return;
+    } catch {}
+    await new Promise((r) => setTimeout(r, 250));
+  }
+  throw new Error(`lemond on port ${port} did not become healthy within ${timeoutMs}ms`);
+}
+```
diff --git a/walkthroughs/local-ai-app-integration.md b/walkthroughs/local-ai-app-integration.md
index 4a67642..13dac97 100644
--- a/walkthroughs/local-ai-app-integration.md
+++ b/walkthroughs/local-ai-app-integration.md
@@ -7,10 +7,21 @@ For this walkthrough we use [`danielholanda/dictate`](https://github.com/danielh
 a Windows dictation app that currently sends every recording to cloud
 speech-to-text providers (Groq, Deepgram, Cartesia, Gemini, Mistral, etc.).
 
-## Prerequiresites
-This sample app used here requires the Rust toolchain (install from https://rustup.rs/).
+**What you'll end up with:** one new launcher module, one config change to
+the existing HTTP client, and `lemond` vendored under `vendor/lemonade/`.
+Transcription moves from cloud to your local device. Expect 1–2 hours.
 
-Because this walkthrough runs transcription on the NPU, you need a Ryzen AI PC with an XDNA2 NPU (Strix, Strix Halo, Kraken, or Gorgon Point) running Windows.
+## Prerequisites
+
+This sample app requires the Rust toolchain (install from https://rustup.rs/).
+
+**Hardware:** Any Windows x64 PC works. The skill selects a backend once at integration time based on your development machine. If your machine has an NPU and the chosen recipe supports it, the NPU backend is used — otherwise it transparently falls back to Vulkan as the universal fallback so the app works on any end-user machine. The skill logs which backend was selected and why, so you always know what ran.
+
+| Priority | Your hardware | What you get |
+|---|---|---|
+| 1 (fastest) | Ryzen AI with XDNA2 NPU (Strix, Strix Halo, Kraken, Gorgon Point) | NPU-accelerated transcription |
+| 2 | AMD iGPU / dGPU | GPU-accelerated transcription |
+| 3 (fallback) | Any other Windows x64 PC | CPU transcription |
 
 ## Step 1 - Get the target app
 
@@ -37,21 +48,32 @@ npx skills add amd/skills --skill local-ai-app-integration --agent claude-code
 
 ## Step 4 - Running the skill
 
-Run `claude --model opus` inside the `dictate` repo run the prompt:
+Run `claude --model opus` inside the `dictate` repo with this prompt:
 
 ```
 This app sends my dictation audio to cloud speech-to-text providers.
 Add a local AI mode that runs transcription on my machine instead by default.
-I want it to run using the NPU. Keep the cloud providers as an option and minimize code changes.
+Use the best available local backend — NPU if I have one, otherwise iGPU or CPU.
+Keep the cloud providers as an option and minimize code changes.
 ```
 
 Claude should:
 
 1. Survey where the app calls its cloud transcription APIs.
-2. Pick a local speech-to-text model + backend (e.g. `whisper-v3-turbo-FLM` using the `FLM` NPU backend).
+2. Probe hardware (`GET /api/v1/system-info`) and pick the fastest available
+   backend for `Whisper-Large-v3-Turbo`, NPU-first:
+   - XDNA2 NPU present → whispercpp NPU backend
+   - else AMD iGPU/dGPU → whispercpp iGPU/dGPU backend
+   - else → whispercpp CPU backend
 3. Vendor the Embeddable Lemonade (`lemond`) binary into the app tree.
-4. Add a launcher that spawns `lemond` on a free port.
-5. Re-point the app's existing client at the local endpoint and wait for `/v1/health`.
+4. Add a launcher that spawns `lemond` on a free port with retry logic, logging
+   each lifecycle stage (spawn → health → backend install → model pull → result).
+5. Re-point the app's existing client at the local endpoint and wait for
+   `/api/v1/health`. Because local mode talks to your own machine, it needs **no
+   cloud API key** — Claude should bypass the app's key-entry gate in local mode.
+6. Install the backend, then **pull the model** (`POST /api/v1/pull`) so its
+   weights are on disk before the first recording. Skipping this makes the very
+   first transcription come back blank with no error.
 
 Please note this may take several minutes as this app has a fairly large codebase.
 
@@ -63,7 +85,35 @@ Dictate is a Tauri (Rust + Node) app. From the repo root:
 npm install
 npm run tauri dev
 ```
-Once the window opens, press the microphone button to speak, and confirm that transcription is now running through your local device instead of a cloud provider. The transcribed text should appear where your cursor was last located.
+
+**What the first launch looks like.** Watch the terminal (not the browser
+console). On a cold first run you should see the staged log lines as setup
+progresses — the model download in particular can take a while:
+
+```
+[lemond] Starting on port 56748
+[lemond] Healthy on port 56748
+[lemond] whispercpp:npu already installed
+[lemond] Pulling model Whisper-Large-v3-Turbo...
+[lemond] Model Whisper-Large-v3-Turbo ready
+[local] Using transcription model: Whisper-Large-v3-Turbo backend: npu
+[local] Transcription result: " Hi, can you hear me?"
+```
+
+The **first transcription can be slow** because it covers the whole setup chain:
+server spawn + backend setup + model download + model load. Subsequent
+recordings are fast. Once the window opens, press the microphone button to
+speak, and confirm transcription runs through your local device instead of a
+cloud provider — the text appears where your cursor was last located.
+
+> **Blank result?** If a recording produces no text and the terminal shows no
+> error, the model was not pulled — `[local] Transcription result: ""`. The
+> model-pull step (item 6 of "Claude should" above) fixes this; it is the most
+> common first-run snag.
+
+> **Repeated phrases** like `" How can you hear me now?\n How can you hear me
+> now?\n"` on quiet audio are a known Whisper behavior on silence/low-energy
+> input, not an integration bug.
 
 ## Step 6 - (Optional) Going beyond