diff --git a/eval/behavioral/tests/test_local_ai_app_integration.py b/eval/behavioral/tests/test_local_ai_app_integration.py new file mode 100644 index 0000000..ced0d6d --- /dev/null +++ b/eval/behavioral/tests/test_local_ai_app_integration.py @@ -0,0 +1,92 @@ +"""Behavioral tests for the `local-ai-app-integration` skill. + +Run locally (needs the `claude` CLI authenticated): + + pytest eval/behavioral/tests/test_local_ai_app_integration.py -s + +Prompts are scoped to code-generation only ("Do not download or install +anything") to avoid the agent attempting the GitHub download, which hangs +indefinitely. Checks prefer `logs_contains` / `workspace_contains` (instant) +over `should` / `should_not` (spawns a judge subprocess) wherever possible. +""" + +from harness import claude + +_STUB = "from openai import OpenAI\nclient = OpenAI()\n" + + +def test_launcher_module_written(): + with claude("sonnet", skill="local-ai-app-integration") as agent: + (agent.workspace / "main.py").write_text(_STUB) + + run = agent.prompt( + "Write a lemond launcher module for this Python app. " + "Do not download or install anything — just write the file." + ) + + run.workspace_contains("lemond_launcher.py") + run.logs_contains("secrets") # random API key generation + run.logs_contains("socket") # dynamic port via socket bind + run.logs_contains("subprocess") # lemond spawned as subprocess + + +def test_http_client_timeout_is_120s(): + with claude("sonnet", skill="local-ai-app-integration") as agent: + (agent.workspace / "main.py").write_text(_STUB) + + run = agent.prompt( + "Update main.py to re-point the OpenAI client at a local lemond " + "instance. Do not download or install anything — just edit the file." + ) + + run.workspace_contains("main.py") + run.logs_contains("120") # 120s timeout present in written code + + +def test_health_check_uses_http_not_stdout(): + with claude("sonnet", skill="local-ai-app-integration") as agent: + (agent.workspace / "main.py").write_text(_STUB) + + run = agent.prompt( + "Write a health-check helper for lemond in this Python app. " + "Do not download or install anything — just write the code." + ) + + run.logs_contains("/api/v1/health") + run.should_not("Read or parse lemond's stdout or stderr to detect readiness") + + +def test_no_preload_call_in_written_code(): + with claude("sonnet", skill="local-ai-app-integration") as agent: + (agent.workspace / "main.py").write_text(_STUB) + + run = agent.prompt( + "Write a lemond launcher for this Python app that waits for the " + "server to be ready. Do not download or install anything." + ) + + run.logs_contains("/api/v1/health") + run.should_not("Call POST /api/v1/load to pre-load the model at startup") + + +def test_api_key_gate_bypassed_in_local_mode(): + with claude("sonnet", skill="local-ai-app-integration") as agent: + (agent.workspace / "main.py").write_text( + "import os\n" + "from openai import OpenAI\n\n" + "api_key = os.environ.get('OPENAI_API_KEY', '')\n" + "if not api_key:\n" + " raise SystemExit('No API key set. Please enter your OpenAI key.')\n\n" + "client = OpenAI(api_key=api_key)\n" + ) + + run = agent.prompt( + "Edit main.py so it works in local mode without an OPENAI_API_KEY. " + "Do not download or install anything — just edit the file." + ) + + run.workspace_contains("main.py") + run.should( + "Remove or bypass the API-key guard so the app starts in local mode " + "without requiring OPENAI_API_KEY to be set" + ) diff --git a/skills/local-ai-app-integration/SKILL.md b/skills/local-ai-app-integration/SKILL.md index 3f197e5..e9e4ee5 100644 --- a/skills/local-ai-app-integration/SKILL.md +++ b/skills/local-ai-app-integration/SKILL.md @@ -19,6 +19,10 @@ talks to it on `http://localhost:PORT/api/v1`. The user gets local, private, hardware-optimized inference (CPU, AMD iGPU/dGPU, XDNA2 NPU) with no separate install. +**What you'll end up with:** one new launcher module (~30 lines), one config +change to the existing HTTP client (base URL + API key), one vendored binary +under `vendor/lemonade/`. + ## When this skill is the right tool Use this skill when **all** of the following are true: @@ -41,15 +45,22 @@ This skill follows one fixed sequence. Do not deviate without a stated reason. ``` [ ] 1. Survey the app's current AI integration [ ] 2. Pick a model + backend profile -[ ] 3. Place Embeddable Lemonade in the app's tree -[ ] 4. Add a `lemond` launcher (subprocess + API key + port) -[ ] 5. Re-point the existing client at lemond -[ ] 6. Wait for /v1/health and pre-load the default model +[ ] 3. Place Embeddable Lemonade in the app's tree (full package, not just the binary) +[ ] 4. Add a `lemond` launcher (subprocess + API key + port + per-stage logging) +[ ] 5. Re-point the existing client at lemond (set HTTP timeout to 120s) +[ ] 6. Wait for /api/v1/health, install backend, then PULL the model before first use [ ] 7. Wire shutdown and error recovery ``` Track progress against this checklist. Move on only when each step verifies. +> **Log every stage.** A local integration has many silent failure points — +> spawn, health, backend install, model download, first inference. Without a +> log line at each transition, "nothing happened" is indistinguishable from +> "broke at stage 3." Emit one clear line per stage as you build (see +> [Step 4](#step-4-add-a-lemond-launcher)); the most common dead-end in this +> integration — a blank result with no error — is invisible without them. + --- ## Step 1: Survey the app @@ -87,8 +98,8 @@ it. | Coding assistant | `Qwen2.5-Coder-7B-Instruct-GGUF` | `llamacpp` | Strong code, runs on iGPU | | Vision / multimodal chat | `Gemma-4-E2B-it-GGUF` | `llamacpp` | Small multimodal default | | NPU-first on Ryzen AI | `Llama-3.2-3B-Instruct-Hybrid` | `ryzenai-llm` | XDNA2 NPU on Windows | -| CPU Speech-to-text | `Whisper-Large-v3-Turbo` | `whispercpp` | Best quality/speed | -| NPU speech-to-text | `whisper-v3-turbo-FLM` | `flm` | XDNA2 NPU on Windows | +| Speech-to-text (Windows) | `Whisper-Large-v3-Turbo` | `whispercpp` | One model; probe picks NPU → iGPU/dGPU → CPU automatically | +| Speech-to-text (Linux NPU) | `whisper-v3-turbo-FLM` | `flm` | Linux NPU path; falls back to `whispercpp` iGPU/CPU off-NPU | | Text-to-speech | `kokoro-v1` | `kokoro` | CPU-only, low latency | | Image generation | `SDXL-Turbo` | `sd-cpp` | Single-step generation | @@ -96,26 +107,94 @@ For the LLM backend, default to `llamacpp` and let `lemond` pick `rocm` → `vulkan` → `cpu` automatically by leaving `llamacpp_backend` unset. Override only if the app has hard hardware requirements. +**Scope: this skill selects a backend once at integration time on the +developer's machine.** Runtime fallback based on the end user's hardware is +out of scope. Bundle `vulkan` as the universal fallback so the app works on +any machine. If the dev machine has an NPU and the chosen recipe supports it, +the skill will use the NPU backend — otherwise it falls back to `vulkan`. + +> **Note:** having an NPU does not mean every recipe supports NPU. Confirm +> the recipe/backend pair is `installed` or `installable` via +> `GET /api/v1/system-info` before committing to it. See +> [reference.md](reference.md#hardware-probing-with-v1system-info) for +> per-recipe decision rules. + For more options and tradeoffs, see [reference.md](reference.md). ## Step 3: Place Embeddable Lemonade in the app's tree and install backends -Get the embeddable artifact from the latest Lemonade release: +**Get the embeddable artifact** from the latest Lemonade release: + +``` +https://github.com/lemonade-sdk/lemonade/releases/latest +``` + +Download the file matching your target OS: - Windows: `lemonade-embeddable-{VERSION}-windows-x64.zip` -- Linux: `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz` +- Linux: `lemonade-embeddable-{VERSION}-ubuntu-x64.tar.gz` + +> **Don't hand-build the download URL from the tag.** The git tag carries a +> leading `v` (e.g. `v10.8.0`) but the asset filename strips it +> (`lemonade-embeddable-10.8.0-...`), so using the tag verbatim 404s. Ask the +> GitHub API for the asset by its stable name pattern and use the URL it +> returns, as below — this stays correct across version and naming changes. + +**First, create the target directory** — it does not exist in a fresh repo: -Unpack into the app source tree at `vendor/lemonade/` (or whatever the app's -existing convention for vendored binaries is). The expected layout after -customization: +```powershell +# Windows +New-Item -ItemType Directory -Force vendor\lemonade +``` + +```bash +# Linux +mkdir -p vendor/lemonade +``` + +Then download and unpack on Windows (PowerShell): + +```powershell +$rel = Invoke-RestMethod https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest +$asset = $rel.assets | Where-Object { $_.name -like "lemonade-embeddable-*-windows-x64.zip" } | Select-Object -First 1 +Invoke-WebRequest $asset.browser_download_url -OutFile lemond.zip +Expand-Archive lemond.zip -DestinationPath "$env:TEMP\lemond-unpack" +$folder = $asset.name -replace '\.zip$','' # unpacked dir = asset name without .zip +Copy-Item -Recurse "$env:TEMP\lemond-unpack\$folder\*" vendor\lemonade\ +# Sanity check: resources/ must be nested under vendor\lemonade\ (not flattened) +if (-not (Test-Path vendor\lemonade\resources\*.json)) { throw "resources/ missing — re-extract and copy again" } +``` + +On Linux (bash): + +```bash +URL=$(curl -s https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest \ + | grep browser_download_url | grep ubuntu-x64.tar.gz | cut -d'"' -f4) +curl -L "$URL" | tar -xz --strip-components=1 -C vendor/lemonade +``` + +> **Copy the full package, not just the binary.** The archive contains +> `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and `resources/`. The +> `resources/` directory is required — without it lemond starts and passes the +> health check but fails on every model and backend request. Copying only the +> binary produces a server that looks healthy but cannot function. + +> **`lemond` vs `lemonade` CLI:** `lemond` is the embedded server binary that +> ships with the app. The `lemonade` CLI is a separate packaging tool used +> only during development/build time to install backends. Install it once on +> the developer machine with `pip install lemonade-sdk`. + +The expected layout **after setup** (first run + backend install). A freshly +unzipped package contains only `lemond[.exe]`, `lemonade[.exe]`, `LICENSE`, and +`resources/` — the items below are created later, as their comments note: ``` vendor/lemonade/ lemond[.exe] # the only binary the app ships LICENSE - config.json # generated on first run + config.json # generated on first run; commit a seed copy resources/ - server_models.json # trim to just the models you ship + server_models.json # do not edit; use GET /api/v1/models at runtime backend_versions.json bin/ # backends bundled at packaging time llamacpp/vulkan/llama-server[.exe] @@ -123,24 +202,41 @@ vendor/lemonade/ models--unsloth--Qwen3-4B-GGUF/ ``` +> **`server_models.json`:** Do not edit or rely on this file. It can be stale. +> The only authoritative model list is `GET /api/v1/models` on a running +> `lemond` instance with the backend already installed. + **Bundle decisions: pick deliberately** - **Backends:** Bundle `llamacpp:vulkan` at packaging time (works on every GPU). Install `llamacpp:rocm` at first run on supported AMD systems via - `POST /v1/install` after probing `GET /v1/system-info`. Never ship every - backend, or the artifact balloons. + `POST /api/v1/install` after probing `GET /api/v1/system-info`. Never ship + every backend, or the artifact balloons. - **Models:** Either bundle the default model under `models/` (offline - install, larger installer) **or** pull on first run with `POST /v1/pull` - (smaller installer, needs network). Pick one and document it. + install, larger installer) **or** pull on first run with + `POST /api/v1/pull` (smaller installer, needs network). Pick one and + document it. - **`models_dir`:** Set to `./models` in `config.json` to keep weights private to the app. Leave as `auto` only if the user explicitly wants to share weights with other apps. -**Install the backend before running any model.** Right after placing -`lemond`, install the backend your chosen recipe needs — a model won't load -without it. Use the CLI at packaging time, e.g. `lemonade backends install -flm:npu` (or `llamacpp:vulkan`, `sd-cpp:cpu`, etc.), or `POST /v1/install` -at first run for hardware-specific backends like `llamacpp:rocm`. +**Backend install timing — two distinct paths:** + +> **Packaging time** (developer machine, before bundling): +> ``` +> lemonade backends install llamacpp:vulkan +> lemonade backends install flm:npu # Windows NPU path only +> ``` +> This bakes the backend binaries into `vendor/lemonade/bin/` before the app +> ships. `lemond` does not need to be running. +> +> **First-run / runtime** (user's machine, after `lemond` is running): +> ```http +> POST /api/v1/install +> {"recipe": "llamacpp", "backend": "rocm"} +> ``` +> Use this for hardware-specific backends (e.g. `llamacpp:rocm`) that cannot +> be bundled universally. `lemond` must already be running (Step 4 complete). ## Step 4: Add a `lemond` launcher @@ -151,87 +247,42 @@ The launcher is a thin process supervisor. Its only jobs: 3. Spawn `lemond --port ` with `LEMONADE_API_KEY` set. 4. Expose the chosen `port` and `key` to the rest of the app. -**Python reference launcher** (adapt to the app's language): +> **Log one line per lifecycle stage.** Build the logging in from the start — +> not as an afterthought when something breaks. Each silent transition needs a +> visible marker so a failure points at the exact stage. Aim for: +> +> ``` +> [lemond] Starting on port +> [lemond] Healthy on port +> [lemond] : installed (or: already installed / install failed) +> [lemond] Pulling model ... then: Model ready (or: pull returned ) +> [local] result: (first inference output — empty string here = unpulled model) +> ``` +> +> Logging the **first inference result verbatim** is what turns the +> silent-empty failure (Step 6) from a multi-hour mystery into a one-line +> diagnosis. Route these through the app's normal logging so they can be quieted +> for release. + +> **Dev-mode file watchers:** If the app runs with a file watcher (Tauri, +> Electron, Next.js, Vite, etc.) that watches the source tree, ensure +> `vendor/lemonade/` is excluded from the watched paths. Lemond writes config +> and cache files at runtime; a watcher that picks these up will restart the +> app, kill the lemond subprocess, and spawn a new one on a new port — +> silently breaking any in-flight transcription. Add `vendor/` (or the +> equivalent) to the watcher's ignore list before testing. + +The launcher logic in pseudocode (full Python and Node.js implementations in [reference.md](reference.md#reference-launchers)): -```python -import os, secrets, socket, subprocess, sys, time, urllib.request -from pathlib import Path - -LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade" -LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond") - -def _free_port() -> int: - with socket.socket() as s: - s.bind(("127.0.0.1", 0)) - return s.getsockname()[1] - -def start_lemond() -> tuple[subprocess.Popen, str, int]: - port = _free_port() - key = secrets.token_urlsafe(32) - env = {**os.environ, "LEMONADE_API_KEY": key} - proc = subprocess.Popen( - [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)], - env=env, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - ) - _wait_for_health(port, key, timeout_s=30) - return proc, key, port - -def _wait_for_health(port: int, key: str, timeout_s: int) -> None: - url = f"http://127.0.0.1:{port}/api/v1/health" - req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"}) - deadline = time.monotonic() + timeout_s - while time.monotonic() < deadline: - try: - with urllib.request.urlopen(req, timeout=1) as r: - if r.status == 200: - return - except Exception: - time.sleep(0.25) - raise RuntimeError("lemond failed to become healthy") ``` - -**Node.js reference launcher:** - -```js -import { spawn } from "node:child_process"; -import { randomBytes } from "node:crypto"; -import { createServer } from "node:net"; -import path from "node:path"; - -const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade"); -const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond"); - -const freePort = () => new Promise((res) => { - const s = createServer().listen(0, "127.0.0.1", () => { - const { port } = s.address(); s.close(() => res(port)); - }); -}); - -export async function startLemond() { - const port = await freePort(); - const key = randomBytes(32).toString("base64url"); - const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], { - env: { ...process.env, LEMONADE_API_KEY: key }, - stdio: ["ignore", "pipe", "pipe"], - }); - await waitForHealth(port, key, 30_000); - return { proc, key, port }; -} - -async function waitForHealth(port, key, timeoutMs) { - const url = `http://127.0.0.1:${port}/api/v1/health`; - const headers = { Authorization: `Bearer ${key}` }; - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - try { - const r = await fetch(url, { headers }); - if (r.ok) return; - } catch {} - await new Promise((r) => setTimeout(r, 250)); - } - throw new Error("lemond failed to become healthy"); -} +port = bind("127.0.0.1:0"), read port, close socket +key = random_bytes(32) +proc = spawn(lemond_bin, [lemond_dir, "--port", port], env={LEMONADE_API_KEY: key}) +poll GET /api/v1/health with Bearer key, retry for 90s, 250ms interval +return proc, key, port + +# On failure: kill proc, pick new port, retry up to 3 times +# On app exit: proc.kill() (Windows) / proc.terminate() (Unix), then wait() ``` ## Step 5: Re-point the existing client at `lemond` @@ -249,23 +300,39 @@ and the API key. Nothing else. The model identifier on requests stays a Lemonade model name (e.g. `Qwen3-4B-GGUF`), not the cloud name. -**Bypass the app's API-key gate in local mode.** A local backend needs no -cloud key, so any onboarding wall, validator, or startup check that demands -one must not block local-mode users. Skip or auto-satisfy the key-entry -screen, treat local mode as already-authorized in validation logic, and -re-enable the gate only for cloud mode. The `lemond` key from Step 4 is set -internally by the launcher, so the user never enters one and any UI -placeholder (e.g. `"local"`) is fine. Flipping into local mode should never -strand the user on a key-entry wall. +**Local mode needs no cloud API key — at all.** This is a defining property of +local mode, not an edge case: there is no cloud service to authenticate to, so +nothing should ever ask the user for a key. Any onboarding wall, validator, or +startup check that demands one must not block local-mode users. Concretely: + +- Skip or auto-satisfy the key-entry screen in local mode. +- Treat local mode as already-authorized in every validation path — an + empty-key check must short-circuit to "valid" when the active mode is local, + never throw "API key not configured". +- Re-enable the gate **only** for cloud mode. + +The `lemond` key from Step 4 is generated internally by the launcher and used +only for the local loopback connection, so the user never sees or enters one; +any UI placeholder (e.g. `"local"`) is fine. Flipping into local mode should +never strand the user on a key-entry wall. + +**Set the HTTP client timeout to at least 120 seconds.** The default timeout +on most HTTP clients (30s) is shorter than the time lemond takes to load a +model on first use. A silent timeout looks identical to a broken integration +— the request fires, nothing comes back, and the UI shows nothing. 120s +covers first-run model load on any supported hardware. **Python (openai) example:** ```python from openai import OpenAI +import httpx + proc, key, port = start_lemond() client = OpenAI( base_url=f"http://127.0.0.1:{port}/api/v1", api_key=key, + http_client=httpx.Client(timeout=120.0), # covers first-run model load ) resp = client.chat.completions.create( model="Qwen3-4B-GGUF", @@ -273,21 +340,69 @@ resp = client.chat.completions.create( ) ``` -## Step 6: Wait for health, then preload the default model +## Step 6: Health, backend, then pull the model — *before* first inference + +`GET /api/v1/health` returning 200 means the **server** is up. It does **not** +mean inference will work. Before the first real request succeeds, three more +things must be true: the backend for your modality is installed, the model's +weights are **downloaded to disk**, and (on the first call) the model is loaded +into memory. Treating health=200 as "ready" is the single biggest cause of a +broken-looking integration. + +**Do not call `POST /api/v1/load` at startup.** Lemond lazy-loads the model +into memory on the first inference request and handles that step on its own. +Pre-loading is unreliable across lemond versions (the `/load` request body +shape has changed between releases) and a malformed call can crash or +destabilise the server before the user takes any action. Loading is the one +step you let lemond do lazily — pulling is not. -`lemond` lazy-loads models on first inference. To eliminate cold-start -latency on the user's first message, preload right after the health check -passes: +### Pull the model so it exists on disk + +Lazy-load only loads weights that are **already downloaded**. If the model was +never pulled, the first inference does not error — lemond returns an empty / +blank result with HTTP 200. So after health passes and the backend is +installed, proactively pull the model: ```http -POST /api/v1/load -Authorization: Bearer {key} -Content-Type: application/json +POST /api/v1/pull +{"model": "Whisper-Large-v3-Turbo"} +``` + +This is **idempotent** — a no-op if the weights are already present, a download +if they are not. Run it once during setup (after backend install, before the +first user-triggered inference) and log the result. + +- **Default model** (the one you chose in Step 2): pull it by name as above. +- **Custom / user-overridden model:** do not assume it exists. Confirm it is a + real Lemonade model first via `GET /api/v1/models` (the **only** trusted + catalog — see [reference.md](reference.md)), then pull it the same way. A + model appearing in the catalog is **not** proof its weights are downloaded; + a successful pull is. + +> **Silent-empty is almost always an unpulled model.** If inference returns an +> empty string / blank output with no HTTP error, the model was not downloaded. +> Check your pull step before debugging anything else — this is the failure mode +> that wastes the most time. Log the pull result and the first inference result +> (see Step 4) so this is diagnosable from the console, not by guesswork. + +### Surface the *whole* setup, not just model load + +First-run cold start is more than a model load. The full sequence is: -{"model": "Qwen3-4B-GGUF"} ``` +server spawn → health 200 → backend install → model download → model load → first result +``` + +On a fresh machine, backend install and model download can each take from tens +of seconds to several **minutes** (multi-GB weights over the network). Model +load alone is 10–30s. An app that shows nothing during this will look frozen. -If the model isn't downloaded yet, follow the recovery flow in Step 7. +Minimum: show a loading indicator or status message ("Setting up local AI…") +from the moment setup begins until the first response arrives — covering the +*entire* sequence above, not just the final load. The simplest implementation +is a flag set when setup/first-request starts and cleared when the first +response arrives. Once the model is pulled and loaded once, subsequent runs are +fast; the long wait is first-run only. ## Step 7: Lifecycle and recovery @@ -295,10 +410,11 @@ These are the only failure modes worth handling. Do not over-engineer. | Symptom | Cause | Recovery | |---|---|---| -| `POST /v1/load` returns 404 / model not found | Model not pulled yet | `POST /v1/pull` with `{"model": "..."}` then retry `/v1/load` | -| `/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /v1/system-info`, pick a supported backend, `POST /v1/install` with `{"recipe": "...", "backend": "..."}`, retry | -| Subprocess exits immediately | Port already in use by another `lemond` | Pick a new free port and retry once | -| `/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after | +| **Inference returns empty / blank with HTTP 200, no error** | Model never pulled: backend is installed but weights are absent, so lazy-load has nothing to load | `POST /api/v1/pull` with `{"model":"..."}`, wait for success, retry. Log the pulled result and the first inference result. This is the most common silent failure — see [Step 6](#step-6-health-backend-then-pull-the-model--before-first-inference) | +| `POST /api/v1/load` returns 404 / model not found | Model not pulled yet (same root cause as the empty-result row above) | `POST /api/v1/pull` with `{"model": "..."}` then retry `/api/v1/load` | +| `POST /api/v1/load` returns 500 with backend error | Backend not installed for this hardware | `GET /api/v1/system-info`, pick a supported backend, `POST /api/v1/install` with `{"recipe": "...", "backend": "..."}`, retry | +| Subprocess exits immediately | Port race: another process grabbed the port between `freePort()` and lemond binding | The reference launcher retries with a fresh port automatically (3 attempts) | +| `/api/v1/health` never returns 200 | First-run backend extraction is slow on cold disk | Extend timeout to 90s on first launch, 30s after | | HTTP 401 on every request | Forgot the `Authorization: Bearer` header | Audit the client config because Lemonade rejects unauth'd calls when `LEMONADE_API_KEY` is set | **Shutdown:** On app exit, `proc.terminate()` (Unix) or @@ -314,13 +430,25 @@ couple of seconds. Always wait on the process; never orphan it. The integration is done when **all** of these are true: +- [ ] `vendor/lemonade/` contains the full package: `lemond[.exe]`, + `lemonade[.exe]`, `LICENSE`, and `resources/` — not just the binary. - [ ] `lemond` starts as a subprocess with a fresh API key per launch. - [ ] `GET /api/v1/health` returns 200 within the timeout. -- [ ] The default model loads successfully via `POST /v1/load`. +- [ ] The default model is pulled (or bundled) before the first inference; a + custom/overridden model is confirmed via `GET /api/v1/models` and then + pulled. A blank result with no error means this step was skipped. +- [ ] Each lifecycle stage logs a clear line (spawn, health, backend install, + model pull, first result) so a failure is diagnosable from the console. - [ ] The existing client's chat / image / speech call returns a valid response with the base URL and key swapped, with no other code changed. -- [ ] In local mode the app's API-key gate is bypassed: no onboarding wall, - validator, or startup check blocks the user for lacking a cloud key. +- [ ] First-run latency is surfaced: the interface shows a loading state from the + moment the first inference request is sent until the response arrives. +- [ ] The HTTP client timeout is set to at least 120 seconds. +- [ ] In local mode the app requires **no** cloud API key: no onboarding wall, + validator, or startup check blocks the user, and no code path throws + "API key not configured" when the active mode is local. +- [ ] If the app uses a dev-mode file watcher, `vendor/lemonade/` is excluded + from the watched paths so runtime writes by lemond do not trigger restarts. - [ ] Killing the parent process leaves no `lemond` subprocess behind. - [ ] On a fresh machine without the optimal backend, the app still works via the Vulkan fallback bundled in `bin/`. diff --git a/skills/local-ai-app-integration/reference.md b/skills/local-ai-app-integration/reference.md index 9b8bb9e..d77f5f1 100644 --- a/skills/local-ai-app-integration/reference.md +++ b/skills/local-ai-app-integration/reference.md @@ -40,14 +40,12 @@ hardware-optimized one at first run after a system probe. ### Speech-to-text -Two NPU paths exist. **Prefer `flm` for NPU**. - | Recipe | Backend | Model | Hardware | OS | |---|---|---|---|---| -| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Windows | +| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | AMD iGPU / dGPU | Windows, Linux | | `whispercpp` | `cpu` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Windows, Linux | -| `whispercpp` | `vulkan` | `Whisper-Large-v3-Turbo` | x86_64 CPU | Linux | -| `whispercpp` | `npu` | `.rai`-cached whisper model | XDNA2 NPU | Windows (avoid) | +| `whispercpp` | `npu` | `Whisper-Large-v3-Turbo` | XDNA2 NPU | Windows | +| `flm` | `npu` | `whisper-v3-turbo-FLM` | XDNA2 NPU | Linux (runtime-install only) | ### Text-to-speech @@ -89,6 +87,14 @@ model catalog; it can be stale or incomplete. A model only appears in `GET /v1/models` once its backend is installed (see Step 3), so install the backend first or the list will look empty/incomplete. +**Catalogued ≠ downloaded.** A model listed by `GET /v1/models` is *available +to use*, not necessarily present on disk. It must be **pulled** +(`POST /api/v1/pull {"model":"..."}`) before it can serve — until then, +inference returns an empty result with HTTP 200, not an error. The surest +signal that a model is ready is a successful pull, not its presence in the +catalog. See SKILL.md +[Step 6](SKILL.md#step-6-health-backend-then-pull-the-model--before-first-inference). + --- ## Hardware probing with /v1/system-info @@ -120,13 +126,31 @@ Response shape (truncated): } ``` -Decision rules in priority order, for the default `llamacpp` recipe: +The same pattern applies to **every** recipe: read the per-backend `state`, +install the best one that is `installable`, use it if already `installed`, and +fall back down the priority list otherwise. Apply it to whichever recipe matches +the app's modality. + +Decision rules in priority order, for the default `llamacpp` recipe (text gen): 1. If `recipes.llamacpp.backends.rocm.state == "installable"` → `POST /v1/install {"recipe":"llamacpp","backend":"rocm"}`. 2. Else if `state == "installed"` for `vulkan` → use it as-is. 3. Else fall back to `cpu`. +Decision rules for the `whispercpp` recipe (speech-to-text), NPU-first: + +1. If `recipes.whispercpp.backends.npu.state == "installed"` → use NPU as-is. +2. Else if `npu.state == "installable"` → + `POST /v1/install {"recipe":"whispercpp","backend":"npu"}`, then use NPU. +3. Else if `vulkan` is `installed`/`installable` → use the iGPU/dGPU path. +4. Else fall back to `cpu`. + +Probe **once**, cache the chosen backend for the session (the result does not +change while the app runs), and log which backend was selected. This is the +mechanism that lets one build run on an NPU machine and a CPU-only machine +without any user configuration. + For Ryzen AI Hybrid models on Windows, additionally check `ryzenai-llm.backends.npu.state` and install if `installable`. @@ -247,3 +271,112 @@ Two backend limitations on Linux as of this writing: When building from source for an unusual Linux distro, see the upstream `docs/embeddable/building.md` in the lemonade-sdk/lemonade repo. + +--- + +## Reference launchers + +Full implementations for Step 4. Adapt to the app's language; the key +constraints are: retry with a fresh port on spawn failure (the socket is +released before lemond binds), poll `/api/v1/health` with the Bearer key, +and kill the process on app exit. + +**Python:** + +```python +import os, secrets, socket, subprocess, sys, time, urllib.request +from pathlib import Path + +LEMOND_DIR = Path(__file__).parent / "vendor" / "lemonade" +LEMOND_BIN = LEMOND_DIR / ("lemond.exe" if sys.platform == "win32" else "lemond") + +def _free_port() -> int: + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + +def start_lemond(retries: int = 3) -> tuple[subprocess.Popen, str, int]: + last_err: Exception | None = None + for _ in range(retries): + port = _free_port() + key = secrets.token_urlsafe(32) + env = {**os.environ, "LEMONADE_API_KEY": key} + proc = subprocess.Popen( + [str(LEMOND_BIN), str(LEMOND_DIR), "--port", str(port)], + env=env, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + ) + try: + _wait_for_health(port, key, timeout_s=30) + return proc, key, port + except RuntimeError as e: + proc.kill() + proc.wait() + last_err = e + raise RuntimeError(f"lemond failed to start after {retries} attempts") from last_err + +def _wait_for_health(port: int, key: str, timeout_s: int) -> None: + url = f"http://127.0.0.1:{port}/api/v1/health" + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"}) + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + try: + with urllib.request.urlopen(req, timeout=1) as r: + if r.status == 200: + return + except Exception: + time.sleep(0.25) + raise RuntimeError(f"lemond on port {port} did not become healthy within {timeout_s}s") +``` + +**Node.js:** + +```js +import { spawn } from "node:child_process"; +import { randomBytes } from "node:crypto"; +import { createServer } from "node:net"; +import path from "node:path"; + +const LEMOND_DIR = path.join(import.meta.dirname, "vendor", "lemonade"); +const LEMOND_BIN = path.join(LEMOND_DIR, process.platform === "win32" ? "lemond.exe" : "lemond"); + +const freePort = () => new Promise((res) => { + const s = createServer().listen(0, "127.0.0.1", () => { + const { port } = s.address(); s.close(() => res(port)); + }); +}); + +export async function startLemond(retries = 3) { + let lastErr; + for (let i = 0; i < retries; i++) { + const port = await freePort(); + const key = randomBytes(32).toString("base64url"); + const proc = spawn(LEMOND_BIN, [LEMOND_DIR, "--port", String(port)], { + env: { ...process.env, LEMONADE_API_KEY: key }, + stdio: ["ignore", "pipe", "pipe"], + }); + try { + await waitForHealth(port, key, 30_000); + return { proc, key, port }; + } catch (e) { + proc.kill(); + lastErr = e; + } + } + throw new Error(`lemond failed to start after ${retries} attempts: ${lastErr?.message}`); +} + +async function waitForHealth(port, key, timeoutMs) { + const url = `http://127.0.0.1:${port}/api/v1/health`; + const headers = { Authorization: `Bearer ${key}` }; + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + const r = await fetch(url, { headers }); + if (r.ok) return; + } catch {} + await new Promise((r) => setTimeout(r, 250)); + } + throw new Error(`lemond on port ${port} did not become healthy within ${timeoutMs}ms`); +} +``` diff --git a/walkthroughs/local-ai-app-integration.md b/walkthroughs/local-ai-app-integration.md index 4a67642..13dac97 100644 --- a/walkthroughs/local-ai-app-integration.md +++ b/walkthroughs/local-ai-app-integration.md @@ -7,10 +7,21 @@ For this walkthrough we use [`danielholanda/dictate`](https://github.com/danielh a Windows dictation app that currently sends every recording to cloud speech-to-text providers (Groq, Deepgram, Cartesia, Gemini, Mistral, etc.). -## Prerequiresites -This sample app used here requires the Rust toolchain (install from https://rustup.rs/). +**What you'll end up with:** one new launcher module, one config change to +the existing HTTP client, and `lemond` vendored under `vendor/lemonade/`. +Transcription moves from cloud to your local device. Expect 1–2 hours. -Because this walkthrough runs transcription on the NPU, you need a Ryzen AI PC with an XDNA2 NPU (Strix, Strix Halo, Kraken, or Gorgon Point) running Windows. +## Prerequisites + +This sample app requires the Rust toolchain (install from https://rustup.rs/). + +**Hardware:** Any Windows x64 PC works. The skill selects a backend once at integration time based on your development machine. If your machine has an NPU and the chosen recipe supports it, the NPU backend is used — otherwise it transparently falls back to Vulkan as the universal fallback so the app works on any end-user machine. The skill logs which backend was selected and why, so you always know what ran. + +| Priority | Your hardware | What you get | +|---|---|---| +| 1 (fastest) | Ryzen AI with XDNA2 NPU (Strix, Strix Halo, Kraken, Gorgon Point) | NPU-accelerated transcription | +| 2 | AMD iGPU / dGPU | GPU-accelerated transcription | +| 3 (fallback) | Any other Windows x64 PC | CPU transcription | ## Step 1 - Get the target app @@ -37,21 +48,32 @@ npx skills add amd/skills --skill local-ai-app-integration --agent claude-code ## Step 4 - Running the skill -Run `claude --model opus` inside the `dictate` repo run the prompt: +Run `claude --model opus` inside the `dictate` repo with this prompt: ``` This app sends my dictation audio to cloud speech-to-text providers. Add a local AI mode that runs transcription on my machine instead by default. -I want it to run using the NPU. Keep the cloud providers as an option and minimize code changes. +Use the best available local backend — NPU if I have one, otherwise iGPU or CPU. +Keep the cloud providers as an option and minimize code changes. ``` Claude should: 1. Survey where the app calls its cloud transcription APIs. -2. Pick a local speech-to-text model + backend (e.g. `whisper-v3-turbo-FLM` using the `FLM` NPU backend). +2. Probe hardware (`GET /api/v1/system-info`) and pick the fastest available + backend for `Whisper-Large-v3-Turbo`, NPU-first: + - XDNA2 NPU present → whispercpp NPU backend + - else AMD iGPU/dGPU → whispercpp iGPU/dGPU backend + - else → whispercpp CPU backend 3. Vendor the Embeddable Lemonade (`lemond`) binary into the app tree. -4. Add a launcher that spawns `lemond` on a free port. -5. Re-point the app's existing client at the local endpoint and wait for `/v1/health`. +4. Add a launcher that spawns `lemond` on a free port with retry logic, logging + each lifecycle stage (spawn → health → backend install → model pull → result). +5. Re-point the app's existing client at the local endpoint and wait for + `/api/v1/health`. Because local mode talks to your own machine, it needs **no + cloud API key** — Claude should bypass the app's key-entry gate in local mode. +6. Install the backend, then **pull the model** (`POST /api/v1/pull`) so its + weights are on disk before the first recording. Skipping this makes the very + first transcription come back blank with no error. Please note this may take several minutes as this app has a fairly large codebase. @@ -63,7 +85,35 @@ Dictate is a Tauri (Rust + Node) app. From the repo root: npm install npm run tauri dev ``` -Once the window opens, press the microphone button to speak, and confirm that transcription is now running through your local device instead of a cloud provider. The transcribed text should appear where your cursor was last located. + +**What the first launch looks like.** Watch the terminal (not the browser +console). On a cold first run you should see the staged log lines as setup +progresses — the model download in particular can take a while: + +``` +[lemond] Starting on port 56748 +[lemond] Healthy on port 56748 +[lemond] whispercpp:npu already installed +[lemond] Pulling model Whisper-Large-v3-Turbo... +[lemond] Model Whisper-Large-v3-Turbo ready +[local] Using transcription model: Whisper-Large-v3-Turbo backend: npu +[local] Transcription result: " Hi, can you hear me?" +``` + +The **first transcription can be slow** because it covers the whole setup chain: +server spawn + backend setup + model download + model load. Subsequent +recordings are fast. Once the window opens, press the microphone button to +speak, and confirm transcription runs through your local device instead of a +cloud provider — the text appears where your cursor was last located. + +> **Blank result?** If a recording produces no text and the terminal shows no +> error, the model was not pulled — `[local] Transcription result: ""`. The +> model-pull step (item 6 of "Claude should" above) fixes this; it is the most +> common first-run snag. + +> **Repeated phrases** like `" How can you hear me now?\n How can you hear me +> now?\n"` on quiet audio are a known Whisper behavior on silence/low-energy +> input, not an integration bug. ## Step 6 - (Optional) Going beyond