diff --git a/.github/skillspector-allow.yml b/.github/skillspector-allow.yml
index 96123ab..6131795 100644
--- a/.github/skillspector-allow.yml
+++ b/.github/skillspector-allow.yml
@@ -101,6 +101,18 @@ suppressions:
       argparse defaults / explicit --image-model/--tts-model/--stt-model flags,
       not from LLM or model output. Nothing here consumes unvalidated model
       output, so there is no injection sink to sanitize.
+  - skill: local-ai-use
+    rule: TM2
+    file: SKILL.md
+    match: Chaining Abuse
+    reason: >-
+      False positive. Line 103 is the documented Ubuntu/Debian install
+      one-liner `sudo add-apt-repository -y ppa:lemonade-team/stable &&
+      sudo apt-get update && sudo apt-get install -y lemonade-server
+      lemonade-desktop`. The `&&` chaining is the standard apt install
+      sequence (add PPA, refresh index, install package), not tool/command
+      chaining of untrusted or model-derived steps. No LLM output feeds the
+      chain and each command is a fixed, reviewable install step.
   - skill: local-ai-use
     rule: P2
     file: templates/local-ai-rule.md
diff --git a/eval/behavioral/tests/test_local_ai_use.py b/eval/behavioral/tests/test_local_ai_use.py
index 308af11..21ec051 100644
--- a/eval/behavioral/tests/test_local_ai_use.py
+++ b/eval/behavioral/tests/test_local_ai_use.py
@@ -29,13 +29,10 @@ def test_generate_image_of_a_cat():
             run.workspace_contains("out.png")
 
             # Positive behavioral expectations
+            run.should("Install Lemonade Server if it is not already installed")
             run.should("Download the SD-Turbo model if the model is not already downloaded")
             run.should("Add a 'Local AI Use' block to AGENTS.md")
 
             # Negative behavioral expectations
-            run.should_not("Use the GenerateImage tool")
-            run.should_not("Use a cloud image API")
+            run.should_not("Pull unrelated modalities for this image generation task")
             run.should_not("Reach for a cloud image path instead of local Lemonade")
-
-            # Skipped behavioral expectations
-            #run.should_not("Pull unrelated modalities for an image-only task")
diff --git a/skills/local-ai-use/SKILL.md b/skills/local-ai-use/SKILL.md
index f5c5d54..d2896cb 100644
--- a/skills/local-ai-use/SKILL.md
+++ b/skills/local-ai-use/SKILL.md
@@ -21,18 +21,28 @@ needs image generation, text-to-speech, or speech-to-text uses the local
 agent's own LLM keeps handling text; only the expensive multimodal calls move
 on-device.
 
-The skill does two things:
-
-1. **Verifies that local Lemonade is reachable and has the right models.**
-2. **Drops a `Local AI Use` block into the workspace `AGENTS.md`** so the agent
+The skill does three things:
+
+1. **Makes sure local Lemonade is installed and running.** If the `lemonade`
+   CLI is missing, the setup script installs the **full version** of Lemonade
+   (server + desktop app) on the user's behalf; if the server is installed but
+   not running, it launches it.
+2. **Verifies that local Lemonade is reachable.**
+3. **Drops a `Local AI Use` block into the workspace `AGENTS.md`** so the agent
    reads the routing rule on every later turn, in Cursor, Claude Code, Codex,
    Gemini CLI, and any other agent that respects `AGENTS.md`.
 
+Models are **not** downloaded during setup. Each default model is pulled
+lazily, on first use, by the routing rule (e.g. the first image request pulls
+the image model). This keeps setup fast and avoids gigabytes of downloads the
+user may never need.
+
 ## When to use this skill
 
 Use this skill when **all** of the following are true:
 
-- The user has, or is willing to install, the system-wide Lemonade Server.
+- The user wants local Lemonade. If it is not yet installed, the setup script
+  installs the **full version** (server + desktop app) for them automatically.
 - The user accepts the default Lemonade endpoint `http://localhost:13305`.
 - The user wants the change to be **persistent** across future turns and
   agent restarts (the rule is written to disk).
@@ -44,14 +54,15 @@ instead.
 ## Prerequisites
 
 - **OS:** Windows 11 x64, Ubuntu/Debian x64, or macOS (beta).
-- **Lemonade Server CLI on `PATH`:** verify with `lemonade --version`. If
-  missing, install from <https://lemonade-server.ai/install_options.html>
-  before continuing. Do not silently install on the user's machine; that is a
-  system-wide change and must be the user's call.
+- **Lemonade Server:** the setup script installs it if missing. It downloads
+  and silently installs the **full version** (Windows `lemonade.msi`, the
+  Ubuntu/Debian `ppa:lemonade-team/stable` PPA plus `lemonade-desktop`, or the
+  macOS `.pkg`), then launches the server. On Linux/macOS this needs `sudo`.
+  Pass `--no-install` if the user wants to install it themselves instead.
 - **Disk:** ~8 GB free for the three default models (SD-Turbo + Whisper-Tiny
-  + kokoro-v1).
-- **Network:** required for the first `lemonade pull` of each model. After
-  that, every modality runs offline.
+  + kokoro-v1), plus ~0.1 GB for the installer itself.
+- **Network:** required for the install download and the first `lemonade pull`
+  of each model. After that, every modality runs offline.
 
 ## The opinionated path
 
@@ -59,52 +70,67 @@ Run this checklist top to bottom. Track progress against it; do not move on
 until each step verifies.
 
 ```
-[ ] 1. Confirm Lemonade Server is installed and reachable
-[ ] 2. Pull the three default modality models
-[ ] 3. Install the routing rule into the workspace AGENTS.md
-[ ] 4. Smoke-test image, TTS, and STT against the local endpoint
+[ ] 1. Ensure Lemonade Server is installed and running (auto-install if missing)
+[ ] 2. Install the routing rule into the workspace AGENTS.md
 ```
 
-The single command that does steps 1, 2, and 3 in one shot is:
+The single command that does both steps in one shot is:
 
 ```bash
 python scripts/setup_local_ai.py
 ```
 
-The script is idempotent: re-running it on a
-fully configured workspace is a no-op apart from a healthcheck. Read the
-sections below for what to do when each step fails.
+It auto-installs the full version of Lemonade if the `lemonade` CLI is
+missing, launches the server if it is not running, then writes the rule. The
+script is idempotent: re-running it on a fully configured workspace is a no-op
+apart from a healthcheck. Read the sections below for what to do when each
+step fails.
 
 ---
 
-## Step 1: confirm Lemonade Server is reachable
+## Step 1: ensure Lemonade Server is installed and running
 
-Run:
+`scripts/setup_local_ai.py` handles this end to end, but here is what it does
+so you can do it by hand or debug it:
 
-```bash
-lemonade status --json
-```
+**1a. Is the CLI installed?** Check whether `lemonade` is on `PATH`
+(`lemonade --version`). If it is not, install the **full version** on the
+user's behalf:
 
-Two acceptable outcomes:
+| OS | Install the full version |
+|---|---|
+| Windows | Download `lemonade.msi` from the [latest release](https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade.msi) and run `msiexec /i lemonade.msi /qn` (silent, per-user, no elevation). |
+| Ubuntu/Debian | `sudo add-apt-repository -y ppa:lemonade-team/stable && sudo apt-get update && sudo apt-get install -y lemonade-server lemonade-desktop` |
+| macOS (beta) | Download the `Lemonade-<ver>-Darwin.pkg` from the latest release and run `sudo installer -pkg Lemonade-<ver>-Darwin.pkg -target /`. |
+
+The full installer bundles the server **and** the desktop app; the
+server-only minimal MSI and the legacy `lemonade-server` CLI are deprecated
+upstream. After a Windows install the CLI lands in
+`%LOCALAPPDATA%\lemonade_server` and is added to the *user* PATH (new shells
+only); the setup script probes that directory so it works in the same run.
+
+**1b. Is the server running?** Check `lemonade status --json`.
 
 | `lemonade status` says | Action |
 |---|---|
 | `Server is running on port 13305` | Continue to Step 2. |
-| `Server is not running` | Start it. On Windows, launch the **Lemonade** Start Menu shortcut. On Linux, run `sudo systemctl start lemonade-server`. Re-check `lemonade status`. |
+| `Server is not running` | Launch it with `lemonade serve` (the script does this in the background and polls `/api/v1/health` until it answers). |
 
-If `lemonade` is not on `PATH` at all, the server is not installed. Stop and
-point the user at <https://lemonade-server.ai/install_options.html>. Do not
-attempt a silent install.
+Only if the automatic install genuinely fails (no `apt-get`, no `sudo`,
+download blocked) should you stop and point the user at
+<https://lemonade-server.ai/install_options.html>.
 
 The rest of this skill assumes the endpoint is `http://localhost:13305/api/v1`
 and no API key is required (the system-wide server defaults to no auth on
 loopback). If the user has set `LEMONADE_API_KEY`, the routing rule template
 in `templates/local-ai-rule.md` shows where to add the `Authorization` header.
 
-## Step 2: pull the three default modality models
+### Default modality models (pulled on first use, not during setup)
 
-Pull these three. They are the **Lite Collection** defaults from Lemonade
-OmniRouter, sized to keep token-and-cost savings real on commodity hardware:
+Setup does **not** download these. The installed rule pulls each one the first
+time that modality is requested. They are the **Lite Collection** defaults from
+Lemonade OmniRouter, sized to keep token-and-cost savings real on commodity
+hardware:
 
 | Modality | Model | Size | Why this default |
 |---|---|---|---|
@@ -112,34 +138,20 @@ OmniRouter, sized to keep token-and-cost savings real on commodity hardware:
 | Text-to-speech | `kokoro-v1` | ~0.3 GB | Only TTS model Lemonade currently supports; CPU-only, low latency |
 | Speech-to-text | `Whisper-Tiny` | ~0.1 GB | Smallest Whisper; fast on CPU. Upgrade to `Whisper-Large-v3-Turbo` if accuracy matters more than latency. |
 
-```bash
-lemonade pull SD-Turbo
-lemonade pull kokoro-v1
-lemonade pull Whisper-Tiny
-```
-
-To choose a different model while installing the rule, pass it to the setup
-script. For example, to make future image requests use SDXL:
+To write a different model ID into the rule, pass it to the setup script. For
+example, to make future image requests use SDXL:
 
 ```bash
 python scripts/setup_local_ai.py --image-model SDXL-Turbo
 ```
 
-The script will pull the selected model and write that model ID into the
-installed `AGENTS.md` rule. The same pattern works for `--tts-model` and
-`--stt-model`.
-
-Each `pull` is idempotent. To verify what is already downloaded:
-
-```bash
-lemonade list --downloaded
-```
-
-For coverage of larger / higher-quality alternatives (`SDXL-Turbo`,
-`Flux-2-Klein-4B`, `Whisper-Large-v3-Turbo`), see the
+That model ID is written into the installed `AGENTS.md` rule and pulled on its
+first use. The same pattern works for `--tts-model` and `--stt-model`. For
+larger / higher-quality alternatives (`SDXL-Turbo`, `Flux-2-Klein-4B`,
+`Whisper-Large-v3-Turbo`), see the
 [model picker in reference.md](reference.md#model-picker).
 
-## Step 3: install the routing rule into AGENTS.md
+## Step 2: install the routing rule into AGENTS.md
 
 The rule is a Markdown block stored in [`templates/local-ai-rule.md`](templates/local-ai-rule.md).
 Append it to the workspace's `AGENTS.md` (create the file if missing). Both
@@ -169,44 +181,6 @@ block to:
 
 The rule's content is identical; only the file location changes.
 
-## Step 4: smoke-test the three modalities
-
-Verify each modality against the live server before declaring success. These
-mirror the inline patterns in the installed rule, so a green pass here means
-the rule will work. If you installed with a model override such as
-`--image-model SDXL-Turbo`, use that model ID in the smoke test and confirm
-the installed `AGENTS.md` rule contains it.
-
-**Image generation** (writes `out.png`):
-
-```bash
-curl -sX POST http://localhost:13305/api/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{"model":"SD-Turbo","prompt":"a single red apple on a white table","size":"512x512","steps":4,"response_format":"b64_json"}' \
-  | python -c "import sys,json,base64; open('out.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))"
-```
-
-**Text-to-speech** (writes `out.mp3`):
-
-```bash
-curl -sX POST http://localhost:13305/api/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -d '{"model":"kokoro-v1","input":"Local AI is now active.","response_format":"mp3"}' \
-  -o out.mp3
-```
-
-**Speech-to-text** (round-trips `out.mp3` → text via a wav re-encode):
-
-```bash
-ffmpeg -y -i out.mp3 -ar 16000 -ac 1 out.wav
-curl -sX POST http://localhost:13305/api/v1/audio/transcriptions \
-  -F "file=@out.wav" -F "model=Whisper-Tiny"
-```
-
-If any of the three returns a non-2xx status, fix it now. The rule we just
-installed sends future requests to these same endpoints, so a broken endpoint
-becomes a broken user experience.
-
 ---
 
 ## What changes after this skill runs
@@ -236,8 +210,8 @@ machine.
 
 | Symptom | Cause | Recovery |
 |---|---|---|
-| `lemonade: command not found` | Server CLI not installed | Install from <https://lemonade-server.ai/install_options.html>; restart shell. |
-| `Server is not running` | Service stopped after install | Windows: launch the **Lemonade** Start Menu shortcut. Linux: `sudo systemctl start lemonade-server`. |
+| `lemonade: command not found` | CLI not installed | Re-run `python scripts/setup_local_ai.py` (auto-installs the full version). If it just installed on Windows, open a new shell so the user PATH refreshes, or the script will find it under `%LOCALAPPDATA%\lemonade_server`. |
+| `Server is not running` | Service stopped after install | Run `lemonade serve` (the setup script launches it for you). |
 | `POST /v1/images/generations` returns 404 model not found | Image model not downloaded | `lemonade pull SD-Turbo` and retry. |
 | Image generation is slow on CPU (~4–5 min) | sd-cpp on CPU backend | Install the GPU backend on supported AMD hardware: `lemonade backends install sd-cpp:rocm`. |
 | `POST /v1/audio/transcriptions` returns 400 unsupported format | Input is not 16 kHz mono WAV | Re-encode with `ffmpeg -i in.* -ar 16000 -ac 1 out.wav`. |
@@ -249,14 +223,11 @@ machine.
 Mark this skill complete only when **all** of the following are true:
 
 - [ ] `lemonade status --json` reports the server running on port 13305.
-- [ ] `lemonade list --downloaded` shows `SD-Turbo`, `kokoro-v1`, and
-      `Whisper-Tiny`.
 - [ ] The workspace `AGENTS.md` contains the
       `amd-skills:local-ai-use` block.
-- [ ] All three smoke tests in Step 4 succeed.
 - [ ] On a follow-up turn, asking the agent to "generate an image of X"
       causes it to POST to `http://localhost:13305/api/v1/images/generations`
-      rather than calling a cloud tool.
+      (pulling the model on first use) rather than calling a cloud tool.
 
 If any box is unchecked, the user is still paying cloud cost for at least
 one modality.
diff --git a/skills/local-ai-use/scripts/setup_local_ai.py b/skills/local-ai-use/scripts/setup_local_ai.py
index 2ea62c9..a845fd0 100644
--- a/skills/local-ai-use/scripts/setup_local_ai.py
+++ b/skills/local-ai-use/scripts/setup_local_ai.py
@@ -5,19 +5,25 @@
 # ///
 """One-shot setup for the `local-ai-use` skill.
 
-Performs the three setup steps from SKILL.md:
-
-  1. Confirms the system-wide Lemonade Server is installed and reachable on
-     http://localhost:13305 (override with --host / --port or LEMONADE_HOST /
-     LEMONADE_PORT).
-  2. Pulls the three default modality models if they are missing
-     (image: SD-Turbo, TTS: kokoro-v1, STT: Whisper-Tiny).
-  3. Writes the routing rule from `templates/local-ai-rule.md` into
+Performs the setup steps from SKILL.md:
+
+  1. Ensures the full Lemonade Server (server + desktop app) is installed and
+     running on http://localhost:13305 (override with --host / --port or
+     LEMONADE_HOST / LEMONADE_PORT). If the `lemonade` CLI is missing, the
+     full version is installed on the user's behalf; if the server is not
+     running, it is launched.
+  2. Writes the routing rule from `templates/local-ai-rule.md` into
      <workspace>/AGENTS.md, between stable BEGIN/END markers so re-runs
      replace the block in place rather than appending.
 
+Setup never downloads models: the default image/TTS/STT models are pulled
+on first use, by the installed AGENTS.md rule (see its failure
+handling). This keeps setup fast and offline-friendly.
+
 The script is idempotent: a second run on a fully configured workspace only
 re-runs the healthcheck. It exits non-zero on any unrecoverable failure.
+Pass --no-install to refuse the automatic install (it then just reports the
+missing CLI and exits non-zero, the old behaviour).
 
 Constants are documented inline; nothing is magical.
 """
@@ -27,9 +33,12 @@
 import argparse
 import json
 import os
+import platform
 import re
 import shutil
 import subprocess
+import tempfile
+import time
 import urllib.error
 import urllib.request
 from pathlib import Path
@@ -59,6 +68,39 @@
 
 INSTALL_URL = "https://lemonade-server.ai/install_options.html"
 
+# The *full* Windows installer: Lemonade Server plus the desktop app (the
+# minimal, server-only MSI and the legacy `lemonade-server` CLI are deprecated
+# upstream). `releases/latest/download/<asset>` always resolves to the newest
+# published asset of that exact name, so we never have to pin a version.
+WINDOWS_MSI_URL = (
+    "https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade.msi"
+)
+# Default per-user install location used by lemonade.msi. The CLI is added to
+# the *user* PATH in the registry, which the current process will not see, so
+# we also probe this tree directly after installing.
+WINDOWS_INSTALL_DIR = Path(
+    os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local"))
+) / "lemonade_server"
+
+# GitHub release metadata, used to resolve the versioned macOS .pkg asset
+# (its filename embeds the version, so there is no stable latest/download URL).
+GITHUB_LATEST_RELEASE_API = (
+    "https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest"
+)
+
+# Ubuntu/Debian "full" install: the stable PPA (server) plus the desktop
+# frontend package. Run as a single shell pipeline so one sudo prompt covers
+# the whole thing.
+LINUX_APT_INSTALL = (
+    "sudo add-apt-repository -y ppa:lemonade-team/stable && "
+    "sudo apt-get update && "
+    "sudo apt-get install -y lemonade-server lemonade-desktop"
+)
+
+# CLI names to look for / drive, newest first. `lemonade-server` is the
+# deprecated alias kept for older installs.
+CLI_NAMES = ("lemonade", "lemonade-server")
+
 
 def _default_workspace() -> Path:
     """Workspace root for AGENTS.md.
@@ -86,77 +128,160 @@ def _http_get(url: str, timeout_s: float) -> tuple[int, bytes]:
         return r.status, r.read()
 
 
-def check_cli_installed() -> bool:
-    """Return True if the `lemonade` CLI is on PATH."""
-    return shutil.which("lemonade") is not None
+def find_cli() -> str | None:
+    """Return a runnable Lemonade CLI, or None.
 
+    Checks PATH for `lemonade` (then the deprecated `lemonade-server` alias).
+    On Windows the MSI updates the *user* PATH in the registry, which the
+    current process will not have inherited, so we also probe the default
+    per-user install tree for the executables.
+    """
+    for name in CLI_NAMES:
+        found = shutil.which(name)
+        if found:
+            return found
+    if platform.system() == "Windows" and WINDOWS_INSTALL_DIR.exists():
+        for name in CLI_NAMES:
+            for exe in WINDOWS_INSTALL_DIR.rglob(f"{name}.exe"):
+                return str(exe)
+    return None
+
+
+def install_lemonade() -> None:
+    """Install the full version of Lemonade for the current OS.
+
+    Raises RuntimeError on any unrecoverable failure so the caller can report
+    a clean message and fall back to the manual install link.
+    """
+    system = platform.system()
+    if system == "Windows":
+        _install_windows()
+    elif system == "Linux":
+        _install_linux()
+    elif system == "Darwin":
+        _install_macos()
+    else:
+        raise RuntimeError(
+            f"No automatic installer for this OS ({system}). "
+            f"Install manually: {INSTALL_URL}"
+        )
 
-def check_server_reachable(host: str, port: int) -> bool:
-    """Return True if /api/v1/health responds 200 within 3 seconds."""
-    url = f"http://{host}:{port}/api/v1/health"
+
+def _download(url: str, dest: Path) -> None:
+    _print(f"downloading {url}")
     try:
-        status, _ = _http_get(url, timeout_s=3.0)
-        return status == 200
-    except (urllib.error.URLError, OSError):
-        return False
+        urllib.request.urlretrieve(url, dest)  # noqa: S310
+    except (urllib.error.URLError, OSError) as exc:
+        raise RuntimeError(f"download failed ({url}): {exc}") from exc
+
+
+def _run(cmd: list[str] | str, *, shell: bool = False) -> None:
+    """Run an install command, surfacing a clean error on failure."""
+    printable = cmd if isinstance(cmd, str) else " ".join(cmd)
+    _print(f"running: {printable}")
+    result = subprocess.run(cmd, shell=shell)  # noqa: S602,S603
+    if result.returncode != 0:
+        raise RuntimeError(f"command failed (exit {result.returncode}): {printable}")
+
+
+def _install_windows() -> None:
+    """Silently install the full lemonade.msi (server + desktop app)."""
+    msi = Path(tempfile.gettempdir()) / "lemonade.msi"
+    _download(WINDOWS_MSI_URL, msi)
+    # /qn = silent, per-user (no elevation needed). The MSI registers the CLI
+    # and Start Menu shortcut and pulls the full app payload.
+    _run(["msiexec", "/i", str(msi), "/qn"])
+    _print("Lemonade full version installed.")
+
+
+def _install_linux() -> None:
+    """Install the stable PPA server plus the desktop frontend on apt distros."""
+    if shutil.which("apt-get") is None:
+        raise RuntimeError(
+            "Automatic install only supports apt-based distros (Ubuntu/Debian). "
+            f"Install manually: {INSTALL_URL}"
+        )
+    if os.geteuid() != 0 and shutil.which("sudo") is None:  # type: ignore[attr-defined]
+        raise RuntimeError(
+            "Need root (or sudo) to install system packages. "
+            f"Install manually: {INSTALL_URL}"
+        )
+    _run(LINUX_APT_INSTALL, shell=True)
+    _print("Lemonade full version installed.")
 
 
-def list_downloaded_models(host: str, port: int) -> set[str]:
-    """Return the set of locally downloaded model IDs.
+def _install_macos() -> None:
+    """Download the latest signed .pkg and install it system-wide."""
+    pkg_url = _resolve_macos_pkg_url()
+    pkg = Path(tempfile.gettempdir()) / "Lemonade.pkg"
+    _download(pkg_url, pkg)
+    _run(["sudo", "installer", "-pkg", str(pkg), "-target", "/"])
+    _print("Lemonade full version installed.")
 
-    Uses `lemonade list --downloaded` (CLI) and falls back to
-    GET /api/v1/models when the CLI lacks the flag. Returning an empty set is
-    treated as "could not determine" by the caller, which still attempts the
-    pulls; `lemonade pull` is itself idempotent.
-    """
-    try:
-        out = subprocess.run(
-            ["lemonade", "list", "--downloaded", "--json"],
-            check=True, capture_output=True, text=True, timeout=10,
-        ).stdout
-        data = json.loads(out)
-        return {m.get("id", "") for m in data if isinstance(m, dict)}
-    except (subprocess.SubprocessError, json.JSONDecodeError, FileNotFoundError):
-        pass
 
+def _resolve_macos_pkg_url() -> str:
+    """Resolve the versioned macOS .pkg download URL from the latest release."""
+    req = urllib.request.Request(
+        GITHUB_LATEST_RELEASE_API, headers={"Accept": "application/vnd.github+json"}
+    )
     try:
-        status, body = _http_get(
-            f"http://{host}:{port}/api/v1/models",
-            timeout_s=5,
+        with urllib.request.urlopen(req, timeout=15.0) as r:  # noqa: S310
+            data = json.loads(r.read())
+    except (urllib.error.URLError, OSError, ValueError) as exc:
+        raise RuntimeError(f"could not query latest release: {exc}") from exc
+    for asset in data.get("assets", []):
+        name = asset.get("name", "")
+        if name.endswith("-Darwin.pkg"):
+            return asset["browser_download_url"]
+    raise RuntimeError(
+        "No macOS .pkg asset found in the latest release. "
+        f"Install manually: {INSTALL_URL}"
+    )
+
+
+def launch_server(cli: str, host: str, port: int) -> None:
+    """Start the Lemonade server in the background (it stays up after we exit)."""
+    cmd = [cli, "serve"]
+    # Only pass overrides; the server already defaults to localhost:13305.
+    if port != DEFAULT_PORT:
+        cmd += ["--port", str(port)]
+    if host not in {DEFAULT_HOST, "localhost", "::1"}:
+        cmd += ["--host", host]
+    _print(f"launching: {' '.join(cmd)}")
+    kwargs: dict = {
+        "stdout": subprocess.DEVNULL,
+        "stderr": subprocess.DEVNULL,
+    }
+    if platform.system() == "Windows":
+        # Detach so the persistent server survives this process exiting.
+        kwargs["creationflags"] = (
+            subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
         )
-        if status == 200:
-            data = json.loads(body)
-            return {
-                m.get("id", "") for m in data.get("data", [])
-                if isinstance(m, dict) and m.get("downloaded")
-            }
-    except (urllib.error.URLError, OSError, json.JSONDecodeError):
-        pass
+    else:
+        kwargs["start_new_session"] = True
+    try:
+        subprocess.Popen(cmd, **kwargs)  # noqa: S603
+    except OSError as exc:
+        raise RuntimeError(f"could not launch `{' '.join(cmd)}`: {exc}") from exc
+
 
-    return set()
+def wait_for_server(host: str, port: int, timeout_s: float = 90.0) -> bool:
+    """Poll /api/v1/health until it answers 200 or we hit the timeout."""
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        if check_server_reachable(host, port):
+            return True
+        time.sleep(2.0)
+    return False
 
 
-def pull_model(model: str) -> bool:
-    """Run `lemonade pull <model>`. Returns True on success."""
-    _print(f"pulling {model}...")
+def check_server_reachable(host: str, port: int) -> bool:
+    """Return True if /api/v1/health responds 200 within 3 seconds."""
+    url = f"http://{host}:{port}/api/v1/health"
     try:
-        subprocess.run(
-            ["lemonade", "pull", model],
-            check=True,
-            # Stream output so the user sees the download progress instead of
-            # staring at a frozen prompt; SD-Turbo is several GB.
-            stdout=None, stderr=None,
-            # SD-Turbo is the largest pull at ~5 GB. 30 minutes is generous
-            # for a slow connection; below that we'd false-positive on real
-            # downloads.
-            timeout=30 * 60,
-        )
-        return True
-    except subprocess.CalledProcessError as exc:
-        _print(f"pull failed for {model} (exit {exc.returncode})")
-        return False
-    except subprocess.TimeoutExpired:
-        _print(f"pull timed out for {model} after 30 minutes")
+        status, _ = _http_get(url, timeout_s=3.0)
+        return status == 200
+    except (urllib.error.URLError, OSError):
         return False
 
 
@@ -276,63 +401,72 @@ def main(argv: list[str] | None = None) -> int:
         default=int(os.environ.get("LEMONADE_PORT", str(DEFAULT_PORT))),
         help="Lemonade Server port (default: 13305 / $LEMONADE_PORT).",
     )
-    parser.add_argument(
-        "--skip-pull",
-        action="store_true",
-        help="Do not pull missing models; just verify and write AGENTS.md.",
-    )
     parser.add_argument(
         "--image-model",
         default=DEFAULT_IMAGE_MODEL,
-        help=f"Image generation model to pull and write into AGENTS.md (default: {DEFAULT_IMAGE_MODEL}).",
+        help=f"Image generation model written into AGENTS.md, pulled on first use (default: {DEFAULT_IMAGE_MODEL}).",
     )
     parser.add_argument(
         "--tts-model",
         default=DEFAULT_TTS_MODEL,
-        help=f"Text-to-speech model to pull and write into AGENTS.md (default: {DEFAULT_TTS_MODEL}).",
+        help=f"Text-to-speech model written into AGENTS.md, pulled on first use (default: {DEFAULT_TTS_MODEL}).",
     )
     parser.add_argument(
         "--stt-model",
         default=DEFAULT_STT_MODEL,
-        help=f"Speech-to-text model to pull and write into AGENTS.md (default: {DEFAULT_STT_MODEL}).",
+        help=f"Speech-to-text model written into AGENTS.md, pulled on first use (default: {DEFAULT_STT_MODEL}).",
+    )
+    parser.add_argument(
+        "--no-install",
+        action="store_true",
+        help="Do not auto-install/launch Lemonade; just report and exit non-zero if missing.",
     )
     args = parser.parse_args(argv)
 
-    if not check_cli_installed():
-        _print("FAIL: `lemonade` is not on PATH.")
-        _print(f"Install Lemonade Server first: {INSTALL_URL}")
-        return 2
+    cli = find_cli()
+    if cli is None:
+        if args.no_install:
+            _print("FAIL: `lemonade` is not on PATH (--no-install set).")
+            _print(f"Install the full version manually: {INSTALL_URL}")
+            return 2
+        _print("`lemonade` CLI not found; installing the full version of Lemonade.")
+        try:
+            install_lemonade()
+        except RuntimeError as exc:
+            _print(f"FAIL: automatic install did not complete: {exc}")
+            return 2
+        cli = find_cli()
+        if cli is None:
+            _print("FAIL: install finished but the `lemonade` CLI is still not found.")
+            _print(
+                "Open a new shell so PATH refreshes and re-run, or install "
+                f"manually: {INSTALL_URL}"
+            )
+            return 2
+    _print(f"using Lemonade CLI: {cli}")
 
     if not check_server_reachable(args.host, args.port):
-        _print(
-            f"FAIL: Lemonade Server is not responding at "
-            f"http://{args.host}:{args.port}/api/v1/health."
-        )
-        _print(
-            "Start it: on Windows launch the Lemonade Start Menu shortcut; "
-            "on Linux run `sudo systemctl start lemonade-server`."
-        )
-        return 3
+        if args.no_install:
+            _print(
+                f"FAIL: Lemonade Server is not responding at "
+                f"http://{args.host}:{args.port}/api/v1/health (--no-install set)."
+            )
+            return 3
+        _print("Lemonade Server is not running; launching it.")
+        try:
+            launch_server(cli, args.host, args.port)
+        except RuntimeError as exc:
+            _print(f"FAIL: could not launch the server: {exc}")
+            return 3
+        if not wait_for_server(args.host, args.port):
+            _print(
+                f"FAIL: launched the server but it never became reachable at "
+                f"http://{args.host}:{args.port}/api/v1/health."
+            )
+            return 3
 
     _print(f"server reachable at http://{args.host}:{args.port}")
 
-    if not args.skip_pull:
-        downloaded = list_downloaded_models(args.host, args.port)
-        selected_models = dict.fromkeys(
-            (args.image_model, args.tts_model, args.stt_model)
-        )
-        for model in selected_models:
-            if model in downloaded:
-                _print(f"already downloaded: {model}")
-                continue
-            if not pull_model(model):
-                # Surface the failure but keep going so the user at least gets
-                # the rule installed for the modalities that did succeed.
-                _print(
-                    f"continuing without {model}; the rule will reference it "
-                    "but calls will 404 until you pull it."
-                )
-
     upsert_agents_md(
         args.workspace.resolve(),
         host=args.host,