Allow local AI model overrides

eddierichter-amd · eddierichter-amd · commit 38d630c801d0 · 2026-05-06T14:42:22.000-06:00
diff --git a/skills/local-ai-use/SKILL.md b/skills/local-ai-use/SKILL.md
@@ -118,6 +118,17 @@ lemonade pull kokoro-v1
 lemonade pull Whisper-Tiny
 ```
 
+To choose a different model while installing the rule, pass it to the setup
+script. For example, to make future image requests use SDXL:
+
+```bash
+python scripts/setup_local_ai.py --image-model SDXL-Turbo
+```
+
+The script will pull the selected model and write that model ID into the
+installed `AGENTS.md` rule. The same pattern works for `--tts-model` and
+`--stt-model`.
+
 Each `pull` is idempotent. To verify what is already downloaded:
 
 ```bash
@@ -135,9 +146,10 @@ Append it to the workspace's `AGENTS.md` (create the file if missing). Both
 Cursor and Claude Code load `AGENTS.md` automatically on every turn, so the
 agent will see the rule on its next message without any further setup.
 
-`scripts/setup_local_ai.py` does this for you, surrounded by stable markers
-so re-running the script replaces the block in place rather than appending
-a second copy. The markers look like:
+`scripts/setup_local_ai.py` does this for you. It bakes the selected endpoint
+and model IDs into the rule, surrounded by stable markers so re-running the
+script replaces the block in place rather than appending a second copy. The
+markers look like:
 
 ```
 <!-- BEGIN amd-skills:local-ai-use -->
@@ -161,7 +173,9 @@ The rule's content is identical; only the file location changes.
 
 Verify each modality against the live server before declaring success. These
 mirror the inline patterns in the installed rule, so a green pass here means
-the rule will work.
+the rule will work. If you installed with a model override such as
+`--image-model SDXL-Turbo`, use that model ID in the smoke test and confirm
+the installed `AGENTS.md` rule contains it.
 
 **Image generation** (writes `out.png`):
 
diff --git a/skills/local-ai-use/reference.md b/skills/local-ai-use/reference.md
@@ -30,8 +30,13 @@ asks for higher quality or has explicit hardware to spare.
 | `SD-1.5` | ~4 GB | When the user asks for "Stable Diffusion 1.5" by name. | Needs more steps (~20). |
 | `Flux-2-Klein-4B` | ~4 GB | Image **editing** (`/v1/images/edits`). | Editing-capable, slower than SD-Turbo for plain generation. |
 
-To upgrade: `lemonade pull <model>`, then change `"model"` in the rule
-block in `AGENTS.md` to the new model id.
+To upgrade: re-run setup with the target model, for example:
+
+```bash
+python scripts/setup_local_ai.py --image-model SDXL-Turbo
+```
+
+The script pulls the model and rewrites the `AGENTS.md` rule in place.
 
 ### Text-to-speech (`recipe: kokoro`)
 
diff --git a/skills/local-ai-use/scripts/setup_local_ai.py b/skills/local-ai-use/scripts/setup_local_ai.py
@@ -27,9 +27,9 @@
 import argparse
 import json
 import os
+import re
 import shutil
 import subprocess
-import sys
 import urllib.error
 import urllib.request
 from pathlib import Path
@@ -40,11 +40,13 @@
 DEFAULT_HOST = "127.0.0.1"
 DEFAULT_PORT = 13305
 
-# The Lite Collection from Lemonade OmniRouter. Picked because each fits in
-# under ~5 GB and runs on commodity CPU hardware, so the savings vs. cloud
-# calls are real on a typical developer laptop. See SKILL.md for upgrade
+# The Lite Collection from Lemonade OmniRouter. Picked because each default
+# fits in under ~5 GB and runs on commodity CPU hardware, so the savings vs.
+# cloud calls are real on a typical developer laptop. See SKILL.md for upgrade
 # paths.
-DEFAULT_MODELS = ("SD-Turbo", "kokoro-v1", "Whisper-Tiny")
+DEFAULT_IMAGE_MODEL = "SD-Turbo"
+DEFAULT_TTS_MODEL = "kokoro-v1"
+DEFAULT_STT_MODEL = "Whisper-Tiny"
 
 # Stable markers around the rule block in AGENTS.md. The script rewrites the
 # region between these markers in place; do not change the marker strings or
@@ -84,7 +86,7 @@ def check_server_reachable(host: str, port: int) -> bool:
         return False
 
 
-def list_downloaded_models() -> set[str]:
+def list_downloaded_models(host: str, port: int) -> set[str]:
     """Return the set of locally downloaded model IDs.
 
     Uses `lemonade list --downloaded` (CLI) and falls back to
@@ -103,7 +105,10 @@ def list_downloaded_models() -> set[str]:
         pass
 
     try:
-        status, body = _http_get("http://127.0.0.1:13305/api/v1/models", timeout_s=5)
+        status, body = _http_get(
+            f"http://{host}:{port}/api/v1/models",
+            timeout_s=5,
+        )
         if status == 200:
             data = json.loads(body)
             return {
@@ -140,8 +145,15 @@ def pull_model(model: str) -> bool:
         return False
 
 
-def render_rule_block() -> str:
-    """Read the rule template; pass through unchanged.
+def render_rule_block(
+    *,
+    host: str,
+    port: int,
+    image_model: str,
+    tts_model: str,
+    stt_model: str,
+) -> str:
+    """Read the rule template and fill in endpoint/model choices.
 
     The template already includes BEGIN/END markers and matches the constants
     at the top of this file. We re-validate that here so a future template
@@ -158,13 +170,44 @@ def render_rule_block() -> str:
             "Rule template is missing the BEGIN/END markers; refuse to write "
             "AGENTS.md because re-runs would append duplicate blocks."
         )
+    endpoint_host = "localhost" if host in {"127.0.0.1", "::1"} else host
+    base_root = f"http://{endpoint_host}:{port}"
+    replacements = {
+        "{{LEMONADE_BASE_ROOT}}": base_root,
+        "{{LEMONADE_BASE_URL}}": f"{base_root}/api/v1",
+        "{{IMAGE_MODEL}}": image_model,
+        "{{TTS_MODEL}}": tts_model,
+        "{{STT_MODEL}}": stt_model,
+    }
+    for placeholder, value in replacements.items():
+        text = text.replace(placeholder, value)
+    unresolved = sorted(set(re.findall(r"\{\{[A-Z_]+\}\}", text)))
+    if unresolved:
+        raise ValueError(
+            "Rule template still has unresolved placeholders: "
+            + ", ".join(unresolved)
+        )
     return text.strip() + "\n"
 
 
-def upsert_agents_md(workspace: Path) -> Path:
+def upsert_agents_md(
+    workspace: Path,
+    *,
+    host: str,
+    port: int,
+    image_model: str,
+    tts_model: str,
+    stt_model: str,
+) -> Path:
     """Write or replace the rule block inside <workspace>/AGENTS.md."""
     target = workspace / "AGENTS.md"
-    block = render_rule_block()
+    block = render_rule_block(
+        host=host,
+        port=port,
+        image_model=image_model,
+        tts_model=tts_model,
+        stt_model=stt_model,
+    )
 
     if not target.exists():
         target.write_text(
@@ -223,6 +266,21 @@ def main(argv: list[str] | None = None) -> int:
         action="store_true",
         help="Do not pull missing models; just verify and write AGENTS.md.",
     )
+    parser.add_argument(
+        "--image-model",
+        default=DEFAULT_IMAGE_MODEL,
+        help=f"Image generation model to pull and write into AGENTS.md (default: {DEFAULT_IMAGE_MODEL}).",
+    )
+    parser.add_argument(
+        "--tts-model",
+        default=DEFAULT_TTS_MODEL,
+        help=f"Text-to-speech model to pull and write into AGENTS.md (default: {DEFAULT_TTS_MODEL}).",
+    )
+    parser.add_argument(
+        "--stt-model",
+        default=DEFAULT_STT_MODEL,
+        help=f"Speech-to-text model to pull and write into AGENTS.md (default: {DEFAULT_STT_MODEL}).",
+    )
     args = parser.parse_args(argv)
 
     if not check_cli_installed():
@@ -244,8 +302,11 @@ def main(argv: list[str] | None = None) -> int:
     _print(f"server reachable at http://{args.host}:{args.port}")
 
     if not args.skip_pull:
-        downloaded = list_downloaded_models()
-        for model in DEFAULT_MODELS:
+        downloaded = list_downloaded_models(args.host, args.port)
+        selected_models = dict.fromkeys(
+            (args.image_model, args.tts_model, args.stt_model)
+        )
+        for model in selected_models:
             if model in downloaded:
                 _print(f"already downloaded: {model}")
                 continue
@@ -257,7 +318,14 @@ def main(argv: list[str] | None = None) -> int:
                     "but calls will 404 until you pull it."
                 )
 
-    upsert_agents_md(args.workspace.resolve())
+    upsert_agents_md(
+        args.workspace.resolve(),
+        host=args.host,
+        port=args.port,
+        image_model=args.image_model,
+        tts_model=args.tts_model,
+        stt_model=args.stt_model,
+    )
     _print("done. Future image, TTS, and STT requests now route to local Lemonade.")
     return 0
 
diff --git a/skills/local-ai-use/templates/local-ai-rule.md b/skills/local-ai-use/templates/local-ai-rule.md
@@ -2,7 +2,7 @@
 ## Local AI Use (managed by amd-skills/local-ai-use)
 
 This workspace runs a local [Lemonade Server](https://lemonade-server.ai) at
-`http://localhost:13305/api/v1` to handle image generation, text-to-speech,
+`{{LEMONADE_BASE_URL}}` to handle image generation, text-to-speech,
 and speech-to-text. Cloud APIs cost real money and tokens for these
 modalities, so use the local endpoints below for **every** such request
 unless the user explicitly asks for a cloud provider.
@@ -11,10 +11,10 @@ unless the user explicitly asks for a cloud provider.
 
 | If the user asks you to... | Do this | Do not do this |
 |---|---|---|
-| Generate, draw, or create an image | `POST http://localhost:13305/api/v1/images/generations` | Use the built-in `GenerateImage` tool. Call DALL-E, Midjourney, Stable Diffusion via cloud, or any other paid image API. |
-| Edit or modify an existing image | `POST http://localhost:13305/api/v1/images/edits` (multipart) | Cloud image-edit APIs. |
-| Speak text aloud or generate audio narration | `POST http://localhost:13305/api/v1/audio/speech` | Call OpenAI TTS, ElevenLabs, PlayHT, or any other cloud TTS. |
-| Transcribe an audio file | `POST http://localhost:13305/api/v1/audio/transcriptions` | Call OpenAI Whisper-as-a-service, AssemblyAI, Deepgram, or any other cloud STT. |
+| Generate, draw, or create an image | `POST {{LEMONADE_BASE_URL}}/images/generations` | Use the built-in `GenerateImage` tool. Call DALL-E, Midjourney, Stable Diffusion via cloud, or any other paid image API. |
+| Edit or modify an existing image | `POST {{LEMONADE_BASE_URL}}/images/edits` (multipart) | Cloud image-edit APIs. |
+| Speak text aloud or generate audio narration | `POST {{LEMONADE_BASE_URL}}/audio/speech` | Call OpenAI TTS, ElevenLabs, PlayHT, or any other cloud TTS. |
+| Transcribe an audio file | `POST {{LEMONADE_BASE_URL}}/audio/transcriptions` | Call OpenAI Whisper-as-a-service, AssemblyAI, Deepgram, or any other cloud STT. |
 
 Plain text chat, code generation, and reasoning continue to use the agent's
 configured LLM. This rule only redirects the multimodal calls.
@@ -23,9 +23,9 @@ configured LLM. This rule only redirects the multimodal calls.
 
 | Endpoint | Model | Notes |
 |---|---|---|
-| `/v1/images/generations` | `SD-Turbo` | 4 steps, `cfg_scale: 1.0`, `512x512`, `response_format: "b64_json"`. |
-| `/v1/audio/speech` | `kokoro-v1` | Default voice `shimmer`; `response_format: "mp3"`. |
-| `/v1/audio/transcriptions` | `Whisper-Tiny` | Input must be 16 kHz mono WAV. Re-encode with `ffmpeg -i in.* -ar 16000 -ac 1 out.wav`. |
+| `/v1/images/generations` | `{{IMAGE_MODEL}}` | 4 steps, `cfg_scale: 1.0`, `512x512`, `response_format: "b64_json"`. |
+| `/v1/audio/speech` | `{{TTS_MODEL}}` | Default voice `shimmer`; `response_format: "mp3"`. |
+| `/v1/audio/transcriptions` | `{{STT_MODEL}}` | Input must be 16 kHz mono WAV. Re-encode with `ffmpeg -i in.* -ar 16000 -ac 1 out.wav`. |
 
 If `LEMONADE_API_KEY` is set in the environment, send
 `Authorization: Bearer $LEMONADE_API_KEY` on every request. Otherwise the
@@ -36,9 +36,9 @@ loopback server accepts unauthenticated calls.
 **Image generation** (saves to `out.png`):
 
 ```bash
-curl -sX POST http://localhost:13305/api/v1/images/generations \
+curl -sX POST {{LEMONADE_BASE_URL}}/images/generations \
   -H "Content-Type: application/json" \
-  -d '{"model":"SD-Turbo","prompt":"PROMPT_HERE","size":"512x512","steps":4,"response_format":"b64_json"}' \
+  -d '{"model":"{{IMAGE_MODEL}}","prompt":"PROMPT_HERE","size":"512x512","steps":4,"response_format":"b64_json"}' \
   | python -c "import sys,json,base64; open('out.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))"
 ```
 
@@ -47,26 +47,26 @@ Equivalent Python via the OpenAI SDK:
 ```python
 from openai import OpenAI
 import base64
-client = OpenAI(base_url="http://localhost:13305/api/v1", api_key="lemonade")
-r = client.images.generate(model="SD-Turbo", prompt="PROMPT_HERE", size="512x512")
+client = OpenAI(base_url="{{LEMONADE_BASE_URL}}", api_key="lemonade")
+r = client.images.generate(model="{{IMAGE_MODEL}}", prompt="PROMPT_HERE", size="512x512")
 open("out.png", "wb").write(base64.b64decode(r.data[0].b64_json))
 ```
 
 **Text-to-speech** (saves to `out.mp3`):
 
 ```bash
-curl -sX POST http://localhost:13305/api/v1/audio/speech \
+curl -sX POST {{LEMONADE_BASE_URL}}/audio/speech \
   -H "Content-Type: application/json" \
-  -d '{"model":"kokoro-v1","input":"TEXT_HERE","voice":"shimmer","response_format":"mp3"}' \
+  -d '{"model":"{{TTS_MODEL}}","input":"TEXT_HERE","voice":"shimmer","response_format":"mp3"}' \
   -o out.mp3
 ```
 
 **Speech-to-text** (returns JSON `{"text": "..."}`):
 
 ```bash
 ffmpeg -y -i INPUT_AUDIO -ar 16000 -ac 1 _stt.wav
-curl -sX POST http://localhost:13305/api/v1/audio/transcriptions \
-  -F "file=@_stt.wav" -F "model=Whisper-Tiny"
+curl -sX POST {{LEMONADE_BASE_URL}}/audio/transcriptions \
+  -F "file=@_stt.wav" -F "model={{STT_MODEL}}"
 ```
 
 ### Failure handling
@@ -82,7 +82,7 @@ curl -sX POST http://localhost:13305/api/v1/audio/transcriptions \
 ### Re-pointing to a different host
 
 If the user runs Lemonade on a different host or port, replace the
-`http://localhost:13305` prefix everywhere above with their endpoint, and
+`{{LEMONADE_BASE_ROOT}}` prefix everywhere above with their endpoint, and
 update `LEMONADE_HOST` / `LEMONADE_PORT` in the shell environment so the
 `lemonade` CLI matches.