diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f00ba1e..fd141aa 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -24,6 +24,11 @@
       "source": "./skills/magpie-kernel-evaluator",
       "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
     },
+    {
+      "name": "serving-llms-on-epyc",
+      "source": "./skills/serving-llms-on-epyc",
+      "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
+    },
     {
       "name": "serving-llms-on-instinct",
       "source": "./skills/serving-llms-on-instinct",
diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json
index f00ba1e..fd141aa 100644
--- a/.cursor-plugin/marketplace.json
+++ b/.cursor-plugin/marketplace.json
@@ -24,6 +24,11 @@
       "source": "./skills/magpie-kernel-evaluator",
       "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
     },
+    {
+      "name": "serving-llms-on-epyc",
+      "source": "./skills/serving-llms-on-epyc",
+      "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
+    },
     {
       "name": "serving-llms-on-instinct",
       "source": "./skills/serving-llms-on-instinct",
diff --git a/eval/behavioral/tests/test_serving_llms_on_epyc.py b/eval/behavioral/tests/test_serving_llms_on_epyc.py
new file mode 100644
index 0000000..55cc1ee
--- /dev/null
+++ b/eval/behavioral/tests/test_serving_llms_on_epyc.py
@@ -0,0 +1,42 @@
+"""Behavioral tests for the `serving-llms-on-epyc` skill.
+
+Run locally (needs the `claude` CLI authenticated; the agent does not actually
+launch a server in the judge's sandbox, so this grades the *plan/behavior*, not
+a live endpoint):
+
+    pytest eval/behavioral/tests/test_serving_llms_on_epyc.py -s
+
+`logs_contains` is deterministic; `should` / `should_not` are graded by an LLM
+judge over the captured evidence (tool calls + outputs), so the agent's prose
+cannot fake a pass.
+"""
+
+from harness import claude
+
+
+def test_serve_model_on_epyc():
+    with claude("sonnet", skill="serving-llms-on-epyc") as agent:
+        run = agent.prompt(
+            "Serve Qwen/Qwen3-0.6B on this AMD EPYC box with vLLM and zentorch. "
+            "Use the default settings."
+        )
+
+        # Programmatic expectation: the skill was actually loaded.
+        run.logs_contains("serving-llms-on-epyc")
+
+        # Positive behavioral expectations (the state machine).
+        run.should("Detect the CPU and confirm it is an AMD EPYC host before serving (e.g. runs detect.py)")
+        run.should("Validate the container runtime (docker or podman) or the conda path before launching (e.g. runs validate.py)")
+        run.should("Use validate.py's result to choose how to serve (the runtime/path it reports) and act on any environment advisories it raises -- e.g. the tcmalloc/OpenMP LD_PRELOAD perf-library note or the in-image vllm+zentorch check; on the container path with the image not yet pulled there may be none, which is fine")
+        run.should("Check that vLLM supports the model before serving (e.g. runs check_model.py), rather than refusing it just for being multimodal")
+        run.should("Check that the model fits in host RAM (e.g. runs estimate_memory.py)")
+        run.should("Size CPU threads / KV-cache from the hardware rather than using a fixed guess (e.g. runs cpu_tune.py)")
+        run.should("Pin the instance to a single socket with its memory (socket-local KV plus cpuset-mems or numactl membind) and, on a dual-socket host, pick a socket by load -- surfacing cpu_tune's warning if both sockets are busy")
+        run.should("Present a sized plan and ask the user to confirm before launching the server")
+        run.should("Plan to launch with 'vllm serve' and poll until /health is healthy")
+
+        # Negative behavioral expectations (the explicit Don'ts).
+        run.should_not("Pass '--device cpu' to vllm serve")
+        run.should_not("Launch the server before the user has confirmed the plan")
+        run.should_not("Enter a debugging loop or retry after a launch failure")
+        run.should_not("Attempt GPU, ROCm, or Instinct serving")
diff --git a/skills/serving-llms-on-epyc/SKILL.md b/skills/serving-llms-on-epyc/SKILL.md
new file mode 100644
index 0000000..14b97e3
--- /dev/null
+++ b/skills/serving-llms-on-epyc/SKILL.md
@@ -0,0 +1,253 @@
+---
+name: serving-llms-on-epyc
+description: >-
+  Serves a language model on an AMD EPYC CPU host using vLLM with the zentorch
+  backend, in a container (Docker or Podman) or a conda env. Use whenever the
+  user wants to run, serve, deploy, start, host, or launch an LLM on AMD EPYC,
+  Zen CPU, "vLLM on CPU", "zentorch serving", or "serve a model without a GPU".
+  Use for "serve Qwen on EPYC", "start a CPU vLLM endpoint", "run an OpenAI
+  server on my EPYC box", or similar. Handles the full single-instance flow:
+  detect the CPU (incl. EPYC generation), validate the runtime/env, check vLLM
+  supports the model (via vLLM's registry, not a modality blocklist), check it
+  fits host RAM, size CPU threads/KV/NUMA from the hardware, confirm the plan with
+  the user, launch, and poll until the endpoint is responsive. Single instance,
+  single socket (pinned to one socket + its memory; vLLM scales poorly across
+  sockets). Does NOT debug failures and does NOT retry -- it reports and stops. Do
+  not use for GPU/Instinct (use serving-llms-on-instinct) or multi-node.
+allowed-tools: Bash, Read
+---
+
+# Serving LLMs on AMD EPYC (vLLM + zentorch, CPU)
+
+Bring up a single vLLM OpenAI endpoint on an AMD EPYC host with the zentorch CPU
+backend, sized to the hardware. Container-first (Docker or Podman); conda/host
+is the fallback.
+
+**This is single-socket serving:** one instance pinned to one socket and its memory
+(vLLM scales poorly across sockets, so we do not span them). On a dual-socket host it
+runs on a single socket; the multi-socket answer is **multiple instances (one per
+socket)**, which is out of scope for this single-instance recipe.
+
+Hard rule for this skill: **on any failure, report the cause + logs and STOP.
+Do not retry, do not debug.** (Debugging is a separate workflow.)
+
+**The agent does the serve flow itself** -- pull, configure, launch, poll --
+using the runtime `validate.py` reports. Never hand the user per-serve commands.
+Like serving-llms-on-instinct, an accessible container runtime is a one-time
+**prerequisite**: if `validate.py` finds none, report its one-time fix (make
+docker accessible / install podman / provide a conda env) and stop. Do not
+attempt `sudo` or privilege escalation.
+
+## Data file
+
+Read `data/epyc.json` directly. It holds the container image, mandatory CPU run
+flags, supported precision, the model-support policy, the default model, and the
+verified throughput-flag gotcha. Do not hardcode the image tag from memory -- read it.
+
+## Step 1: Detect the CPU
+
+```bash
+python3 scripts/detect.py            # add --host user@box for a remote host
+```
+
+Returns `cpu_model`, `is_amd_epyc`, `epyc_generation`
+(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), `zen_arch`, `avx512`,
+`logical_cores`, `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`. If
+`is_amd_epyc` is `false`, stop: this skill targets AMD EPYC. (Other x86 may work
+but is unsupported here.) Carry `epyc_generation` / `avx512` through the later
+phases -- e.g. AVX-512 + bf16 land on Zen4+ (Genoa/Turin), and Turin packs up to
+128 cores/socket, which the thread-binding in Step 5 sizes from.
+
+## Step 2: Validate the runtime and environment
+
+```bash
+python3 scripts/validate.py --image <image from data/epyc.json>
+```
+
+Returns `ready`, `runtime` (`docker`, `podman`, or null), `runtime_detail`,
+`conda_path_available`, `ram_gb`, and `errors/warnings/advisories`. Pick the path:
+- `runtime` is `docker` or `podman` -> container path (Step 6), used verbatim.
+- `runtime` null but `conda_path_available: true` -> conda/host path.
+- `runtime` null and no conda -> `ready` is false. Report the one-time
+  onboarding `fix` (make docker accessible / install podman / conda env) and stop.
+
+Do not proceed if `ready` is `false`.
+
+## Step 3: Resolve and validate the model
+
+If the user named no model, use `default_model` from `data/epyc.json`
+(`Qwen/Qwen3-0.6B` -- ungated, tiny, fast first success). Otherwise use theirs.
+
+Check that vLLM actually supports the model (do **not** blanket-block multimodal):
+
+```bash
+python3 scripts/check_model.py --model-id <model> --vllm-version <vllm_version from data/epyc.json>
+```
+
+- Exit 0 = vLLM serves it as a generation endpoint (`kind` `text` or `multimodal`),
+  or support is undeterminable (gated/offline) -- proceed; launch confirms.
+- Exit 1 = positively unsupported: the architecture is not in vLLM's registry, or
+  it is a `pooling`/embedding/reranker (not a chat/completion endpoint). Report the
+  printed `message` and stop.
+- A `multimodal` model is allowed; a vLLM-supported multimodal arch may still hit a
+  GPU-only kernel on CPU, which surfaces at load (the no-retry rule then applies).
+
+**Precision/dtype**: native CPU dtypes are `bf16` (default), `fp16`, `fp32`. Use
+`bfloat16` unless the user asks otherwise.
+
+For gated models (Llama, Gemma) `HF_TOKEN` must be set and the license accepted on
+HuggingFace; if not, stop and say so.
+
+## Step 4: Check it fits host RAM
+
+RAM is the ceiling on CPU (weights + KV cache both live in RAM). Run on ONE line:
+
+```bash
+python3 scripts/estimate_memory.py --model-id <model> --ram-gb <memory_gb from detect> --max-model-len <4096 or user value> --num-prompts <1 or desired concurrency>
+```
+
+Exit 0 = fits, exit 1 = does not fit. If `fit.fits` is false: **do not launch.**
+Tell the user `required_gb` vs `ram_gb` and the printed `fit.action` -- reduce
+`--max-model-len` to `fit.suggested_max_model_len` and retry, or use a smaller
+model. `--max-model-len` and `--num-prompts` are the two knobs that move KV.
+Extra flag: `--weight-gb N` overrides weights if a model has no HF metadata
+(rare). KV cache is bf16-only on zentorch CPU (no fp8 KV).
+
+## Step 5: Size the CPU runtime from the hardware
+
+```bash
+eval "$(python3 scripts/cpu_tune.py)"      # or --format json to inspect
+```
+
+A single instance runs on **one socket, with its memory** (vLLM scales poorly across
+sockets). `cpu_tune.py` exports `VLLM_CPU_OMP_THREADS_BIND` (the chosen socket's
+physical cores) and `VLLM_CPU_KVCACHE_SPACE` (sized from that **socket's local RAM**,
+not whole-system, so the KV pool stays on-socket). It does **not** set
+`OMP_NUM_THREADS` (vLLM derives it) or `VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM's own default).
+
+Socket choice on a dual-socket host (load-aware): it samples per-socket CPU busy%
+(~0.5s) and prefers a free socket -- both free → socket 0; one free → that socket;
+**both busy (≥ `--busy-threshold`, default 15%) → it `warning`s and proceeds on the
+least-busy socket**. `--socket N` forces a choice. Single-socket hosts use socket 0.
+
+For the chosen socket it also emits the memory-bound pin: `container_cpuset`
+(`--cpuset-cpus=<cores> --cpuset-mems=<nodes>`) for the container path, and
+`conda_launch_prefix` (`numactl --cpunodebind/--membind`, falling back to `taskset`
+CPU-only, or empty-with-note if neither tool exists) for conda. **Surface `warning`
+to the user** if set. On NPS2/NPS4 a socket spans multiple NUMA nodes; memory is
+bound across them and `nps_note` flags that finer binding could add performance.
+
+## Step 6: Confirm the plan, then launch (container-first)
+
+Before launching, present this summary and **wait for the user to confirm** -- do
+not launch unprompted. This is the human gate before anything runs:
+
+| Field | Value |
+|---|---|
+| Model / kind | `<model>` -- `text` or `multimodal` (from `check_model.py`) |
+| Path | container (`<runtime>`, image from `data/epyc.json`) or conda/host |
+| Precision | `bfloat16` (or the user's choice) |
+| Fit | required `<required_gb>` GB vs `<ram_gb>` GB RAM |
+| CPU sizing | socket `<chosen_socket>` (`<socket_choice_reason>`), bind `<VLLM_CPU_OMP_THREADS_BIND>`, KV `<VLLM_CPU_KVCACHE_SPACE>` GB (socket-local), mem bound to nodes `<numa_nodes_on_socket>` |
+| Hardware | EPYC `<epyc_generation>` (`<zen_arch>`), `<physical_cores>` cores, AVX-512 `<avx512>` |
+| Port | `<port>` |
+
+If `cpu_tune.py` returned a `warning` (e.g. all sockets busy), include it here so the user sees it before confirming.
+
+Proceed only on a clear "go". If the user declines or wants changes (model,
+`--max-model-len`, port), stop and adjust -- do not launch.
+
+Build the launch from `data/epyc.json`. The CLI is `vllm serve <model>`.
+**Do not pass `--device cpu`** on vLLM >= 0.20 -- the zentorch plugin
+auto-selects the CPU platform and `vllm serve` rejects the flag. Only add it if
+`vllm serve --help` lists it (older vLLM).
+
+**Container path** (`runtime` from validate.py). The agent runs these itself,
+including the pull. `RT` is the resolved runtime verbatim:
+```bash
+RT="<runtime from validate.py: docker | podman>"
+$RT pull <image from data/epyc.json>          # agent pulls; do not ask the user to
+$RT run -d --name vllm-epyc \
+  <run_flags from data/epyc.json>            # --ipc=host --shm-size=16g --network=host
+  <hf_cache_mount> \
+  <container_cpuset from cpu_tune>             # --cpuset-cpus=<cores> --cpuset-mems=<nodes>
+  --env VLLM_CPU_OMP_THREADS_BIND="$VLLM_CPU_OMP_THREADS_BIND" \
+  --env VLLM_CPU_KVCACHE_SPACE=$VLLM_CPU_KVCACHE_SPACE \
+  --env HF_TOKEN=${HF_TOKEN} \
+  <image from data/epyc.json> \
+  vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len>
+```
+
+**Conda/host path** (no container runtime, `conda_path_available` true). `eval`-ing
+cpu_tune already exported the env vars; prefix the launch with `conda_launch_prefix`
+from cpu_tune so memory is bound to the chosen socket (empty → unpinned, with a note):
+```bash
+<conda_launch_prefix from cpu_tune> vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len> &
+# e.g. numactl --cpunodebind=0 --membind=0 vllm serve ...
+```
+
+Optional throughput flags are **opt-in and must move together** (see Gotchas):
+`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE=0` (+ `ZENTORCH_WEIGHT_PREPACK=1`).
+The base launch sets none of them.
+
+## Step 7: Poll until up and responsive
+
+A 503 while loading is normal. Poll until the server answers, then prove the
+chat endpoint works. CPU first-token compile can take a minute or two.
+
+```bash
+# container alive (or process alive for conda) + /health
+for i in $(seq 1 120); do
+  # container path:
+  $RT inspect -f '{{.State.Running}}' vllm-epyc 2>/dev/null | grep -q true || { echo "FAILED: container exited"; $RT logs --tail 50 vllm-epyc; break; }
+  curl -sf http://localhost:<port>/health >/dev/null 2>&1 && { echo "HEALTHY"; break; }
+  sleep 3
+done
+```
+
+Then validate the OpenAI endpoint is actually accessible:
+```bash
+curl -sf http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
+  -d '{"model":"<model>","messages":[{"role":"user","content":"hi"}],"max_tokens":8}'
+```
+
+Resource sanity (your validation list): `$RT stats --no-stream vllm-epyc`.
+
+**If the server never becomes healthy or the endpoint does not respond: print
+the container/process logs, state the failure, and STOP. Do not retry. Do not
+start a debugging loop.**
+
+## Step 8: On success, hand over the endpoint
+
+Print a connection table (model, runtime, port, OMP threads, KV GB, max-model-len,
+NUMA pinning) and a ready-to-run example:
+```bash
+curl -s http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
+  -d '{"model":"<model>","messages":[{"role":"user","content":"Hello"}]}'
+```
+To stop: `$RT rm -f vllm-epyc` (container) or `kill <pid>` (conda).
+
+## Offline (single-instance batch)
+
+For a one-shot offline run instead of a server, replace Step 6-8 with a single
+`vllm bench throughput` (or an offline `LLM.generate`) using the same sized env,
+wait for completion, and report the metrics. Same no-retry / no-debug rule.
+
+## Gotchas
+
+See [reference.md](reference.md) for the full list. The load-bearing ones:
+
+- **`--device cpu` was removed** from `vllm serve` in vLLM >= 0.20. The zentorch
+  plugin auto-selects CPU. Passing it makes `vllm serve` error with
+  "unrecognized arguments: --device cpu".
+- **`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init** on vLLM 0.23 /
+  zentorch 2.11 (`AssertionError: expected OutputCode, got function`). It only
+  works with `VLLM_USE_AOT_COMPILE=0` set alongside it. Never set one without
+  the other.
+- **`--shm-size`**: vLLM needs a large `/dev/shm`; the container default (64MB)
+  is too small. Use `--shm-size=16g` (in `data/epyc.json`).
+- **NUMA / socket**: one instance is pinned to **one socket plus its memory** --
+  CPU bind + `--cpuset-mems` (container) / `numactl --membind` (conda), with KV sized
+  from that socket's local RAM. On a dual-socket host `cpu_tune.py` picks a free socket
+  by load and `warning`s if both are busy. NPS2/NPS4 (multi-node socket) gets an
+  `nps_note` that finer per-node binding could add more.
diff --git a/skills/serving-llms-on-epyc/data/epyc.json b/skills/serving-llms-on-epyc/data/epyc.json
new file mode 100644
index 0000000..deb67f4
--- /dev/null
+++ b/skills/serving-llms-on-epyc/data/epyc.json
@@ -0,0 +1,53 @@
+{
+  "vllm_version": "0.22.0",
+  "container": {
+    "image": "amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23",
+    "runtimes": ["docker", "podman"],
+    "comment": "Public vLLM + zentorch CPU image on Docker Hub (amdih/zendnn_zentorch) -- no internal-registry access needed. Tags are vllm_v<ver>_zentorch_v<ver>_<os>_<build>; prefer the newest ubuntu22.04 stable. Both docker and podman are supported; the skill prefers docker and falls back to podman.",
+    "run_flags": [
+      "--ipc=host",
+      "--shm-size=16g",
+      "--network=host"
+    ],
+    "hf_cache_mount": "-v ~/.cache/huggingface:/root/.cache/huggingface",
+    "flag_notes": {
+      "--ipc=host": "vLLM workers use host IPC/shared memory.",
+      "--shm-size=16g": "vLLM needs a large /dev/shm; default 64MB is not enough.",
+      "--network=host": "Expose the served port directly. Alternative: -p <port>:<port>.",
+      "numa": "A single instance is pinned to ONE socket plus its memory. cpu_tune.py picks a free socket by CPU load on dual-socket hosts (warns if both busy; --socket N forces), sizes KV from that socket's local RAM, and emits --cpuset-cpus + --cpuset-mems (container) or numactl --cpunodebind/--membind (conda). True multi-socket scaling = multiple instances (one per socket), out of scope here."
+    }
+  },
+  "launch": {
+    "cli": "vllm serve",
+    "device_flag_note": "Do NOT pass --device cpu on vLLM >= 0.20; the zentorch plugin auto-selects the CPU platform and `vllm serve` rejects --device. Only pass it if `vllm serve --help` advertises it (older vLLM)."
+  },
+  "precision": {
+    "native": ["bf16", "fp16", "fp32"],
+    "default": "bfloat16",
+    "notes": "bf16 is the throughput default on EPYC (Zen). fp32 is slower and for debugging only. WOQ (per-channel/per-group int) is supported by zentorch but out of scope for the base recipe."
+  },
+  "model_support": {
+    "check_script": "scripts/check_model.py",
+    "policy": "Do NOT blanket-block multimodal. check_model.py reads the model's HF architectures and checks them against vLLM's model registry for the pinned vllm_version. Text and multimodal generation endpoints are allowed; pooling/embedding/reranker and non-LLM architectures are rejected (not chat/completion endpoints).",
+    "cpu_note": "A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load, where the no-retry rule applies."
+  },
+  "default_model": "Qwen/Qwen3-0.6B",
+  "default_model_notes": "Ungated (Apache-2.0), tiny, fast first success on CPU. For a real workload pick a larger Qwen3 / Llama once the flow is verified.",
+  "smoke_model": "Qwen/Qwen3-0.6B",
+  "smoke_model_notes": "Current small Qwen, chat-capable (ships a chat template, so /v1/chat/completions works -- unlike base models such as opt-125m).",
+  "env_defaults": {
+    "VLLM_CPU_OMP_THREADS_BIND": "set by cpu_tune.py (physical cores of the chosen socket)",
+    "VLLM_CPU_KVCACHE_SPACE": "set by cpu_tune.py (GB)",
+    "do_not_set": "OMP_NUM_THREADS -- vLLM sets it from the bind list (len of cpu_list); and VLLM_CPU_NUM_OF_RESERVED_CPU -- vLLM has its own default when unset, forcing 0 overrides it."
+  },
+  "throughput_flags_optional": {
+    "TORCHINDUCTOR_FREEZING": "1",
+    "VLLM_USE_AOT_COMPILE": "0",
+    "ZENTORCH_WEIGHT_PREPACK": "1",
+    "gotcha": "VERIFIED on vLLM 0.22.0 / zentorch 2.11.0.1: TORCHINDUCTOR_FREEZING=1 ALONE crashes engine-core init with 'AssertionError: expected OutputCode, got function'. It only works when VLLM_USE_AOT_COMPILE=0 is set alongside it. Never set FREEZING=1 without AOT_COMPILE=0. The base recipe leaves all three unset."
+  },
+  "ram": {
+    "os_headroom_gb": 16,
+    "comment": "Reserve ~16 GB for OS + framework beyond model weights + KV cache when checking fit."
+  }
+}
diff --git a/skills/serving-llms-on-epyc/reference.md b/skills/serving-llms-on-epyc/reference.md
new file mode 100644
index 0000000..4a12ee1
--- /dev/null
+++ b/skills/serving-llms-on-epyc/reference.md
@@ -0,0 +1,128 @@
+# serving-llms-on-epyc -- Reference
+
+## Table of Contents
+1. [Runtime selection](#runtime-selection)
+2. [Container run flags (CPU)](#container-run-flags-cpu)
+3. [Precision and modality](#precision-and-modality)
+4. [CPU sizing](#cpu-sizing)
+5. [Known quirks](#known-quirks)
+
+---
+
+## Runtime selection
+
+`scripts/validate.py` resolves a runtime the **agent can drive
+non-interactively** and reports it as `runtime` (the exact command prefix the
+agent uses for `pull`/`run`/`stats`/`logs`). Preference order maximizes
+agent-drivability with no human in the loop:
+
+1. **docker** (direct) -- if `docker ps` exits 0 (user in the `docker` group /
+   daemon reachable). No sudo. Best.
+2. **podman** (rootless) -- no daemon, no sudo. Note: rootless podman needs a
+   storage backend that supports its overlay; some networked/`/proj`
+   filesystems reject the overlay `pivot_root` (the run fails even though
+   `podman info` succeeds). On those hosts use docker or the conda path.
+3. **sudo docker** -- only if `sudo -n docker ps` works (passwordless sudo). The
+   agent can still drive it unattended; `runtime` comes back as `"sudo docker"`.
+4. **conda/host** -- requires `import vllm, zentorch` in the active env.
+
+If docker is installed but **none** of the above is agent-drivable (no docker
+group, no passwordless sudo), `validate.py` returns `runtime: null`,
+`runtime_agent_drivable: false`, and a **one-time** setup `fix`:
+`sudo usermod -aG docker $USER && newgrp docker` (or a NOPASSWD sudoers entry).
+This is one-time onboarding, not a per-serve command. After it, every serve is
+fully agent-driven. The skill must not degrade into asking the user to paste
+docker commands for each serve.
+
+## Container run flags (CPU)
+
+From `data/epyc.json`. Unlike the Instinct (GPU) skill there are **no**
+`/dev/kfd`, `/dev/dri`, `--group-add`, or ROCm flags -- this is pure CPU.
+
+| Flag | Why |
+|---|---|
+| `--ipc=host` | vLLM workers use host IPC / shared memory |
+| `--shm-size=16g` | vLLM needs a large `/dev/shm`; the 64MB default is too small |
+| `--network=host` | expose the served port directly (or use `-p <port>:<port>`) |
+| `--cpuset-cpus` / `--cpuset-mems` | pin the container to the chosen socket's physical cores and its NUMA node(s); from `cpu_tune.py` |
+| `-v ~/.cache/huggingface:/root/.cache/huggingface` | reuse the host model cache |
+
+Image: `amdih/zendnn_zentorch:<tag>` -- the public vLLM + zentorch CPU image on
+Docker Hub (no internal-registry access needed). The exact tag lives in
+`data/epyc.json`; read it, never hardcode it.
+
+## Precision and modality
+
+| Dtype | EPYC (Zen) | Notes |
+|---|---|---|
+| BF16 | Native (default) | throughput default |
+| FP16 | Native | |
+| FP32 | Native | slower; debugging only |
+| WOQ int8/int4 | Supported by zentorch | per-channel / per-group; out of scope for the base recipe |
+
+Modality: not gated by a static blocklist. `scripts/check_model.py` checks the
+model's architecture against vLLM's model registry (pinned to `vllm_version`):
+text **and** multimodal generation endpoints are allowed; pooling/embedding/
+reranker and non-LLM architectures are rejected (not chat/completion endpoints).
+A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU -- that
+surfaces at load, where the no-retry rule applies.
+
+## CPU sizing
+
+Policy: a single instance is pinned to **one socket plus its memory** (vLLM scales
+poorly across sockets). `scripts/cpu_tune.py` derives:
+- **Socket choice** (dual-socket): samples per-socket CPU busy% (~0.5s) and prefers a
+  free socket -- both free → socket 0; one free → that one; both at/above
+  `--busy-threshold` (default 15%) → `warning` and proceed on the least-busy. `--socket N`
+  forces it. Single-socket → socket 0.
+- `VLLM_CPU_OMP_THREADS_BIND` = the chosen socket's physical cores (SMT dropped). vLLM
+  sets `OMP_NUM_THREADS` from this, so we don't.
+- `VLLM_CPU_KVCACHE_SPACE` (GB) = `min(socket_ram*kv_frac, socket_ram-16)` -- sized from
+  the **chosen socket's local RAM** so the KV pool stays on-socket (≤32GB → `*0.5`).
+- Memory-bound pin: `container_cpuset` = `--cpuset-cpus=<cores> --cpuset-mems=<nodes>`;
+  `conda_launch_prefix` = `numactl --cpunodebind=<nodes> --membind=<nodes>` (falls back to
+  `taskset` CPU-only, or empty-with-note if neither tool exists).
+
+Not set: `OMP_NUM_THREADS` (vLLM derives it from the bind) and
+`VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default when unset).
+
+When the chosen socket spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` emits an
+`nps_note`: memory is bound across the socket's nodes, and finer per-node binding
+(one instance per node) could add more. That tuning is out of
+scope for the base recipe.
+
+## Known quirks
+
+**`--device cpu` removed (vLLM >= 0.20)**
+`vllm serve` no longer accepts `--device cpu`; the zentorch plugin auto-selects
+the CPU platform. Passing it -> `vllm: error: unrecognized arguments: --device cpu`.
+Only pass it if `vllm serve --help` advertises it (older vLLM).
+
+**`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE` (VERIFIED)**
+On vLLM 0.23.0 / zentorch 2.11.0.2 (EPYC 9454, facebook/opt-125m, 2026-06-23):
+`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init with
+`AssertionError: expected OutputCode, got function` (inductor codecache). Adding
+`VLLM_USE_AOT_COMPILE=0` fixes it (healthy in ~99s). The only changed variable
+between the failing and passing runs was `VLLM_USE_AOT_COMPILE`. Never set
+`FREEZING=1` without `VLLM_USE_AOT_COMPILE=0`. The base recipe leaves both unset.
+
+**`/dev/shm` too small**
+Without `--shm-size=16g` (or `--ipc=host`), vLLM workers fail to allocate shared
+memory at startup.
+
+**RAM is the ceiling, not VRAM**
+CPU serving keeps weights + KV cache in system RAM. `estimate_memory.py` checks
+`weights + KV(max_model_len x num_prompts) + reserve <= RAM` (reserve default
+16 GB, `--reserve-gb`). It exits 1 when it does not fit and prints
+`suggested_max_model_len` + an `action` to reduce and retry. Weights come from
+HF file sizes (`.safetensors` or legacy `.bin`); `--weight-gb` overrides when a
+model has no metadata. KV cache is bf16-only on zentorch CPU (no fp8 KV), so the estimate always uses 2 bytes/element.
+
+**NUMA cross-node traffic**
+On a 2-socket EPYC, an unpinned instance spreads threads + memory across both sockets
+and pays cross-socket latency. `cpu_tune.py` keeps one instance on **one socket plus
+its memory**: CPU bind (`VLLM_CPU_OMP_THREADS_BIND` + `--cpuset-cpus`), memory bind
+(`--cpuset-mems` / `numactl --membind`), and KV sized from that socket's local RAM so
+the KV pool never lands on the other socket. The socket is chosen by load (free socket
+preferred; warns if both busy). True multi-socket throughput = **multiple instances**
+(one per socket) -- out of scope for this single-instance recipe.
diff --git a/skills/serving-llms-on-epyc/scripts/check_model.py b/skills/serving-llms-on-epyc/scripts/check_model.py
new file mode 100644
index 0000000..534bfea
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/check_model.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Does vLLM support this model's architecture? -- so the skill checks real vLLM
+support instead of blanket-blocking multimodal.
+
+Reads the model's `architectures` from its HF config.json, then checks them
+against vLLM's model registry for the pinned vLLM version. The registry comes
+from the version-pinned registry.py on GitHub (no vLLM install needed); if that
+is unreachable it falls back to an importable local `vllm`. Generation endpoints
+(text + multimodal) are supported; pooling/embedding/reranker and non-LLM
+architectures are not chat/completion endpoints and are rejected.
+
+    check_model.py --model-id Qwen/Qwen3-0.6B
+    check_model.py --model-id <id> --vllm-version 0.22.0
+
+Exit 0 if vLLM serves it as a generation endpoint (or support is undeterminable
+-- launch confirms), 1 if it is positively unsupported. JSON to stdout.
+Env: HF_TOKEN for gated models.
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+import urllib.error
+
+HF = "https://huggingface.co"
+GH_RAW = "https://raw.githubusercontent.com/vllm-project/vllm"
+REG_PATH = "vllm/model_executor/models/registry.py"
+
+# registry.py dict name -> kind we care about
+_SECTIONS = {
+    "_TEXT_GENERATION_MODELS": "text",
+    "_TRANSFORMERS_BACKEND_MODELS": "text",
+    "_MULTIMODAL_MODELS": "multimodal",
+    "_EMBEDDING_MODELS": "pooling",
+    "_POOLING_MODELS": "pooling",
+    "_CROSS_ENCODER_MODELS": "pooling",
+}
+
+
+def _get(url, token=None):
+    """GET text from a URL. Returns (text, error_message)."""
+    headers = {"User-Agent": "check-model/1"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    try:
+        with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r:
+            return r.read().decode("utf-8"), None
+    except urllib.error.HTTPError as e:
+        return None, {401: "not found or gated (set HF_TOKEN)",
+                      403: "access denied -- accept the model license on HuggingFace",
+                      404: "not found"}.get(e.code, f"HTTP {e.code}")
+    except Exception as e:
+        return None, str(e)
+
+
+def model_architectures(model, rev, token):
+    """Architectures declared in the model's HF config.json. Returns (list, error)."""
+    text, err = _get(f"{HF}/{model}/resolve/{rev}/config.json", token)
+    if text is None:
+        return None, err
+    try:
+        cfg = json.loads(text)
+    except ValueError:
+        return None, "config.json is not valid JSON"
+    return cfg.get("architectures") or [], None
+
+
+def registry_from_github(version):
+    """Parse vLLM's registry.py at v<version>. Returns ({arch: kind}, source) or (None, err)."""
+    src, err = _get(f"{GH_RAW}/v{version}/{REG_PATH}")
+    if src is None:
+        return None, err
+    reg, cur = {}, None
+    for line in src.splitlines():
+        s = line.strip()
+        sec = re.match(r"^(_[A-Z0-9_]+_MODELS)\s*(?::[^=]+)?=\s*\{", s)
+        if sec:
+            cur = _SECTIONS.get(sec.group(1))
+            continue
+        if s.startswith("}"):
+            cur = None
+            continue
+        if cur:
+            key = re.match(r'^"([A-Za-z0-9_]+)"\s*:', s)
+            if key:
+                reg[key.group(1)] = cur
+    return (reg or None), (f"github:v{version}" if reg else "registry.py had no parseable archs")
+
+
+def registry_from_local():
+    """Coarse fallback: an importable local `vllm` (text vs multimodal). Returns ({arch: kind}, source) or (None, None)."""
+    snippet = (
+        "import json;"
+        "from vllm import ModelRegistry as R;"
+        "a=list(R.get_supported_archs());"
+        "mm=set(x for x in a if R.is_multimodal_model([x]));"
+        "print(json.dumps({'archs':a,'mm':list(mm)}))"
+    )
+    r = subprocess.run(["python", "-c", snippet], stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE, text=True, timeout=60)
+    if r.returncode != 0 or not r.stdout.strip():
+        return None, None
+    try:
+        d = json.loads(r.stdout)
+    except ValueError:
+        return None, None
+    mm = set(d.get("mm", []))
+    return {a: ("multimodal" if a in mm else "text") for a in d.get("archs", [])}, "vllm-import"
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--model-id", required=True)
+    p.add_argument("--revision", default="main")
+    p.add_argument("--vllm-version", default="0.22.0", help="pin the registry to this vLLM version (from data/epyc.json)")
+    a = p.parse_args()
+    token = os.environ.get("HF_TOKEN", "")
+
+    archs, aerr = model_architectures(a.model_id, a.revision, token)
+    if not archs:
+        # Cannot read the config (gated/offline) -- do not positively block; the
+        # gating check and launch will catch real problems.
+        print(json.dumps({"model_id": a.model_id, "supported": None, "kind": "undetermined",
+                           "message": f"Could not read architectures ({aerr or 'none declared'}); support unverified. "
+                                      "If gated, set HF_TOKEN. This does not bypass the gating/launch checks."}, indent=2))
+        sys.exit(0)
+
+    reg, source = registry_from_github(a.vllm_version)
+    if reg is None:
+        reg, source = registry_from_local()
+    if reg is None:
+        print(json.dumps({"model_id": a.model_id, "architectures": archs, "supported": None,
+                           "kind": "undetermined",
+                           "message": "Could not load vLLM's model registry (no network and no importable vllm); "
+                                      "support unverified. vLLM confirms support at load (no-retry rule applies)."}, indent=2))
+        sys.exit(0)
+
+    kinds = [reg.get(arch) for arch in archs]
+    known = [k for k in kinds if k]
+    out = {"model_id": a.model_id, "architectures": archs, "registry_source": source}
+
+    if not known:
+        out.update(supported=False, kind="unsupported",
+                   message=f"vLLM has no registry entry for {archs}; it cannot serve this model on any backend. Stop.")
+        print(json.dumps(out, indent=2))
+        sys.exit(1)
+
+    if any(k in ("text", "multimodal") for k in known):
+        kind = "multimodal" if "multimodal" in known else "text"
+        msg = f"vLLM supports {archs} as a {kind} generation endpoint."
+        if kind == "multimodal":
+            msg += " A multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load (no-retry rule applies)."
+        out.update(supported=True, kind=kind, message=msg)
+        print(json.dumps(out, indent=2))
+        sys.exit(0)
+
+    out.update(supported=False, kind="pooling",
+               message=f"{archs} is a pooling/embedding/reranker model in vLLM, not a chat/completion endpoint. Stop.")
+    print(json.dumps(out, indent=2))
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/cpu_tune.py b/skills/serving-llms-on-epyc/scripts/cpu_tune.py
new file mode 100644
index 0000000..bf84acc
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/cpu_tune.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Derive vLLM-on-CPU runtime knobs from the host, for a single instance pinned to
+ONE socket (with its memory). Read-only.
+
+Socket choice (dual-socket hosts): vLLM scales poorly across sockets, so we run on
+one. We sample per-socket CPU load (~0.5s via /proc/stat) and prefer a free socket:
+  - both sockets below --busy-threshold  -> socket 0 (deterministic; both free)
+  - exactly one below the threshold       -> that socket
+  - both at/above the threshold           -> WARN and proceed on the least-busy one
+  - --socket N                            -> force a socket, skip the load check
+A single-socket host just uses socket 0. (NPS2/NPS4 -> a socket spans multiple
+NUMA nodes; we bind memory to all of the chosen socket's nodes.)
+
+Emits two env vars:
+  - VLLM_CPU_OMP_THREADS_BIND : physical cores of the chosen socket (SMT siblings
+    dropped). vLLM sets OMP_NUM_THREADS itself (= len(cores)), so we don't.
+  - VLLM_CPU_KVCACHE_SPACE    : KV-cache RAM (GB), sized from the chosen socket's
+    LOCAL RAM (not whole-system) so the pool stays on-socket.
+
+And a memory-bound pin for the chosen socket:
+  - container : --cpuset-cpus=<phys cores> --cpuset-mems=<socket nodes>
+  - conda     : numactl --cpunodebind=<nodes> --membind=<nodes>  (preferred)
+                falls back to  taskset -c <phys cores>  (CPU-only, no mem bind)
+                if neither exists, reported -- launch proceeds unpinned.
+
+Not set: OMP_NUM_THREADS (vLLM derives it) and VLLM_CPU_NUM_OF_RESERVED_CPU
+(vLLM has its own default when unset).
+
+Usage:
+    python3 scripts/cpu_tune.py                       # export lines for `eval`
+    python3 scripts/cpu_tune.py --format json         # machine-readable
+    python3 scripts/cpu_tune.py --socket 1            # force socket 1
+    python3 scripts/cpu_tune.py --busy-threshold 70   # "free" means < 70% busy
+"""
+
+import argparse
+import json
+import re
+import shutil
+import subprocess
+import sys
+import time
+
+OS_HEADROOM_GB = 16
+
+
+def _sh(cmd):
+    try:
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=15)
+        return r.stdout
+    except Exception:
+        return ""
+
+
+def _lscpu_int(out, label, default):
+    m = re.search(rf"^{re.escape(label)}:\s*(\d+)", out, re.MULTILINE)
+    return int(m.group(1)) if m else default
+
+
+def _ranges(items):
+    """Compress a sorted int list to a range string: [0,1,2,5] -> '0-2,5'."""
+    items = sorted(items)
+    if not items:
+        return ""
+    out, start, prev = [], items[0], items[0]
+    for c in items[1:]:
+        if c == prev + 1:
+            prev = c
+            continue
+        out.append(f"{start}-{prev}" if start != prev else f"{start}")
+        start = prev = c
+    out.append(f"{start}-{prev}" if start != prev else f"{start}")
+    return ",".join(out)
+
+
+def topology():
+    """Per-socket layout from `lscpu -p`. Returns {sid: {phys, all, nodes}} where
+    phys = one CPU per core (SMT dropped), all = every logical CPU, nodes = set of
+    NUMA node ids on that socket. Also returns cpu->socket."""
+    socks, cpu_socket = {}, {}
+    for line in _sh("lscpu -p=CPU,CORE,SOCKET,NODE").splitlines():
+        if line.startswith("#") or not line.strip():
+            continue
+        parts = line.split(",")
+        if len(parts) < 3:
+            continue
+        cpu, core, sid = int(parts[0]), parts[1], int(parts[2])
+        node = parts[3] if len(parts) > 3 and parts[3] != "" else str(sid)
+        s = socks.setdefault(sid, {"phys": [], "all": [], "nodes": set(), "_cores": set()})
+        s["all"].append(cpu)
+        s["nodes"].add(int(node))
+        cpu_socket[cpu] = sid
+        if core not in s["_cores"]:
+            s["_cores"].add(core)
+            s["phys"].append(cpu)
+    return socks, cpu_socket
+
+
+def node_ram_gb(node):
+    out = _sh(f"grep MemTotal /sys/devices/system/node/node{node}/meminfo")
+    m = re.search(r"(\d+)", out)
+    return (int(m.group(1)) // (1024 * 1024)) if m else 0
+
+
+def socket_busy_pct(cpus, interval=0.5):
+    """Mean CPU-busy% across `cpus` over `interval` seconds, from /proc/stat."""
+    def snap():
+        d = {}
+        for ln in open("/proc/stat"):
+            if ln.startswith("cpu") and len(ln) > 3 and ln[3].isdigit():
+                p = ln.split()
+                vals = list(map(int, p[1:]))
+                idle = vals[3] + (vals[4] if len(vals) > 4 else 0)
+                d[int(p[0][3:])] = (idle, sum(vals))
+        return d
+    a = snap(); time.sleep(interval); b = snap()
+    di = sum(b[c][0] - a[c][0] for c in cpus if c in a and c in b)
+    dt = sum(b[c][1] - a[c][1] for c in cpus if c in a and c in b)
+    return round(100 * (1 - di / dt), 1) if dt else 0.0
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--kv-frac", type=float, default=0.4, help="fraction of the chosen socket's RAM for KV cache")
+    p.add_argument("--socket", type=int, default=None, help="force a socket id (skips the load check)")
+    p.add_argument("--busy-threshold", type=float, default=15.0,
+                   help="a socket is 'free' if its CPU-busy%% is below this (default 15)")
+    p.add_argument("--format", choices=["env", "json"], default="env")
+    args = p.parse_args()
+
+    socks, _ = topology()
+    if not socks:  # lscpu -p unavailable; degrade to a no-pin single instance
+        print('export VLLM_CPU_KVCACHE_SPACE=4' if args.format == "env"
+              else json.dumps({"error": "no topology from lscpu -p"}))
+        return
+
+    sids = sorted(socks)
+    busy = {s: socket_busy_pct(socks[s]["all"]) for s in sids}
+
+    warn = ""
+    if args.socket is not None and args.socket in socks:
+        chosen, reason = args.socket, "forced via --socket"
+    elif len(sids) == 1:
+        chosen, reason = sids[0], "single socket"
+    else:
+        free = [s for s in sids if busy[s] < args.busy_threshold]
+        if len(free) >= 2:
+            chosen, reason = sids[0], f"both sockets free (<{args.busy_threshold}% busy) -> socket 0"
+        elif len(free) == 1:
+            chosen, reason = free[0], f"only free socket (<{args.busy_threshold}% busy)"
+        else:
+            chosen = min(sids, key=lambda s: busy[s])
+            reason = f"all sockets busy (>={args.busy_threshold}%) -> least-busy"
+            warn = (f"all {len(sids)} sockets are busy (>= {args.busy_threshold}%): "
+                    f"{ {s: busy[s] for s in sids} }. Proceeding on the least-busy socket "
+                    f"{chosen}; performance may suffer. Pass --socket N to override.")
+
+    sock = socks[chosen]
+    bind = _ranges(sock["phys"])
+    nodes = sorted(sock["nodes"])
+    nodes_str = _ranges(nodes)
+
+    sock_ram = sum(node_ram_gb(n) for n in nodes)
+    if sock_ram <= 0:  # sysfs unavailable: fall back to total/sockets
+        m = re.search(r"MemTotal:\s*(\d+)", _sh("grep MemTotal /proc/meminfo"))
+        total = int(m.group(1)) // (1024 * 1024) if m else 0
+        sock_ram = total // max(1, len(sids))
+    if sock_ram <= 2 * OS_HEADROOM_GB:
+        kv = max(1, int(sock_ram * 0.5))
+    else:
+        kv = max(1, min(int(sock_ram * args.kv_frac), sock_ram - OS_HEADROOM_GB))
+
+    container_cpuset = f"--cpuset-cpus={bind} --cpuset-mems={nodes_str}"
+    if shutil.which("numactl"):
+        conda_prefix = f"numactl --cpunodebind={nodes_str} --membind={nodes_str}"
+        conda_pin = "numactl (cpu + memory bound to the socket's nodes)"
+    elif shutil.which("taskset"):
+        conda_prefix = f"taskset -c {bind}"
+        conda_pin = "taskset (CPU-only; memory NOT node-bound -- numactl not found)"
+    else:
+        conda_prefix = ""
+        conda_pin = "none (no numactl/taskset; launching unpinned -- install numactl for memory binding)"
+
+    nps_note = ""
+    if len(nodes) > 1:
+        nps_note = (f"socket {chosen} spans {len(nodes)} NUMA nodes (NPS{len(nodes)}); memory is "
+                    f"bound across nodes {nodes_str}. Finer per-node binding could add performance.")
+
+    result = {
+        "chosen_socket": chosen,
+        "socket_choice_reason": reason,
+        "sockets": len(sids),
+        "socket_busy_pct": busy,
+        "busy_threshold": args.busy_threshold,
+        "vllm_cpu_omp_threads_bind": bind,
+        "vllm_cpu_kvcache_space_gb": kv,
+        "socket_ram_gb": sock_ram,
+        "numa_nodes_on_socket": nodes,
+        "container_cpuset": container_cpuset,
+        "conda_launch_prefix": conda_prefix,
+        "conda_pin_tool": conda_pin,
+        "warning": warn,
+        "nps_note": nps_note,
+    }
+
+    if args.format == "json":
+        print(json.dumps(result, indent=2))
+        return
+
+    print(f'export VLLM_CPU_OMP_THREADS_BIND="{bind}"')
+    print(f"export VLLM_CPU_KVCACHE_SPACE={kv}")
+    print(f"# socket {chosen} ({reason}); per-socket busy%: {busy}")
+    print(f"#   container: {container_cpuset}")
+    print(f"#   conda:     {conda_prefix or '(unpinned)'} vllm serve ...   [{conda_pin}]")
+    if warn:
+        print(f"# WARNING: {warn}")
+    if nps_note:
+        print(f"# NOTE: {nps_note}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/detect.py b/skills/serving-llms-on-epyc/scripts/detect.py
new file mode 100644
index 0000000..c0c3340
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/detect.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Detect AMD EPYC CPU hardware for vLLM + zentorch serving.
+
+Usage:
+    python3 scripts/detect.py
+    python3 scripts/detect.py --host user@hostname
+
+Output: JSON with cpu_model, is_amd_epyc, logical_cores, physical_cores,
+sockets, threads_per_core, numa_nodes, memory_gb, epyc_generation
+(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), zen_arch, and avx512. Exits 0 on
+success, 1 if no CPU info could be read.
+
+Env vars (used when --host is not given):
+    ZEN_SSH_HOST, ZEN_SSH_USER, ZEN_SSH_PORT
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+def _is_local(host):
+    return not host or host in ("local", "localhost", "127.0.0.1")
+
+
+def _run(cmd, host, user, port, timeout=20):
+    if _is_local(host):
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=timeout)
+    else:
+        ssh_target = f"{user}@{host}" if user else host
+        ssh = ["ssh", "-o", "StrictHostKeyChecking=accept-new",
+               "-o", "ConnectTimeout=15", "-o", "BatchMode=yes",
+               "-o", "LogLevel=ERROR", "-p", str(port), ssh_target, cmd]
+        r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                           text=True, timeout=timeout)
+    return r.returncode, r.stdout, r.stderr
+
+
+def _lscpu_field(lscpu_out, label):
+    m = re.search(rf"^{re.escape(label)}:\s*(.+)$", lscpu_out, re.MULTILINE)
+    return m.group(1).strip() if m else ""
+
+
+def _epyc_generation(model):
+    """Map an AMD EPYC model name to (generation, zen_arch).
+
+    EPYC numbering encodes the generation: 7xx1=Naples (Zen1), 7xx2=Rome (Zen2),
+    7xx3=Milan (Zen3), 8xx4=Siena (Zen4c), 97x4=Bergamo (Zen4c), 9xx4=Genoa (Zen4),
+    9xx5=Turin (Zen5). The agent should carry this through every phase (e.g. AVX-512
+    + bf16 land on Zen4+, Turin has up to 128 cores per socket -> thread binding)."""
+    m = re.search(r"EPYC\s+(\d{4})", model.upper())
+    if not m:
+        return "unknown", "unknown"
+    num = m.group(1)
+    first, last = num[0], num[3]
+    if first == "7":
+        return {"1": ("Naples", "Zen1"), "2": ("Rome", "Zen2"),
+                "3": ("Milan", "Zen3")}.get(last, ("unknown", "unknown"))
+    if first == "8" and last == "4":
+        return "Siena", "Zen4c"
+    if first == "9":
+        if num.startswith("97") and last == "4":
+            return "Bergamo", "Zen4c"
+        if last == "4":
+            return "Genoa", "Zen4"
+        if last == "5":
+            return "Turin", "Zen5"
+    return "unknown", "unknown"
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--host", default="", help="[user@]host (default: local or ZEN_SSH_HOST)")
+    p.add_argument("--user", default="")
+    p.add_argument("--port", type=int, default=0)
+    args = p.parse_args()
+
+    host, user = args.host, args.user
+    if "@" in host:
+        user, host = host.split("@", 1)
+    host = host or os.environ.get("ZEN_SSH_HOST", "")
+    user = user or os.environ.get("ZEN_SSH_USER", "")
+    port = args.port or int(os.environ.get("ZEN_SSH_PORT", "22"))
+
+    rc, lscpu_out, err = _run("lscpu", host, user, port)
+    if rc != 0 or not lscpu_out:
+        print(json.dumps({"error": "lscpu failed",
+                          "detail": err.strip() or f"exit {rc}"}))
+        sys.exit(1)
+
+    model = _lscpu_field(lscpu_out, "Model name") or "unknown"
+    vendor = _lscpu_field(lscpu_out, "Vendor ID")
+
+    def _int(label, default=0):
+        v = _lscpu_field(lscpu_out, label)
+        try:
+            return int(v)
+        except ValueError:
+            return default
+
+    sockets = _int("Socket(s)", 1)
+    cores_per_socket = _int("Core(s) per socket", 0)
+    threads_per_core = _int("Thread(s) per core", 1) or 1
+    numa_nodes = _int("NUMA node(s)", 1)
+
+    rc, nproc_out, _ = _run("nproc --all", host, user, port)
+    try:
+        logical = int(nproc_out.strip())
+    except (ValueError, AttributeError):
+        logical = sockets * cores_per_socket * threads_per_core
+
+    physical = sockets * cores_per_socket if cores_per_socket else logical // threads_per_core
+
+    rc, mem_out, _ = _run("grep MemTotal /proc/meminfo", host, user, port)
+    mem_kb = 0
+    m = re.search(r"(\d+)", mem_out or "")
+    if m:
+        mem_kb = int(m.group(1))
+    memory_gb = mem_kb // (1024 * 1024)
+
+    is_epyc = vendor == "AuthenticAMD" and "EPYC" in model.upper()
+    generation, zen_arch = _epyc_generation(model)
+    avx512 = "avx512f" in _lscpu_field(lscpu_out, "Flags").split()
+
+    print(json.dumps({
+        "cpu_model": model,
+        "vendor": vendor,
+        "is_amd_epyc": is_epyc,
+        "epyc_generation": generation,
+        "zen_arch": zen_arch,
+        "avx512": avx512,
+        "logical_cores": logical,
+        "physical_cores": physical,
+        "sockets": sockets,
+        "threads_per_core": threads_per_core,
+        "numa_nodes": numa_nodes,
+        "memory_gb": memory_gb,
+        "target": "local" if _is_local(host) else host,
+    }, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/estimate_memory.py b/skills/serving-llms-on-epyc/scripts/estimate_memory.py
new file mode 100644
index 0000000..75c50ad
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/estimate_memory.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Does a HuggingFace model fit in host RAM for CPU serving at a given context?
+
+No download -- reads HF metadata over HTTP. Answers one question:
+    weights + KV(max_model_len x num_prompts) + reserve  <=  RAM ?
+If not, prints the largest max_model_len that would fit, so you reduce it and
+retry. Exit 0 = fits, 1 = does not fit (or error).
+
+    estimate_memory.py --model-id Qwen/Qwen3-8B --ram-gb 755 --max-model-len 4096 --num-prompts 8
+
+Three sub-problems, one function each: weight_gb(), kv_bytes_per_token(), fit().
+Env: HF_TOKEN for gated models. --weight-gb overrides weights if metadata is missing.
+"""
+
+import argparse
+import json
+import os
+import sys
+import urllib.request
+import urllib.error
+
+HF = "https://huggingface.co"
+KV_BYTES_PER_ELEM = 2  # zentorch CPU KV cache is bf16-only (2 bytes); no fp8 KV support
+
+
+def _get(url, token):
+    """GET JSON from HF. Returns (data, error_message)."""
+    headers = {"User-Agent": "estimate-memory/2"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    try:
+        with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r:
+            return json.load(r), None
+    except urllib.error.HTTPError as e:
+        return None, {401: "not found, or gated (set HF_TOKEN if it is gated)",
+                      403: "access denied -- accept the model license on HuggingFace",
+                      404: "model not found"}.get(e.code, f"HTTP {e.code}")
+    except Exception as e:
+        return None, str(e)
+
+
+def weight_gb(model, rev, token):
+    """(1) Weight RAM = sum of uncompressed weight-file sizes. Works for
+    .safetensors and legacy .bin; file size is ground truth even for quantized
+    checkpoints. Returns (gb, error)."""
+    tree, err = _get(f"{HF}/api/models/{model}/tree/{rev}", token)
+    if not isinstance(tree, list):
+        return None, err or "no file tree"
+    total = sum(
+        f.get("size", 0) for f in tree
+        if f.get("type") == "file" and (
+            f.get("path", "").endswith(".safetensors")
+            or (f.get("path", "").endswith(".bin") and "model" in f.get("path", "").lower())
+        )
+    )
+    if total == 0:
+        return None, "no weight files (.safetensors/.bin) found -- pass --weight-gb"
+    return round(total / 2**30, 2), None
+
+
+def get_config(model, rev, token):
+    """Model config.json, unwrapping the LLM sub-config of multimodal models."""
+    cfg, _ = _get(f"{HF}/{model}/resolve/{rev}/config.json", token)
+    if cfg and "num_hidden_layers" not in cfg:
+        for k in ("text_config", "language_config", "llm_config"):
+            if isinstance(cfg.get(k), dict) and cfg[k].get("num_hidden_layers"):
+                sub = dict(cfg[k])
+                sub.setdefault("max_position_embeddings", cfg.get("max_position_embeddings"))
+                return sub
+    return cfg
+
+
+def kv_bytes_per_token(cfg):
+    """(2) KV-cache bytes per token = 2(K,V) x layers x kv_heads x head_dim x 2 (bf16).
+    zentorch CPU caches KV in bf16 only. MLA models (DeepSeek) cache a compressed latent."""
+    if not cfg or not cfg.get("num_hidden_layers"):
+        return 0
+    nbytes = KV_BYTES_PER_ELEM
+    layers = cfg["num_hidden_layers"]
+    if "kv_lora_rank" in cfg:  # MLA: latent KV
+        return 2 * layers * (cfg["kv_lora_rank"] + cfg.get("qk_rope_head_dim", 0)) * nbytes
+    kv_heads = cfg.get("num_key_value_heads", cfg.get("num_attention_heads", 0))
+    head_dim = cfg.get("head_dim") or (cfg.get("hidden_size", 0) // max(1, cfg.get("num_attention_heads", 1)))
+    return 2 * layers * kv_heads * head_dim * nbytes
+
+
+def fit(weight, kv_per_tok, ctx, prompts, ram, reserve):
+    """(3) Verdict + the largest max_model_len that would fit if it doesn't."""
+    kv_gb = kv_per_tok * ctx * prompts / 2**30
+    required = round(weight + kv_gb + reserve, 2)
+    out = {"max_model_len": ctx, "num_prompts": prompts, "weight_gb": weight,
+           "kv_cache_gb": round(kv_gb, 2), "reserve_gb": reserve,
+           "required_gb": required, "ram_gb": ram, "fits": required <= ram}
+    if not out["fits"]:
+        budget = (ram - weight - reserve) * 2**30
+        best = int(budget / (kv_per_tok * prompts)) // 256 * 256 if kv_per_tok and budget > 0 else 0
+        out["suggested_max_model_len"] = max(0, best)
+        out["action"] = (f"reduce --max-model-len to {best} or less and retry"
+                         if best >= 256 else "weights alone exceed RAM -- use a smaller model")
+    return out
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--model-id", required=True)
+    p.add_argument("--revision", default="main")
+    p.add_argument("--ram-gb", type=float, default=0, help="host RAM (enables the fit verdict)")
+    p.add_argument("--max-model-len", type=int, default=4096)
+    p.add_argument("--num-prompts", type=int, default=1, help="concurrent sequences")
+    p.add_argument("--reserve-gb", type=float, default=16, help="RAM held back for OS + vLLM runtime")
+    p.add_argument("--weight-gb", type=float, default=0, help="override weight RAM if metadata is unavailable")
+    a = p.parse_args()
+    token = os.environ.get("HF_TOKEN", "")
+
+    w = a.weight_gb if a.weight_gb > 0 else None
+    if w is None:
+        w, err = weight_gb(a.model_id, a.revision, token)
+        if w is None:
+            print(json.dumps({"error": err, "model_id": a.model_id}))
+            sys.exit(1)
+
+    cfg = get_config(a.model_id, a.revision, token)
+    kv_per_tok = kv_bytes_per_token(cfg)
+    max_seq = cfg.get("max_position_embeddings") if cfg else None
+    ctx = min(a.max_model_len, max_seq) if max_seq else a.max_model_len
+
+    out = {"model_id": a.model_id, "weight_gb": w, "kv_dtype": "bf16",
+           "kv_bytes_per_token": kv_per_tok, "model_max_seq_len": max_seq}
+    if a.ram_gb > 0:
+        out["fit"] = fit(w, kv_per_tok, ctx, a.num_prompts, a.ram_gb, a.reserve_gb)
+
+    print(json.dumps(out, indent=2))
+    sys.exit(0 if out.get("fit", {"fits": True})["fits"] else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/validate.py b/skills/serving-llms-on-epyc/scripts/validate.py
new file mode 100644
index 0000000..95fd37a
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/validate.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Validate the environment before serving vLLM + zentorch on an EPYC CPU host.
+
+Checks a container runtime (docker or podman), whether the vLLM+zentorch image
+is present (and, if already pulled, that `import vllm, zentorch` works inside it),
+a conda/host fallback (`import vllm, zentorch`), the host perf libraries
+(tcmalloc / OpenMP via LD_PRELOAD), HF_TOKEN, and RAM. Each issue is error
+(blocks launch) / warning (degrades) / advisory (info).
+
+Usage:
+    python3 scripts/validate.py
+    python3 scripts/validate.py --image amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+
+Exits 0 if no error-severity issues remain, 1 otherwise. JSON to stdout.
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+
+
+def _sh(cmd, timeout=20):
+    try:
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=timeout)
+        return r.returncode, r.stdout.strip(), r.stderr.strip()
+    except subprocess.TimeoutExpired:
+        return 1, "", f"timed out after {timeout}s"
+
+
+def _detect_runtime():
+    """Pick an accessible container runtime: docker (daemon reachable) > podman
+    (rootless). Returns (runtime, detail) or (None, why).
+
+    Like serving-llms-on-instinct, an accessible runtime is a PREREQUISITE. We
+    check and report a one-time fix; we never escalate privileges (no sudo).
+    """
+    if shutil.which("docker"):
+        rc, _, err = _sh("docker ps -q")
+        if rc == 0:
+            return "docker", "docker reachable"
+        last = (err or "docker ps failed").splitlines()[0][:120]
+    else:
+        last = "docker not installed"
+    if shutil.which("podman"):
+        rc, _, err = _sh("podman info --format '{{.Host.Arch}}'")
+        if rc == 0:
+            return "podman", "podman available (rootless)"
+        last = (err or last).splitlines()[0][:120] if err else last
+    return None, last
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--image", default="", help="container image to check for (advisory)")
+    args = p.parse_args()
+
+    issues = []
+
+    # 1. Container runtime (prerequisite): docker > podman, else conda fallback.
+    runtime, detail = _detect_runtime()
+    conda_ok = _sh('python -c "import vllm, zentorch"')[0] == 0
+
+    if runtime is None:
+        if conda_ok:
+            issues.append({"check": "container_runtime", "severity": "warning",
+                           "message": f"No accessible container runtime ({detail}); using the conda/host path.",
+                           "fix": "For the container path, make docker accessible or install rootless podman (see fix below)."})
+        else:
+            issues.append({"check": "container_runtime", "severity": "error",
+                           "message": f"No accessible container runtime ({detail}) and no host vllm+zentorch.",
+                           "fix": "One-time onboarding: add your user to the docker group "
+                                  "(sudo usermod -aG docker $USER, then re-login) or start the daemon; "
+                                  "OR install rootless podman; OR activate a conda env with vllm+zentorch."})
+
+    # 2. Image present + (only if already pulled) zentorch inside it. The in-image
+    #    import check runs ONLY when the image is local, so it never triggers a
+    #    multi-GB pull just to validate.
+    if runtime and args.image:
+        repo = args.image.rsplit(":", 1)[0]  # strip the tag, keep any host:port/repo
+        rc, out, _ = _sh(f"{runtime} images {repo} --format '{{{{.Repository}}}}:{{{{.Tag}}}}'")
+        if args.image not in (out or ""):
+            issues.append({"check": "image", "severity": "advisory",
+                           "message": f"Image {args.image} not pulled yet; first launch will download it (in-image zentorch check deferred to launch).",
+                           "fix": f"{runtime} pull {args.image}"})
+        else:
+            rc, ver, err = _sh(f'{runtime} run --rm {args.image} '
+                               f'python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"', timeout=90)
+            if rc == 0 and ver:
+                issues.append({"check": "image_stack", "severity": "advisory",
+                               "message": f"Image has vllm+zentorch ({ver})."})
+            else:
+                issues.append({"check": "image_stack", "severity": "warning",
+                               "message": f"Image {args.image} is present but `import vllm, zentorch` failed inside it: {(err or 'unknown')[:120]}",
+                               "fix": "Use an image tag that bundles the zentorch plugin (see data/epyc.json)."})
+
+    # 3. Host vllm+zentorch (for the conda path)
+    if conda_ok:
+        _, ver, _ = _sh('python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"')
+        issues.append({"check": "host_stack", "severity": "advisory",
+                       "message": f"Host vllm+zentorch importable ({ver}); conda path available."})
+    elif runtime:
+        issues.append({"check": "host_stack", "severity": "advisory",
+                       "message": "Host `import vllm, zentorch` not available; use the container path."})
+
+    # 4. HF_TOKEN
+    if not os.environ.get("HF_TOKEN"):
+        issues.append({"check": "hf_token", "severity": "advisory",
+                       "message": "HF_TOKEN not set. Required for gated models (Llama, Gemma); not needed for Qwen3.",
+                       "fix": "export HF_TOKEN=hf_..."})
+
+    # 5. RAM
+    rc, out, _ = _sh("grep MemTotal /proc/meminfo | awk '{print int($2/1024/1024)}'")
+    try:
+        ram_gb = int(out)
+    except ValueError:
+        ram_gb = 0
+    if 0 < ram_gb < 32:
+        issues.append({"check": "ram", "severity": "warning",
+                       "message": f"Only {ram_gb} GB RAM. CPU serving keeps weights + KV cache in RAM; large models may not fit.",
+                       "fix": "Use a small model or a host with more RAM."})
+
+    # 6. Perf libraries for the host/conda path (advisory). vLLM CPU wants
+    #    libtcmalloc + libiomp (OpenMP) preloaded and warns otherwise. The
+    #    container image sets these itself, so only check the host when the
+    #    conda/host path is viable.
+    if conda_ok:
+        ld = os.environ.get("LD_PRELOAD", "")
+        missing = [lib for lib in ("libtcmalloc", "libiomp") if lib not in ld]
+        if missing:
+            issues.append({"check": "perf_libs", "severity": "advisory",
+                           "message": f"LD_PRELOAD is missing {', '.join(missing)}; vLLM CPU warns about this and throughput suffers without them (host/conda path).",
+                           "fix": "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$CONDA_PREFIX/lib/libiomp5.so:$LD_PRELOAD"})
+
+    errors = [i for i in issues if i["severity"] == "error"]
+    result = {
+        "ready": len(errors) == 0,
+        "runtime": runtime,
+        "runtime_detail": detail,
+        "conda_path_available": conda_ok,
+        "ram_gb": ram_gb,
+        "errors": errors,
+        "warnings": [i for i in issues if i["severity"] == "warning"],
+        "advisories": [i for i in issues if i["severity"] == "advisory"],
+    }
+    print(json.dumps(result, indent=2))
+    sys.exit(0 if len(errors) == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/skill-card.md b/skills/serving-llms-on-epyc/skill-card.md
new file mode 100644
index 0000000..120283f
--- /dev/null
+++ b/skills/serving-llms-on-epyc/skill-card.md
@@ -0,0 +1,13 @@
+# Skill Card
+
+## Description
+
+Serve a single LLM on an AMD EPYC CPU host with vLLM + zentorch (Docker, Podman, or conda), handling CPU detection, runtime/env validation, model + RAM-fit checks, hardware-sized threads/KV/NUMA, launch, and health verification. Reports and stops on failure; does not debug.
+
+## Owner
+
+AMD
+
+## License
+
+MIT