Skip to content

Commit f62dd74

Browse files
committed
add serving-llms-on-epyc skill (vLLM + zentorch CPU serving)
Signed-off-by: Lalithnarayan C <Lalithnarayan.C@amd.com> Change-Id: I1dc2362e0983326658b6618015a161ecd44f40e6
1 parent c8d6fe1 commit f62dd74

12 files changed

Lines changed: 1231 additions & 0 deletions

File tree

.claude-plugin/marketplace.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@
2424
"source": "./skills/magpie-kernel-evaluator",
2525
"description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
2626
},
27+
{
28+
"name": "serving-llms-on-epyc",
29+
"source": "./skills/serving-llms-on-epyc",
30+
"description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
31+
},
2732
{
2833
"name": "serving-llms-on-instinct",
2934
"source": "./skills/serving-llms-on-instinct",

.cursor-plugin/marketplace.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@
2424
"source": "./skills/magpie-kernel-evaluator",
2525
"description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
2626
},
27+
{
28+
"name": "serving-llms-on-epyc",
29+
"source": "./skills/serving-llms-on-epyc",
30+
"description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
31+
},
2732
{
2833
"name": "serving-llms-on-instinct",
2934
"source": "./skills/serving-llms-on-instinct",
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Behavioral tests for the `serving-llms-on-epyc` skill.
2+
3+
Run locally (needs the `claude` CLI authenticated; the agent does not actually
4+
launch a server in the judge's sandbox, so this grades the *plan/behavior*, not
5+
a live endpoint):
6+
7+
pytest eval/behavioral/tests/test_serving_llms_on_epyc.py -s
8+
9+
`logs_contains` is deterministic; `should` / `should_not` are graded by an LLM
10+
judge over the captured evidence (tool calls + outputs), so the agent's prose
11+
cannot fake a pass.
12+
"""
13+
14+
from harness import claude
15+
16+
17+
def test_serve_model_on_epyc():
18+
with claude("sonnet", skill="serving-llms-on-epyc") as agent:
19+
run = agent.prompt(
20+
"Serve Qwen/Qwen3-0.6B on this AMD EPYC box with vLLM and zentorch. "
21+
"Use the default settings."
22+
)
23+
24+
# Programmatic expectation: the skill was actually loaded.
25+
run.logs_contains("serving-llms-on-epyc")
26+
27+
# Positive behavioral expectations (the state machine).
28+
run.should("Detect the CPU and confirm it is an AMD EPYC host before serving (e.g. runs detect.py)")
29+
run.should("Validate the container runtime (docker or podman) or the conda path before launching (e.g. runs validate.py)")
30+
run.should("Take validate.py's environment advisories into account -- the tcmalloc / OpenMP (LD_PRELOAD) perf-library recommendation and, when the image is already pulled, the in-image vllm+zentorch check -- surfacing any that apply")
31+
run.should("Check that vLLM supports the model before serving (e.g. runs check_model.py), rather than refusing it just for being multimodal")
32+
run.should("Check that the model fits in host RAM (e.g. runs estimate_memory.py)")
33+
run.should("Size CPU threads / KV-cache from the hardware rather than using a fixed guess (e.g. runs cpu_tune.py)")
34+
run.should("Present a sized plan and ask the user to confirm before launching the server")
35+
run.should("Plan to launch with 'vllm serve' and poll until /health is healthy")
36+
37+
# Negative behavioral expectations (the explicit Don'ts).
38+
run.should_not("Pass '--device cpu' to vllm serve")
39+
run.should_not("Launch the server before the user has confirmed the plan")
40+
run.should_not("Enter a debugging loop or retry after a launch failure")
41+
run.should_not("Attempt GPU, ROCm, or Instinct serving")
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
---
2+
name: serving-llms-on-epyc
3+
description: >-
4+
Serves a language model on an AMD EPYC CPU host using vLLM with the zentorch
5+
backend, in a container (Docker or Podman) or a conda env. Use whenever the
6+
user wants to run, serve, deploy, start, host, or launch an LLM on AMD EPYC,
7+
Zen CPU, "vLLM on CPU", "zentorch serving", or "serve a model without a GPU".
8+
Use for "serve Qwen on EPYC", "start a CPU vLLM endpoint", "run an OpenAI
9+
server on my EPYC box", or similar. Handles the full single-instance flow:
10+
detect the CPU (incl. EPYC generation), validate the runtime/env, check vLLM
11+
supports the model (via vLLM's registry, not a modality blocklist), check it
12+
fits host RAM, size CPU threads/KV/NUMA from the hardware, confirm the plan with
13+
the user, launch, and poll until the endpoint is responsive. Single instance
14+
only. Does NOT debug failures
15+
and does NOT retry -- it reports and stops. Do not use for GPU/Instinct (use
16+
serving-llms-on-instinct) or multi-node.
17+
allowed-tools: Bash, Read
18+
---
19+
20+
# Serving LLMs on AMD EPYC (vLLM + zentorch, CPU)
21+
22+
Bring up a single vLLM OpenAI endpoint on an AMD EPYC host with the zentorch CPU
23+
backend, sized to the hardware. Container-first (Docker or Podman); conda/host
24+
is the fallback.
25+
26+
Hard rule for this skill: **on any failure, report the cause + logs and STOP.
27+
Do not retry, do not debug.** (Debugging is a separate workflow.)
28+
29+
**The agent does the serve flow itself** -- pull, configure, launch, poll --
30+
using the runtime `validate.py` reports. Never hand the user per-serve commands.
31+
Like serving-llms-on-instinct, an accessible container runtime is a one-time
32+
**prerequisite**: if `validate.py` finds none, report its one-time fix (make
33+
docker accessible / install podman / provide a conda env) and stop. Do not
34+
attempt `sudo` or privilege escalation.
35+
36+
## Data file
37+
38+
Read `data/epyc.json` directly. It holds the container image, mandatory CPU run
39+
flags, supported precision, the model-support policy, the default model, and the
40+
verified throughput-flag gotcha. Do not hardcode the image tag from memory -- read it.
41+
42+
## Step 1: Detect the CPU
43+
44+
```bash
45+
python3 scripts/detect.py # add --host user@box for a remote host
46+
```
47+
48+
Returns `cpu_model`, `is_amd_epyc`, `epyc_generation`
49+
(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), `zen_arch`, `avx512`,
50+
`logical_cores`, `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`. If
51+
`is_amd_epyc` is `false`, stop: this skill targets AMD EPYC. (Other x86 may work
52+
but is unsupported here.) Carry `epyc_generation` / `avx512` through the later
53+
phases -- e.g. AVX-512 + bf16 land on Zen4+ (Genoa/Turin), and Turin packs up to
54+
128 cores/socket, which the thread-binding in Step 5 sizes from.
55+
56+
## Step 2: Validate the runtime and environment
57+
58+
```bash
59+
python3 scripts/validate.py --image <image from data/epyc.json>
60+
```
61+
62+
Returns `ready`, `runtime` (`docker`, `podman`, or null), `runtime_detail`,
63+
`conda_path_available`, `ram_gb`, and `errors/warnings/advisories`. Pick the path:
64+
- `runtime` is `docker` or `podman` -> container path (Step 6), used verbatim.
65+
- `runtime` null but `conda_path_available: true` -> conda/host path.
66+
- `runtime` null and no conda -> `ready` is false. Report the one-time
67+
onboarding `fix` (make docker accessible / install podman / conda env) and stop.
68+
69+
Do not proceed if `ready` is `false`.
70+
71+
## Step 3: Resolve and validate the model
72+
73+
If the user named no model, use `default_model` from `data/epyc.json`
74+
(`Qwen/Qwen3-0.6B` -- ungated, tiny, fast first success). Otherwise use theirs.
75+
76+
Check that vLLM actually supports the model (do **not** blanket-block multimodal):
77+
78+
```bash
79+
python3 scripts/check_model.py --model-id <model> --vllm-version <vllm_version from data/epyc.json>
80+
```
81+
82+
- Exit 0 = vLLM serves it as a generation endpoint (`kind` `text` or `multimodal`),
83+
or support is undeterminable (gated/offline) -- proceed; launch confirms.
84+
- Exit 1 = positively unsupported: the architecture is not in vLLM's registry, or
85+
it is a `pooling`/embedding/reranker (not a chat/completion endpoint). Report the
86+
printed `message` and stop.
87+
- A `multimodal` model is allowed; a vLLM-supported multimodal arch may still hit a
88+
GPU-only kernel on CPU, which surfaces at load (the no-retry rule then applies).
89+
90+
**Precision/dtype**: native CPU dtypes are `bf16` (default), `fp16`, `fp32`. Use
91+
`bfloat16` unless the user asks otherwise.
92+
93+
For gated models (Llama, Gemma) `HF_TOKEN` must be set and the license accepted on
94+
HuggingFace; if not, stop and say so.
95+
96+
## Step 4: Check it fits host RAM
97+
98+
RAM is the ceiling on CPU (weights + KV cache both live in RAM). Run on ONE line:
99+
100+
```bash
101+
python3 scripts/estimate_memory.py --model-id <model> --ram-gb <memory_gb from detect> --max-model-len <4096 or user value> --num-prompts <1 or desired concurrency>
102+
```
103+
104+
Exit 0 = fits, exit 1 = does not fit. If `fit.fits` is false: **do not launch.**
105+
Tell the user `required_gb` vs `ram_gb` and the printed `fit.action` -- reduce
106+
`--max-model-len` to `fit.suggested_max_model_len` and retry, or use a smaller
107+
model. `--max-model-len` and `--num-prompts` are the two knobs that move KV.
108+
Extra flag: `--weight-gb N` overrides weights if a model has no HF metadata
109+
(rare). KV cache is bf16-only on zentorch CPU (no fp8 KV).
110+
111+
## Step 5: Size the CPU runtime from the hardware
112+
113+
```bash
114+
eval "$(python3 scripts/cpu_tune.py)" # or --format json to inspect
115+
```
116+
117+
Exports `VLLM_CPU_OMP_THREADS_BIND` (physical cores of **socket 0**) and
118+
`VLLM_CPU_KVCACHE_SPACE` (GB). It does **not** set `OMP_NUM_THREADS` (vLLM derives
119+
it from the bind list) or `VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default
120+
when unset). Default policy, the same for NPS1/NPS2/NPS4: a single instance uses
121+
**socket 0's whole CPU with no memory binding**. On a multi-socket host the JSON
122+
gives `container_cpuset` (`--cpuset-cpus` only -- no `--cpuset-mems`) for the
123+
container path; the conda path needs nothing extra (the bind env var binds the
124+
threads). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `perf_note` flags that
125+
optimal per-node binding could give more performance -- surface it, but proceed.
126+
127+
## Step 6: Confirm the plan, then launch (container-first)
128+
129+
Before launching, present this summary and **wait for the user to confirm** -- do
130+
not launch unprompted. This is the human gate before anything runs:
131+
132+
| Field | Value |
133+
|---|---|
134+
| Model / kind | `<model>` -- `text` or `multimodal` (from `check_model.py`) |
135+
| Path | container (`<runtime>`, image from `data/epyc.json`) or conda/host |
136+
| Precision | `bfloat16` (or the user's choice) |
137+
| Fit | required `<required_gb>` GB vs `<ram_gb>` GB RAM |
138+
| CPU sizing | thread bind `<VLLM_CPU_OMP_THREADS_BIND>` (socket 0), KV `<VLLM_CPU_KVCACHE_SPACE>` GB, no memory binding |
139+
| Hardware | EPYC `<epyc_generation>` (`<zen_arch>`), `<physical_cores>` cores, AVX-512 `<avx512>` |
140+
| Port | `<port>` |
141+
142+
Proceed only on a clear "go". If the user declines or wants changes (model,
143+
`--max-model-len`, port), stop and adjust -- do not launch.
144+
145+
Build the launch from `data/epyc.json`. The CLI is `vllm serve <model>`.
146+
**Do not pass `--device cpu`** on vLLM >= 0.20 -- the zentorch plugin
147+
auto-selects the CPU platform and `vllm serve` rejects the flag. Only add it if
148+
`vllm serve --help` lists it (older vLLM).
149+
150+
**Container path** (`runtime` from validate.py). The agent runs these itself,
151+
including the pull. `RT` is the resolved runtime verbatim:
152+
```bash
153+
RT="<runtime from validate.py: docker | podman>"
154+
$RT pull <image from data/epyc.json> # agent pulls; do not ask the user to
155+
$RT run -d --name vllm-epyc \
156+
<run_flags from data/epyc.json> # --ipc=host --shm-size=16g --network=host
157+
<hf_cache_mount> \
158+
<container_cpuset from cpu_tune, on multi-socket> # --cpuset-cpus=... (no --cpuset-mems)
159+
--env VLLM_CPU_OMP_THREADS_BIND="$VLLM_CPU_OMP_THREADS_BIND" \
160+
--env VLLM_CPU_KVCACHE_SPACE=$VLLM_CPU_KVCACHE_SPACE \
161+
--env HF_TOKEN=${HF_TOKEN} \
162+
<image from data/epyc.json> \
163+
vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len>
164+
```
165+
166+
**Conda/host path** (no container runtime, `conda_path_available` true). `eval`-ing
167+
cpu_tune already exported the env vars; just launch -- `VLLM_CPU_OMP_THREADS_BIND`
168+
binds the threads to socket 0, and there is no memory binding by default:
169+
```bash
170+
vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len> &
171+
```
172+
173+
Optional throughput flags are **opt-in and must move together** (see Gotchas):
174+
`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE=0` (+ `ZENTORCH_WEIGHT_PREPACK=1`).
175+
The base launch sets none of them.
176+
177+
## Step 7: Poll until up and responsive
178+
179+
A 503 while loading is normal. Poll until the server answers, then prove the
180+
chat endpoint works. CPU first-token compile can take a minute or two.
181+
182+
```bash
183+
# container alive (or process alive for conda) + /health
184+
for i in $(seq 1 120); do
185+
# container path:
186+
$RT inspect -f '{{.State.Running}}' vllm-epyc 2>/dev/null | grep -q true || { echo "FAILED: container exited"; $RT logs --tail 50 vllm-epyc; break; }
187+
curl -sf http://localhost:<port>/health >/dev/null 2>&1 && { echo "HEALTHY"; break; }
188+
sleep 3
189+
done
190+
```
191+
192+
Then validate the OpenAI endpoint is actually accessible:
193+
```bash
194+
curl -sf http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
195+
-d '{"model":"<model>","messages":[{"role":"user","content":"hi"}],"max_tokens":8}'
196+
```
197+
198+
Resource sanity (your validation list): `$RT stats --no-stream vllm-epyc`.
199+
200+
**If the server never becomes healthy or the endpoint does not respond: print
201+
the container/process logs, state the failure, and STOP. Do not retry. Do not
202+
start a debugging loop.**
203+
204+
## Step 8: On success, hand over the endpoint
205+
206+
Print a connection table (model, runtime, port, OMP threads, KV GB, max-model-len,
207+
NUMA pinning) and a ready-to-run example:
208+
```bash
209+
curl -s http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
210+
-d '{"model":"<model>","messages":[{"role":"user","content":"Hello"}]}'
211+
```
212+
To stop: `$RT rm -f vllm-epyc` (container) or `kill <pid>` (conda).
213+
214+
## Offline (single-instance batch)
215+
216+
For a one-shot offline run instead of a server, replace Step 6-8 with a single
217+
`vllm bench throughput` (or an offline `LLM.generate`) using the same sized env,
218+
wait for completion, and report the metrics. Same no-retry / no-debug rule.
219+
220+
## Gotchas
221+
222+
See [reference.md](reference.md) for the full list. The load-bearing ones:
223+
224+
- **`--device cpu` was removed** from `vllm serve` in vLLM >= 0.20. The zentorch
225+
plugin auto-selects CPU. Passing it makes `vllm serve` error with
226+
"unrecognized arguments: --device cpu".
227+
- **`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init** on vLLM 0.23 /
228+
zentorch 2.11 (`AssertionError: expected OutputCode, got function`). It only
229+
works with `VLLM_USE_AOT_COMPILE=0` set alongside it. Never set one without
230+
the other.
231+
- **`--shm-size`**: vLLM needs a large `/dev/shm`; the container default (64MB)
232+
is too small. Use `--shm-size=16g` (in `data/epyc.json`).
233+
- **NUMA**: the default is simple -- one instance on **socket 0's CPUs, no memory
234+
binding** (`--cpuset-cpus` from `cpu_tune.py` for the container; the bind env var
235+
for conda). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` notes
236+
that optimal per-node binding could add performance; the base recipe doesn't do it.
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{
2+
"vllm_version": "0.22.0",
3+
"container": {
4+
"image": "amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23",
5+
"runtimes": ["docker", "podman"],
6+
"comment": "Public vLLM + zentorch CPU image on Docker Hub (amdih/zendnn_zentorch) -- no internal-registry access needed. Tags are vllm_v<ver>_zentorch_v<ver>_<os>_<build>; prefer the newest ubuntu22.04 stable. Both docker and podman are supported; the skill prefers docker and falls back to podman.",
7+
"run_flags": [
8+
"--ipc=host",
9+
"--shm-size=16g",
10+
"--network=host"
11+
],
12+
"hf_cache_mount": "-v ~/.cache/huggingface:/root/.cache/huggingface",
13+
"flag_notes": {
14+
"--ipc=host": "vLLM workers use host IPC/shared memory.",
15+
"--shm-size=16g": "vLLM needs a large /dev/shm; default 64MB is not enough.",
16+
"--network=host": "Expose the served port directly. Alternative: -p <port>:<port>.",
17+
"numa": "Default: a single instance uses socket 0's CPUs with NO memory binding (cpu_tune.py emits --cpuset-cpus for the container; conda relies on VLLM_CPU_OMP_THREADS_BIND). On NPS2/NPS4 (multiple NUMA nodes per socket), optimal per-node binding could add performance -- cpu_tune.py notes this; the base recipe does not do it."
18+
}
19+
},
20+
"launch": {
21+
"cli": "vllm serve",
22+
"device_flag_note": "Do NOT pass --device cpu on vLLM >= 0.20; the zentorch plugin auto-selects the CPU platform and `vllm serve` rejects --device. Only pass it if `vllm serve --help` advertises it (older vLLM)."
23+
},
24+
"precision": {
25+
"native": ["bf16", "fp16", "fp32"],
26+
"default": "bfloat16",
27+
"notes": "bf16 is the throughput default on EPYC (Zen). fp32 is slower and for debugging only. WOQ (per-channel/per-group int) is supported by zentorch but out of scope for the base recipe."
28+
},
29+
"model_support": {
30+
"check_script": "scripts/check_model.py",
31+
"policy": "Do NOT blanket-block multimodal. check_model.py reads the model's HF architectures and checks them against vLLM's model registry for the pinned vllm_version. Text and multimodal generation endpoints are allowed; pooling/embedding/reranker and non-LLM architectures are rejected (not chat/completion endpoints).",
32+
"cpu_note": "A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load, where the no-retry rule applies."
33+
},
34+
"default_model": "Qwen/Qwen3-0.6B",
35+
"default_model_notes": "Ungated (Apache-2.0), tiny, fast first success on CPU. For a real workload pick a larger Qwen3 / Llama once the flow is verified.",
36+
"smoke_model": "Qwen/Qwen3-0.6B",
37+
"smoke_model_notes": "Current small Qwen, chat-capable (ships a chat template, so /v1/chat/completions works -- unlike base models such as opt-125m).",
38+
"env_defaults": {
39+
"VLLM_CPU_OMP_THREADS_BIND": "set by cpu_tune.py (physical cores of socket 0)",
40+
"VLLM_CPU_KVCACHE_SPACE": "set by cpu_tune.py (GB)",
41+
"do_not_set": "OMP_NUM_THREADS -- vLLM sets it from the bind list (len of cpu_list); and VLLM_CPU_NUM_OF_RESERVED_CPU -- vLLM has its own default when unset, forcing 0 overrides it."
42+
},
43+
"throughput_flags_optional": {
44+
"TORCHINDUCTOR_FREEZING": "1",
45+
"VLLM_USE_AOT_COMPILE": "0",
46+
"ZENTORCH_WEIGHT_PREPACK": "1",
47+
"gotcha": "VERIFIED on vLLM 0.22.0 / zentorch 2.11.0.1: TORCHINDUCTOR_FREEZING=1 ALONE crashes engine-core init with 'AssertionError: expected OutputCode, got function'. It only works when VLLM_USE_AOT_COMPILE=0 is set alongside it. Never set FREEZING=1 without AOT_COMPILE=0. The base recipe leaves all three unset."
48+
},
49+
"ram": {
50+
"os_headroom_gb": 16,
51+
"comment": "Reserve ~16 GB for OS + framework beyond model weights + KV cache when checking fit."
52+
}
53+
}

0 commit comments

Comments
 (0)