diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f00ba1e..fd141aa 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -24,6 +24,11 @@ "source": "./skills/magpie-kernel-evaluator", "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces." }, + { + "name": "serving-llms-on-epyc", + "source": "./skills/serving-llms-on-epyc", + "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure." + }, { "name": "serving-llms-on-instinct", "source": "./skills/serving-llms-on-instinct", diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json index f00ba1e..fd141aa 100644 --- a/.cursor-plugin/marketplace.json +++ b/.cursor-plugin/marketplace.json @@ -24,6 +24,11 @@ "source": "./skills/magpie-kernel-evaluator", "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces." }, + { + "name": "serving-llms-on-epyc", + "source": "./skills/serving-llms-on-epyc", + "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure." + }, { "name": "serving-llms-on-instinct", "source": "./skills/serving-llms-on-instinct", diff --git a/eval/behavioral/tests/test_serving_llms_on_epyc.py b/eval/behavioral/tests/test_serving_llms_on_epyc.py new file mode 100644 index 0000000..55cc1ee --- /dev/null +++ b/eval/behavioral/tests/test_serving_llms_on_epyc.py @@ -0,0 +1,42 @@ +"""Behavioral tests for the `serving-llms-on-epyc` skill. + +Run locally (needs the `claude` CLI authenticated; the agent does not actually +launch a server in the judge's sandbox, so this grades the *plan/behavior*, not +a live endpoint): + + pytest eval/behavioral/tests/test_serving_llms_on_epyc.py -s + +`logs_contains` is deterministic; `should` / `should_not` are graded by an LLM +judge over the captured evidence (tool calls + outputs), so the agent's prose +cannot fake a pass. +""" + +from harness import claude + + +def test_serve_model_on_epyc(): + with claude("sonnet", skill="serving-llms-on-epyc") as agent: + run = agent.prompt( + "Serve Qwen/Qwen3-0.6B on this AMD EPYC box with vLLM and zentorch. " + "Use the default settings." + ) + + # Programmatic expectation: the skill was actually loaded. + run.logs_contains("serving-llms-on-epyc") + + # Positive behavioral expectations (the state machine). + run.should("Detect the CPU and confirm it is an AMD EPYC host before serving (e.g. runs detect.py)") + run.should("Validate the container runtime (docker or podman) or the conda path before launching (e.g. runs validate.py)") + run.should("Use validate.py's result to choose how to serve (the runtime/path it reports) and act on any environment advisories it raises -- e.g. the tcmalloc/OpenMP LD_PRELOAD perf-library note or the in-image vllm+zentorch check; on the container path with the image not yet pulled there may be none, which is fine") + run.should("Check that vLLM supports the model before serving (e.g. runs check_model.py), rather than refusing it just for being multimodal") + run.should("Check that the model fits in host RAM (e.g. runs estimate_memory.py)") + run.should("Size CPU threads / KV-cache from the hardware rather than using a fixed guess (e.g. runs cpu_tune.py)") + run.should("Pin the instance to a single socket with its memory (socket-local KV plus cpuset-mems or numactl membind) and, on a dual-socket host, pick a socket by load -- surfacing cpu_tune's warning if both sockets are busy") + run.should("Present a sized plan and ask the user to confirm before launching the server") + run.should("Plan to launch with 'vllm serve' and poll until /health is healthy") + + # Negative behavioral expectations (the explicit Don'ts). + run.should_not("Pass '--device cpu' to vllm serve") + run.should_not("Launch the server before the user has confirmed the plan") + run.should_not("Enter a debugging loop or retry after a launch failure") + run.should_not("Attempt GPU, ROCm, or Instinct serving") diff --git a/skills/serving-llms-on-epyc/SKILL.md b/skills/serving-llms-on-epyc/SKILL.md new file mode 100644 index 0000000..14b97e3 --- /dev/null +++ b/skills/serving-llms-on-epyc/SKILL.md @@ -0,0 +1,253 @@ +--- +name: serving-llms-on-epyc +description: >- + Serves a language model on an AMD EPYC CPU host using vLLM with the zentorch + backend, in a container (Docker or Podman) or a conda env. Use whenever the + user wants to run, serve, deploy, start, host, or launch an LLM on AMD EPYC, + Zen CPU, "vLLM on CPU", "zentorch serving", or "serve a model without a GPU". + Use for "serve Qwen on EPYC", "start a CPU vLLM endpoint", "run an OpenAI + server on my EPYC box", or similar. Handles the full single-instance flow: + detect the CPU (incl. EPYC generation), validate the runtime/env, check vLLM + supports the model (via vLLM's registry, not a modality blocklist), check it + fits host RAM, size CPU threads/KV/NUMA from the hardware, confirm the plan with + the user, launch, and poll until the endpoint is responsive. Single instance, + single socket (pinned to one socket + its memory; vLLM scales poorly across + sockets). Does NOT debug failures and does NOT retry -- it reports and stops. Do + not use for GPU/Instinct (use serving-llms-on-instinct) or multi-node. +allowed-tools: Bash, Read +--- + +# Serving LLMs on AMD EPYC (vLLM + zentorch, CPU) + +Bring up a single vLLM OpenAI endpoint on an AMD EPYC host with the zentorch CPU +backend, sized to the hardware. Container-first (Docker or Podman); conda/host +is the fallback. + +**This is single-socket serving:** one instance pinned to one socket and its memory +(vLLM scales poorly across sockets, so we do not span them). On a dual-socket host it +runs on a single socket; the multi-socket answer is **multiple instances (one per +socket)**, which is out of scope for this single-instance recipe. + +Hard rule for this skill: **on any failure, report the cause + logs and STOP. +Do not retry, do not debug.** (Debugging is a separate workflow.) + +**The agent does the serve flow itself** -- pull, configure, launch, poll -- +using the runtime `validate.py` reports. Never hand the user per-serve commands. +Like serving-llms-on-instinct, an accessible container runtime is a one-time +**prerequisite**: if `validate.py` finds none, report its one-time fix (make +docker accessible / install podman / provide a conda env) and stop. Do not +attempt `sudo` or privilege escalation. + +## Data file + +Read `data/epyc.json` directly. It holds the container image, mandatory CPU run +flags, supported precision, the model-support policy, the default model, and the +verified throughput-flag gotcha. Do not hardcode the image tag from memory -- read it. + +## Step 1: Detect the CPU + +```bash +python3 scripts/detect.py # add --host user@box for a remote host +``` + +Returns `cpu_model`, `is_amd_epyc`, `epyc_generation` +(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), `zen_arch`, `avx512`, +`logical_cores`, `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`. If +`is_amd_epyc` is `false`, stop: this skill targets AMD EPYC. (Other x86 may work +but is unsupported here.) Carry `epyc_generation` / `avx512` through the later +phases -- e.g. AVX-512 + bf16 land on Zen4+ (Genoa/Turin), and Turin packs up to +128 cores/socket, which the thread-binding in Step 5 sizes from. + +## Step 2: Validate the runtime and environment + +```bash +python3 scripts/validate.py --image +``` + +Returns `ready`, `runtime` (`docker`, `podman`, or null), `runtime_detail`, +`conda_path_available`, `ram_gb`, and `errors/warnings/advisories`. Pick the path: +- `runtime` is `docker` or `podman` -> container path (Step 6), used verbatim. +- `runtime` null but `conda_path_available: true` -> conda/host path. +- `runtime` null and no conda -> `ready` is false. Report the one-time + onboarding `fix` (make docker accessible / install podman / conda env) and stop. + +Do not proceed if `ready` is `false`. + +## Step 3: Resolve and validate the model + +If the user named no model, use `default_model` from `data/epyc.json` +(`Qwen/Qwen3-0.6B` -- ungated, tiny, fast first success). Otherwise use theirs. + +Check that vLLM actually supports the model (do **not** blanket-block multimodal): + +```bash +python3 scripts/check_model.py --model-id --vllm-version +``` + +- Exit 0 = vLLM serves it as a generation endpoint (`kind` `text` or `multimodal`), + or support is undeterminable (gated/offline) -- proceed; launch confirms. +- Exit 1 = positively unsupported: the architecture is not in vLLM's registry, or + it is a `pooling`/embedding/reranker (not a chat/completion endpoint). Report the + printed `message` and stop. +- A `multimodal` model is allowed; a vLLM-supported multimodal arch may still hit a + GPU-only kernel on CPU, which surfaces at load (the no-retry rule then applies). + +**Precision/dtype**: native CPU dtypes are `bf16` (default), `fp16`, `fp32`. Use +`bfloat16` unless the user asks otherwise. + +For gated models (Llama, Gemma) `HF_TOKEN` must be set and the license accepted on +HuggingFace; if not, stop and say so. + +## Step 4: Check it fits host RAM + +RAM is the ceiling on CPU (weights + KV cache both live in RAM). Run on ONE line: + +```bash +python3 scripts/estimate_memory.py --model-id --ram-gb --max-model-len <4096 or user value> --num-prompts <1 or desired concurrency> +``` + +Exit 0 = fits, exit 1 = does not fit. If `fit.fits` is false: **do not launch.** +Tell the user `required_gb` vs `ram_gb` and the printed `fit.action` -- reduce +`--max-model-len` to `fit.suggested_max_model_len` and retry, or use a smaller +model. `--max-model-len` and `--num-prompts` are the two knobs that move KV. +Extra flag: `--weight-gb N` overrides weights if a model has no HF metadata +(rare). KV cache is bf16-only on zentorch CPU (no fp8 KV). + +## Step 5: Size the CPU runtime from the hardware + +```bash +eval "$(python3 scripts/cpu_tune.py)" # or --format json to inspect +``` + +A single instance runs on **one socket, with its memory** (vLLM scales poorly across +sockets). `cpu_tune.py` exports `VLLM_CPU_OMP_THREADS_BIND` (the chosen socket's +physical cores) and `VLLM_CPU_KVCACHE_SPACE` (sized from that **socket's local RAM**, +not whole-system, so the KV pool stays on-socket). It does **not** set +`OMP_NUM_THREADS` (vLLM derives it) or `VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM's own default). + +Socket choice on a dual-socket host (load-aware): it samples per-socket CPU busy% +(~0.5s) and prefers a free socket -- both free → socket 0; one free → that socket; +**both busy (≥ `--busy-threshold`, default 15%) → it `warning`s and proceeds on the +least-busy socket**. `--socket N` forces a choice. Single-socket hosts use socket 0. + +For the chosen socket it also emits the memory-bound pin: `container_cpuset` +(`--cpuset-cpus= --cpuset-mems=`) for the container path, and +`conda_launch_prefix` (`numactl --cpunodebind/--membind`, falling back to `taskset` +CPU-only, or empty-with-note if neither tool exists) for conda. **Surface `warning` +to the user** if set. On NPS2/NPS4 a socket spans multiple NUMA nodes; memory is +bound across them and `nps_note` flags that finer binding could add performance. + +## Step 6: Confirm the plan, then launch (container-first) + +Before launching, present this summary and **wait for the user to confirm** -- do +not launch unprompted. This is the human gate before anything runs: + +| Field | Value | +|---|---| +| Model / kind | `` -- `text` or `multimodal` (from `check_model.py`) | +| Path | container (``, image from `data/epyc.json`) or conda/host | +| Precision | `bfloat16` (or the user's choice) | +| Fit | required `` GB vs `` GB RAM | +| CPU sizing | socket `` (``), bind ``, KV `` GB (socket-local), mem bound to nodes `` | +| Hardware | EPYC `` (``), `` cores, AVX-512 `` | +| Port | `` | + +If `cpu_tune.py` returned a `warning` (e.g. all sockets busy), include it here so the user sees it before confirming. + +Proceed only on a clear "go". If the user declines or wants changes (model, +`--max-model-len`, port), stop and adjust -- do not launch. + +Build the launch from `data/epyc.json`. The CLI is `vllm serve `. +**Do not pass `--device cpu`** on vLLM >= 0.20 -- the zentorch plugin +auto-selects the CPU platform and `vllm serve` rejects the flag. Only add it if +`vllm serve --help` lists it (older vLLM). + +**Container path** (`runtime` from validate.py). The agent runs these itself, +including the pull. `RT` is the resolved runtime verbatim: +```bash +RT="" +$RT pull # agent pulls; do not ask the user to +$RT run -d --name vllm-epyc \ + # --ipc=host --shm-size=16g --network=host + \ + # --cpuset-cpus= --cpuset-mems= + --env VLLM_CPU_OMP_THREADS_BIND="$VLLM_CPU_OMP_THREADS_BIND" \ + --env VLLM_CPU_KVCACHE_SPACE=$VLLM_CPU_KVCACHE_SPACE \ + --env HF_TOKEN=${HF_TOKEN} \ + \ + vllm serve --dtype bfloat16 --port --max-model-len +``` + +**Conda/host path** (no container runtime, `conda_path_available` true). `eval`-ing +cpu_tune already exported the env vars; prefix the launch with `conda_launch_prefix` +from cpu_tune so memory is bound to the chosen socket (empty → unpinned, with a note): +```bash + vllm serve --dtype bfloat16 --port --max-model-len & +# e.g. numactl --cpunodebind=0 --membind=0 vllm serve ... +``` + +Optional throughput flags are **opt-in and must move together** (see Gotchas): +`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE=0` (+ `ZENTORCH_WEIGHT_PREPACK=1`). +The base launch sets none of them. + +## Step 7: Poll until up and responsive + +A 503 while loading is normal. Poll until the server answers, then prove the +chat endpoint works. CPU first-token compile can take a minute or two. + +```bash +# container alive (or process alive for conda) + /health +for i in $(seq 1 120); do + # container path: + $RT inspect -f '{{.State.Running}}' vllm-epyc 2>/dev/null | grep -q true || { echo "FAILED: container exited"; $RT logs --tail 50 vllm-epyc; break; } + curl -sf http://localhost:/health >/dev/null 2>&1 && { echo "HEALTHY"; break; } + sleep 3 +done +``` + +Then validate the OpenAI endpoint is actually accessible: +```bash +curl -sf http://localhost:/v1/chat/completions -H 'Content-Type: application/json' \ + -d '{"model":"","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' +``` + +Resource sanity (your validation list): `$RT stats --no-stream vllm-epyc`. + +**If the server never becomes healthy or the endpoint does not respond: print +the container/process logs, state the failure, and STOP. Do not retry. Do not +start a debugging loop.** + +## Step 8: On success, hand over the endpoint + +Print a connection table (model, runtime, port, OMP threads, KV GB, max-model-len, +NUMA pinning) and a ready-to-run example: +```bash +curl -s http://localhost:/v1/chat/completions -H 'Content-Type: application/json' \ + -d '{"model":"","messages":[{"role":"user","content":"Hello"}]}' +``` +To stop: `$RT rm -f vllm-epyc` (container) or `kill ` (conda). + +## Offline (single-instance batch) + +For a one-shot offline run instead of a server, replace Step 6-8 with a single +`vllm bench throughput` (or an offline `LLM.generate`) using the same sized env, +wait for completion, and report the metrics. Same no-retry / no-debug rule. + +## Gotchas + +See [reference.md](reference.md) for the full list. The load-bearing ones: + +- **`--device cpu` was removed** from `vllm serve` in vLLM >= 0.20. The zentorch + plugin auto-selects CPU. Passing it makes `vllm serve` error with + "unrecognized arguments: --device cpu". +- **`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init** on vLLM 0.23 / + zentorch 2.11 (`AssertionError: expected OutputCode, got function`). It only + works with `VLLM_USE_AOT_COMPILE=0` set alongside it. Never set one without + the other. +- **`--shm-size`**: vLLM needs a large `/dev/shm`; the container default (64MB) + is too small. Use `--shm-size=16g` (in `data/epyc.json`). +- **NUMA / socket**: one instance is pinned to **one socket plus its memory** -- + CPU bind + `--cpuset-mems` (container) / `numactl --membind` (conda), with KV sized + from that socket's local RAM. On a dual-socket host `cpu_tune.py` picks a free socket + by load and `warning`s if both are busy. NPS2/NPS4 (multi-node socket) gets an + `nps_note` that finer per-node binding could add more. diff --git a/skills/serving-llms-on-epyc/data/epyc.json b/skills/serving-llms-on-epyc/data/epyc.json new file mode 100644 index 0000000..deb67f4 --- /dev/null +++ b/skills/serving-llms-on-epyc/data/epyc.json @@ -0,0 +1,53 @@ +{ + "vllm_version": "0.22.0", + "container": { + "image": "amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23", + "runtimes": ["docker", "podman"], + "comment": "Public vLLM + zentorch CPU image on Docker Hub (amdih/zendnn_zentorch) -- no internal-registry access needed. Tags are vllm_v_zentorch_v__; prefer the newest ubuntu22.04 stable. Both docker and podman are supported; the skill prefers docker and falls back to podman.", + "run_flags": [ + "--ipc=host", + "--shm-size=16g", + "--network=host" + ], + "hf_cache_mount": "-v ~/.cache/huggingface:/root/.cache/huggingface", + "flag_notes": { + "--ipc=host": "vLLM workers use host IPC/shared memory.", + "--shm-size=16g": "vLLM needs a large /dev/shm; default 64MB is not enough.", + "--network=host": "Expose the served port directly. Alternative: -p :.", + "numa": "A single instance is pinned to ONE socket plus its memory. cpu_tune.py picks a free socket by CPU load on dual-socket hosts (warns if both busy; --socket N forces), sizes KV from that socket's local RAM, and emits --cpuset-cpus + --cpuset-mems (container) or numactl --cpunodebind/--membind (conda). True multi-socket scaling = multiple instances (one per socket), out of scope here." + } + }, + "launch": { + "cli": "vllm serve", + "device_flag_note": "Do NOT pass --device cpu on vLLM >= 0.20; the zentorch plugin auto-selects the CPU platform and `vllm serve` rejects --device. Only pass it if `vllm serve --help` advertises it (older vLLM)." + }, + "precision": { + "native": ["bf16", "fp16", "fp32"], + "default": "bfloat16", + "notes": "bf16 is the throughput default on EPYC (Zen). fp32 is slower and for debugging only. WOQ (per-channel/per-group int) is supported by zentorch but out of scope for the base recipe." + }, + "model_support": { + "check_script": "scripts/check_model.py", + "policy": "Do NOT blanket-block multimodal. check_model.py reads the model's HF architectures and checks them against vLLM's model registry for the pinned vllm_version. Text and multimodal generation endpoints are allowed; pooling/embedding/reranker and non-LLM architectures are rejected (not chat/completion endpoints).", + "cpu_note": "A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load, where the no-retry rule applies." + }, + "default_model": "Qwen/Qwen3-0.6B", + "default_model_notes": "Ungated (Apache-2.0), tiny, fast first success on CPU. For a real workload pick a larger Qwen3 / Llama once the flow is verified.", + "smoke_model": "Qwen/Qwen3-0.6B", + "smoke_model_notes": "Current small Qwen, chat-capable (ships a chat template, so /v1/chat/completions works -- unlike base models such as opt-125m).", + "env_defaults": { + "VLLM_CPU_OMP_THREADS_BIND": "set by cpu_tune.py (physical cores of the chosen socket)", + "VLLM_CPU_KVCACHE_SPACE": "set by cpu_tune.py (GB)", + "do_not_set": "OMP_NUM_THREADS -- vLLM sets it from the bind list (len of cpu_list); and VLLM_CPU_NUM_OF_RESERVED_CPU -- vLLM has its own default when unset, forcing 0 overrides it." + }, + "throughput_flags_optional": { + "TORCHINDUCTOR_FREEZING": "1", + "VLLM_USE_AOT_COMPILE": "0", + "ZENTORCH_WEIGHT_PREPACK": "1", + "gotcha": "VERIFIED on vLLM 0.22.0 / zentorch 2.11.0.1: TORCHINDUCTOR_FREEZING=1 ALONE crashes engine-core init with 'AssertionError: expected OutputCode, got function'. It only works when VLLM_USE_AOT_COMPILE=0 is set alongside it. Never set FREEZING=1 without AOT_COMPILE=0. The base recipe leaves all three unset." + }, + "ram": { + "os_headroom_gb": 16, + "comment": "Reserve ~16 GB for OS + framework beyond model weights + KV cache when checking fit." + } +} diff --git a/skills/serving-llms-on-epyc/reference.md b/skills/serving-llms-on-epyc/reference.md new file mode 100644 index 0000000..4a12ee1 --- /dev/null +++ b/skills/serving-llms-on-epyc/reference.md @@ -0,0 +1,128 @@ +# serving-llms-on-epyc -- Reference + +## Table of Contents +1. [Runtime selection](#runtime-selection) +2. [Container run flags (CPU)](#container-run-flags-cpu) +3. [Precision and modality](#precision-and-modality) +4. [CPU sizing](#cpu-sizing) +5. [Known quirks](#known-quirks) + +--- + +## Runtime selection + +`scripts/validate.py` resolves a runtime the **agent can drive +non-interactively** and reports it as `runtime` (the exact command prefix the +agent uses for `pull`/`run`/`stats`/`logs`). Preference order maximizes +agent-drivability with no human in the loop: + +1. **docker** (direct) -- if `docker ps` exits 0 (user in the `docker` group / + daemon reachable). No sudo. Best. +2. **podman** (rootless) -- no daemon, no sudo. Note: rootless podman needs a + storage backend that supports its overlay; some networked/`/proj` + filesystems reject the overlay `pivot_root` (the run fails even though + `podman info` succeeds). On those hosts use docker or the conda path. +3. **sudo docker** -- only if `sudo -n docker ps` works (passwordless sudo). The + agent can still drive it unattended; `runtime` comes back as `"sudo docker"`. +4. **conda/host** -- requires `import vllm, zentorch` in the active env. + +If docker is installed but **none** of the above is agent-drivable (no docker +group, no passwordless sudo), `validate.py` returns `runtime: null`, +`runtime_agent_drivable: false`, and a **one-time** setup `fix`: +`sudo usermod -aG docker $USER && newgrp docker` (or a NOPASSWD sudoers entry). +This is one-time onboarding, not a per-serve command. After it, every serve is +fully agent-driven. The skill must not degrade into asking the user to paste +docker commands for each serve. + +## Container run flags (CPU) + +From `data/epyc.json`. Unlike the Instinct (GPU) skill there are **no** +`/dev/kfd`, `/dev/dri`, `--group-add`, or ROCm flags -- this is pure CPU. + +| Flag | Why | +|---|---| +| `--ipc=host` | vLLM workers use host IPC / shared memory | +| `--shm-size=16g` | vLLM needs a large `/dev/shm`; the 64MB default is too small | +| `--network=host` | expose the served port directly (or use `-p :`) | +| `--cpuset-cpus` / `--cpuset-mems` | pin the container to the chosen socket's physical cores and its NUMA node(s); from `cpu_tune.py` | +| `-v ~/.cache/huggingface:/root/.cache/huggingface` | reuse the host model cache | + +Image: `amdih/zendnn_zentorch:` -- the public vLLM + zentorch CPU image on +Docker Hub (no internal-registry access needed). The exact tag lives in +`data/epyc.json`; read it, never hardcode it. + +## Precision and modality + +| Dtype | EPYC (Zen) | Notes | +|---|---|---| +| BF16 | Native (default) | throughput default | +| FP16 | Native | | +| FP32 | Native | slower; debugging only | +| WOQ int8/int4 | Supported by zentorch | per-channel / per-group; out of scope for the base recipe | + +Modality: not gated by a static blocklist. `scripts/check_model.py` checks the +model's architecture against vLLM's model registry (pinned to `vllm_version`): +text **and** multimodal generation endpoints are allowed; pooling/embedding/ +reranker and non-LLM architectures are rejected (not chat/completion endpoints). +A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU -- that +surfaces at load, where the no-retry rule applies. + +## CPU sizing + +Policy: a single instance is pinned to **one socket plus its memory** (vLLM scales +poorly across sockets). `scripts/cpu_tune.py` derives: +- **Socket choice** (dual-socket): samples per-socket CPU busy% (~0.5s) and prefers a + free socket -- both free → socket 0; one free → that one; both at/above + `--busy-threshold` (default 15%) → `warning` and proceed on the least-busy. `--socket N` + forces it. Single-socket → socket 0. +- `VLLM_CPU_OMP_THREADS_BIND` = the chosen socket's physical cores (SMT dropped). vLLM + sets `OMP_NUM_THREADS` from this, so we don't. +- `VLLM_CPU_KVCACHE_SPACE` (GB) = `min(socket_ram*kv_frac, socket_ram-16)` -- sized from + the **chosen socket's local RAM** so the KV pool stays on-socket (≤32GB → `*0.5`). +- Memory-bound pin: `container_cpuset` = `--cpuset-cpus= --cpuset-mems=`; + `conda_launch_prefix` = `numactl --cpunodebind= --membind=` (falls back to + `taskset` CPU-only, or empty-with-note if neither tool exists). + +Not set: `OMP_NUM_THREADS` (vLLM derives it from the bind) and +`VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default when unset). + +When the chosen socket spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` emits an +`nps_note`: memory is bound across the socket's nodes, and finer per-node binding +(one instance per node) could add more. That tuning is out of +scope for the base recipe. + +## Known quirks + +**`--device cpu` removed (vLLM >= 0.20)** +`vllm serve` no longer accepts `--device cpu`; the zentorch plugin auto-selects +the CPU platform. Passing it -> `vllm: error: unrecognized arguments: --device cpu`. +Only pass it if `vllm serve --help` advertises it (older vLLM). + +**`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE` (VERIFIED)** +On vLLM 0.23.0 / zentorch 2.11.0.2 (EPYC 9454, facebook/opt-125m, 2026-06-23): +`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init with +`AssertionError: expected OutputCode, got function` (inductor codecache). Adding +`VLLM_USE_AOT_COMPILE=0` fixes it (healthy in ~99s). The only changed variable +between the failing and passing runs was `VLLM_USE_AOT_COMPILE`. Never set +`FREEZING=1` without `VLLM_USE_AOT_COMPILE=0`. The base recipe leaves both unset. + +**`/dev/shm` too small** +Without `--shm-size=16g` (or `--ipc=host`), vLLM workers fail to allocate shared +memory at startup. + +**RAM is the ceiling, not VRAM** +CPU serving keeps weights + KV cache in system RAM. `estimate_memory.py` checks +`weights + KV(max_model_len x num_prompts) + reserve <= RAM` (reserve default +16 GB, `--reserve-gb`). It exits 1 when it does not fit and prints +`suggested_max_model_len` + an `action` to reduce and retry. Weights come from +HF file sizes (`.safetensors` or legacy `.bin`); `--weight-gb` overrides when a +model has no metadata. KV cache is bf16-only on zentorch CPU (no fp8 KV), so the estimate always uses 2 bytes/element. + +**NUMA cross-node traffic** +On a 2-socket EPYC, an unpinned instance spreads threads + memory across both sockets +and pays cross-socket latency. `cpu_tune.py` keeps one instance on **one socket plus +its memory**: CPU bind (`VLLM_CPU_OMP_THREADS_BIND` + `--cpuset-cpus`), memory bind +(`--cpuset-mems` / `numactl --membind`), and KV sized from that socket's local RAM so +the KV pool never lands on the other socket. The socket is chosen by load (free socket +preferred; warns if both busy). True multi-socket throughput = **multiple instances** +(one per socket) -- out of scope for this single-instance recipe. diff --git a/skills/serving-llms-on-epyc/scripts/check_model.py b/skills/serving-llms-on-epyc/scripts/check_model.py new file mode 100644 index 0000000..534bfea --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/check_model.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Does vLLM support this model's architecture? -- so the skill checks real vLLM +support instead of blanket-blocking multimodal. + +Reads the model's `architectures` from its HF config.json, then checks them +against vLLM's model registry for the pinned vLLM version. The registry comes +from the version-pinned registry.py on GitHub (no vLLM install needed); if that +is unreachable it falls back to an importable local `vllm`. Generation endpoints +(text + multimodal) are supported; pooling/embedding/reranker and non-LLM +architectures are not chat/completion endpoints and are rejected. + + check_model.py --model-id Qwen/Qwen3-0.6B + check_model.py --model-id --vllm-version 0.22.0 + +Exit 0 if vLLM serves it as a generation endpoint (or support is undeterminable +-- launch confirms), 1 if it is positively unsupported. JSON to stdout. +Env: HF_TOKEN for gated models. +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import urllib.request +import urllib.error + +HF = "https://huggingface.co" +GH_RAW = "https://raw.githubusercontent.com/vllm-project/vllm" +REG_PATH = "vllm/model_executor/models/registry.py" + +# registry.py dict name -> kind we care about +_SECTIONS = { + "_TEXT_GENERATION_MODELS": "text", + "_TRANSFORMERS_BACKEND_MODELS": "text", + "_MULTIMODAL_MODELS": "multimodal", + "_EMBEDDING_MODELS": "pooling", + "_POOLING_MODELS": "pooling", + "_CROSS_ENCODER_MODELS": "pooling", +} + + +def _get(url, token=None): + """GET text from a URL. Returns (text, error_message).""" + headers = {"User-Agent": "check-model/1"} + if token: + headers["Authorization"] = f"Bearer {token}" + try: + with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r: + return r.read().decode("utf-8"), None + except urllib.error.HTTPError as e: + return None, {401: "not found or gated (set HF_TOKEN)", + 403: "access denied -- accept the model license on HuggingFace", + 404: "not found"}.get(e.code, f"HTTP {e.code}") + except Exception as e: + return None, str(e) + + +def model_architectures(model, rev, token): + """Architectures declared in the model's HF config.json. Returns (list, error).""" + text, err = _get(f"{HF}/{model}/resolve/{rev}/config.json", token) + if text is None: + return None, err + try: + cfg = json.loads(text) + except ValueError: + return None, "config.json is not valid JSON" + return cfg.get("architectures") or [], None + + +def registry_from_github(version): + """Parse vLLM's registry.py at v. Returns ({arch: kind}, source) or (None, err).""" + src, err = _get(f"{GH_RAW}/v{version}/{REG_PATH}") + if src is None: + return None, err + reg, cur = {}, None + for line in src.splitlines(): + s = line.strip() + sec = re.match(r"^(_[A-Z0-9_]+_MODELS)\s*(?::[^=]+)?=\s*\{", s) + if sec: + cur = _SECTIONS.get(sec.group(1)) + continue + if s.startswith("}"): + cur = None + continue + if cur: + key = re.match(r'^"([A-Za-z0-9_]+)"\s*:', s) + if key: + reg[key.group(1)] = cur + return (reg or None), (f"github:v{version}" if reg else "registry.py had no parseable archs") + + +def registry_from_local(): + """Coarse fallback: an importable local `vllm` (text vs multimodal). Returns ({arch: kind}, source) or (None, None).""" + snippet = ( + "import json;" + "from vllm import ModelRegistry as R;" + "a=list(R.get_supported_archs());" + "mm=set(x for x in a if R.is_multimodal_model([x]));" + "print(json.dumps({'archs':a,'mm':list(mm)}))" + ) + r = subprocess.run(["python", "-c", snippet], stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=60) + if r.returncode != 0 or not r.stdout.strip(): + return None, None + try: + d = json.loads(r.stdout) + except ValueError: + return None, None + mm = set(d.get("mm", [])) + return {a: ("multimodal" if a in mm else "text") for a in d.get("archs", [])}, "vllm-import" + + +def main(): + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--model-id", required=True) + p.add_argument("--revision", default="main") + p.add_argument("--vllm-version", default="0.22.0", help="pin the registry to this vLLM version (from data/epyc.json)") + a = p.parse_args() + token = os.environ.get("HF_TOKEN", "") + + archs, aerr = model_architectures(a.model_id, a.revision, token) + if not archs: + # Cannot read the config (gated/offline) -- do not positively block; the + # gating check and launch will catch real problems. + print(json.dumps({"model_id": a.model_id, "supported": None, "kind": "undetermined", + "message": f"Could not read architectures ({aerr or 'none declared'}); support unverified. " + "If gated, set HF_TOKEN. This does not bypass the gating/launch checks."}, indent=2)) + sys.exit(0) + + reg, source = registry_from_github(a.vllm_version) + if reg is None: + reg, source = registry_from_local() + if reg is None: + print(json.dumps({"model_id": a.model_id, "architectures": archs, "supported": None, + "kind": "undetermined", + "message": "Could not load vLLM's model registry (no network and no importable vllm); " + "support unverified. vLLM confirms support at load (no-retry rule applies)."}, indent=2)) + sys.exit(0) + + kinds = [reg.get(arch) for arch in archs] + known = [k for k in kinds if k] + out = {"model_id": a.model_id, "architectures": archs, "registry_source": source} + + if not known: + out.update(supported=False, kind="unsupported", + message=f"vLLM has no registry entry for {archs}; it cannot serve this model on any backend. Stop.") + print(json.dumps(out, indent=2)) + sys.exit(1) + + if any(k in ("text", "multimodal") for k in known): + kind = "multimodal" if "multimodal" in known else "text" + msg = f"vLLM supports {archs} as a {kind} generation endpoint." + if kind == "multimodal": + msg += " A multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load (no-retry rule applies)." + out.update(supported=True, kind=kind, message=msg) + print(json.dumps(out, indent=2)) + sys.exit(0) + + out.update(supported=False, kind="pooling", + message=f"{archs} is a pooling/embedding/reranker model in vLLM, not a chat/completion endpoint. Stop.") + print(json.dumps(out, indent=2)) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/cpu_tune.py b/skills/serving-llms-on-epyc/scripts/cpu_tune.py new file mode 100644 index 0000000..bf84acc --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/cpu_tune.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Derive vLLM-on-CPU runtime knobs from the host, for a single instance pinned to +ONE socket (with its memory). Read-only. + +Socket choice (dual-socket hosts): vLLM scales poorly across sockets, so we run on +one. We sample per-socket CPU load (~0.5s via /proc/stat) and prefer a free socket: + - both sockets below --busy-threshold -> socket 0 (deterministic; both free) + - exactly one below the threshold -> that socket + - both at/above the threshold -> WARN and proceed on the least-busy one + - --socket N -> force a socket, skip the load check +A single-socket host just uses socket 0. (NPS2/NPS4 -> a socket spans multiple +NUMA nodes; we bind memory to all of the chosen socket's nodes.) + +Emits two env vars: + - VLLM_CPU_OMP_THREADS_BIND : physical cores of the chosen socket (SMT siblings + dropped). vLLM sets OMP_NUM_THREADS itself (= len(cores)), so we don't. + - VLLM_CPU_KVCACHE_SPACE : KV-cache RAM (GB), sized from the chosen socket's + LOCAL RAM (not whole-system) so the pool stays on-socket. + +And a memory-bound pin for the chosen socket: + - container : --cpuset-cpus= --cpuset-mems= + - conda : numactl --cpunodebind= --membind= (preferred) + falls back to taskset -c (CPU-only, no mem bind) + if neither exists, reported -- launch proceeds unpinned. + +Not set: OMP_NUM_THREADS (vLLM derives it) and VLLM_CPU_NUM_OF_RESERVED_CPU +(vLLM has its own default when unset). + +Usage: + python3 scripts/cpu_tune.py # export lines for `eval` + python3 scripts/cpu_tune.py --format json # machine-readable + python3 scripts/cpu_tune.py --socket 1 # force socket 1 + python3 scripts/cpu_tune.py --busy-threshold 70 # "free" means < 70% busy +""" + +import argparse +import json +import re +import shutil +import subprocess +import sys +import time + +OS_HEADROOM_GB = 16 + + +def _sh(cmd): + try: + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=15) + return r.stdout + except Exception: + return "" + + +def _lscpu_int(out, label, default): + m = re.search(rf"^{re.escape(label)}:\s*(\d+)", out, re.MULTILINE) + return int(m.group(1)) if m else default + + +def _ranges(items): + """Compress a sorted int list to a range string: [0,1,2,5] -> '0-2,5'.""" + items = sorted(items) + if not items: + return "" + out, start, prev = [], items[0], items[0] + for c in items[1:]: + if c == prev + 1: + prev = c + continue + out.append(f"{start}-{prev}" if start != prev else f"{start}") + start = prev = c + out.append(f"{start}-{prev}" if start != prev else f"{start}") + return ",".join(out) + + +def topology(): + """Per-socket layout from `lscpu -p`. Returns {sid: {phys, all, nodes}} where + phys = one CPU per core (SMT dropped), all = every logical CPU, nodes = set of + NUMA node ids on that socket. Also returns cpu->socket.""" + socks, cpu_socket = {}, {} + for line in _sh("lscpu -p=CPU,CORE,SOCKET,NODE").splitlines(): + if line.startswith("#") or not line.strip(): + continue + parts = line.split(",") + if len(parts) < 3: + continue + cpu, core, sid = int(parts[0]), parts[1], int(parts[2]) + node = parts[3] if len(parts) > 3 and parts[3] != "" else str(sid) + s = socks.setdefault(sid, {"phys": [], "all": [], "nodes": set(), "_cores": set()}) + s["all"].append(cpu) + s["nodes"].add(int(node)) + cpu_socket[cpu] = sid + if core not in s["_cores"]: + s["_cores"].add(core) + s["phys"].append(cpu) + return socks, cpu_socket + + +def node_ram_gb(node): + out = _sh(f"grep MemTotal /sys/devices/system/node/node{node}/meminfo") + m = re.search(r"(\d+)", out) + return (int(m.group(1)) // (1024 * 1024)) if m else 0 + + +def socket_busy_pct(cpus, interval=0.5): + """Mean CPU-busy% across `cpus` over `interval` seconds, from /proc/stat.""" + def snap(): + d = {} + for ln in open("/proc/stat"): + if ln.startswith("cpu") and len(ln) > 3 and ln[3].isdigit(): + p = ln.split() + vals = list(map(int, p[1:])) + idle = vals[3] + (vals[4] if len(vals) > 4 else 0) + d[int(p[0][3:])] = (idle, sum(vals)) + return d + a = snap(); time.sleep(interval); b = snap() + di = sum(b[c][0] - a[c][0] for c in cpus if c in a and c in b) + dt = sum(b[c][1] - a[c][1] for c in cpus if c in a and c in b) + return round(100 * (1 - di / dt), 1) if dt else 0.0 + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--kv-frac", type=float, default=0.4, help="fraction of the chosen socket's RAM for KV cache") + p.add_argument("--socket", type=int, default=None, help="force a socket id (skips the load check)") + p.add_argument("--busy-threshold", type=float, default=15.0, + help="a socket is 'free' if its CPU-busy%% is below this (default 15)") + p.add_argument("--format", choices=["env", "json"], default="env") + args = p.parse_args() + + socks, _ = topology() + if not socks: # lscpu -p unavailable; degrade to a no-pin single instance + print('export VLLM_CPU_KVCACHE_SPACE=4' if args.format == "env" + else json.dumps({"error": "no topology from lscpu -p"})) + return + + sids = sorted(socks) + busy = {s: socket_busy_pct(socks[s]["all"]) for s in sids} + + warn = "" + if args.socket is not None and args.socket in socks: + chosen, reason = args.socket, "forced via --socket" + elif len(sids) == 1: + chosen, reason = sids[0], "single socket" + else: + free = [s for s in sids if busy[s] < args.busy_threshold] + if len(free) >= 2: + chosen, reason = sids[0], f"both sockets free (<{args.busy_threshold}% busy) -> socket 0" + elif len(free) == 1: + chosen, reason = free[0], f"only free socket (<{args.busy_threshold}% busy)" + else: + chosen = min(sids, key=lambda s: busy[s]) + reason = f"all sockets busy (>={args.busy_threshold}%) -> least-busy" + warn = (f"all {len(sids)} sockets are busy (>= {args.busy_threshold}%): " + f"{ {s: busy[s] for s in sids} }. Proceeding on the least-busy socket " + f"{chosen}; performance may suffer. Pass --socket N to override.") + + sock = socks[chosen] + bind = _ranges(sock["phys"]) + nodes = sorted(sock["nodes"]) + nodes_str = _ranges(nodes) + + sock_ram = sum(node_ram_gb(n) for n in nodes) + if sock_ram <= 0: # sysfs unavailable: fall back to total/sockets + m = re.search(r"MemTotal:\s*(\d+)", _sh("grep MemTotal /proc/meminfo")) + total = int(m.group(1)) // (1024 * 1024) if m else 0 + sock_ram = total // max(1, len(sids)) + if sock_ram <= 2 * OS_HEADROOM_GB: + kv = max(1, int(sock_ram * 0.5)) + else: + kv = max(1, min(int(sock_ram * args.kv_frac), sock_ram - OS_HEADROOM_GB)) + + container_cpuset = f"--cpuset-cpus={bind} --cpuset-mems={nodes_str}" + if shutil.which("numactl"): + conda_prefix = f"numactl --cpunodebind={nodes_str} --membind={nodes_str}" + conda_pin = "numactl (cpu + memory bound to the socket's nodes)" + elif shutil.which("taskset"): + conda_prefix = f"taskset -c {bind}" + conda_pin = "taskset (CPU-only; memory NOT node-bound -- numactl not found)" + else: + conda_prefix = "" + conda_pin = "none (no numactl/taskset; launching unpinned -- install numactl for memory binding)" + + nps_note = "" + if len(nodes) > 1: + nps_note = (f"socket {chosen} spans {len(nodes)} NUMA nodes (NPS{len(nodes)}); memory is " + f"bound across nodes {nodes_str}. Finer per-node binding could add performance.") + + result = { + "chosen_socket": chosen, + "socket_choice_reason": reason, + "sockets": len(sids), + "socket_busy_pct": busy, + "busy_threshold": args.busy_threshold, + "vllm_cpu_omp_threads_bind": bind, + "vllm_cpu_kvcache_space_gb": kv, + "socket_ram_gb": sock_ram, + "numa_nodes_on_socket": nodes, + "container_cpuset": container_cpuset, + "conda_launch_prefix": conda_prefix, + "conda_pin_tool": conda_pin, + "warning": warn, + "nps_note": nps_note, + } + + if args.format == "json": + print(json.dumps(result, indent=2)) + return + + print(f'export VLLM_CPU_OMP_THREADS_BIND="{bind}"') + print(f"export VLLM_CPU_KVCACHE_SPACE={kv}") + print(f"# socket {chosen} ({reason}); per-socket busy%: {busy}") + print(f"# container: {container_cpuset}") + print(f"# conda: {conda_prefix or '(unpinned)'} vllm serve ... [{conda_pin}]") + if warn: + print(f"# WARNING: {warn}") + if nps_note: + print(f"# NOTE: {nps_note}") + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/detect.py b/skills/serving-llms-on-epyc/scripts/detect.py new file mode 100644 index 0000000..c0c3340 --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/detect.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Detect AMD EPYC CPU hardware for vLLM + zentorch serving. + +Usage: + python3 scripts/detect.py + python3 scripts/detect.py --host user@hostname + +Output: JSON with cpu_model, is_amd_epyc, logical_cores, physical_cores, +sockets, threads_per_core, numa_nodes, memory_gb, epyc_generation +(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), zen_arch, and avx512. Exits 0 on +success, 1 if no CPU info could be read. + +Env vars (used when --host is not given): + ZEN_SSH_HOST, ZEN_SSH_USER, ZEN_SSH_PORT +""" + +import argparse +import json +import os +import re +import subprocess +import sys + + +def _is_local(host): + return not host or host in ("local", "localhost", "127.0.0.1") + + +def _run(cmd, host, user, port, timeout=20): + if _is_local(host): + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=timeout) + else: + ssh_target = f"{user}@{host}" if user else host + ssh = ["ssh", "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=15", "-o", "BatchMode=yes", + "-o", "LogLevel=ERROR", "-p", str(port), ssh_target, cmd] + r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, timeout=timeout) + return r.returncode, r.stdout, r.stderr + + +def _lscpu_field(lscpu_out, label): + m = re.search(rf"^{re.escape(label)}:\s*(.+)$", lscpu_out, re.MULTILINE) + return m.group(1).strip() if m else "" + + +def _epyc_generation(model): + """Map an AMD EPYC model name to (generation, zen_arch). + + EPYC numbering encodes the generation: 7xx1=Naples (Zen1), 7xx2=Rome (Zen2), + 7xx3=Milan (Zen3), 8xx4=Siena (Zen4c), 97x4=Bergamo (Zen4c), 9xx4=Genoa (Zen4), + 9xx5=Turin (Zen5). The agent should carry this through every phase (e.g. AVX-512 + + bf16 land on Zen4+, Turin has up to 128 cores per socket -> thread binding).""" + m = re.search(r"EPYC\s+(\d{4})", model.upper()) + if not m: + return "unknown", "unknown" + num = m.group(1) + first, last = num[0], num[3] + if first == "7": + return {"1": ("Naples", "Zen1"), "2": ("Rome", "Zen2"), + "3": ("Milan", "Zen3")}.get(last, ("unknown", "unknown")) + if first == "8" and last == "4": + return "Siena", "Zen4c" + if first == "9": + if num.startswith("97") and last == "4": + return "Bergamo", "Zen4c" + if last == "4": + return "Genoa", "Zen4" + if last == "5": + return "Turin", "Zen5" + return "unknown", "unknown" + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--host", default="", help="[user@]host (default: local or ZEN_SSH_HOST)") + p.add_argument("--user", default="") + p.add_argument("--port", type=int, default=0) + args = p.parse_args() + + host, user = args.host, args.user + if "@" in host: + user, host = host.split("@", 1) + host = host or os.environ.get("ZEN_SSH_HOST", "") + user = user or os.environ.get("ZEN_SSH_USER", "") + port = args.port or int(os.environ.get("ZEN_SSH_PORT", "22")) + + rc, lscpu_out, err = _run("lscpu", host, user, port) + if rc != 0 or not lscpu_out: + print(json.dumps({"error": "lscpu failed", + "detail": err.strip() or f"exit {rc}"})) + sys.exit(1) + + model = _lscpu_field(lscpu_out, "Model name") or "unknown" + vendor = _lscpu_field(lscpu_out, "Vendor ID") + + def _int(label, default=0): + v = _lscpu_field(lscpu_out, label) + try: + return int(v) + except ValueError: + return default + + sockets = _int("Socket(s)", 1) + cores_per_socket = _int("Core(s) per socket", 0) + threads_per_core = _int("Thread(s) per core", 1) or 1 + numa_nodes = _int("NUMA node(s)", 1) + + rc, nproc_out, _ = _run("nproc --all", host, user, port) + try: + logical = int(nproc_out.strip()) + except (ValueError, AttributeError): + logical = sockets * cores_per_socket * threads_per_core + + physical = sockets * cores_per_socket if cores_per_socket else logical // threads_per_core + + rc, mem_out, _ = _run("grep MemTotal /proc/meminfo", host, user, port) + mem_kb = 0 + m = re.search(r"(\d+)", mem_out or "") + if m: + mem_kb = int(m.group(1)) + memory_gb = mem_kb // (1024 * 1024) + + is_epyc = vendor == "AuthenticAMD" and "EPYC" in model.upper() + generation, zen_arch = _epyc_generation(model) + avx512 = "avx512f" in _lscpu_field(lscpu_out, "Flags").split() + + print(json.dumps({ + "cpu_model": model, + "vendor": vendor, + "is_amd_epyc": is_epyc, + "epyc_generation": generation, + "zen_arch": zen_arch, + "avx512": avx512, + "logical_cores": logical, + "physical_cores": physical, + "sockets": sockets, + "threads_per_core": threads_per_core, + "numa_nodes": numa_nodes, + "memory_gb": memory_gb, + "target": "local" if _is_local(host) else host, + }, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/estimate_memory.py b/skills/serving-llms-on-epyc/scripts/estimate_memory.py new file mode 100644 index 0000000..75c50ad --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/estimate_memory.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Does a HuggingFace model fit in host RAM for CPU serving at a given context? + +No download -- reads HF metadata over HTTP. Answers one question: + weights + KV(max_model_len x num_prompts) + reserve <= RAM ? +If not, prints the largest max_model_len that would fit, so you reduce it and +retry. Exit 0 = fits, 1 = does not fit (or error). + + estimate_memory.py --model-id Qwen/Qwen3-8B --ram-gb 755 --max-model-len 4096 --num-prompts 8 + +Three sub-problems, one function each: weight_gb(), kv_bytes_per_token(), fit(). +Env: HF_TOKEN for gated models. --weight-gb overrides weights if metadata is missing. +""" + +import argparse +import json +import os +import sys +import urllib.request +import urllib.error + +HF = "https://huggingface.co" +KV_BYTES_PER_ELEM = 2 # zentorch CPU KV cache is bf16-only (2 bytes); no fp8 KV support + + +def _get(url, token): + """GET JSON from HF. Returns (data, error_message).""" + headers = {"User-Agent": "estimate-memory/2"} + if token: + headers["Authorization"] = f"Bearer {token}" + try: + with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r: + return json.load(r), None + except urllib.error.HTTPError as e: + return None, {401: "not found, or gated (set HF_TOKEN if it is gated)", + 403: "access denied -- accept the model license on HuggingFace", + 404: "model not found"}.get(e.code, f"HTTP {e.code}") + except Exception as e: + return None, str(e) + + +def weight_gb(model, rev, token): + """(1) Weight RAM = sum of uncompressed weight-file sizes. Works for + .safetensors and legacy .bin; file size is ground truth even for quantized + checkpoints. Returns (gb, error).""" + tree, err = _get(f"{HF}/api/models/{model}/tree/{rev}", token) + if not isinstance(tree, list): + return None, err or "no file tree" + total = sum( + f.get("size", 0) for f in tree + if f.get("type") == "file" and ( + f.get("path", "").endswith(".safetensors") + or (f.get("path", "").endswith(".bin") and "model" in f.get("path", "").lower()) + ) + ) + if total == 0: + return None, "no weight files (.safetensors/.bin) found -- pass --weight-gb" + return round(total / 2**30, 2), None + + +def get_config(model, rev, token): + """Model config.json, unwrapping the LLM sub-config of multimodal models.""" + cfg, _ = _get(f"{HF}/{model}/resolve/{rev}/config.json", token) + if cfg and "num_hidden_layers" not in cfg: + for k in ("text_config", "language_config", "llm_config"): + if isinstance(cfg.get(k), dict) and cfg[k].get("num_hidden_layers"): + sub = dict(cfg[k]) + sub.setdefault("max_position_embeddings", cfg.get("max_position_embeddings")) + return sub + return cfg + + +def kv_bytes_per_token(cfg): + """(2) KV-cache bytes per token = 2(K,V) x layers x kv_heads x head_dim x 2 (bf16). + zentorch CPU caches KV in bf16 only. MLA models (DeepSeek) cache a compressed latent.""" + if not cfg or not cfg.get("num_hidden_layers"): + return 0 + nbytes = KV_BYTES_PER_ELEM + layers = cfg["num_hidden_layers"] + if "kv_lora_rank" in cfg: # MLA: latent KV + return 2 * layers * (cfg["kv_lora_rank"] + cfg.get("qk_rope_head_dim", 0)) * nbytes + kv_heads = cfg.get("num_key_value_heads", cfg.get("num_attention_heads", 0)) + head_dim = cfg.get("head_dim") or (cfg.get("hidden_size", 0) // max(1, cfg.get("num_attention_heads", 1))) + return 2 * layers * kv_heads * head_dim * nbytes + + +def fit(weight, kv_per_tok, ctx, prompts, ram, reserve): + """(3) Verdict + the largest max_model_len that would fit if it doesn't.""" + kv_gb = kv_per_tok * ctx * prompts / 2**30 + required = round(weight + kv_gb + reserve, 2) + out = {"max_model_len": ctx, "num_prompts": prompts, "weight_gb": weight, + "kv_cache_gb": round(kv_gb, 2), "reserve_gb": reserve, + "required_gb": required, "ram_gb": ram, "fits": required <= ram} + if not out["fits"]: + budget = (ram - weight - reserve) * 2**30 + best = int(budget / (kv_per_tok * prompts)) // 256 * 256 if kv_per_tok and budget > 0 else 0 + out["suggested_max_model_len"] = max(0, best) + out["action"] = (f"reduce --max-model-len to {best} or less and retry" + if best >= 256 else "weights alone exceed RAM -- use a smaller model") + return out + + +def main(): + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--model-id", required=True) + p.add_argument("--revision", default="main") + p.add_argument("--ram-gb", type=float, default=0, help="host RAM (enables the fit verdict)") + p.add_argument("--max-model-len", type=int, default=4096) + p.add_argument("--num-prompts", type=int, default=1, help="concurrent sequences") + p.add_argument("--reserve-gb", type=float, default=16, help="RAM held back for OS + vLLM runtime") + p.add_argument("--weight-gb", type=float, default=0, help="override weight RAM if metadata is unavailable") + a = p.parse_args() + token = os.environ.get("HF_TOKEN", "") + + w = a.weight_gb if a.weight_gb > 0 else None + if w is None: + w, err = weight_gb(a.model_id, a.revision, token) + if w is None: + print(json.dumps({"error": err, "model_id": a.model_id})) + sys.exit(1) + + cfg = get_config(a.model_id, a.revision, token) + kv_per_tok = kv_bytes_per_token(cfg) + max_seq = cfg.get("max_position_embeddings") if cfg else None + ctx = min(a.max_model_len, max_seq) if max_seq else a.max_model_len + + out = {"model_id": a.model_id, "weight_gb": w, "kv_dtype": "bf16", + "kv_bytes_per_token": kv_per_tok, "model_max_seq_len": max_seq} + if a.ram_gb > 0: + out["fit"] = fit(w, kv_per_tok, ctx, a.num_prompts, a.ram_gb, a.reserve_gb) + + print(json.dumps(out, indent=2)) + sys.exit(0 if out.get("fit", {"fits": True})["fits"] else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/validate.py b/skills/serving-llms-on-epyc/scripts/validate.py new file mode 100644 index 0000000..95fd37a --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/validate.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Validate the environment before serving vLLM + zentorch on an EPYC CPU host. + +Checks a container runtime (docker or podman), whether the vLLM+zentorch image +is present (and, if already pulled, that `import vllm, zentorch` works inside it), +a conda/host fallback (`import vllm, zentorch`), the host perf libraries +(tcmalloc / OpenMP via LD_PRELOAD), HF_TOKEN, and RAM. Each issue is error +(blocks launch) / warning (degrades) / advisory (info). + +Usage: + python3 scripts/validate.py + python3 scripts/validate.py --image amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 + +Exits 0 if no error-severity issues remain, 1 otherwise. JSON to stdout. +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys + + +def _sh(cmd, timeout=20): + try: + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=timeout) + return r.returncode, r.stdout.strip(), r.stderr.strip() + except subprocess.TimeoutExpired: + return 1, "", f"timed out after {timeout}s" + + +def _detect_runtime(): + """Pick an accessible container runtime: docker (daemon reachable) > podman + (rootless). Returns (runtime, detail) or (None, why). + + Like serving-llms-on-instinct, an accessible runtime is a PREREQUISITE. We + check and report a one-time fix; we never escalate privileges (no sudo). + """ + if shutil.which("docker"): + rc, _, err = _sh("docker ps -q") + if rc == 0: + return "docker", "docker reachable" + last = (err or "docker ps failed").splitlines()[0][:120] + else: + last = "docker not installed" + if shutil.which("podman"): + rc, _, err = _sh("podman info --format '{{.Host.Arch}}'") + if rc == 0: + return "podman", "podman available (rootless)" + last = (err or last).splitlines()[0][:120] if err else last + return None, last + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--image", default="", help="container image to check for (advisory)") + args = p.parse_args() + + issues = [] + + # 1. Container runtime (prerequisite): docker > podman, else conda fallback. + runtime, detail = _detect_runtime() + conda_ok = _sh('python -c "import vllm, zentorch"')[0] == 0 + + if runtime is None: + if conda_ok: + issues.append({"check": "container_runtime", "severity": "warning", + "message": f"No accessible container runtime ({detail}); using the conda/host path.", + "fix": "For the container path, make docker accessible or install rootless podman (see fix below)."}) + else: + issues.append({"check": "container_runtime", "severity": "error", + "message": f"No accessible container runtime ({detail}) and no host vllm+zentorch.", + "fix": "One-time onboarding: add your user to the docker group " + "(sudo usermod -aG docker $USER, then re-login) or start the daemon; " + "OR install rootless podman; OR activate a conda env with vllm+zentorch."}) + + # 2. Image present + (only if already pulled) zentorch inside it. The in-image + # import check runs ONLY when the image is local, so it never triggers a + # multi-GB pull just to validate. + if runtime and args.image: + repo = args.image.rsplit(":", 1)[0] # strip the tag, keep any host:port/repo + rc, out, _ = _sh(f"{runtime} images {repo} --format '{{{{.Repository}}}}:{{{{.Tag}}}}'") + if args.image not in (out or ""): + issues.append({"check": "image", "severity": "advisory", + "message": f"Image {args.image} not pulled yet; first launch will download it (in-image zentorch check deferred to launch).", + "fix": f"{runtime} pull {args.image}"}) + else: + rc, ver, err = _sh(f'{runtime} run --rm {args.image} ' + f'python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"', timeout=90) + if rc == 0 and ver: + issues.append({"check": "image_stack", "severity": "advisory", + "message": f"Image has vllm+zentorch ({ver})."}) + else: + issues.append({"check": "image_stack", "severity": "warning", + "message": f"Image {args.image} is present but `import vllm, zentorch` failed inside it: {(err or 'unknown')[:120]}", + "fix": "Use an image tag that bundles the zentorch plugin (see data/epyc.json)."}) + + # 3. Host vllm+zentorch (for the conda path) + if conda_ok: + _, ver, _ = _sh('python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"') + issues.append({"check": "host_stack", "severity": "advisory", + "message": f"Host vllm+zentorch importable ({ver}); conda path available."}) + elif runtime: + issues.append({"check": "host_stack", "severity": "advisory", + "message": "Host `import vllm, zentorch` not available; use the container path."}) + + # 4. HF_TOKEN + if not os.environ.get("HF_TOKEN"): + issues.append({"check": "hf_token", "severity": "advisory", + "message": "HF_TOKEN not set. Required for gated models (Llama, Gemma); not needed for Qwen3.", + "fix": "export HF_TOKEN=hf_..."}) + + # 5. RAM + rc, out, _ = _sh("grep MemTotal /proc/meminfo | awk '{print int($2/1024/1024)}'") + try: + ram_gb = int(out) + except ValueError: + ram_gb = 0 + if 0 < ram_gb < 32: + issues.append({"check": "ram", "severity": "warning", + "message": f"Only {ram_gb} GB RAM. CPU serving keeps weights + KV cache in RAM; large models may not fit.", + "fix": "Use a small model or a host with more RAM."}) + + # 6. Perf libraries for the host/conda path (advisory). vLLM CPU wants + # libtcmalloc + libiomp (OpenMP) preloaded and warns otherwise. The + # container image sets these itself, so only check the host when the + # conda/host path is viable. + if conda_ok: + ld = os.environ.get("LD_PRELOAD", "") + missing = [lib for lib in ("libtcmalloc", "libiomp") if lib not in ld] + if missing: + issues.append({"check": "perf_libs", "severity": "advisory", + "message": f"LD_PRELOAD is missing {', '.join(missing)}; vLLM CPU warns about this and throughput suffers without them (host/conda path).", + "fix": "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$CONDA_PREFIX/lib/libiomp5.so:$LD_PRELOAD"}) + + errors = [i for i in issues if i["severity"] == "error"] + result = { + "ready": len(errors) == 0, + "runtime": runtime, + "runtime_detail": detail, + "conda_path_available": conda_ok, + "ram_gb": ram_gb, + "errors": errors, + "warnings": [i for i in issues if i["severity"] == "warning"], + "advisories": [i for i in issues if i["severity"] == "advisory"], + } + print(json.dumps(result, indent=2)) + sys.exit(0 if len(errors) == 0 else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/skill-card.md b/skills/serving-llms-on-epyc/skill-card.md new file mode 100644 index 0000000..120283f --- /dev/null +++ b/skills/serving-llms-on-epyc/skill-card.md @@ -0,0 +1,13 @@ +# Skill Card + +## Description + +Serve a single LLM on an AMD EPYC CPU host with vLLM + zentorch (Docker, Podman, or conda), handling CPU detection, runtime/env validation, model + RAM-fit checks, hardware-sized threads/KV/NUMA, launch, and health verification. Reports and stops on failure; does not debug. + +## Owner + +AMD + +## License + +MIT