diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f00ba1e..ad028d5 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -33,6 +33,11 @@ "name": "tracelens-analysis-orchestrator", "source": "./skills/tracelens-analysis-orchestrator", "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)." + }, + { + "name": "vllm-multiinstance", + "source": "./skills/vllm-multiinstance", + "description": "Multi-instance vLLM CPU benchmark on AMD EPYC: runs N vLLM instances behind NGINX, drives load with guidellm, and reports peak memory and end-to-end throughput/latency across models, concurrency rates, and instance counts." } ] } diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json index f00ba1e..ad028d5 100644 --- a/.cursor-plugin/marketplace.json +++ b/.cursor-plugin/marketplace.json @@ -33,6 +33,11 @@ "name": "tracelens-analysis-orchestrator", "source": "./skills/tracelens-analysis-orchestrator", "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)." + }, + { + "name": "vllm-multiinstance", + "source": "./skills/vllm-multiinstance", + "description": "Multi-instance vLLM CPU benchmark on AMD EPYC: runs N vLLM instances behind NGINX, drives load with guidellm, and reports peak memory and end-to-end throughput/latency across models, concurrency rates, and instance counts." } ] } diff --git a/eval/behavioral/tests/test_vllm_multiinstance.py b/eval/behavioral/tests/test_vllm_multiinstance.py new file mode 100644 index 0000000..23cd841 --- /dev/null +++ b/eval/behavioral/tests/test_vllm_multiinstance.py @@ -0,0 +1,121 @@ +"""Behavioral tests for the `vllm-multiinstance` skill. + +Run locally (needs the `claude` CLI authenticated and on a network that can +reach the API): + + pytest eval/behavioral/tests/test_vllm_multiinstance.py -s + +A real sweep needs podman + ansible + a model + many physical cores and runs +for ~10+ minutes, so these tests do NOT launch the stack. Every prompt is +scoped to planning / explanation ("Do not run anything") and asserts the +agent's *decisions and guardrails* drawn from the skill: sweep sizing, the +"read scores from guidellm.log, not benchmarks.json" rule, and the host +preflight / rootless fail-fast behavior the harness implements. + +Each check on `run` prints a `[PASS]`/`[FAIL]` line and raises on failure. +`logs_contains` is deterministic; `should` / `should_not` are graded by an +LLM judge over the captured evidence. +""" + +from harness import claude + +_NO_RUN = "Do not run any containers, podman, ansible, or scripts -- just answer." + + +def test_skill_activates_and_sizes_the_sweep(): + with claude("sonnet", skill="vllm-multiinstance") as agent: + run = agent.prompt( + "I want to benchmark a vLLM CPU image with the vllm-multiinstance " + "skill on a single-socket AMD EPYC with 128 physical cores. How " + f"many vLLM instances should I run and which cores does each get? {_NO_RUN}" + ) + + run.logs_contains("vllm-multiinstance") + + run.should("Recommend running 3 vLLM instances for a 128-physical-core host") + run.should( + "Pin the three instances to cores 32-63, 64-95, and 96-127 " + "(CORES_PER_INSTANCE=32, all on one socket)" + ) + run.should_not( + "Spread the instances across both sockets or use a " + "CORES_PER_INSTANCE other than 32" + ) + + +def test_reads_throughput_from_guidellm_log_not_json(): + with claude("sonnet", skill="vllm-multiinstance") as agent: + run = agent.prompt( + "Using the vllm-multiinstance skill: after a run completes, where do " + "I read the server throughput, and which number should I NOT trust? " + f"{_NO_RUN}" + ) + + run.logs_contains("vllm-multiinstance") + + run.should( + "Read server throughput from guidellm.log (the 'Server Throughput " + "Statistics' table), which is the server-aggregate number" + ) + run.should_not( + "Recommend reporting requests_per_second or output_tokens_per_second " + "from benchmarks.json as the server throughput" + ) + + +def test_host_preflight_fails_fast_on_blockers(): + with claude("sonnet", skill="vllm-multiinstance") as agent: + run = agent.prompt( + "Using the vllm-multiinstance skill: I'm on a rootless podman 3.4.4 / " + "CNI 0.9.1 host. What host-level problems will the harness catch " + "before a long run, and how does it avoid the 20-minute health-wait " + f"hang? {_NO_RUN}" + ) + + run.logs_contains("vllm-multiinstance") + + run.should( + "Mention the host preflight (check-host.sh) catches an unresolvable " + "image short-name, missing rootless cgroup cpuset delegation, and a " + "CNI cniVersion mismatch" + ) + run.should( + "Explain that the harness exits early / fails fast with actionable " + "guidance instead of hanging the full health-check timeout" + ) + + +def test_image_short_name_remediation(): + with claude("sonnet", skill="vllm-multiinstance") as agent: + run = agent.prompt( + "Using the vllm-multiinstance skill: the default image " + "amdih/zendnn_zentorch:... won't resolve on my host (no " + f"unqualified-search registries). What should I do? {_NO_RUN}" + ) + + run.logs_contains("vllm-multiinstance") + + run.should( + "Recommend using a fully-qualified image name, e.g. prefixing it " + "with docker.io/ (or pre-pulling that fully-qualified image)" + ) + + +def test_rootless_runs_without_passwordless_sudo(): + with claude("sonnet", skill="vllm-multiinstance") as agent: + run = agent.prompt( + "Using the vllm-multiinstance skill: my host has no passwordless " + "sudo. Can I still run the guidellm benchmark, and how does the " + f"harness handle it? {_NO_RUN}" + ) + + run.logs_contains("vllm-multiinstance") + + run.should( + "Explain the harness can run ansible (incl. guidellm) rootless via " + "ansible_become=false -- auto-detected when passwordless sudo is " + "missing, or forced with --no-become / ANSIBLE_NO_BECOME=1" + ) + run.should_not( + "Claim the benchmark simply cannot run without passwordless sudo" + ) diff --git a/skills/vllm-multiinstance/README.md b/skills/vllm-multiinstance/README.md new file mode 100644 index 0000000..23a251f --- /dev/null +++ b/skills/vllm-multiinstance/README.md @@ -0,0 +1,192 @@ +# vllm-multiinstance + +Benchmark a vLLM CPU image on an AMD EPYC box: run **N vLLM instances behind +NGINX**, drive load with **guidellm**, and report **peak memory** + **end-to-end +throughput/latency**. The benchmark harness is vendored here — you only supply a +container image and a model. + +This README sets expectations and gives copy-paste commands. For the *why* behind +each step see [`SKILL.md`](SKILL.md); for a full replay log see +[`reference.md`](reference.md). + +--- + +## What you configure + +Four things; everything else has sane defaults: + +| Knob | Meaning | Default | +|------|---------|---------| +| `VLLM_IMAGE` | container image to benchmark | the Docker Hub image below | +| `MODEL` | `"repo-or-path \| tag"` | — (required) | +| `GUIDELLM_RATES` | concurrency rate list | `[32,64]` | +| `NUM_INSTANCES` | vLLM instances behind NGINX | `3` | + +Default image: +``` +amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 +``` + +The `tag` half of `MODEL` may contain any characters — the harness sanitizes it to +`[A-Za-z0-9-]` for the run name (e.g. `qwen3-0.6b` → `qwen3-0-6b`). You don't have +to pre-mangle dots or slashes. + +--- + +## Expectations (read before you start) + +- **Time:** each run takes ~8-15 min (model load + `rate × 300s` + teardown). A + 2×2 matrix is ~40-60 min. Run sweeps in the background and wait on a sentinel — + don't poll. +- **Cores:** instances pin physical cores starting at 32 (`3×32` → cores + 32-63 / 64-95 / 96-127). Only **one stack per machine** at a time — a second + stack would fight for the same cores. Check `podman ps | grep vllm` first. +- **RAM/disk:** you need room for `NUM_INSTANCES` model copies. If root (`/`) is + tight, set `BENCH_ROOT` to a roomy filesystem (temp + results land there). +- **Scores:** always read throughput from `guidellm.log`, **not** + `benchmarks.json` (the JSON numbers are per-request medians and understate + server throughput). +- **The guidellm load generator runs rootful.** A rootless `podman ps` won't list + it; it self-exits when the endpoint is torn down. See *Aborting a run* below. + +--- + +## Prerequisites + +- `podman` + `podman-compose` + - On a **podman 3.x** host (e.g. 3.4.4), pin `podman-compose==1.0.6`. + Newer podman-compose (1.6.0) emits podman-4.x `--network net:ip=` syntax + that podman 3.x silently ignores, so containers lose their static IPs. +- `ansible-playbook` and collections `containers.podman`, `ansible.posix`, + `community.general` +- a Python env with `hf` / `huggingface-cli` (for the one-time model pre-warm) + +--- + +## Quick start + +```bash +cd skills/vllm-multiinstance + +# 1. Size the sweep to your hardware. +python3 scripts/detect.py +# NUM_INSTANCES = floor((physical_cores - 16) / 32) # 128 cores -> 3 + +# 2. One-time: clone + patch the ansible/guidellm automation into harness/. +bash scripts/setup-harness.sh # idempotent + +# 3. One-time: pre-warm the model into a shared HF cache (offline runs need it). +HF_HOME=$HOME/.cache/hf-shared/huggingface hf download Qwen/Qwen3-0.6B + +# 4. (Optional) Dry-run — validates preflight + ansible wiring, starts nothing. +VLLM_IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 \ + HF_TOKEN=offline HF_CACHE_DIR="$HOME/.cache/hf-shared" \ + harness/run_sweep.sh --dry-run -m "Qwen/Qwen3-0.6B | qwen3-0.6b" +``` + +--- + +## Run a benchmark + +`scripts/run_combo.sh` is env-driven — one run is `LABEL` + `VLLM_IMAGE` + `MODEL`. + +```bash +cd skills/vllm-multiinstance +mkdir -p results # must exist before any nohup/redirect + +IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 +MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b" + +# Single run (zentorch): +LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \ + bash scripts/run_combo.sh > results/run_run1.out 2>&1 +``` + +### A/B: zentorch vs native (same image) + +`NATIVE=1` bypasses zentorch to compare against vanilla CPU vLLM — no separate +build needed. + +```bash +for row in "zentorch:" "native:NATIVE=1"; do + label="${row%%:*}"; extra="${row#*:}" + env $extra LABEL="$label" VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \ + bash scripts/run_combo.sh > "results/run_${label}.out" 2>&1 +done +``` + +### Sweep instance counts (e.g. 3-instance vs single-instance) + +```bash +for n in 3 1; do + LABEL="i${n}" NUM_INSTANCES=$n VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \ + bash scripts/run_combo.sh > "results/run_i${n}.out" 2>&1 +done +``` + +### Sweep concurrency rates (keep outputs separate with `RUN_TAG`) + +```bash +for rate in 32 64 96; do + LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \ + GUIDELLM_RATES="[$rate]" RUN_TAG="_c$rate" \ + bash scripts/run_combo.sh > "results/run_run1_c$rate.out" 2>&1 +done +``` + +### Background a long sweep and wait on a sentinel + +```bash +nohup bash -c ' + for n in 3 1; do + LABEL="i${n}" NUM_INSTANCES=$n VLLM_IMAGE="'"$IMAGE"'" MODEL="'"$MODEL"'" \ + bash scripts/run_combo.sh > "results/run_i${n}.out" 2>&1 + done + echo ALL_DONE +' > results/sweep.out 2>&1 & +while ! grep -q ALL_DONE results/sweep.out; do sleep 60; done # don't tight-poll +``` + +--- + +## Collect scores + +```bash +R=harness/vllm-cpu-perf-eval/results/llm/Qwen__Qwen3-0.6B + +ls -1dt "$R"/chat-* # newest-first; disambiguate by timestamp + +# Server-aggregate throughput + median latency (authoritative): +python3 scripts/parse_guidellm_log.py "$R/chat--/external-endpoint/guidellm.log" +# conc req/s in_tok/s out_tok/s tot_tok/s lat_s TTFT_ms ITL_ms TPOT_ms + +# Peak aggregate memory for a run (