diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f00ba1e..ad028d5 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -33,6 +33,11 @@
       "name": "tracelens-analysis-orchestrator",
       "source": "./skills/tracelens-analysis-orchestrator",
       "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)."
+    },
+    {
+      "name": "vllm-multiinstance",
+      "source": "./skills/vllm-multiinstance",
+      "description": "Multi-instance vLLM CPU benchmark on AMD EPYC: runs N vLLM instances behind NGINX, drives load with guidellm, and reports peak memory and end-to-end throughput/latency across models, concurrency rates, and instance counts."
     }
   ]
 }
diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json
index f00ba1e..ad028d5 100644
--- a/.cursor-plugin/marketplace.json
+++ b/.cursor-plugin/marketplace.json
@@ -33,6 +33,11 @@
       "name": "tracelens-analysis-orchestrator",
       "source": "./skills/tracelens-analysis-orchestrator",
       "description": "Orchestrates modular PyTorch profiler trace analysis with TraceLens: generates perf reports, prepares category data, runs system-level and compute-kernel subagents in parallel, validates outputs, and writes a prioritized stakeholder report (analysis.md)."
+    },
+    {
+      "name": "vllm-multiinstance",
+      "source": "./skills/vllm-multiinstance",
+      "description": "Multi-instance vLLM CPU benchmark on AMD EPYC: runs N vLLM instances behind NGINX, drives load with guidellm, and reports peak memory and end-to-end throughput/latency across models, concurrency rates, and instance counts."
     }
   ]
 }
diff --git a/eval/behavioral/tests/test_vllm_multiinstance.py b/eval/behavioral/tests/test_vllm_multiinstance.py
new file mode 100644
index 0000000..23cd841
--- /dev/null
+++ b/eval/behavioral/tests/test_vllm_multiinstance.py
@@ -0,0 +1,121 @@
+"""Behavioral tests for the `vllm-multiinstance` skill.
+
+Run locally (needs the `claude` CLI authenticated and on a network that can
+reach the API):
+
+    pytest eval/behavioral/tests/test_vllm_multiinstance.py -s
+
+A real sweep needs podman + ansible + a model + many physical cores and runs
+for ~10+ minutes, so these tests do NOT launch the stack. Every prompt is
+scoped to planning / explanation ("Do not run anything") and asserts the
+agent's *decisions and guardrails* drawn from the skill: sweep sizing, the
+"read scores from guidellm.log, not benchmarks.json" rule, and the host
+preflight / rootless fail-fast behavior the harness implements.
+
+Each check on `run` prints a `[PASS]`/`[FAIL]` line and raises on failure.
+`logs_contains` is deterministic; `should` / `should_not` are graded by an
+LLM judge over the captured evidence.
+"""
+
+from harness import claude
+
+_NO_RUN = "Do not run any containers, podman, ansible, or scripts -- just answer."
+
+
+def test_skill_activates_and_sizes_the_sweep():
+    with claude("sonnet", skill="vllm-multiinstance") as agent:
+        run = agent.prompt(
+            "I want to benchmark a vLLM CPU image with the vllm-multiinstance "
+            "skill on a single-socket AMD EPYC with 128 physical cores. How "
+            f"many vLLM instances should I run and which cores does each get? {_NO_RUN}"
+        )
+
+        run.logs_contains("vllm-multiinstance")
+
+        run.should("Recommend running 3 vLLM instances for a 128-physical-core host")
+        run.should(
+            "Pin the three instances to cores 32-63, 64-95, and 96-127 "
+            "(CORES_PER_INSTANCE=32, all on one socket)"
+        )
+        run.should_not(
+            "Spread the instances across both sockets or use a "
+            "CORES_PER_INSTANCE other than 32"
+        )
+
+
+def test_reads_throughput_from_guidellm_log_not_json():
+    with claude("sonnet", skill="vllm-multiinstance") as agent:
+        run = agent.prompt(
+            "Using the vllm-multiinstance skill: after a run completes, where do "
+            "I read the server throughput, and which number should I NOT trust? "
+            f"{_NO_RUN}"
+        )
+
+        run.logs_contains("vllm-multiinstance")
+
+        run.should(
+            "Read server throughput from guidellm.log (the 'Server Throughput "
+            "Statistics' table), which is the server-aggregate number"
+        )
+        run.should_not(
+            "Recommend reporting requests_per_second or output_tokens_per_second "
+            "from benchmarks.json as the server throughput"
+        )
+
+
+def test_host_preflight_fails_fast_on_blockers():
+    with claude("sonnet", skill="vllm-multiinstance") as agent:
+        run = agent.prompt(
+            "Using the vllm-multiinstance skill: I'm on a rootless podman 3.4.4 / "
+            "CNI 0.9.1 host. What host-level problems will the harness catch "
+            "before a long run, and how does it avoid the 20-minute health-wait "
+            f"hang? {_NO_RUN}"
+        )
+
+        run.logs_contains("vllm-multiinstance")
+
+        run.should(
+            "Mention the host preflight (check-host.sh) catches an unresolvable "
+            "image short-name, missing rootless cgroup cpuset delegation, and a "
+            "CNI cniVersion mismatch"
+        )
+        run.should(
+            "Explain that the harness exits early / fails fast with actionable "
+            "guidance instead of hanging the full health-check timeout"
+        )
+
+
+def test_image_short_name_remediation():
+    with claude("sonnet", skill="vllm-multiinstance") as agent:
+        run = agent.prompt(
+            "Using the vllm-multiinstance skill: the default image "
+            "amdih/zendnn_zentorch:... won't resolve on my host (no "
+            f"unqualified-search registries). What should I do? {_NO_RUN}"
+        )
+
+        run.logs_contains("vllm-multiinstance")
+
+        run.should(
+            "Recommend using a fully-qualified image name, e.g. prefixing it "
+            "with docker.io/ (or pre-pulling that fully-qualified image)"
+        )
+
+
+def test_rootless_runs_without_passwordless_sudo():
+    with claude("sonnet", skill="vllm-multiinstance") as agent:
+        run = agent.prompt(
+            "Using the vllm-multiinstance skill: my host has no passwordless "
+            "sudo. Can I still run the guidellm benchmark, and how does the "
+            f"harness handle it? {_NO_RUN}"
+        )
+
+        run.logs_contains("vllm-multiinstance")
+
+        run.should(
+            "Explain the harness can run ansible (incl. guidellm) rootless via "
+            "ansible_become=false -- auto-detected when passwordless sudo is "
+            "missing, or forced with --no-become / ANSIBLE_NO_BECOME=1"
+        )
+        run.should_not(
+            "Claim the benchmark simply cannot run without passwordless sudo"
+        )
diff --git a/skills/vllm-multiinstance/README.md b/skills/vllm-multiinstance/README.md
new file mode 100644
index 0000000..23a251f
--- /dev/null
+++ b/skills/vllm-multiinstance/README.md
@@ -0,0 +1,192 @@
+# vllm-multiinstance
+
+Benchmark a vLLM CPU image on an AMD EPYC box: run **N vLLM instances behind
+NGINX**, drive load with **guidellm**, and report **peak memory** + **end-to-end
+throughput/latency**. The benchmark harness is vendored here — you only supply a
+container image and a model.
+
+This README sets expectations and gives copy-paste commands. For the *why* behind
+each step see [`SKILL.md`](SKILL.md); for a full replay log see
+[`reference.md`](reference.md).
+
+---
+
+## What you configure
+
+Four things; everything else has sane defaults:
+
+| Knob | Meaning | Default |
+|------|---------|---------|
+| `VLLM_IMAGE` | container image to benchmark | the Docker Hub image below |
+| `MODEL` | `"repo-or-path \| tag"` | — (required) |
+| `GUIDELLM_RATES` | concurrency rate list | `[32,64]` |
+| `NUM_INSTANCES` | vLLM instances behind NGINX | `3` |
+
+Default image:
+```
+amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+```
+
+The `tag` half of `MODEL` may contain any characters — the harness sanitizes it to
+`[A-Za-z0-9-]` for the run name (e.g. `qwen3-0.6b` → `qwen3-0-6b`). You don't have
+to pre-mangle dots or slashes.
+
+---
+
+## Expectations (read before you start)
+
+- **Time:** each run takes ~8-15 min (model load + `rate × 300s` + teardown). A
+  2×2 matrix is ~40-60 min. Run sweeps in the background and wait on a sentinel —
+  don't poll.
+- **Cores:** instances pin physical cores starting at 32 (`3×32` → cores
+  32-63 / 64-95 / 96-127). Only **one stack per machine** at a time — a second
+  stack would fight for the same cores. Check `podman ps | grep vllm` first.
+- **RAM/disk:** you need room for `NUM_INSTANCES` model copies. If root (`/`) is
+  tight, set `BENCH_ROOT` to a roomy filesystem (temp + results land there).
+- **Scores:** always read throughput from `guidellm.log`, **not**
+  `benchmarks.json` (the JSON numbers are per-request medians and understate
+  server throughput).
+- **The guidellm load generator runs rootful.** A rootless `podman ps` won't list
+  it; it self-exits when the endpoint is torn down. See *Aborting a run* below.
+
+---
+
+## Prerequisites
+
+- `podman` + `podman-compose`
+  - On a **podman 3.x** host (e.g. 3.4.4), pin `podman-compose==1.0.6`.
+    Newer podman-compose (1.6.0) emits podman-4.x `--network net:ip=` syntax
+    that podman 3.x silently ignores, so containers lose their static IPs.
+- `ansible-playbook` and collections `containers.podman`, `ansible.posix`,
+  `community.general`
+- a Python env with `hf` / `huggingface-cli` (for the one-time model pre-warm)
+
+---
+
+## Quick start
+
+```bash
+cd skills/vllm-multiinstance
+
+# 1. Size the sweep to your hardware.
+python3 scripts/detect.py
+#   NUM_INSTANCES = floor((physical_cores - 16) / 32)   # 128 cores -> 3
+
+# 2. One-time: clone + patch the ansible/guidellm automation into harness/.
+bash scripts/setup-harness.sh           # idempotent
+
+# 3. One-time: pre-warm the model into a shared HF cache (offline runs need it).
+HF_HOME=$HOME/.cache/hf-shared/huggingface hf download Qwen/Qwen3-0.6B
+
+# 4. (Optional) Dry-run — validates preflight + ansible wiring, starts nothing.
+VLLM_IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 \
+  HF_TOKEN=offline HF_CACHE_DIR="$HOME/.cache/hf-shared" \
+  harness/run_sweep.sh --dry-run -m "Qwen/Qwen3-0.6B | qwen3-0.6b"
+```
+
+---
+
+## Run a benchmark
+
+`scripts/run_combo.sh` is env-driven — one run is `LABEL` + `VLLM_IMAGE` + `MODEL`.
+
+```bash
+cd skills/vllm-multiinstance
+mkdir -p results            # must exist before any nohup/redirect
+
+IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b"
+
+# Single run (zentorch):
+LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+  bash scripts/run_combo.sh > results/run_run1.out 2>&1
+```
+
+### A/B: zentorch vs native (same image)
+
+`NATIVE=1` bypasses zentorch to compare against vanilla CPU vLLM — no separate
+build needed.
+
+```bash
+for row in "zentorch:" "native:NATIVE=1"; do
+  label="${row%%:*}"; extra="${row#*:}"
+  env $extra LABEL="$label" VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+    bash scripts/run_combo.sh > "results/run_${label}.out" 2>&1
+done
+```
+
+### Sweep instance counts (e.g. 3-instance vs single-instance)
+
+```bash
+for n in 3 1; do
+  LABEL="i${n}" NUM_INSTANCES=$n VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+    bash scripts/run_combo.sh > "results/run_i${n}.out" 2>&1
+done
+```
+
+### Sweep concurrency rates (keep outputs separate with `RUN_TAG`)
+
+```bash
+for rate in 32 64 96; do
+  LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+    GUIDELLM_RATES="[$rate]" RUN_TAG="_c$rate" \
+    bash scripts/run_combo.sh > "results/run_run1_c$rate.out" 2>&1
+done
+```
+
+### Background a long sweep and wait on a sentinel
+
+```bash
+nohup bash -c '
+  for n in 3 1; do
+    LABEL="i${n}" NUM_INSTANCES=$n VLLM_IMAGE="'"$IMAGE"'" MODEL="'"$MODEL"'" \
+      bash scripts/run_combo.sh > "results/run_i${n}.out" 2>&1
+  done
+  echo ALL_DONE
+' > results/sweep.out 2>&1 &
+while ! grep -q ALL_DONE results/sweep.out; do sleep 60; done   # don't tight-poll
+```
+
+---
+
+## Collect scores
+
+```bash
+R=harness/vllm-cpu-perf-eval/results/llm/Qwen__Qwen3-0.6B
+
+ls -1dt "$R"/chat-*                         # newest-first; disambiguate by timestamp
+
+# Server-aggregate throughput + median latency (authoritative):
+python3 scripts/parse_guidellm_log.py "$R/chat-<ts>-<test_name>/external-endpoint/guidellm.log"
+# conc  req/s  in_tok/s  out_tok/s  tot_tok/s  lat_s  TTFT_ms  ITL_ms  TPOT_ms
+
+# Peak aggregate memory for a run (<label> = LABEL, + RUN_TAG if set):
+grep "^PEAK" results/mem_<label>.csv
+
+# Sanity: every run must be Failed : 0
+grep -E "Failed +:" results/run_*.out
+```
+
+---
+
+## Aborting a run / cleaning up
+
+The driver chain is `run_combo.sh → run_sweep.sh → ansible-playbook` plus a
+background `mem_poll.sh`.
+
+```bash
+pkill -9 -f run_sweep.sh; pkill -9 -f run_combo.sh; pkill -9 -f mem_poll.sh
+pkill -9 -f ansible-playbook
+bash harness/stop.sh --clean        # stops the vLLM stack AND removes the network
+```
+
+If a **rootful** guidellm container is stuck (it normally self-exits):
+
+```bash
+sudo podman ps -a | grep guidellm
+sudo podman rm -f <guidellm-container>
+```
+
+> The harness names its stack `bench-vllm-*` when launched via `run_combo.sh`.
+> Before killing anything, confirm you're not stopping **someone else's** stack
+> (e.g. plain `vllm-instance-*`) sharing the host.
diff --git a/skills/vllm-multiinstance/SKILL.md b/skills/vllm-multiinstance/SKILL.md
new file mode 100644
index 0000000..b4abc67
--- /dev/null
+++ b/skills/vllm-multiinstance/SKILL.md
@@ -0,0 +1,222 @@
+---
+name: vllm-multiinstance
+description: Multi-instance vLLM benchmark on AMD EPYC CPU — runs N vLLM instances behind NGINX, drives load with guidellm, and reports peak memory (podman stats) + end-to-end throughput/latency. Use to benchmark a vLLM CPU image across models, concurrency rates, and instance counts. The harness is vendored here; nothing external is required beyond podman + ansible.
+user-invocable: true
+---
+
+## Overview
+A multi-instance vLLM + NGINX load-balancer benchmark for AMD EPYC CPU inference.
+It runs N vLLM instances (each pinned to a range of physical cores) behind NGINX,
+drives load with guidellm via ansible, and tears the stack down per run. You point
+it at a container image and a model; it reports **peak memory** and **end-to-end
+performance**.
+
+You configure four things: **image**, **model**, **concurrency rate**, and
+**instance count**. Everything else has sensible defaults.
+
+**Self-contained.** The benchmark harness is vendored under `harness/` — there is
+no dependency on any other repo. The only external piece is the guidellm/ansible
+automation (`redhat-et/vllm-cpu-perf-eval`), which `scripts/setup-harness.sh`
+clones + patches into `harness/` on first use.
+
+## Layout
+```
+skills/vllm-multiinstance/
+  SKILL.md            this file
+  reference.md        copy-pasteable command-replay log from a real run
+  harness/            vendored benchmark stack (no external repo needed)
+    run_sweep.sh        sweep driver: stop → start → ansible guidellm → stop
+    start.sh            generate compose, pre-warm HF cache, start N+nginx, wait
+    generate-config.sh  emit docker-compose + nginx.conf
+    stop.sh             podman-compose down (--clean removes volumes)
+    vllm-cpu-perf-eval.patch   patch for the external ansible automation
+    vllm-cpu-perf-eval/        cloned by setup-harness.sh (gitignored, ~62M)
+  scripts/
+    detect.py           print local CPU info as JSON (size the sweep from it)
+    setup-harness.sh    one-time: clone + patch the ansible automation
+    run_combo.sh        env-driven single-run driver (image + model + rate)
+    mem_poll.sh         peak aggregate memory via podman stats
+    parse_guidellm_log.py   scores from guidellm.log (authoritative)
+    extract_perf.py     scores from benchmarks.json (fallback/cross-check)
+```
+
+## Harness flow
+`run_sweep.sh` → `start.sh` (generate-config → HF pre-warm → `podman-compose up`
+→ health wait) → ansible `llm-benchmark-concurrent-load.yml` (guidellm) →
+`stop.sh --clean`. The stack is `NUM_INSTANCES` × `CORES_PER_INSTANCE` physical
+cores pinned from `VLLM_START_CORE` (default 32), e.g. 3×32 → cores 32-63 / 64-95
+/ 96-127. NGINX routes to instances by static IP. Results land in
+`harness/vllm-cpu-perf-eval/results/llm/<model>/chat-<ts>-<test_name>/external-endpoint/`
+(`benchmarks.json`, `benchmarks.csv`, `guidellm.log`).
+
+Default image:
+`amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23`
+(Docker Hub). Any vLLM CPU image works — just set `VLLM_IMAGE`.
+
+## Step 1: Check your hardware
+```bash
+cd skills/vllm-multiinstance
+python3 scripts/detect.py
+```
+This prints `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`, etc. Size the
+sweep from `physical_cores`, keeping all instances on **one socket**:
+
+- `CORES_PER_INSTANCE` is fixed at **32** (the sweet spot).
+- `NUM_INSTANCES = floor((physical_cores − 16) / 32)` — the 16 leaves headroom for
+  NGINX and the OS. On a 128-core part that's 3 instances; on 64 cores, 1.
+
+You also need enough RAM for `NUM_INSTANCES` model copies. `df -h /` — if root is
+tight, set `BENCH_ROOT` to a roomy filesystem (temp + results land there).
+
+## Step 2: One-time setup
+```bash
+# 1. Clone + patch the external ansible/guidellm automation into harness/.
+bash scripts/setup-harness.sh          # idempotent
+
+# 2. Pre-warm the model into a shared HF cache (offline runs need it on disk).
+#    Use whatever Python env has huggingface-cli / hf (active venv/conda).
+HF_HOME=$HOME/.cache/hf-shared/huggingface hf download <model>
+```
+Needs `podman`, `ansible-playbook`, and ansible collections `containers.podman`,
+`ansible.posix`, `community.general` (setup-harness.sh reports which are missing).
+
+## Step 3: Run a benchmark
+`scripts/run_combo.sh` is env-driven. A run is defined by `LABEL`, `VLLM_IMAGE`,
+and `MODEL`; the script handles the memory poller, stack naming, temp redirection,
+HF cache, and teardown.
+
+Required: `LABEL` (output name), `VLLM_IMAGE`, `MODEL` (`"repo-or-path | tag"`).
+The `tag` may contain any characters — the harness sanitizes it to `[A-Za-z0-9-]`
+for the ansible `test_name` (e.g. `qwen3-0.6b` → `qwen3-0-6b`), so you don't have
+to pre-mangle dots/slashes.
+Optional: `NUM_INSTANCES` (3), `CORES_PER_INSTANCE` (32), `GUIDELLM_RATES`
+(`[32,64]`), `RUN_TAG` (output suffix so rate sweeps don't clobber), `MODELS_DIR`,
+`BENCH_ROOT` (where `results/` lands; default `$PWD`). `NATIVE=1` bypasses zentorch
+to A/B the same image with vanilla CPU vLLM.
+
+```bash
+cd skills/vllm-multiinstance
+mkdir -p results          # must exist before any nohup redirect
+
+IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b"
+
+# One run:
+LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+  bash scripts/run_combo.sh > results/run_run1.out 2>&1
+```
+
+### Sweeping a matrix
+Define the matrix as a data table and loop — no script edits. Sweep **images**
+and/or **concurrency rates**:
+```bash
+IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b"
+# Each row: "label | image | extra-env"
+MATRIX=(
+  "zentorch | $IMAGE | "
+  "native   | $IMAGE | NATIVE=1"
+)
+for row in "${MATRIX[@]}"; do
+  IFS='|' read -r label image extra <<<"$row"
+  label="${label// /}"; image="${image// /}"
+  env $extra LABEL="$label" VLLM_IMAGE="$image" MODEL="$MODEL" \
+    bash scripts/run_combo.sh > "results/run_${label}.out" 2>&1
+done
+
+# Sweep concurrency rates for one combo — RUN_TAG keeps outputs separate:
+for rate in 32 64 96; do
+  LABEL=run1 VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+    GUIDELLM_RATES="[$rate]" RUN_TAG="_c$rate" \
+    bash scripts/run_combo.sh > "results/run_run1_c$rate.out" 2>&1
+done
+```
+Each run takes ~8-15 min (load + rate×300s + teardown). Run the loop in the
+background and wait on a sentinel — don't poll.
+
+## Step 4: Pre-flight (optional)
+- `podman ps | grep vllm` — kill/await any stack pinning your cores.
+- `harness/run_sweep.sh --dry-run ...` — validates preflight, ansible path, env
+  without starting containers.
+
+### Host prerequisites & the host preflight
+The harness assumes a **podman 4.x + netavark** host. On older/leaner hosts
+(rootless podman 3.4.4, CNI 0.9.1) a first-timer hits several host-level
+blockers that used to surface only as a cryptic deep failure or a 20-minute
+health-wait hang. `harness/check-host.sh` now runs automatically inside both
+`run_sweep.sh` (preflight) and `start.sh`, and **exits early with the fix**
+instead of hanging:
+
+| Check | Blocker it catches | What you'll see / the fix |
+|-------|--------------------|---------------------------|
+| **image short-name** | default `amdih/...` won't resolve without unqualified-search registries | `[BLOCK]` → set `VLLM_IMAGE=docker.io/amdih/...` (or `podman pull` it). Bypass: `ALLOW_SHORT_NAME=1`. |
+| **rootless cpuset** | cgroup v2 cpuset not delegated → `cpuset.cpus: no such file or directory` | `[BLOCK]` with the one-time `Delegate=cpu cpuset io memory pids` systemd fix (needs root). Or run `--no-limits`. Bypass: `ALLOW_NO_CPUSET=1`. |
+| **CNI cniVersion** | podman writes `cniVersion 1.0.0`, host plugins only support ≤0.4.0 → containers drop their static IPs → every LB request 504s | `[WARN]`; `start.sh` auto-downgrades the conflist to `0.4.0`. If it still fails, a **static-IP guard** in `start.sh` aborts in seconds (not 20 min) telling you to upgrade `containernetworking-plugins`. |
+
+> **podman 3.x field paths (handled automatically).** Three places assumed
+> podman-4.x inspect/info paths that render empty on podman 3.4.4, so the
+> intended logic silently no-op'd. All are now version-agnostic:
+> - **CNI backend detection** (`check-host.sh` + `start.sh downgrade_cni_version`)
+>   keyed on `podman info .Host.NetworkBackend`, which doesn't exist on podman
+>   3.x → the auto-downgrade never fired. Now bails only on an *explicit*
+>   `netavark` backend; for `cni`/empty/unknown it proceeds (the conflist-file
+>   check keeps it a safe no-op elsewhere).
+> - **Health wait** read `.State.Health.Status` (podman 4.x); on podman 3.x the
+>   field is `.State.Healthcheck.Status`, so healthy instances looked perpetually
+>   unready → 20-min timeout. A `container_health()` helper now tries both paths.
+
+> **podman-compose version on podman 3.x.** podman-compose **1.6.0** emits the
+> podman-4.x `--network net:ip=` syntax, which podman 3.4.4 *silently ignores* —
+> containers fall back to the default net and lose their static IPs. On a podman
+> 3.x host, pin **podman-compose 1.0.6** (it uses `--net <name> --ip=`, which
+> assigns the static IPs correctly): `pip install 'podman-compose==1.0.6'`.
+
+`start.sh` also **fast-fails** the health wait the moment any required container
+exits/dies (prints its last log lines) rather than polling for the full
+timeout. Skip all host checks with `SKIP_HOST_CHECK=1`.
+
+### Aborting a run / cleaning up
+The driver chain is `run_combo.sh → run_sweep.sh → ansible-playbook` plus a
+background `mem_poll.sh`. To stop a run cleanly, kill the drivers then tear the
+stack down (`stop.sh` now also removes the compose network, so the next start is
+clean):
+```bash
+pkill -9 -f run_sweep.sh; pkill -9 -f run_combo.sh; pkill -9 -f mem_poll.sh
+pkill -9 -f ansible-playbook
+bash harness/stop.sh --clean        # stops vLLM stack + removes network
+```
+**By default the guidellm load generator runs ROOTFUL** (under
+`/var/lib/containers`, owned by root) via ansible `become`, which needs
+**passwordless sudo**. A rootless `podman ps` won't even list it, and your
+user-level `podman rm` / `kill -9` can't touch it. It normally self-exits the
+moment the vLLM endpoint is torn down; if one is stuck, remove it with sudo:
+```bash
+sudo podman ps -a | grep guidellm
+sudo podman rm -f <guidellm-container>
+```
+
+**Hosts without passwordless sudo:** guidellm doesn't actually need root (its
+container is `network:host` + `user 0:0`), so the whole playbook can run
+rootless as the invoking user. `run_sweep.sh` **auto-detects** this — if
+`sudo -n true` fails it injects `-e ansible_become=false` and prints an `[INFO]`
+line. Force it either way with `--no-become` / `--become` (or
+`ANSIBLE_NO_BECOME=1`). When run rootless, the guidellm container *is* listed by
+your normal `podman ps` and you can `podman rm -f` it without sudo. This also
+means a sudo-less host **fails fast in preflight** instead of ~10 min into the
+ansible health-check retries.
+
+## Step 5: Collect scores
+**Always read scores from `guidellm.log`, not `benchmarks.json`.** The JSON's
+`requests_per_second`/`output_tokens_per_second` are *per-request medians* and
+understate server throughput; the log's "Server Throughput Statistics" table is
+the correct server-aggregate number.
+
+```bash
+python3 scripts/parse_guidellm_log.py <...>/external-endpoint/guidellm.log
+# conc  req/s  in_tok/s  out_tok/s  tot_tok/s  lat_s  TTFT_ms  ITL_ms  TPOT_ms
+grep "^PEAK" results/mem_<label>.csv      # peak aggregate memory (<label> = LABEL[+RUN_TAG])
+```
+Disambiguate same-named result dirs by timestamp order (`ls -1dt .../chat-*`),
+cross-check each run's image in `results/run_<label>.out` (`VLLM_IMAGE=`), and
+verify each run's `Failed : 0`. `extract_perf.py` parses `benchmarks.json` as a
+fallback/cross-check only.
diff --git a/skills/vllm-multiinstance/harness/check-host.sh b/skills/vllm-multiinstance/harness/check-host.sh
new file mode 100755
index 0000000..f4008ec
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/check-host.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# ============================================================================
+# Host / environment preflight for the multi-instance vLLM stack.
+#
+# Catches the host-level blockers that otherwise surface only as a cryptic deep
+# failure or a 20-minute health-wait hang (none of these are caught by
+# detect.py or the run_sweep preflight):
+#
+#   3. image short-name that won't resolve on a host without unqualified-search
+#      registries (e.g. "amdih/..." instead of "docker.io/amdih/...").
+#   4. rootless cgroup v2 cpuset NOT delegated to the user slice -> core pinning
+#      dies with "cpuset.cpus: no such file or directory".
+#   5. CNI backend (podman 3.x) whose plugins are older than the cniVersion
+#      podman writes -> containers silently drop their static IPs -> every
+#      load-balanced request 504s and the health wait never passes.
+#
+# Exit 0 if it's safe to proceed, 1 on a hard blocker. CNI is a WARN only
+# (start.sh auto-downgrades the conflist and the static-IP guard catches any
+# residual failure fast).
+#
+# Reads (all optional):
+#   VLLM_IMAGE   image to validate; skipped if unset.
+#   LIMITS_ON    1 (default) if cpuset pinning is in use; 0 disables the cpuset
+#                check (matches start.sh --no-limits).
+#
+# Escape hatches (set =1 to downgrade a hard failure to a warning):
+#   SKIP_HOST_CHECK    skip this script entirely (callers honor it).
+#   ALLOW_SHORT_NAME   allow an unresolved short-name image.
+#   ALLOW_NO_CPUSET    proceed without cpuset delegation.
+# ============================================================================
+set -uo pipefail
+
+if [[ "${SKIP_HOST_CHECK:-0}" == "1" ]]; then
+    echo "  [skip] host check (SKIP_HOST_CHECK=1)"
+    exit 0
+fi
+
+if ! command -v podman >/dev/null 2>&1; then
+    echo "  [BLOCK] podman not found in PATH." >&2
+    exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# 3. Image resolvability (short-name).
+# ----------------------------------------------------------------------------
+check_image_resolvable() {
+    local img="${VLLM_IMAGE:-}"
+    [[ -z "$img" ]] && { echo "  [skip] image check (VLLM_IMAGE unset)"; return 0; }
+
+    if podman image exists "$img" 2>/dev/null; then
+        echo "  [OK]   image present locally: $img"
+        return 0
+    fi
+
+    # Fully qualified = first path component looks like a registry host
+    # (contains a '.' or ':port', or is literally "localhost").
+    local first="${img%%/*}"
+    if [[ "$img" == */* && ( "$first" == *.* || "$first" == *:* || "$first" == "localhost" ) ]]; then
+        echo "  [WARN] image $img not present locally; podman will try to pull it."
+        return 0
+    fi
+
+    if [[ "${ALLOW_SHORT_NAME:-0}" == "1" ]]; then
+        echo "  [WARN] short-name image '$img' not present (ALLOW_SHORT_NAME=1; continuing)."
+        return 0
+    fi
+
+    echo "  [BLOCK] image short-name '$img' is not present locally and is not fully qualified." >&2
+    echo "          Hosts without unqualified-search registries can't resolve bare names," >&2
+    echo "          so podman-compose would fail deep into the run." >&2
+    echo "    Fix:    use a fully-qualified name, e.g.  VLLM_IMAGE=docker.io/$img" >&2
+    echo "    Or:     pre-pull it once,            e.g.  podman pull docker.io/$img" >&2
+    echo "    Bypass: ALLOW_SHORT_NAME=1" >&2
+    return 1
+}
+
+# ----------------------------------------------------------------------------
+# 4. Rootless cgroup v2 cpuset delegation.
+# ----------------------------------------------------------------------------
+check_cpuset_delegation() {
+    if [[ "${LIMITS_ON:-1}" != "1" ]]; then
+        echo "  [skip] cpuset delegation check (limits disabled)"
+        return 0
+    fi
+    if [[ "$(id -u)" -eq 0 ]]; then
+        echo "  [OK]   running rootful; cpuset always available"
+        return 0
+    fi
+    if [[ ! -f /sys/fs/cgroup/cgroup.controllers ]]; then
+        echo "  [skip] not cgroup v2 unified; cpuset delegation N/A"
+        return 0
+    fi
+
+    local uid; uid="$(id -u)"
+    local candidates=(
+        "/sys/fs/cgroup/user.slice/user-${uid}.slice/user@${uid}.service/cgroup.controllers"
+    )
+    local cg
+    cg="$(awk -F: '/^0::/{print $3}' /proc/self/cgroup 2>/dev/null)"
+    [[ -n "$cg" ]] && candidates+=("/sys/fs/cgroup${cg}/cgroup.controllers")
+
+    local found_file=false f
+    for f in "${candidates[@]}"; do
+        [[ -r "$f" ]] || continue
+        found_file=true
+        if grep -qw cpuset "$f"; then
+            echo "  [OK]   rootless cpuset delegated ($f)"
+            return 0
+        fi
+    done
+
+    if ! $found_file; then
+        echo "  [WARN] could not read cgroup controllers; cannot verify cpuset delegation."
+        echo "         If pinning fails with 'cpuset.cpus: no such file or directory', see below."
+        return 0
+    fi
+
+    if [[ "${ALLOW_NO_CPUSET:-0}" == "1" ]]; then
+        echo "  [WARN] cpuset NOT delegated to rootless user slice (ALLOW_NO_CPUSET=1; continuing)."
+        return 0
+    fi
+
+    echo "  [BLOCK] rootless cgroup v2 cpuset is NOT delegated to your user slice." >&2
+    echo "          Core pinning will fail with 'cpuset.cpus: no such file or directory'." >&2
+    echo "    Fix (needs root, one-time -- no session restart required):" >&2
+    echo "      sudo mkdir -p /etc/systemd/system/user@.service.d" >&2
+    echo "      printf '[Service]\\nDelegate=cpu cpuset io memory pids\\n' | \\" >&2
+    echo "        sudo tee /etc/systemd/system/user@.service.d/delegate.conf" >&2
+    echo "      sudo systemctl daemon-reload && systemctl --user daemon-reload" >&2
+    echo "      sudo systemctl restart user@${uid}.service   # or re-login" >&2
+    echo "    Alternative (no pinning): re-run with --no-limits." >&2
+    echo "    Bypass: ALLOW_NO_CPUSET=1" >&2
+    return 1
+}
+
+# ----------------------------------------------------------------------------
+# 5. CNI cniVersion vs installed plugins (WARN only; start.sh auto-downgrades).
+# ----------------------------------------------------------------------------
+check_cni_version() {
+    local backend
+    backend="$(podman info --format '{{.Host.NetworkBackend}}' 2>/dev/null || echo "")"
+    # .Host.NetworkBackend exists only on podman >= 4. On podman 3.x the field
+    # is absent and the template renders empty -- so we must NOT treat "not
+    # literally cni" as "no risk", or the podman-3.x CNI path (the one that
+    # actually has the cniVersion problem) would be skipped entirely. Bail only
+    # on an EXPLICIT netavark backend (genuinely no CNI risk). For "cni" or an
+    # empty/unknown backend (i.e. podman 3.x), fall through; the plugin probe
+    # below is a safe no-op when no CNI bridge plugin is installed.
+    if [[ "$backend" == "netavark" ]]; then
+        echo "  [OK]   network backend: netavark (no CNI cniVersion risk)"
+        return 0
+    fi
+
+    echo "  [WARN] network backend is CNI / podman 3.x style (backend='${backend:-unknown}')."
+    local plugin="" d
+    for d in /opt/cni/bin /usr/lib/cni /usr/libexec/cni; do
+        [[ -x "$d/bridge" ]] && { plugin="$d/bridge"; break; }
+    done
+
+    if [[ -n "$plugin" ]]; then
+        local out
+        out="$(printf '{"cniVersion":"1.0.0","name":"x","type":"bridge"}' \
+            | CNI_COMMAND=VERSION "$plugin" 2>/dev/null || true)"
+        if [[ -n "$out" ]] && ! echo "$out" | grep -q '"1.0.0"'; then
+            echo "         $plugin does not advertise cniVersion 1.0.0 support."
+            echo "         start.sh will downgrade the network conflist to 0.4.0 automatically."
+            echo "         If static IPs still fail, upgrade 'containernetworking-plugins'."
+        fi
+    fi
+    return 0
+}
+
+rc=0
+check_image_resolvable   || rc=1
+check_cpuset_delegation  || rc=1
+check_cni_version        || true
+exit "$rc"
diff --git a/skills/vllm-multiinstance/harness/generate-config.sh b/skills/vllm-multiinstance/harness/generate-config.sh
new file mode 100755
index 0000000..b4b8e57
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/generate-config.sh
@@ -0,0 +1,463 @@
+#!/bin/bash
+set -euo pipefail
+
+# Configurable parameters (override via env vars or flags)
+NUM_INSTANCES="${NUM_INSTANCES:-5}"
+CORES_PER_INSTANCE="${CORES_PER_INSTANCE:-32}"
+NGINX_CORES="${NGINX_CORES:-1-15}"
+VLLM_START_CORE="${VLLM_START_CORE:-32}"
+MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
+HF_TOKEN="${HF_TOKEN:-}"
+MEM_LIMIT="${MEM_LIMIT:-100g}"
+NGINX_MEM_LIMIT="${NGINX_MEM_LIMIT:-5g}"
+SHM_SIZE="${SHM_SIZE:-16g}"
+# Extra args appended verbatim to the vLLM command in each instance
+# (e.g. EXTRA_VLLM_ARGS="--trust-remote-code --max-model-len 8192").
+EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:-}"
+VLLM_IMAGE="${VLLM_IMAGE:-amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23}"
+VLLM_KV_CACHE_SPACE="${VLLM_KV_CACHE_SPACE:-63}"
+NGINX_PORT="${NGINX_PORT:-8080}"
+# Dedicated /24 subnet for the compose bridge network. Each vLLM instance gets a
+# static IP on this subnet so NGINX can reach instances by IP instead of by
+# hostname. This is required because rootless podman's DNS (aardvark-dns) is not
+# reachable on the bridge gateway in this environment -- container name/alias
+# resolution fails with "connection refused", which made NGINX crash-loop with
+# "host not found in upstream". Static IPs sidestep DNS entirely. Use a subnet
+# outside netavark's default 10.89.0.0/16 auto-allocation pool to avoid clashes.
+VLLM_SUBNET="${VLLM_SUBNET:-10.201.0.0/24}"
+# Derive the /24 prefix (e.g. 10.201.0.0/24 -> 10.201.0). Instance i is assigned
+# <prefix>.IP_BASE+i; NGINX takes <prefix>.IP_BASE.
+_subnet_base="${VLLM_SUBNET%/*}"
+IP_PREFIX="${_subnet_base%.*}"
+IP_BASE="${IP_BASE:-10}"
+# Run each vLLM instance with HF_HUB_OFFLINE so it loads the model straight from
+# the shared cache instead of contacting huggingface.co. Defaults to 1 (offline)
+# because start.sh pre-warms the cache on the host before any container starts,
+# so the model is always present on disk -- and the compose bridge network has no
+# reliable DNS to huggingface.co, which otherwise makes instances crash with
+# "Temporary failure in name resolution". Set to 0 (or pass --hf-online) only
+# when instances must download/refresh the model themselves.
+HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+MODELS_DIR="${MODELS_DIR:-}"
+# Shared HuggingFace cache dir bind-mounted into every vLLM instance at
+# /opt/app-root/src/.cache. Lets all instances reuse a single set of model
+# downloads instead of each one filling its own named volume. Persists across
+# `podman compose down` / `podman system prune` because it's a host path.
+HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/hf-shared}"
+
+# Backend selection: "zentorch" (default) or "native". Override via the
+# BACKEND env var or the --native/--zentorch flags. Native mode skips the
+# zentorch-specific env vars emitted into the compose file.
+BACKEND="${BACKEND:-zentorch}"
+
+usage() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Generate docker-compose.yml and nginx.conf for multi-instance vLLM.
+
+Options:
+  -n, --num-instances N       Number of vLLM instances (default: $NUM_INSTANCES)
+  -c, --cores-per-instance N  Cores per instance (default: $CORES_PER_INSTANCE)
+  --nginx-cores RANGE         CPU range for nginx (default: $NGINX_CORES)
+  --start-core N              First core for vLLM instances (default: $VLLM_START_CORE)
+  -m, --model NAME            Model name or path (default: $MODEL_NAME)
+  --image IMAGE               vLLM container image (default: $VLLM_IMAGE)
+  --mem-limit SIZE            Memory limit per instance (default: $MEM_LIMIT)
+  --kv-cache-space N          KV cache space in GB (default: $VLLM_KV_CACHE_SPACE)
+  --nginx-port PORT           Host port for nginx (default: $NGINX_PORT)
+  --hf-offline                Run instances with HF_HUB_OFFLINE=1 (default; load
+                              from the pre-warmed shared cache, no network)
+  --hf-online                 Run instances with HF_HUB_OFFLINE=0 (let each
+                              instance contact huggingface.co)
+  --models-dir DIR            Host dir bind-mounted into each container at
+                              the same absolute path (for local model loading)
+  --hf-cache-dir DIR          Host dir bind-mounted as the shared HuggingFace
+                              cache in every instance (default: $HF_CACHE_DIR).
+                              All instances reuse one cache so each model is
+                              downloaded exactly once per sweep.
+  --native                    Use native backend: skip zentorch-specific env
+                              vars (same as BACKEND=native)
+  --zentorch                  Use zentorch backend (default; BACKEND=zentorch)
+  --no-limits                 Skip cpuset/mem_limit/shm_size (for rootless/LSF environments)
+  --no-mem-limit              Drop mem_limit only; keep cpuset/shm_size/cap_add/security_opt
+  -o, --output-dir DIR        Output directory (default: generated/)
+  --dry-run                   Print config summary without writing files
+  -h, --help                  Show this help
+
+Environment variables:
+  NUM_INSTANCES, CORES_PER_INSTANCE, NGINX_CORES, VLLM_START_CORE,
+  MODEL_NAME, HF_TOKEN, MEM_LIMIT, SHM_SIZE, VLLM_IMAGE,
+  VLLM_KV_CACHE_SPACE, NGINX_PORT, HF_HUB_OFFLINE (0|1, default 1),
+  BACKEND (zentorch|native, default zentorch)
+
+Example:
+  # 3 instances on a 96-core CPU, starting at core 16
+  $0 -n 3 -c 24 --start-core 16 --nginx-cores 0-15
+
+  # Use a local model path
+  $0 -m /models/Llama-3.1-8B-Instruct
+EOF
+    exit 0
+}
+
+OUTPUT_DIR="generated"
+DRY_RUN=false
+NO_LIMITS=false
+NO_MEM_LIMIT=false
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -n|--num-instances) NUM_INSTANCES="$2"; shift 2 ;;
+        -c|--cores-per-instance) CORES_PER_INSTANCE="$2"; shift 2 ;;
+        --nginx-cores) NGINX_CORES="$2"; shift 2 ;;
+        --start-core) VLLM_START_CORE="$2"; shift 2 ;;
+        -m|--model) MODEL_NAME="$2"; shift 2 ;;
+        --image) VLLM_IMAGE="$2"; shift 2 ;;
+        --mem-limit) MEM_LIMIT="$2"; shift 2 ;;
+        --kv-cache-space) VLLM_KV_CACHE_SPACE="$2"; shift 2 ;;
+        --nginx-port) NGINX_PORT="$2"; shift 2 ;;
+        --hf-offline) HF_HUB_OFFLINE=1; shift ;;
+        --hf-online) HF_HUB_OFFLINE=0; shift ;;
+        --models-dir) MODELS_DIR="$2"; shift 2 ;;
+        --hf-cache-dir) HF_CACHE_DIR="$2"; shift 2 ;;
+        -o|--output-dir) OUTPUT_DIR="$2"; shift 2 ;;
+        --native) BACKEND=native; shift ;;
+        --zentorch) BACKEND=zentorch; shift ;;
+        --no-limits) NO_LIMITS=true; shift ;;
+        --no-mem-limit) NO_MEM_LIMIT=true; shift ;;
+        --dry-run) DRY_RUN=true; shift ;;
+        -h|--help) usage ;;
+        *) echo "Unknown option: $1"; usage ;;
+    esac
+done
+
+case "$BACKEND" in
+    zentorch) NATIVE_MODE=false ;;
+    native)   NATIVE_MODE=true ;;
+    *) echo "ERROR: invalid BACKEND='$BACKEND' (expected 'zentorch' or 'native')." >&2; exit 1 ;;
+esac
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+OUTPUT_DIR="$SCRIPT_DIR/$OUTPUT_DIR"
+
+# Resolve HF_CACHE_DIR to an absolute path and ensure it exists so podman
+# doesn't auto-create it as root-owned on first mount.
+mkdir -p "$HF_CACHE_DIR"
+HF_CACHE_DIR="$(cd "$HF_CACHE_DIR" && pwd)"
+
+# Append each token in EXTRA_VLLM_ARGS as a YAML list item to the compose
+# command block (word-split on whitespace; no quoting/escaping needed for
+# typical flags like --trust-remote-code or --max-model-len 8192).
+emit_extra_vllm_args() {
+    [[ -z "$EXTRA_VLLM_ARGS" ]] && return 0
+    local arg
+    for arg in $EXTRA_VLLM_ARGS; do
+        printf '      - %s\n' "$arg" >> "$COMPOSE"
+    done
+}
+
+last_core=$(( VLLM_START_CORE + NUM_INSTANCES * CORES_PER_INSTANCE - 1 ))
+
+echo "=== Multi-Instance vLLM Configuration ==="
+echo "  Instances:       $NUM_INSTANCES"
+echo "  Cores/instance:  $CORES_PER_INSTANCE"
+echo "  NGINX cores:     $NGINX_CORES"
+echo "  vLLM cores:      $VLLM_START_CORE-$last_core"
+echo "  Model:           $MODEL_NAME"
+echo "  Backend:         $BACKEND"
+echo "  Image:           $VLLM_IMAGE"
+echo "  Memory limit:    $MEM_LIMIT per instance"
+echo "  KV cache space:  ${VLLM_KV_CACHE_SPACE}GB"
+echo "  NGINX port:      $NGINX_PORT"
+echo "  Network subnet:  $VLLM_SUBNET (nginx=${IP_PREFIX}.${IP_BASE}, vllm-i=${IP_PREFIX}.$((IP_BASE+1))..)"
+echo "  HF hub offline:  $HF_HUB_OFFLINE"
+echo "  Models dir:      ${MODELS_DIR:-<unset>}"
+echo "  HF cache dir:    $HF_CACHE_DIR (shared across all instances)"
+echo "  Output dir:      $OUTPUT_DIR"
+echo "=========================================="
+
+if $DRY_RUN; then
+    echo ""
+    echo "Core allocation:"
+    for i in $(seq 1 "$NUM_INSTANCES"); do
+        start=$(( VLLM_START_CORE + (i - 1) * CORES_PER_INSTANCE ))
+        end=$(( start + CORES_PER_INSTANCE - 1 ))
+        echo "  vllm-$i: cores $start-$end"
+    done
+    echo "  nginx:  cores $NGINX_CORES"
+    exit 0
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+# --- Generate .env ---
+cat > "$OUTPUT_DIR/.env" <<EOF
+MODEL_NAME=$MODEL_NAME
+HF_TOKEN=$HF_TOKEN
+MEM_LIMIT=$MEM_LIMIT
+HF_CACHE_DIR=$HF_CACHE_DIR
+EOF
+echo "  Written: $OUTPUT_DIR/.env"
+
+# --- Generate nginx.conf ---
+cat > "$OUTPUT_DIR/nginx.conf" <<'NGINX_HEADER'
+user root;
+pid /tmp/nginx.pid;
+
+events {
+    worker_connections 4096;
+}
+
+http {
+    upstream vllm_backend {
+NGINX_HEADER
+
+for i in $(seq 1 "$NUM_INSTANCES"); do
+    echo "        server ${IP_PREFIX}.$(( IP_BASE + i )):8000 max_fails=10 fail_timeout=10s;" >> "$OUTPUT_DIR/nginx.conf"
+done
+
+cat >> "$OUTPUT_DIR/nginx.conf" <<'NGINX_BODY'
+    }
+
+    log_format upstream_log '$remote_addr - [$time_local] "$request" $status '
+                            'upstream=$upstream_addr response_time=$upstream_response_time';
+
+    access_log /var/log/nginx/access.log upstream_log;
+    error_log /var/log/nginx/error.log warn;
+
+    server {
+        listen 80;
+        server_name _;
+
+        proxy_connect_timeout 2s;
+        proxy_send_timeout 300s;
+        proxy_read_timeout 300s;
+        send_timeout 300s;
+
+        proxy_buffer_size 128k;
+        proxy_buffers 8 256k;
+        proxy_busy_buffers_size 512k;
+
+        location /health {
+            proxy_pass http://vllm_backend/health;
+            proxy_http_version 1.1;
+            proxy_set_header Connection "";
+            proxy_next_upstream error timeout http_502 http_503 http_504;
+            proxy_next_upstream_tries 0;
+        }
+
+        location / {
+            proxy_pass http://vllm_backend;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_set_header Connection "";
+            proxy_buffering off;
+            proxy_cache off;
+            proxy_next_upstream error timeout http_502 http_503 http_504;
+            proxy_next_upstream_tries 0;
+            client_max_body_size 50M;
+        }
+
+        location /v1/completions {
+            proxy_pass http://vllm_backend/v1/completions;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_set_header Connection "";
+            proxy_buffering off;
+            proxy_cache off;
+            proxy_next_upstream error timeout http_502 http_503 http_504;
+            proxy_next_upstream_tries 0;
+            client_max_body_size 50M;
+        }
+
+        location /v1/chat/completions {
+            proxy_pass http://vllm_backend/v1/chat/completions;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_set_header Connection "";
+            proxy_buffering off;
+            proxy_cache off;
+            proxy_next_upstream error timeout http_502 http_503 http_504;
+            proxy_next_upstream_tries 0;
+            client_max_body_size 50M;
+        }
+
+        location /v1/models {
+            proxy_pass http://vllm_backend/v1/models;
+            proxy_http_version 1.1;
+            proxy_set_header Connection "";
+        }
+
+        location /nginx_status {
+            stub_status on;
+            access_log off;
+        }
+    }
+}
+NGINX_BODY
+echo "  Written: $OUTPUT_DIR/nginx.conf"
+
+# --- Generate docker-compose.yml ---
+COMPOSE="$OUTPUT_DIR/docker-compose.yml"
+
+cat > "$COMPOSE" <<EOF
+version: '3.8'
+
+services:
+EOF
+
+DEPENDS_LIST=""
+for i in $(seq 1 "$NUM_INSTANCES"); do
+    start=$(( VLLM_START_CORE + (i - 1) * CORES_PER_INSTANCE ))
+    end=$(( start + CORES_PER_INSTANCE - 1 ))
+    DEPENDS_LIST="${DEPENDS_LIST}      - vllm-$i
+"
+
+    cat >> "$COMPOSE" <<EOF
+  vllm-$i:
+    image: $VLLM_IMAGE
+    container_name: ${VLLM_NAME_PREFIX:-vllm-instance}-$i
+EOF
+
+    if ! $NO_LIMITS; then
+        cat >> "$COMPOSE" <<EOF
+    cpuset: "$start-$end"
+EOF
+        if ! $NO_MEM_LIMIT; then
+            cat >> "$COMPOSE" <<EOF
+    mem_limit: \${MEM_LIMIT:-$MEM_LIMIT}
+EOF
+        fi
+        cat >> "$COMPOSE" <<EOF
+    shm_size: $SHM_SIZE
+    cap_add:
+      - SYS_NICE
+    security_opt:
+      - seccomp=unconfined
+EOF
+    fi
+
+    cat >> "$COMPOSE" <<EOF
+    environment:
+      - HF_HUB_OFFLINE=$HF_HUB_OFFLINE
+      - HF_HOME=/opt/app-root/src/.cache/huggingface
+      - HF_TOKEN=\${HF_TOKEN}
+      - VLLM_CPU_KVCACHE_SPACE=$VLLM_KV_CACHE_SPACE
+EOF
+
+    if $NATIVE_MODE; then
+        cat >> "$COMPOSE" <<EOF
+      - TORCHINDUCTOR_FREEZING=0
+EOF
+    else
+        cat >> "$COMPOSE" <<EOF
+      - TORCHINDUCTOR_FREEZING=1
+      - VLLM_USE_AOT_COMPILE=0
+      - TORCHINDUCTOR_AUTOGRAD_CACHE=0
+      - ZENDNNL_MATMUL_ALGO=1
+EOF
+    fi
+
+    cat >> "$COMPOSE" <<EOF
+    volumes:
+      - $HF_CACHE_DIR:/opt/app-root/src/.cache:z
+EOF
+
+    if [[ -n "$MODELS_DIR" ]]; then
+        cat >> "$COMPOSE" <<EOF
+      - $MODELS_DIR:$MODELS_DIR:ro,z
+EOF
+    fi
+
+    cat >> "$COMPOSE" <<EOF
+    entrypoint:
+      - bash
+      - -c
+      - 'export VLLM_CPU_OMP_THREADS_BIND=$start-$end && echo "=== VLLM_CPU_OMP_THREADS_BIND=\$\$VLLM_CPU_OMP_THREADS_BIND ===" && echo "=== pip list (\$\$(hostname)) ===" && pip list && echo "=== launching: vllm serve \$\$@ ===" && exec vllm serve "\$\$@"'
+      - --
+    command:
+      - --model
+      - \${MODEL_NAME:-$MODEL_NAME}
+      - --port
+      - "8000"
+      - --host
+      - "0.0.0.0"
+      - --no-enable-prefix-caching
+      - --max-model-len
+      - "4096"
+EOF
+
+    emit_extra_vllm_args
+
+    cat >> "$COMPOSE" <<EOF
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 30s
+      retries: 10
+      start_period: 600s
+    networks:
+      vllm-network:
+        ipv4_address: ${IP_PREFIX}.$(( IP_BASE + i ))
+
+EOF
+done
+
+cat >> "$COMPOSE" <<EOF
+  nginx:
+    image: docker.io/library/nginx:alpine
+    container_name: ${VLLM_NGINX_NAME:-vllm-nginx-lb}
+EOF
+
+    if ! $NO_LIMITS; then
+        cat >> "$COMPOSE" <<EOF
+    cpuset: "$NGINX_CORES"
+    mem_limit: $NGINX_MEM_LIMIT
+EOF
+    fi
+
+    cat >> "$COMPOSE" <<EOF
+    ports:
+      - "$NGINX_PORT:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro,Z
+    depends_on:
+${DEPENDS_LIST}    networks:
+      vllm-network:
+        ipv4_address: ${IP_PREFIX}.${IP_BASE}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 360s
+
+networks:
+  vllm-network:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: $VLLM_SUBNET
+EOF
+
+echo "  Written: $OUTPUT_DIR/docker-compose.yml"
+echo ""
+echo "Configuration generated successfully in $OUTPUT_DIR/"
+echo "Core allocation:"
+for i in $(seq 1 "$NUM_INSTANCES"); do
+    start=$(( VLLM_START_CORE + (i - 1) * CORES_PER_INSTANCE ))
+    end=$(( start + CORES_PER_INSTANCE - 1 ))
+    echo "  vllm-$i: cores $start-$end"
+done
+echo "  nginx:  cores $NGINX_CORES"
diff --git a/skills/vllm-multiinstance/harness/run_sweep.sh b/skills/vllm-multiinstance/harness/run_sweep.sh
new file mode 100755
index 0000000..c206f20
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/run_sweep.sh
@@ -0,0 +1,688 @@
+#!/bin/bash
+# ============================================================================
+# Multi-model, multi-variant (native + zentorch) benchmark sweep
+#
+# For each (variant, model):
+#   1. Stop any running stack
+#   2. Start the stack with the chosen image (zentorch or native) and model
+#   3. Run guidellm benchmark via ansible at rates [32, 64, 96]
+#   4. Stop the stack
+#
+# Results are tagged: test_name=<PREFIX>_<variant>_<sanitized-model>
+# ============================================================================
+
+set -euo pipefail
+
+# ----------------------------------------------------------------------------
+# 0. ARG PARSING
+# ----------------------------------------------------------------------------
+DRY_RUN=false
+CLI_MODELS=()
+CLI_MODELS_DIR=""
+CLI_NUM_INSTANCES=""
+CLI_QUANT=false
+CLI_TORCHAO=false
+CLI_NO_MEM_LIMIT=false
+CLI_VARIANT=""
+CLI_BECOME=""   # "" auto | "off" rootless | "on" force become
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --dry-run|-n)
+            DRY_RUN=true; shift ;;
+        --native)
+            CLI_VARIANT="native"; shift ;;
+        --zentorch)
+            CLI_VARIANT="zentorch"; shift ;;
+        -m|--model)
+            CLI_MODELS+=("$2"); shift 2 ;;
+        --models-dir)
+            CLI_MODELS_DIR="$2"; shift 2 ;;
+        -N|--num-instances)
+            CLI_NUM_INSTANCES="$2"; shift 2 ;;
+        --quant)
+            CLI_QUANT=true; shift ;;
+        --torchao)
+            CLI_TORCHAO=true; shift ;;
+        --no-mem-limit)
+            CLI_NO_MEM_LIMIT=true; shift ;;
+        --no-become)
+            CLI_BECOME="off"; shift ;;
+        --become)
+            CLI_BECOME="on"; shift ;;
+        -h|--help)
+            cat <<EOF
+Usage: $0 [options]
+
+Sweeps every (variant, model) pair and runs the guidellm benchmark.
+
+Options:
+  -m, --model ENTRY      Model to sweep, as "path-or-repo | short-tag" (the
+                         tag is optional). Repeat to add several. Overrides
+                         the built-in MODELS list when given at least once.
+  --models-dir DIR       Host dir holding local model folders. Bind-mounted
+                         into each container at the same path; model ENTRY
+                         left-sides are resolved relative to it.
+  -N, --num-instances N  Number of vLLM instances (also seeds the default
+                         test_name prefix EPYC<N>).
+  --native               Sweep only the native (non-zentorch) variant.
+                         Overrides the VARIANTS array.
+  --zentorch             Sweep only the zentorch variant (default).
+                         Overrides the VARIANTS array.
+  --quant                Quantized-model run (default: llm-compressor format).
+                         Uses the base image as-is, NO torchao install. Models
+                         may be HF repo ids (pulled from the hub, needs
+                         HF_TOKEN) or local dirs via --models-dir (or MODELS_DIR
+                         env).
+  --torchao              Only meaningful with --quant. Builds the derived
+                         torchao-enabled image (passes --torchao to start.sh).
+                         Use for models with quant_method=torchao. Without this,
+                         --quant does NOT change the image.
+  --no-mem-limit         Drop the per-instance mem_limit while keeping cpuset
+                         pinning (forwarded to start.sh). Use when the memory
+                         cgroup OOM-kills containers despite ample free host RAM.
+  --no-become            Run ansible (incl. the guidellm load generator)
+                         rootless as the invoking user (-e ansible_become=false).
+                         guidellm doesn't need root (network:host, user 0:0).
+                         Use on hosts without passwordless sudo. Default is
+                         auto: become is used only if 'sudo -n true' succeeds.
+                         Env equivalent: ANSIBLE_NO_BECOME=1.
+  --become               Force ansible become on (override auto-detection).
+  --dry-run, -n          Validate paths/config and print every command that
+                         would run, without starting containers, calling
+                         ansible, or touching sudo. Safe with HF_TOKEN unset.
+  -h, --help             Show this help.
+EOF
+            exit 0 ;;
+        *)
+            echo "Unknown option: $1 (try --help)" >&2
+            exit 1 ;;
+    esac
+done
+
+# --quant marks a quantized run (llm-compressor by default, no image change).
+# --torchao (only with --quant) opts into the derived torchao image build.
+QUANT="$CLI_QUANT"
+TORCHAO="$CLI_TORCHAO"
+if [[ "$TORCHAO" == "true" && "$QUANT" != "true" ]]; then
+    echo "ERROR: --torchao requires --quant." >&2
+    exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# 1. MODELS TO SWEEP  --  fill in your 20 models here
+# ----------------------------------------------------------------------------
+# Format per entry:  "hf-path-or-relative-dir | short-tag"
+#   - Left side  = passed to vLLM as --model (HF repo id, or path under MODELS_DIR)
+#   - Right side = used in test_name (must be [A-Za-z0-9-] and short enough that
+#                  "${PREFIX}-${variant}-${tag}" stays <= 30 chars).
+# The ansible playbook enforces: test_name matches ^[A-Za-z0-9-]{1,30}$.
+MODELS=(
+    "Llama-3.1-8B-Instruct                | llama31-8b"
+    "gpt-oss-20b-BF16                     | gptoss-20b"
+)
+
+# CLI -m/--model entries override the built-in list above.
+if (( ${#CLI_MODELS[@]} > 0 )); then
+    MODELS=("${CLI_MODELS[@]}")
+fi
+
+# ----------------------------------------------------------------------------
+# 2. VARIANTS  --  comment out one if you only want native or only zentorch
+# ----------------------------------------------------------------------------
+VARIANTS=(
+    "zentorch"
+#    "native"
+)
+
+# CLI --native/--zentorch overrides the VARIANTS array above.
+if [[ -n "$CLI_VARIANT" ]]; then
+    VARIANTS=("$CLI_VARIANT")
+fi
+
+# ----------------------------------------------------------------------------
+# 3. BENCHMARK CONFIG
+# ----------------------------------------------------------------------------
+GUIDELLM_RATES="${GUIDELLM_RATES:-[32,64]}"
+# GUIDELLM_RATES="${GUIDELLM_RATES:-[64,96,128,256,364,512,1024]}"
+GUIDELLM_MAX_SECONDS="${GUIDELLM_MAX_SECONDS:-300}"
+# Max concurrent in-flight requests. Must be >= the largest GUIDELLM_RATES
+# value or the concurrent profile clamps high rates to this cap. Forwarded to
+# ansible as guidellm_max_concurrency (the playbook defaults to 128).
+GUIDELLM_MAX_CONCURRENCY="${GUIDELLM_MAX_CONCURRENCY:-1024}"
+BASE_WORKLOAD="${BASE_WORKLOAD:-chat}"
+
+# --no-mem-limit: CLI flag wins over the NO_MEM_LIMIT env var. Forwarded to
+# start.sh.
+if [[ "$CLI_NO_MEM_LIMIT" == "true" ]]; then
+    NO_MEM_LIMIT=true
+fi
+export NO_MEM_LIMIT="${NO_MEM_LIMIT:-false}"
+# Resolve instance count early: CLI -N wins, then env, then default 1.
+# It seeds both the stack layout and the default test_name prefix below.
+NUM_INSTANCES="${CLI_NUM_INSTANCES:-${NUM_INSTANCES:-1}}"
+
+# test_name must be 1-30 chars, [A-Za-z0-9-] only.
+# Format used below: "${PREFIX}-${variant}-${tag}"  (variant = "zentorch"|"native")
+# Budget: "EPYC7" (5) + "-zentorch-" (10) + tag = 15 + len(tag), so tag <= 15.
+TEST_NAME_PREFIX="${TEST_NAME_PREFIX:-EPYC${NUM_INSTANCES}}"
+
+# If MODELS_DIR is set, each model name in the MODELS array is expected to be
+# a relative path under MODELS_DIR (e.g. "meta-llama/Llama-3.1-8B-Instruct"
+# resolves to "$MODELS_DIR/meta-llama/Llama-3.1-8B-Instruct"). The directory
+# is bind-mounted at the same path inside the container, so the absolute path
+# is passed straight to "vllm serve".
+# Leave MODELS_DIR unset to pull models from the HuggingFace hub as before.
+# CLI --models-dir wins over the MODELS_DIR env var.
+MODELS_DIR="${CLI_MODELS_DIR:-${MODELS_DIR:-}}"
+
+# ----------------------------------------------------------------------------
+# 4. STACK / CPU LAYOUT  (passed to start.sh --regenerate)
+# ----------------------------------------------------------------------------
+export NUM_INSTANCES
+export CORES_PER_INSTANCE="${CORES_PER_INSTANCE:-32}"
+export VLLM_START_CORE="${VLLM_START_CORE:-32}"
+export NGINX_CORES="${NGINX_CORES:-1-15}"
+export NGINX_PORT="${NGINX_PORT:-8080}"
+export MEM_LIMIT="${MEM_LIMIT:-200g}"
+export VLLM_KV_CACHE_SPACE="${VLLM_KV_CACHE_SPACE:-90}"
+
+# Zentorch image (base). The native variant is built from this image by
+# uninstalling zentorch -- start.sh --native handles this automatically.
+export VLLM_IMAGE="${VLLM_IMAGE:-amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23}"
+
+HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-1200}"
+
+# ----------------------------------------------------------------------------
+# 5. ANSIBLE / ENDPOINT CONFIG
+# ----------------------------------------------------------------------------
+# GuideLLM load-generator CPU / NUMA pinning (optional passthrough to ansible).
+# The load generator is otherwise strictly memory-bound to NUMA node 0
+# (cpuset_mems default '0'); with many vLLM instances node 0 can run out of
+# memory and the client's worker processes get OOM-killed at startup. Set
+# GUIDELLM_NUMA_NODE to a range spanning all nodes (e.g. "0-7") or to a node
+# with free RAM, and optionally GUIDELLM_CPUS to matching cores.
+GUIDELLM_CPUS="${GUIDELLM_CPUS:-}"
+GUIDELLM_NUMA_NODE="${GUIDELLM_NUMA_NODE:-}"
+
+export VLLM_ENDPOINT_MODE="${VLLM_ENDPOINT_MODE:-external}"
+export VLLM_ENDPOINT_URL="${VLLM_ENDPOINT_URL:-http://localhost:${NGINX_PORT}}"
+export LOADGEN_HOSTNAME="${LOADGEN_HOSTNAME:-localhost}"
+export DUT_HOSTNAME="${DUT_HOSTNAME:-localhost}"
+export ANSIBLE_SSH_USER="${ANSIBLE_SSH_USER:-$(whoami)}"
+export ANSIBLE_SSH_KEY="${ANSIBLE_SSH_KEY:-$HOME/.ssh/id_rsa}"
+
+# --- Rootless / ansible become handling -------------------------------------
+# The playbook runs several tasks (incl. the guidellm load generator) under
+# ansible `become: true`, which needs passwordless sudo. On hosts without it
+# the run hard-fails -- and worse, only after ~10 min of health-check retries
+# ("sudo: a password is required"). guidellm doesn't actually need root (its
+# container is network:host + user 0:0), so we can run the whole playbook as
+# the invoking user by passing `-e ansible_become=false`.
+#
+# Decision (CLI --no-become/--become > env ANSIBLE_NO_BECOME > auto-detect):
+#   off / 1     -> rootless (ansible_become=false)
+#   on  / 0     -> force become (default ansible behavior)
+#   auto        -> become only if `sudo -n true` works.
+NO_BECOME=false
+case "$CLI_BECOME" in
+    off) NO_BECOME=true ;;
+    on)  NO_BECOME=false ;;
+    *)
+        if [[ -n "${ANSIBLE_NO_BECOME:-}" ]]; then
+            [[ "$ANSIBLE_NO_BECOME" == "1" ]] && NO_BECOME=true || NO_BECOME=false
+        elif ! $DRY_RUN && ! sudo -n true 2>/dev/null; then
+            NO_BECOME=true
+            echo "[INFO] no passwordless sudo detected -> running ansible rootless" \
+                 "(ansible_become=false). Override with --become." >&2
+        fi
+        ;;
+esac
+BECOME_ARGS=()
+if $NO_BECOME; then
+    BECOME_ARGS=(-e "ansible_become=false")
+fi
+
+# HF_TOKEN is required for gated models. Don't use ${VAR:?} -- it hard-exits
+# the shell (which can take down the tmux pane that launched the script).
+# Instead, prompt interactively so the session stays alive.
+# In --dry-run we skip the check entirely.
+if ! $DRY_RUN; then
+    if [[ -z "${HF_TOKEN:-}" ]]; then
+        echo ""
+        echo "HF_TOKEN is not set. Gated models (e.g. Llama) will fail without it."
+        if [[ -t 0 ]]; then
+            read -r -s -p "Paste your HuggingFace token (or press Enter to abort): " HF_TOKEN
+            echo ""
+        fi
+        if [[ -z "${HF_TOKEN:-}" ]]; then
+            echo "No token provided. Aborting (tmux session preserved)." >&2
+            exit 1
+        fi
+    fi
+fi
+export HF_TOKEN="${HF_TOKEN:-}"
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ANSIBLE_DIR="${ANSIBLE_DIR:-$SCRIPT_DIR/vllm-cpu-perf-eval/automation/test-execution/ansible}"
+LOG_DIR="${LOG_DIR:-$SCRIPT_DIR/sweep-logs/$(date +%Y%m%d-%H%M%S)}"
+mkdir -p "$LOG_DIR"
+
+if [[ -n "$MODELS_DIR" ]]; then
+    if [[ ! -d "$MODELS_DIR" ]]; then
+        echo "ERROR: MODELS_DIR='$MODELS_DIR' does not exist or is not a directory." >&2
+        exit 1
+    fi
+    MODELS_DIR="$(cd "$MODELS_DIR" && pwd)"
+fi
+
+# Resolve a model entry to the actual path/repo passed to vLLM.
+# - With MODELS_DIR set: returns "$MODELS_DIR/$model" (absolute local path)
+# - Without MODELS_DIR : returns "$model" unchanged (HF repo id)
+resolve_model_path() {
+    local m="$1"
+    if [[ -n "$MODELS_DIR" ]]; then
+        echo "$MODELS_DIR/$m"
+    else
+        echo "$m"
+    fi
+}
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+sanitize_model() {
+    # meta-llama/Llama-3.1-8B-Instruct  ->  meta-llama__Llama-3.1-8B-Instruct
+    # Used only for filesystem-safe log file names, NOT for test_name.
+    echo "$1" | sed 's|/|__|g'
+}
+
+# Trim leading/trailing whitespace.
+trim() {
+    local s="$1"
+    s="${s#"${s%%[![:space:]]*}"}"
+    s="${s%"${s##*[![:space:]]}"}"
+    echo "$s"
+}
+
+# Split a MODELS entry "hf/path | tag" -> sets global MODEL_PATH and MODEL_TAG.
+parse_model_entry() {
+    local entry="$1"
+    local left right
+    left="${entry%%|*}"
+    right="${entry##*|}"
+    if [[ "$left" == "$entry" ]]; then
+        # No '|' separator -- fall back to using the path as the tag too.
+        MODEL_PATH="$(trim "$entry")"
+        MODEL_TAG="$(trim "$entry")"
+    else
+        MODEL_PATH="$(trim "$left")"
+        MODEL_TAG="$(trim "$right")"
+    fi
+}
+
+# Map a model tag to the [A-Za-z0-9-] charset the ansible test_name validator
+# requires: replace disallowed chars with '-', collapse runs, trim edges. Lets
+# common ids like "qwen3-0.6b" work verbatim instead of failing preflight.
+sanitize_tag() {
+    local t="$1"
+    t="${t//[^A-Za-z0-9-]/-}"
+    while [[ "$t" == *--* ]]; do t="${t//--/-}"; done
+    t="${t#-}"; t="${t%-}"
+    printf '%s' "$t"
+}
+
+# Validate a test_name against the ansible playbook's rules.
+# Returns 0 if OK, 1 otherwise and prints a reason on stderr.
+validate_test_name() {
+    local name="$1"
+    local len=${#name}
+    if (( len < 1 || len > 30 )); then
+        echo "  test_name '$name' is $len chars (must be 1-30)" >&2
+        return 1
+    fi
+    if [[ ! "$name" =~ ^[A-Za-z0-9-]+$ ]]; then
+        echo "  test_name '$name' contains non-[A-Za-z0-9-] characters" >&2
+        return 1
+    fi
+    return 0
+}
+
+banner() {
+    echo ""
+    echo "============================================================"
+    echo "  $*"
+    echo "============================================================"
+}
+
+run_one() {
+    local variant="$1"
+    local entry="$2"
+
+    parse_model_entry "$entry"
+    local model="$MODEL_PATH"
+    local tag="$MODEL_TAG"
+
+    # test_name uses ONLY [A-Za-z0-9-] and is <= 30 chars (ansible validates).
+    local safe_tag
+    safe_tag=$(sanitize_tag "$tag")
+    if [[ "$safe_tag" != "$tag" ]]; then
+        echo "  Note: tag '$tag' sanitized to '$safe_tag' for test_name." >&2
+    fi
+    local test_name="${TEST_NAME_PREFIX}-${variant}-${safe_tag}"
+
+    # Log file name can use the sanitized HF path -- not subject to test_name rules.
+    local log_tag
+    log_tag=$(sanitize_model "$model")
+    local run_log="$LOG_DIR/${variant}_${log_tag}.log"
+
+    # Resolve the actual path/repo to load (local absolute path or HF repo id).
+    local model_path
+    model_path=$(resolve_model_path "$model")
+
+    banner "[$variant] $model"
+    echo "  test_name  = $test_name (${#test_name} chars)"
+    echo "  model_path = $model_path"
+    echo "  rates      = $GUIDELLM_RATES"
+    echo "  max_conc   = $GUIDELLM_MAX_CONCURRENCY"
+    echo "  log        = $run_log"
+
+    # Build start.sh args (same for dry-run and real run).
+    # --torchao is only added when both --quant and --torchao were passed.
+    local start_args=(--regenerate --timeout "$HEALTH_TIMEOUT" -m "$model_path")
+    if [[ "$TORCHAO" == "true" ]]; then
+        start_args=(--torchao "${start_args[@]}")
+    fi
+    if [[ -n "$MODELS_DIR" ]]; then
+        start_args+=(--models-dir "$MODELS_DIR")
+    fi
+    if [[ "$variant" == "native" ]]; then
+        start_args=(--native "${start_args[@]}")
+    fi
+    if [[ "${NO_MEM_LIMIT:-false}" == "true" ]]; then
+        start_args+=(--no-mem-limit)
+    fi
+
+    if $DRY_RUN; then
+        echo ""
+        echo "  [dry-run] would call: $SCRIPT_DIR/stop.sh --clean"
+        echo "  [dry-run] would call: $SCRIPT_DIR/start.sh ${start_args[*]}"
+        echo "  [dry-run] would cd:   $ANSIBLE_DIR"
+        echo "  [dry-run] would run ansible-playbook:"
+        cat <<EOF
+    ansible-playbook -i inventory/hosts.yml \\
+        llm-benchmark-concurrent-load.yml \\
+        --connection=local \\
+        -e "ansible_python_interpreter=/usr/bin/python3" \\
+        -e "test_model=$model_path" \\
+        -e "base_workload=$BASE_WORKLOAD" \\
+        -e "skip_phase_2=true" \\
+        -e "skip_phase_3=true" \\
+        -e "guidellm_rate=$GUIDELLM_RATES" \\
+        -e "guidellm_max_seconds=$GUIDELLM_MAX_SECONDS" \\
+        -e "guidellm_max_concurrency=$GUIDELLM_MAX_CONCURRENCY" \\
+        -e "test_name=$test_name" \\
+        ${GUIDELLM_CPUS:+-e \"guidellm_cpus=$GUIDELLM_CPUS\" }${GUIDELLM_NUMA_NODE:+-e \"guidellm_numa_node=$GUIDELLM_NUMA_NODE\" }$($NO_BECOME && printf -- '-e "ansible_become=false" ')\\
+        -e '{"health_check":{"timeout":600,"interval":5}}'
+EOF
+        echo "  [dry-run] would call: $SCRIPT_DIR/stop.sh --clean"
+        echo "  [dry-run] would run:  sync && echo 3 | sudo -n tee /proc/sys/vm/drop_caches"
+        echo "  [dry-run] Done: $test_name"
+        return 0
+    fi
+
+    # --- 1. Stop any previous stack -----------------------------------------
+    echo ""
+    echo "--- Stopping any previous stack ---"
+    "$SCRIPT_DIR/stop.sh" --clean >> "$run_log" 2>&1 || true
+
+    # --- 2. Start fresh stack with the right image + model ------------------
+    echo "--- Starting stack ($variant, $model) ---"
+    if ! "$SCRIPT_DIR/start.sh" "${start_args[@]}" >> "$run_log" 2>&1; then
+        echo "  ERROR: start.sh failed for $variant / $model_path -- skipping."
+        echo "  See $run_log for details."
+        echo "  Containers left running for inspection. To inspect:"
+        echo "    podman ps -a --filter 'name=vllm-'"
+        echo "    podman logs vllm-instance-1"
+        echo "  When done, manually clean with: $SCRIPT_DIR/stop.sh --clean"
+        return 1
+    fi
+
+    # --- 3. Run ansible benchmark sweep -------------------------------------
+    echo "--- Running ansible benchmark ---"
+    pushd "$ANSIBLE_DIR" > /dev/null
+
+    # Optional GuideLLM load-generator pinning overrides.
+    local guidellm_extra_args=()
+    if [[ -n "$GUIDELLM_CPUS" ]]; then
+        guidellm_extra_args+=(-e "guidellm_cpus=$GUIDELLM_CPUS")
+    fi
+    if [[ -n "$GUIDELLM_NUMA_NODE" ]]; then
+        guidellm_extra_args+=(-e "guidellm_numa_node=$GUIDELLM_NUMA_NODE")
+    fi
+
+    local run_rc=0
+    if ! ansible-playbook -i inventory/hosts.yml \
+            llm-benchmark-concurrent-load.yml \
+            --connection=local \
+            -e "ansible_python_interpreter=/usr/bin/python3" \
+            -e "test_model=$model_path" \
+            -e "base_workload=$BASE_WORKLOAD" \
+            -e "skip_phase_2=true" \
+            -e "skip_phase_3=true" \
+            -e "guidellm_rate=$GUIDELLM_RATES" \
+            -e "guidellm_max_seconds=$GUIDELLM_MAX_SECONDS" \
+            -e "guidellm_max_concurrency=$GUIDELLM_MAX_CONCURRENCY" \
+            -e "test_name=$test_name" \
+            "${guidellm_extra_args[@]}" \
+            "${BECOME_ARGS[@]}" \
+            -e '{"health_check":{"timeout":600,"interval":5}}' \
+            >> "$run_log" 2>&1; then
+        run_rc=1
+        echo "  WARNING: ansible benchmark failed for $variant / $model."
+        echo "  See $run_log for details."
+        if ! $NO_BECOME && grep -qiE "sudo: a password is required|Missing sudo password" "$run_log"; then
+            echo "  CAUSE: ansible 'become' needs passwordless sudo, which this host lacks." >&2
+            echo "         Re-run with --no-become (or ANSIBLE_NO_BECOME=1) to run rootless." >&2
+        fi
+    fi
+
+    popd > /dev/null
+
+    # --- 4. Tear down -------------------------------------------------------
+    echo "--- Stopping stack ---"
+    "$SCRIPT_DIR/stop.sh" --clean >> "$run_log" 2>&1 || true
+
+    # --- 5. Drop page cache / dentries / inodes -----------------------------
+    echo "--- Dropping page cache ---"
+    sync
+    if echo 3 | sudo -n tee /proc/sys/vm/drop_caches >> "$run_log" 2>&1; then
+        echo "  Page cache dropped."
+    else
+        echo "  WARNING: failed to drop page cache (need passwordless sudo for 'tee /proc/sys/vm/drop_caches')."
+    fi
+
+    if [[ "$run_rc" -ne 0 ]]; then
+        echo "  Done (WITH FAILURES): $test_name"
+    else
+        echo "  Done: $test_name"
+    fi
+    return "$run_rc"
+}
+
+# ----------------------------------------------------------------------------
+# Pre-flight validation
+# ----------------------------------------------------------------------------
+banner "Pre-flight checks"
+
+preflight_ok=true
+check_path() {
+    local label="$1" path="$2"
+    if [[ -e "$path" ]]; then
+        echo "  [OK]   $label : $path"
+    else
+        echo "  [MISS] $label : $path"
+        preflight_ok=false
+    fi
+}
+
+check_path "SCRIPT_DIR    " "$SCRIPT_DIR"
+check_path "start.sh      " "$SCRIPT_DIR/start.sh"
+check_path "stop.sh       " "$SCRIPT_DIR/stop.sh"
+check_path "LOG_DIR       " "$LOG_DIR"
+check_path "ANSIBLE_DIR   " "$ANSIBLE_DIR"
+check_path "playbook      " "$ANSIBLE_DIR/llm-benchmark-concurrent-load.yml"
+check_path "inventory     " "$ANSIBLE_DIR/inventory/hosts.yml"
+
+if (( ${#MODELS[@]} == 0 )); then
+    echo "  [MISS] MODELS array is empty"
+    preflight_ok=false
+else
+    echo "  [OK]   MODELS array: ${#MODELS[@]} entries"
+fi
+if (( ${#VARIANTS[@]} == 0 )); then
+    echo "  [MISS] VARIANTS array is empty"
+    preflight_ok=false
+else
+    echo "  [OK]   VARIANTS array: ${VARIANTS[*]}"
+fi
+
+if command -v ansible-playbook >/dev/null 2>&1; then
+    echo "  [OK]   ansible-playbook found at: $(command -v ansible-playbook)"
+else
+    echo "  [MISS] ansible-playbook not found in PATH"
+    preflight_ok=false
+fi
+if command -v podman >/dev/null 2>&1; then
+    echo "  [OK]   podman found at: $(command -v podman)"
+else
+    echo "  [WARN] podman not found in PATH (start.sh will fail)"
+fi
+
+# Host-level blockers (image short-name / rootless cpuset / CNI version) that
+# detect.py and the path checks above don't cover. Hard-fails the preflight on
+# a real run; informational only under --dry-run (the image may not be pulled
+# yet). Set SKIP_HOST_CHECK=1 to skip.
+echo ""
+echo "  --- host environment (image / cpuset / CNI) ---"
+if VLLM_IMAGE="$VLLM_IMAGE" bash "$SCRIPT_DIR/check-host.sh"; then
+    :
+elif $DRY_RUN; then
+    echo "  [INFO] host check reported a blocker (dry-run: not failing preflight)."
+else
+    preflight_ok=false
+fi
+
+if $DRY_RUN; then
+    echo "  [INFO] HF_TOKEN check skipped (dry-run)"
+elif [[ -n "${HF_TOKEN:-}" ]]; then
+    echo "  [OK]   HF_TOKEN is set (${#HF_TOKEN} chars)"
+else
+    echo "  [WARN] HF_TOKEN is not set"
+fi
+
+# ansible become mode. Fail fast here if become is required but passwordless
+# sudo is unavailable -- otherwise the playbook only fails ~10 min into the
+# health-check retries with "sudo: a password is required".
+if $NO_BECOME; then
+    echo "  [OK]   ansible become : OFF (running rootless as $(whoami))"
+elif $DRY_RUN; then
+    echo "  [INFO] ansible become : ON (sudo check skipped in dry-run)"
+elif sudo -n true 2>/dev/null; then
+    echo "  [OK]   ansible become : ON (passwordless sudo available)"
+else
+    echo "  [BAD]  ansible become : ON but no passwordless sudo on this host."
+    echo "         The guidellm load generator would fail ~10 min into the run."
+    echo "         Re-run with --no-become (or ANSIBLE_NO_BECOME=1) to run rootless."
+    preflight_ok=false
+fi
+
+if [[ -n "$MODELS_DIR" ]]; then
+    echo "  [OK]   MODELS_DIR    : $MODELS_DIR (will be bind-mounted into containers)"
+    for entry in "${MODELS[@]}"; do
+        parse_model_entry "$entry"
+        if [[ -d "$MODELS_DIR/$MODEL_PATH" ]]; then
+            echo "  [OK]   model dir    : $MODELS_DIR/$MODEL_PATH"
+        else
+            echo "  [MISS] model dir    : $MODELS_DIR/$MODEL_PATH"
+            preflight_ok=false
+        fi
+    done
+else
+    echo "  [INFO] MODELS_DIR not set -- models will be downloaded from HuggingFace hub"
+fi
+
+if [[ "$QUANT" == "true" ]]; then
+    if [[ "$TORCHAO" == "true" ]]; then
+        echo "  [OK]   --quant ON     : torchao image (start.sh --torchao)"
+    else
+        echo "  [OK]   --quant ON     : llm-compressor (base image, no torchao)"
+    fi
+else
+    echo "  [INFO] --quant OFF    : base image, no torchao"
+fi
+
+# Validate every (variant, model) -> test_name fits ansible's rules.
+echo ""
+echo "  --- test_name validation ---"
+for variant in "${VARIANTS[@]}"; do
+    for entry in "${MODELS[@]}"; do
+        parse_model_entry "$entry"
+        candidate="${TEST_NAME_PREFIX}-${variant}-$(sanitize_tag "$MODEL_TAG")"
+        if validate_test_name "$candidate" 2>/dev/null; then
+            printf "  [OK]   %-30s  (%d chars)\n" "$candidate" "${#candidate}"
+        else
+            printf "  [BAD]  %-30s  (%d chars, [A-Za-z0-9-] only, 1-30 chars)\n" \
+                "$candidate" "${#candidate}"
+            preflight_ok=false
+        fi
+    done
+done
+
+if ! $preflight_ok; then
+    echo ""
+    echo "Pre-flight failed. Fix the [MISS] items above before running." >&2
+    exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# Main loop
+# ----------------------------------------------------------------------------
+mode_label="online (guidellm)"
+if $DRY_RUN; then
+    banner "DRY RUN -- ${#MODELS[@]} models x ${#VARIANTS[@]} variants -- $mode_label (no commands will execute)"
+else
+    banner "Benchmark sweep -- ${#MODELS[@]} models x ${#VARIANTS[@]} variants -- $mode_label"
+fi
+echo "  Variants : ${VARIANTS[*]}"
+echo "  Models   :"
+for entry in "${MODELS[@]}"; do
+    parse_model_entry "$entry"
+    printf "    - %-40s -> %s\n" "$MODEL_PATH" "$MODEL_TAG"
+done
+echo "  Rates    : $GUIDELLM_RATES"
+echo "  Log dir  : $LOG_DIR"
+
+total=$(( ${#MODELS[@]} * ${#VARIANTS[@]} ))
+i=0
+failed=()
+
+for variant in "${VARIANTS[@]}"; do
+    for entry in "${MODELS[@]}"; do
+        parse_model_entry "$entry"
+        i=$((i + 1))
+        echo ""
+        echo ">>> [$i/$total] variant=$variant model=$MODEL_PATH"
+        if ! run_one "$variant" "$entry"; then
+            failed+=("$variant / $MODEL_PATH")
+        fi
+    done
+done
+
+banner "Sweep complete"
+echo "  Total runs : $total"
+echo "  Failed     : ${#failed[@]}"
+for f in "${failed[@]}"; do echo "    - $f"; done
+echo "  Logs       : $LOG_DIR"
+echo ""
+echo "Results from each run are in:"
+echo "  $ANSIBLE_DIR/../../../results/llm/<sanitized-model>/<workload>-<timestamp>-<test_name>/external-endpoint/"
diff --git a/skills/vllm-multiinstance/harness/start.sh b/skills/vllm-multiinstance/harness/start.sh
new file mode 100755
index 0000000..c4e3438
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/start.sh
@@ -0,0 +1,443 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GENERATED_DIR="$SCRIPT_DIR/generated"
+COMPOSE_FILE="$GENERATED_DIR/docker-compose.yml"
+HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-600}"
+HEALTH_INTERVAL=10
+
+# Container health lives at a different inspect path across podman major
+# versions: podman 4.x exposes .State.Health.Status, podman 3.x exposes
+# .State.Healthcheck.Status. Reading the 4.x path on podman 3.x yields an
+# empty string (the field is absent), which made the wait loop below treat a
+# perfectly healthy instance as never-ready and hang the full timeout. Try the
+# 4.x path first, fall back to the 3.x path, so both versions report correctly.
+container_health() {
+    local c="$1" h
+    h="$(podman inspect --format '{{.State.Health.Status}}' "$c" 2>/dev/null || true)"
+    if [[ -z "$h" || "$h" == "<no value>" ]]; then
+        h="$(podman inspect --format '{{.State.Healthcheck.Status}}' "$c" 2>/dev/null || true)"
+    fi
+    [[ -z "$h" || "$h" == "<no value>" ]] && h="missing"
+    printf '%s' "$h"
+}
+
+usage() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Start the multi-instance vLLM + NGINX stack.
+
+Options:
+  --regenerate    Force regeneration of config files (passes all extra args to generate-config.sh)
+  --native        Use the native backend: build an image without zentorch from
+                  the base VLLM_IMAGE and use it (same as BACKEND=native)
+  --zentorch      Use the zentorch backend (default; same as BACKEND=zentorch)
+  --torchao       Build a derived image with torchao>=0.10.0 pre-installed
+                  (required for models with quant_method=torchao)
+  --no-limits     Skip cpuset/mem_limit/shm_size (for rootless/LSF environments)
+  --no-mem-limit  Drop mem_limit only; keep cpuset/shm_size/cap_add/security_opt
+  --no-wait       Start containers but don't wait for health checks
+  --timeout N     Health check timeout in seconds (default: $HEALTH_TIMEOUT)
+  -h, --help      Show this help
+
+If generated/ directory doesn't exist, generate-config.sh runs automatically with defaults.
+Pass configuration flags after --regenerate to customize (they forward to generate-config.sh).
+
+Backend defaults to zentorch. Set BACKEND=native (or pass --native) to use the
+native (non-zentorch) backend.
+
+Examples:
+  $0                                    # start with existing or default config
+  $0 --regenerate -n 3 -c 24           # regenerate for 3 instances, 24 cores each
+  $0 --regenerate --model /path/model  # regenerate with a local model
+  $0 --native --regenerate             # build native image and regenerate config to use it
+EOF
+    exit 0
+}
+
+REGENERATE=false
+NO_WAIT=false
+# Backend selection: "zentorch" (default) or "native". Override via the
+# BACKEND env var or the --native/--zentorch flags below.
+BACKEND="${BACKEND:-zentorch}"
+USE_TORCHAO=false
+GENERATE_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --regenerate) REGENERATE=true; shift ;;
+        --native) BACKEND=native; shift ;;
+        --zentorch) BACKEND=zentorch; shift ;;
+        --torchao|--with-torchao) USE_TORCHAO=true; shift ;;
+        --no-limits) GENERATE_ARGS+=("--no-limits"); shift ;;
+        --no-mem-limit) GENERATE_ARGS+=("--no-mem-limit"); shift ;;
+        --no-wait) NO_WAIT=true; shift ;;
+        --timeout) HEALTH_TIMEOUT="$2"; shift 2 ;;
+        -h|--help) usage ;;
+        *) GENERATE_ARGS+=("$1"); shift ;;
+    esac
+done
+
+case "$BACKEND" in
+    zentorch|native) ;;
+    *) echo "ERROR: invalid BACKEND='$BACKEND' (expected 'zentorch' or 'native')." >&2; exit 1 ;;
+esac
+
+# --- Host / environment preflight ------------------------------------------
+# Fail fast (with remediation) on the host-level blockers that otherwise only
+# surface as a cryptic deep failure or a 20-min health-wait hang: unresolvable
+# image short-names, missing rootless cpuset delegation, and CNI version skew.
+# Runs against the BASE image (native/torchao builds pull from it first).
+if [[ "${SKIP_HOST_CHECK:-0}" != "1" ]]; then
+    LIMITS_ON=1
+    for a in "${GENERATE_ARGS[@]:-}"; do
+        [[ "$a" == "--no-limits" ]] && LIMITS_ON=0
+    done
+    echo "--- Host preflight (image / cpuset / CNI) ---"
+    if ! LIMITS_ON="$LIMITS_ON" bash "$SCRIPT_DIR/check-host.sh"; then
+        echo "Host preflight failed. Fix the [BLOCK] items above, or set" >&2
+        echo "SKIP_HOST_CHECK=1 to bypass at your own risk." >&2
+        exit 1
+    fi
+    echo ""
+fi
+
+if [[ "$BACKEND" == "native" ]]; then
+    BASE_IMAGE="${VLLM_IMAGE:-amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23}"
+    NATIVE_IMAGE="${BASE_IMAGE}_native"
+    TEMP_CONTAINER="vllm-native-build-$$"
+
+    if podman image exists "$NATIVE_IMAGE" 2>/dev/null; then
+        echo "--- Native image already exists: $NATIVE_IMAGE ---"
+    else
+        echo "--- Building native image (removing zentorch) ---"
+        echo "  Base:   $BASE_IMAGE"
+        echo "  Target: $NATIVE_IMAGE"
+
+        podman run --entrypoint bash --name "$TEMP_CONTAINER" "$BASE_IMAGE" \
+            -c "pip uninstall -y zentorch zentorch-weekly 2>/dev/null; echo 'zentorch packages removed'"
+
+        podman commit --change 'ENTRYPOINT ["vllm", "serve"]' "$TEMP_CONTAINER" "$NATIVE_IMAGE"
+        podman rm "$TEMP_CONTAINER"
+
+        echo "  Native image built successfully."
+    fi
+
+    export VLLM_IMAGE="$NATIVE_IMAGE"
+    GENERATE_ARGS+=("--image" "$NATIVE_IMAGE" "--native")
+    REGENERATE=true
+    echo ""
+fi
+
+if $USE_TORCHAO; then
+    BASE_IMAGE="${VLLM_IMAGE:-amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23}"
+    TORCHAO_SPEC="${TORCHAO_VERSION:+torchao==${TORCHAO_VERSION}}"
+    TORCHAO_SPEC="${TORCHAO_SPEC:-torchao>=0.10.0}"
+    TORCHAO_TAG="${TORCHAO_VERSION:-latest}"
+    TORCHAO_IMAGE="${BASE_IMAGE}_torchao-${TORCHAO_TAG}"
+    TEMP_CONTAINER="vllm-torchao-build-$$"
+
+    if podman image exists "$TORCHAO_IMAGE" 2>/dev/null; then
+        echo "--- torchao image already exists: $TORCHAO_IMAGE ---"
+    else
+        echo "--- Building torchao image (installing ${TORCHAO_SPEC}) ---"
+        echo "  Base:   $BASE_IMAGE"
+        echo "  Target: $TORCHAO_IMAGE"
+
+        podman run --entrypoint bash --name "$TEMP_CONTAINER" "$BASE_IMAGE" \
+            -c "pip install --no-cache-dir '${TORCHAO_SPEC}' && python -c 'import torchao; print(\"torchao\", torchao.__version__)'"
+
+        podman commit --change 'ENTRYPOINT ["vllm", "serve"]' "$TEMP_CONTAINER" "$TORCHAO_IMAGE"
+        podman rm "$TEMP_CONTAINER"
+
+        echo "  torchao image built successfully."
+    fi
+
+    export VLLM_IMAGE="$TORCHAO_IMAGE"
+    GENERATE_ARGS+=("--image" "$TORCHAO_IMAGE")
+    REGENERATE=true
+    echo ""
+fi
+
+if $REGENERATE || [[ ! -f "$COMPOSE_FILE" ]]; then
+    echo "--- Generating configuration ---"
+    bash "$SCRIPT_DIR/generate-config.sh" "${GENERATE_ARGS[@]}"
+    echo ""
+fi
+
+if [[ ! -f "$COMPOSE_FILE" ]]; then
+    echo "ERROR: $COMPOSE_FILE not found. Run generate-config.sh first."
+    exit 1
+fi
+
+# --- HuggingFace pre-warm ---------------------------------------------------
+# Pull the model once on the host before any vLLM container starts, so the
+# shared HF cache (bind-mounted into every instance) is fully populated and
+# all N workers load from disk in parallel without racing on hf_hub lockfiles.
+# Hard-fails on download error: skipping the pre-warm and letting N containers
+# race on the same download is exactly the bug this step exists to prevent.
+ENV_FILE="$GENERATED_DIR/.env"
+if [[ -f "$ENV_FILE" ]]; then
+    set -a
+    # shellcheck disable=SC1090
+    source "$ENV_FILE"
+    set +a
+fi
+
+if [[ -z "${MODEL_NAME:-}" || -z "${HF_CACHE_DIR:-}" ]]; then
+    echo "ERROR: MODEL_NAME or HF_CACHE_DIR not set after sourcing $ENV_FILE."
+    exit 1
+fi
+
+if [[ "$MODEL_NAME" == /* ]]; then
+    echo "--- Pre-warm skipped: model is a local path ($MODEL_NAME) ---"
+else
+    echo "--- Pre-warming HF cache for $MODEL_NAME -> $HF_CACHE_DIR ---"
+    # huggingface_hub 1.0+ replaced `huggingface-cli` with `hf`. Prefer `hf`
+    # when present; fall back to `huggingface-cli` for older installs.
+    if command -v hf >/dev/null 2>&1; then
+        HF_CLI=(hf download)
+    elif command -v huggingface-cli >/dev/null 2>&1; then
+        HF_CLI=(huggingface-cli download)
+    else
+        echo "ERROR: neither 'hf' nor 'huggingface-cli' found on host. Install with:" >&2
+        echo "  pip install huggingface_hub" >&2
+        exit 1
+    fi
+    if ! HF_HOME="$HF_CACHE_DIR/huggingface" \
+         HF_TOKEN="${HF_TOKEN:-}" \
+         "${HF_CLI[@]}" "$MODEL_NAME"; then
+        echo "ERROR: ${HF_CLI[*]} failed for '$MODEL_NAME'." >&2
+        echo "  Check HF_TOKEN, model id, and network access." >&2
+        exit 1
+    fi
+    echo "  Pre-warm complete."
+fi
+# ---------------------------------------------------------------------------
+
+num_instances=$(grep -c "container_name: ${VLLM_NAME_PREFIX:-vllm-instance}-" "$COMPOSE_FILE")
+echo "--- Starting $num_instances vLLM instances + NGINX ---"
+
+# Pre-create the compose network to avoid a known podman-compose race condition
+# (multiple services in the same compose file all call "podman network exists"
+# concurrently, all see "no", all try to create, only the first succeeds and
+# the rest fail with exit 125 "already exists"). Pre-creating it once here,
+# single-threaded, before `podman-compose up` means compose always finds it
+# already present and never races.
+#
+# podman < 4.1 has no `network create --ignore` flag, so we must not rely on it.
+# Instead we guard creation with an explicit `network exists` check, which works
+# on every podman version (the stale-subnet block above already ensures any
+# leftover network has the right subnet, or has been removed).
+COMPOSE_PROJECT="$(basename "$GENERATED_DIR")"
+COMPOSE_NETWORK="${COMPOSE_PROJECT}_vllm-network"
+# Pull the subnet straight out of the generated compose so the pre-created
+# network matches the static IPs assigned to each service. Without a matching
+# subnet, podman-compose's ipv4_address assignments would fall outside the
+# network and fail. Static IPs are required because rootless aardvark-dns is
+# unreachable here, so NGINX must reach instances by IP, not hostname.
+COMPOSE_SUBNET="$(grep -E '^[[:space:]]+- subnet:' "$COMPOSE_FILE" | head -1 | awk '{print $NF}')"
+SUBNET_ARG=()
+if [[ -n "$COMPOSE_SUBNET" ]]; then
+    SUBNET_ARG=(--subnet "$COMPOSE_SUBNET")
+fi
+# A network left over from a crashed/killed run may carry a DIFFERENT subnet than
+# the one the current compose assigns its static IPs from. A reused stale network
+# would make every container fail with "requested static ip ... not in any
+# subnet". Guard against it: if a network already exists with a mismatched
+# subnet, remove it so we recreate it correctly.
+#
+# The subnet lives in different places depending on podman's network backend:
+#   - netavark (podman >= 4): top-level `.Subnets` (.Subnet field)
+#   - CNI (podman 3.x):       `.plugins[].ipam.ranges[][].subnet`
+# Try the netavark template first, then fall back to the CNI path.
+network_subnet() {
+    local net="$1" out
+    out="$(podman network inspect "$net" \
+        --format '{{range .Subnets}}{{.Subnet}}{{end}}' 2>/dev/null)"
+    if [[ -z "$out" ]]; then
+        out="$(podman network inspect "$net" \
+            --format '{{range .plugins}}{{range .ipam.ranges}}{{range .}}{{.subnet}}{{end}}{{end}}{{end}}' 2>/dev/null)"
+    fi
+    printf '%s' "$out"
+}
+
+if podman network exists "$COMPOSE_NETWORK" 2>/dev/null; then
+    existing_subnet="$(network_subnet "$COMPOSE_NETWORK")"
+    if [[ -n "$COMPOSE_SUBNET" && "$existing_subnet" != "$COMPOSE_SUBNET" ]]; then
+        echo "  Stale network $COMPOSE_NETWORK has subnet '${existing_subnet:-none}'," \
+             "expected '$COMPOSE_SUBNET' -- removing it."
+        podman network rm -f "$COMPOSE_NETWORK" >/dev/null 2>&1 || true
+    fi
+fi
+
+# Create only if absent. `network exists` + create is portable across all podman
+# versions (podman < 4.1 lacks `network create --ignore`). The pre-create is
+# single-threaded here, so there is no race with the check.
+if ! podman network exists "$COMPOSE_NETWORK" 2>/dev/null; then
+    podman network create \
+        --label "io.podman.compose.project=${COMPOSE_PROJECT}" \
+        --label "com.docker.compose.project=${COMPOSE_PROJECT}" \
+        --driver bridge \
+        "${SUBNET_ARG[@]}" \
+        "$COMPOSE_NETWORK" >/dev/null
+fi
+
+# On the CNI backend (podman 3.x) the conflist podman just wrote may declare a
+# cniVersion newer than the host's containernetworking-plugins support. The
+# bridge/portmap plugins then reject it and every container silently falls back
+# to the default podman net (10.88.x), losing the static IPs nginx.conf routes
+# to -> every LB request 504s and the health wait below would hang the full
+# timeout. Downgrade the conflist to a broadly-supported cniVersion so the
+# static IPs hold. (0.4.0 is accepted by both pre-1.0 and 1.x plugins.)
+downgrade_cni_version() {
+    local net="$1" backend dir f
+    backend="$(podman info --format '{{.Host.NetworkBackend}}' 2>/dev/null || echo "")"
+    # .Host.NetworkBackend only exists on podman >= 4 (empty on podman 3.x).
+    # Guarding on `== "cni"` therefore skipped the downgrade on exactly the
+    # podman-3.x hosts that need it. Bail only on an explicit netavark backend;
+    # for cni / empty / unknown we proceed, and the `-f "$f"` conflist check
+    # below makes this a safe no-op when there's no CNI conflist to rewrite.
+    [[ "$backend" == "netavark" ]] && return 0
+    if [[ "$(id -u)" -eq 0 ]]; then
+        dir="/etc/cni/net.d"
+    else
+        dir="${XDG_CONFIG_HOME:-$HOME/.config}/cni/net.d"
+    fi
+    f="$dir/$net.conflist"
+    [[ -f "$f" ]] || return 0
+    if grep -q '"cniVersion"[[:space:]]*:[[:space:]]*"1\.' "$f"; then
+        echo "  CNI backend: downgrading $net cniVersion -> 0.4.0 (host plugins are pre-1.0)."
+        sed -i 's/"cniVersion"[[:space:]]*:[[:space:]]*"1\.[0-9.]*"/"cniVersion": "0.4.0"/' "$f" || true
+    fi
+}
+downgrade_cni_version "$COMPOSE_NETWORK"
+
+cd "$GENERATED_DIR"
+podman-compose up -d
+cd "$SCRIPT_DIR"
+
+echo ""
+echo "--- Container status ---"
+podman ps --filter "name=vllm-" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
+
+if $NO_WAIT; then
+    echo ""
+    echo "Started without waiting for health checks (--no-wait)."
+    echo "Run ./test-setup.sh to verify the setup once instances are ready."
+    exit 0
+fi
+
+echo ""
+echo "--- Waiting for all instances to become healthy (timeout: ${HEALTH_TIMEOUT}s) ---"
+echo "    vLLM model loading can take several minutes..."
+
+# Required containers gate the wait loop. nginx-lb is shown for visibility
+# only -- its self-healthcheck (wget through the upstream) can be flaky
+# while vLLM workers are still warming up, even when the proxy is working.
+required_containers=()
+for i in $(seq 1 "$num_instances"); do
+    required_containers+=("${VLLM_NAME_PREFIX:-vllm-instance}-$i")
+done
+optional_containers=("${VLLM_NGINX_NAME:-vllm-nginx-lb}")
+
+# Guard against the CNI fallback: if a vLLM container didn't get its assigned
+# static IP from our subnet, it landed on the default podman net and NGINX
+# (which routes by static IP) will 504 every request -> the health wait below
+# would hang the whole timeout. Detect it immediately and fail with guidance.
+if [[ -n "$COMPOSE_SUBNET" ]]; then
+    subnet_net="${COMPOSE_SUBNET%/*}"      # 10.201.0.0/24 -> 10.201.0.0
+    expected_prefix="${subnet_net%.*}."     # -> 10.201.0.
+    bad_ip=false
+    for c in "${required_containers[@]}"; do
+        cip="$(podman inspect \
+            --format '{{range .NetworkSettings.Networks}}{{.IPAddress}} {{end}}' \
+            "$c" 2>/dev/null || true)"
+        if [[ "$cip" != *"$expected_prefix"* ]]; then
+            echo "ERROR: $c is not on the expected subnet $COMPOSE_SUBNET (IP: '${cip:-none}')." >&2
+            bad_ip=true
+        fi
+    done
+    if $bad_ip; then
+        echo "" >&2
+        echo "Containers fell back off their static IPs. NGINX routes by static IP, so" >&2
+        echo "every load-balanced request would 504 and the health wait would hang." >&2
+        echo "Most common cause on podman 3.x: a CNI cniVersion the host plugins reject" >&2
+        echo "(start.sh tries to auto-downgrade it; if you still see this, the plugins" >&2
+        echo "are too old). Remedies:" >&2
+        echo "  - upgrade 'containernetworking-plugins' on the host, or" >&2
+        echo "  - inspect: podman network inspect $COMPOSE_NETWORK" >&2
+        echo "Aborting now instead of waiting ${HEALTH_TIMEOUT}s." >&2
+        exit 1
+    fi
+fi
+
+start_time=$(date +%s)
+
+while true; do
+    elapsed=$(( $(date +%s) - start_time ))
+    if [[ $elapsed -ge $HEALTH_TIMEOUT ]]; then
+        echo ""
+        echo "ERROR: Timed out after ${HEALTH_TIMEOUT}s waiting for vLLM instances to become healthy."
+        echo "Check logs with: podman logs <container-name>"
+        for c in "${required_containers[@]}" "${optional_containers[@]}"; do
+            status=$(container_health "$c")
+            echo "  $c: $status"
+        done
+        exit 1
+    fi
+
+    # Fast-fail: if a required container has exited/died (e.g. cpuset error,
+    # OOM, bad model/image), abort now instead of waiting out the full timeout.
+    for c in "${required_containers[@]}"; do
+        state=$(podman inspect --format '{{.State.Status}}' "$c" 2>/dev/null || echo "missing")
+        if [[ "$state" == "exited" || "$state" == "dead" ]]; then
+            ec=$(podman inspect --format '{{.State.ExitCode}}' "$c" 2>/dev/null || echo "?")
+            echo ""
+            echo "ERROR: $c is '$state' (exit code $ec) -- it will never become healthy." >&2
+            echo "Last 25 log lines:" >&2
+            podman logs --tail 25 "$c" 2>&1 | sed 's/^/    /' >&2 || true
+            echo "Aborting now instead of waiting ${HEALTH_TIMEOUT}s." >&2
+            exit 1
+        fi
+    done
+
+    all_healthy=true
+    status_line=""
+    for c in "${required_containers[@]}"; do
+        health=$(container_health "$c")
+        short_name="${c#vllm-}"
+        if [[ "$health" == "healthy" ]]; then
+            status_line="$status_line [$short_name:OK]"
+        else
+            status_line="$status_line [$short_name:$health]"
+            all_healthy=false
+        fi
+    done
+    for c in "${optional_containers[@]}"; do
+        health=$(container_health "$c")
+        short_name="${c#vllm-}"
+        # Suffix with '*' so it's visually obvious this one is informational.
+        status_line="$status_line [$short_name:${health}*]"
+    done
+
+    printf "\r  [%3ds]%s" "$elapsed" "$status_line"
+
+    if $all_healthy; then
+        echo ""
+        echo ""
+        echo "All vLLM instances are healthy. (nginx-lb status is informational only.)"
+        break
+    fi
+
+    sleep "$HEALTH_INTERVAL"
+done
+
+echo ""
+echo "--- Final status ---"
+podman ps --filter "name=vllm-" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
+echo ""
+echo "NGINX endpoint: http://localhost:${NGINX_PORT:-8080}"
+echo "Run ./test-setup.sh to validate the full setup."
diff --git a/skills/vllm-multiinstance/harness/stop.sh b/skills/vllm-multiinstance/harness/stop.sh
new file mode 100755
index 0000000..16cb766
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/stop.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GENERATED_DIR="$SCRIPT_DIR/generated"
+COMPOSE_FILE="$GENERATED_DIR/docker-compose.yml"
+
+usage() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Stop the multi-instance vLLM + NGINX stack.
+
+Options:
+  --clean         Remove volumes (model caches) after stopping
+  --purge         Remove volumes AND delete generated config files
+  --log-dir DIR   Directory to dump container logs into before teardown.
+                  Defaults to \$LOG_DIR if set, else
+                  sweep-logs/<timestamp>/container-logs/.
+                  Pass --no-logs to skip the dump entirely.
+  --no-logs       Skip dumping container logs before teardown.
+  -h, --help      Show this help
+EOF
+    exit 0
+}
+
+CLEAN=false
+PURGE=false
+CLI_LOG_DIR=""
+DUMP_LOGS=true
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --clean) CLEAN=true; shift ;;
+        --purge) PURGE=true; CLEAN=true; shift ;;
+        --log-dir) CLI_LOG_DIR="$2"; shift 2 ;;
+        --no-logs) DUMP_LOGS=false; shift ;;
+        -h|--help) usage ;;
+        *) echo "Unknown option: $1"; usage ;;
+    esac
+done
+
+# ----------------------------------------------------------------------------
+# Dump podman logs for every stack container BEFORE we stop/remove them.
+# Matches vllm-* and nginx* containers (the names used by generate-config.sh).
+# Includes stopped containers so we still capture exit logs.
+# ----------------------------------------------------------------------------
+dump_container_logs() {
+    if ! $DUMP_LOGS; then
+        return 0
+    fi
+
+    local containers
+    containers=$(podman ps -a \
+        --filter "name=vllm-" \
+        --filter "name=nginx" \
+        --format "{{.Names}}" | sort -u)
+
+    if [[ -z "$containers" ]]; then
+        echo "  No vllm-* or nginx* containers found -- skipping log dump."
+        return 0
+    fi
+
+    local out_dir
+    if [[ -n "$CLI_LOG_DIR" ]]; then
+        out_dir="$CLI_LOG_DIR"
+    elif [[ -n "${LOG_DIR:-}" ]]; then
+        out_dir="$LOG_DIR/container-logs"
+    else
+        out_dir="$SCRIPT_DIR/sweep-logs/$(date +%Y%m%d-%H%M%S)/container-logs"
+    fi
+    mkdir -p "$out_dir"
+
+    echo "--- Dumping container logs to $out_dir ---"
+    local c
+    for c in $containers; do
+        local log_file="$out_dir/${c}.log"
+        if podman logs "$c" > "$log_file" 2>&1; then
+            local size
+            size=$(wc -c < "$log_file" | tr -d ' ')
+            echo "  saved $log_file ($size bytes)"
+        else
+            echo "  WARNING: failed to dump logs for $c (see $log_file)"
+        fi
+    done
+}
+
+dump_container_logs
+
+if [[ ! -f "$COMPOSE_FILE" ]]; then
+    echo "No compose file found at $COMPOSE_FILE."
+    echo "Attempting to stop containers by name..."
+    for c in $(podman ps -a --filter "name=vllm-" --format "{{.Names}}"); do
+        echo "  Stopping $c..."
+        podman stop "$c" --timeout 10 2>/dev/null || true
+        podman rm "$c" 2>/dev/null || true
+    done
+    echo "Done."
+    exit 0
+fi
+
+echo "--- Stopping multi-instance vLLM stack ---"
+
+cd "$GENERATED_DIR"
+
+if $CLEAN; then
+    echo "  Stopping containers and removing volumes..."
+    podman-compose down -v
+else
+    echo "  Stopping containers (volumes preserved)..."
+    podman-compose down
+fi
+
+cd "$SCRIPT_DIR"
+
+# Explicitly remove the compose network. `podman-compose down` leaves it behind
+# when a run was killed mid-flight, and a stale network with a mismatched subnet
+# breaks the next start.sh ("requested static ip ... not in any subnet").
+COMPOSE_NETWORK="$(basename "$GENERATED_DIR")_vllm-network"
+if podman network exists "$COMPOSE_NETWORK" 2>/dev/null; then
+    echo "  Removing network $COMPOSE_NETWORK..."
+    podman network rm -f "$COMPOSE_NETWORK" >/dev/null 2>&1 || true
+fi
+
+echo ""
+echo "--- Remaining vLLM containers ---"
+remaining=$(podman ps -a --filter "name=vllm-" --format "{{.Names}}" | wc -l)
+if [[ "$remaining" -eq 0 ]]; then
+    echo "  None (clean shutdown)."
+else
+    podman ps -a --filter "name=vllm-" --format "table {{.Names}}\t{{.Status}}"
+fi
+
+if $PURGE; then
+    echo ""
+    echo "--- Removing generated config ---"
+    rm -rf "$GENERATED_DIR"
+    echo "  Deleted $GENERATED_DIR"
+fi
+
+echo ""
+echo "Stack stopped."
diff --git a/skills/vllm-multiinstance/harness/vllm-cpu-perf-eval.patch b/skills/vllm-multiinstance/harness/vllm-cpu-perf-eval.patch
new file mode 100644
index 0000000..39b6ed3
--- /dev/null
+++ b/skills/vllm-multiinstance/harness/vllm-cpu-perf-eval.patch
@@ -0,0 +1,339 @@
+diff --git a/automation/test-execution/ansible/roles/benchmark_guidellm/tasks/main.yml b/automation/test-execution/ansible/roles/benchmark_guidellm/tasks/main.yml
+index 96214b5..ef107be 100644
+--- a/automation/test-execution/ansible/roles/benchmark_guidellm/tasks/main.yml
++++ b/automation/test-execution/ansible/roles/benchmark_guidellm/tasks/main.yml
+@@ -84,6 +84,11 @@
+   ansible.builtin.set_fact:
+     processor_model: "{{ guidellm_processor | default(resolved_model) }}"
+ 
++- name: Detect local processor path (so we can bind-mount it into the guidellm container)
++  ansible.builtin.set_fact:
++    processor_is_local_path: "{{ processor_model is string and processor_model.startswith('/') }}"
++    processor_local_path: "{{ processor_model if (processor_model is string and processor_model.startswith('/')) else '' }}"
++
+ - name: Set results path
+   ansible.builtin.set_fact:
+     results_path: "{{ bench_config.results_dir }}/{{ resolved_model | replace('/', '__') }}/{{ workload_type }}-{{ test_run_id }}/{{ core_cfg.name }}"
+@@ -142,7 +147,7 @@
+       GUIDELLM_COOLDOWN: "{{ guidellm_cfg.cooldown }}"
+       GUIDELLM_OUTPUTS: "{{ guidellm_cfg.outputs }}"
+       HF_TOKEN: "{{ hf_token | default('') }}"
+-  no_log: true
++  no_log: false
+ 
+ - name: Build GUIDELLM_DATA for fixed workloads
+   ansible.builtin.set_fact:
+@@ -157,13 +162,13 @@
+ - name: Add GUIDELLM_DATA to environment
+   ansible.builtin.set_fact:
+     guidellm_env: "{{ guidellm_env | combine({'GUIDELLM_DATA': guidellm_data_string}) }}"
+-  no_log: true
++  no_log: false
+ 
+ - name: Add rate parameter for non-synchronous profiles
+   ansible.builtin.set_fact:
+     guidellm_env: "{{ guidellm_env | combine({'GUIDELLM_RATE': guidellm_cfg.rate | default([]) | join(',')}) }}"
+   when: guidellm_cfg.profile != 'synchronous'
+-  no_log: true
++  no_log: false
+ 
+ - name: Add API key to backend kwargs if configured
+   ansible.builtin.set_fact:
+@@ -171,7 +176,7 @@
+   when:
+     - vllm_api_key is defined
+     - vllm_api_key | length > 0
+-  no_log: true
++  no_log: false
+ 
+ - name: Build GuideLLM container name with actual NUMA configuration
+   ansible.builtin.set_fact:
+@@ -182,21 +187,26 @@
+   containers.podman.podman_container:
+     name: "{{ guidellm_container_name }}"
+     image: "{{ guidellm_cfg.container_image }}"
++    user: "0:0"
+     state: started
+     rm: false  # Don't auto-remove so we can check exit code
+     detach: true  # Run in background so we can stream logs
+     network: host
+     cpuset_cpus: "{{ omit if ansible_facts['system'] == 'Darwin' else guidellm_cfg.cpuset_cpus }}"
+     cpuset_mems: "{{ omit if ansible_facts['system'] == 'Darwin' else guidellm_cfg.cpuset_mems }}"
+-    volumes:
+-      - "{{ results_path }}:/results:z"
++    volumes: >-
++      {{
++        [results_path ~ ':/results:z']
++        + ([processor_local_path ~ ':' ~ processor_local_path ~ ':ro,z']
++           if processor_is_local_path else [])
++      }}
+     env: "{{ guidellm_env }}"
+     log_driver: journald
+     log_opt:
+       tag: "{{ guidellm_container_name }}"
+   register: guidellm_container
+   when: use_guidellm_container | bool
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Set monitoring command based on connection type
+   ansible.builtin.set_fact:
+@@ -379,7 +389,7 @@
+   when:
+     - use_guidellm_container | bool
+     - container_exit_code.stdout is defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Check benchmark exit code after diagnostics (containerized)
+   ansible.builtin.fail:
+@@ -524,7 +534,7 @@
+   register: guidellm_host_result
+   changed_when: true
+   failed_when: false
+-  no_log: true
++  no_log: false
+   when: not (use_guidellm_container | bool)
+ 
+ # ============================================================================
+diff --git a/automation/test-execution/ansible/roles/common/tasks/setup-vllm-api-key.yml b/automation/test-execution/ansible/roles/common/tasks/setup-vllm-api-key.yml
+index 8d40880..7bc1ce1 100644
+--- a/automation/test-execution/ansible/roles/common/tasks/setup-vllm-api-key.yml
++++ b/automation/test-execution/ansible/roles/common/tasks/setup-vllm-api-key.yml
+@@ -12,7 +12,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'env'
+         - vllm_endpoint.external.api_key.env_var is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Get API key from file
+       ansible.builtin.slurp:
+@@ -21,7 +21,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'file'
+         - vllm_endpoint.external.api_key.file_path is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Set API key from file content
+       ansible.builtin.set_fact:
+@@ -29,7 +29,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'file'
+         - vllm_api_key_file_content is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Get API key from Ansible vault
+       ansible.builtin.set_fact:
+@@ -37,7 +37,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'vault'
+         - vllm_endpoint.external.api_key.vault_var is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Prompt for API key
+       ansible.builtin.pause:
+@@ -46,7 +46,7 @@
+       register: vllm_api_key_prompt
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'prompt'
+-      no_log: true
++      no_log: false
+ 
+     - name: Set API key from prompt
+       ansible.builtin.set_fact:
+@@ -54,7 +54,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'prompt'
+         - vllm_api_key_prompt is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Use direct value if no source specified
+       ansible.builtin.set_fact:
+@@ -62,7 +62,7 @@
+       when:
+         - vllm_endpoint.external.api_key.source | default('value') == 'value'
+         - vllm_endpoint.external.api_key.value is defined
+-      no_log: true
++      no_log: false
+ 
+     - name: Validate API key configuration
+       ansible.builtin.assert:
+@@ -76,7 +76,7 @@
+     - name: Set vLLM API key fact
+       ansible.builtin.set_fact:
+         vllm_api_key: "{{ vllm_api_key_resolved }}"
+-      no_log: true
++      no_log: false
+ 
+     - name: Display API key status
+       ansible.builtin.debug:
+diff --git a/automation/test-execution/ansible/roles/hf_token/tasks/main.yml b/automation/test-execution/ansible/roles/hf_token/tasks/main.yml
+index f128943..4e9c0fd 100644
+--- a/automation/test-execution/ansible/roles/hf_token/tasks/main.yml
++++ b/automation/test-execution/ansible/roles/hf_token/tasks/main.yml
+@@ -9,7 +9,7 @@
+   when:
+     - huggingface.token_source == 'env'
+     - hf_token is not defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Get HuggingFace token from file
+   ansible.builtin.slurp:
+@@ -19,7 +19,7 @@
+     - huggingface.token_source == 'file'
+     - hf_token is not defined
+     - huggingface.token_file is defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Set HuggingFace token from file content
+   ansible.builtin.set_fact:
+@@ -27,7 +27,7 @@
+   when:
+     - huggingface.token_source == 'file'
+     - hf_token_file_content is defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Get HuggingFace token from Ansible vault
+   ansible.builtin.set_fact:
+@@ -36,7 +36,7 @@
+     - huggingface.token_source == 'vault'
+     - hf_token is not defined
+     - huggingface.token_vault is defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Prompt for HuggingFace token
+   ansible.builtin.pause:
+@@ -46,7 +46,7 @@
+   when:
+     - huggingface.token_source == 'prompt'
+     - hf_token is not defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Set HuggingFace token from prompt
+   ansible.builtin.set_fact:
+@@ -54,7 +54,7 @@
+   when:
+     - huggingface.token_source == 'prompt'
+     - hf_token_prompt is defined
+-  no_log: true
++  no_log: false
+ 
+ - name: Validate HuggingFace token is set
+   ansible.builtin.assert:
+@@ -68,7 +68,7 @@
+       3. Vault: Set huggingface.token_vault in inventory
+       4. Prompt: Set huggingface.token_source=prompt in inventory
+     success_msg: "✓ HuggingFace token configured"
+-  no_log: true
++  no_log: false
+ 
+ - name: Display token source (masked)
+   ansible.builtin.debug:
+diff --git a/automation/test-execution/ansible/roles/vllm_server/tasks/start-embedding.yml b/automation/test-execution/ansible/roles/vllm_server/tasks/start-embedding.yml
+index 64f7296..83894e7 100644
+--- a/automation/test-execution/ansible/roles/vllm_server/tasks/start-embedding.yml
++++ b/automation/test-execution/ansible/roles/vllm_server/tasks/start-embedding.yml
+@@ -32,7 +32,7 @@
+     vllm_env_vars:
+       VLLM_CPU_KVCACHE_SPACE: "{{ workload_cfg.kv_cache_space | extract_size_value }}"
+       HF_TOKEN: "{{ hf_token }}"
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Add OMP_NUM_THREADS if specified (ONLY for TP > 1)
+   ansible.builtin.set_fact:
+@@ -42,7 +42,7 @@
+     - core_cfg.omp_num_threads is defined
+     - core_cfg.omp_num_threads is not none
+     - core_cfg.omp_num_threads | int > 0
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Add VLLM_CPU_OMP_THREADS_BIND if specified (ONLY for TP > 1)
+   ansible.builtin.set_fact:
+@@ -51,7 +51,7 @@
+     - (core_cfg.tensor_parallel | default(1) | int) > 1
+     - core_cfg.omp_threads_bind is defined
+     - core_cfg.omp_threads_bind is not none
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Pull vLLM container image
+   containers.podman.podman_image:
+@@ -99,7 +99,7 @@
+     - container_cfg.engine == 'podman'
+     - core_cfg.cpuset_cpus is defined
+   register: vllm_container_pinned
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Start vLLM embedding server (no CPU pinning)
+   containers.podman.podman_container:
+@@ -129,7 +129,7 @@
+     - container_cfg.engine == 'podman'
+     - core_cfg.cpuset_cpus is not defined
+   register: vllm_container_no_pin
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Set container result
+   ansible.builtin.set_fact:
+diff --git a/automation/test-execution/ansible/roles/vllm_server/tasks/start-llm.yml b/automation/test-execution/ansible/roles/vllm_server/tasks/start-llm.yml
+index 2fe33a5..5a8ecb4 100644
+--- a/automation/test-execution/ansible/roles/vllm_server/tasks/start-llm.yml
++++ b/automation/test-execution/ansible/roles/vllm_server/tasks/start-llm.yml
+@@ -157,7 +157,7 @@
+     vllm_env_vars:
+       VLLM_CPU_KVCACHE_SPACE: "{{ effective_kv_cache_space | extract_size_value }}"
+       HF_TOKEN: "{{ hf_token }}"
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Add OMP_NUM_THREADS if specified (ONLY for TP > 1)
+   ansible.builtin.set_fact:
+@@ -167,7 +167,7 @@
+     - core_cfg.omp_num_threads is defined
+     - core_cfg.omp_num_threads is not none
+     - core_cfg.omp_num_threads | int > 0
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Add VLLM_CPU_OMP_THREADS_BIND if specified (ONLY for TP > 1)
+   ansible.builtin.set_fact:
+@@ -176,7 +176,7 @@
+     - (core_cfg.tensor_parallel | default(1) | int) > 1
+     - core_cfg.omp_threads_bind is defined
+     - core_cfg.omp_threads_bind is not none
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Detect AMD ZenDNN container for OMP binding
+   ansible.builtin.set_fact:
+@@ -188,7 +188,7 @@
+   when:
+     - is_zendnn_container_early
+     - core_cfg.cpuset_cpus is defined
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Build base vLLM arguments from workload
+   ansible.builtin.set_fact:
+@@ -318,7 +318,7 @@
+       tag: "vllm-{{ workload_type }}-{{ core_cfg.cores }}c-tp{{ core_cfg.tensor_parallel }}"
+   when: container_cfg.engine == 'podman'
+   register: vllm_container
+-  no_log: true  # Prevent HF_TOKEN from appearing in logs
++  no_log: false  # Prevent HF_TOKEN from appearing in logs
+ 
+ - name: Display vLLM container info
+   ansible.builtin.debug:
diff --git a/skills/vllm-multiinstance/reference.md b/skills/vllm-multiinstance/reference.md
new file mode 100644
index 0000000..25c5462
--- /dev/null
+++ b/skills/vllm-multiinstance/reference.md
@@ -0,0 +1,144 @@
+# vllm-multiinstance — command reference (replay log)
+
+Concrete, copy-pasteable commands for an end-to-end run: benchmarking a vLLM CPU
+image across instance counts and concurrency rates, measuring memory footprint +
+end-to-end performance.
+
+This is a history, not a tutorial — see `SKILL.md` for the why. The benchmark
+harness is **vendored in this skill** (`harness/`); the only thing you supply is a
+container image and a model. Commands below assume you `cd` into the skill dir
+first:
+```bash
+cd <repo>/skills/vllm-multiinstance      # all paths below are relative to here
+```
+
+Default image used throughout:
+```bash
+IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+```
+To benchmark a custom image, build it however you like and point `VLLM_IMAGE` at
+it — the harness benchmarks whatever image you give it. For a native (non-zentorch)
+A/B on the same image, pass `NATIVE=1`; no separate build is needed.
+
+---
+
+## 1. Check the hardware
+
+```bash
+python3 scripts/detect.py
+# -> physical_cores, sockets, numa_nodes, memory_gb, ...
+```
+Size the sweep: `CORES_PER_INSTANCE=32` fixed,
+`NUM_INSTANCES = floor((physical_cores - 16) / 32)`, single socket. (128 cores → 3
+instances.) Also sanity-check `df -h /` — set `BENCH_ROOT` to a roomy fs if root is
+tight.
+
+---
+
+## 2. One-time harness setup
+
+```bash
+# Clone + patch the external ansible/guidellm automation into harness/.
+bash scripts/setup-harness.sh         # idempotent
+
+# Pre-warm the model into a shared HF cache (offline runs need it on disk).
+# Use whatever Python env has huggingface-cli / hf (e.g. conda activate base).
+HF_HOME=$HOME/.cache/hf-shared/huggingface hf download Qwen/Qwen3-0.6B
+```
+
+The patch applied by setup-harness.sh adds the rootless guidellm `user: "0:0"` fix
+and the `/tmp → BENCH_TMPDIR` redirect to the ansible automation.
+
+---
+
+## 3. Pre-flight
+
+```bash
+nproc; lscpu | grep -E "Socket|Core|NUMA|Model name"   # enough physical cores? 1 socket?
+podman ps --format '{{.Names}} {{.Status}}' | grep -i vllm   # any stack pinning your cores?
+df -h /                                                  # root full? BENCH_ROOT on a roomy fs sidesteps it
+
+# Stop a stale stack if present (by name):
+for c in bench-vllm-instance-1 bench-vllm-instance-2 bench-vllm-instance-3 bench-vllm-nginx-lb; do
+  podman rm -f "$c" 2>/dev/null; done
+
+# Dry-run validates preflight + ansible path + env wiring (no containers):
+VLLM_IMAGE="$IMAGE" NUM_INSTANCES=3 CORES_PER_INSTANCE=32 HF_TOKEN=offline \
+  HF_CACHE_DIR="$HOME/.cache/hf-shared" \
+  harness/run_sweep.sh --dry-run -m "Qwen/Qwen3-0.6B | qwen3-0.6b"
+```
+
+---
+
+## 4. Run the sweep
+
+`scripts/run_combo.sh` is env-driven: set `LABEL`, `VLLM_IMAGE`, `MODEL` per run.
+Define the matrix as a data table and loop — no script edits.
+
+```bash
+mkdir -p results          # MUST exist before any nohup redirect or the job dies
+
+IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b"
+# Each row: "label | image | extra-env"  (extra-env e.g. NATIVE=1)
+MATRIX=(
+  "zentorch | $IMAGE | "
+  "native   | $IMAGE | NATIVE=1"
+)
+
+cat > run_sweep_all.sh <<EOF
+#!/bin/bash
+set -uo pipefail
+cd "$PWD"
+MODEL="$MODEL"
+MATRIX=( $(printf '"%s" ' "${MATRIX[@]}") )
+for row in "\${MATRIX[@]}"; do
+    IFS='|' read -r label image extra <<<"\$row"
+    label="\${label// /}"; image="\${image// /}"
+    echo "############ STARTING \$label ############"
+    env \$extra LABEL="\$label" VLLM_IMAGE="\$image" MODEL="\$MODEL" \\
+        bash scripts/run_combo.sh > "results/run_\${label}.out" 2>&1
+    echo "############ FINISHED \$label rc=\$? ############"
+    grep "^PEAK" "results/mem_\${label}.csv" 2>/dev/null || echo "no peak for \$label"
+done
+echo "ALL_DONE"
+EOF
+nohup bash run_sweep_all.sh > results/run_sweep_all.out 2>&1 &
+while ! grep -q ALL_DONE results/run_sweep_all.out; do sleep 60; done   # wait on sentinel, don't poll
+
+# One combo at a different concurrency — RUN_TAG keeps outputs separate:
+LABEL=zentorch VLLM_IMAGE="$IMAGE" MODEL="$MODEL" \
+  GUIDELLM_RATES="[96]" RUN_TAG="_c96" \
+  bash scripts/run_combo.sh > results/run_zentorch_c96.out 2>&1
+```
+
+---
+
+## 5. Collect scores — from guidellm.log (authoritative)
+
+```bash
+R=harness/vllm-cpu-perf-eval/results/llm/Qwen__Qwen3-0.6B
+
+# Result dirs newest-first; runs may share a test_name → disambiguate by timestamp.
+ls -1dt "$R"/chat-*
+
+# Perf per run (server-aggregate throughput + median latency):
+python3 scripts/parse_guidellm_log.py "$R/chat-<ts>-<test_name>/external-endpoint/guidellm.log"
+# conc  req/s  in_tok/s  out_tok/s  tot_tok/s  lat_s  TTFT_ms  ITL_ms  TPOT_ms
+
+# Peak memory per run (<label> = the LABEL you passed, + RUN_TAG if any):
+grep "^PEAK" results/mem_<label>.csv
+# PEAK label=<label> instances=3 agg_mem_bytes=... agg_mem_human=...
+
+# Sanity: every run must be Failed : 0, no ENOSPC/fatal in guidellm.log
+grep -E "Failed +:" results/run_*.out
+grep -rliE "no space|fatal|traceback" "$R"/*/external-endpoint/guidellm.log || echo "clean"
+
+# Cross-check each run's image (which image actually ran):
+grep -hE "VLLM_IMAGE=" results/run_<label>.out | head
+```
+
+Do NOT report throughput from `benchmarks.json` — its
+`requests_per_second`/`output_tokens_per_second` are per-request medians and
+understate server throughput. `extract_perf.py` parses the JSON as a
+fallback/cross-check only.
diff --git a/skills/vllm-multiinstance/scripts/Dockerfile.buildB b/skills/vllm-multiinstance/scripts/Dockerfile.buildB
new file mode 100644
index 0000000..72ca0f1
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/Dockerfile.buildB
@@ -0,0 +1,29 @@
+# Build B: v0.23.0 zentorch image + zentorch wheel rebuilt from ZenDNN PR #532.
+# Same vLLM (0.23.0) / torch (2.11.0) as Build A; only the ZenDNN/zentorch
+# native lib changes, so combos 3-5 differ from combos 1-2 by exactly PR #532.
+ARG BASE_IMAGE=mkmhub.amd.com/sw-aig-zendnn-vllm-zentorch-dev/vllm_v0.23.0_zentorch_v2.11.0.2:ubuntu22.04_v2.11.0.2
+FROM ${BASE_IMAGE}
+
+# Build toolchain for the plugin (image is runtime-only; needs compilers + cmake).
+USER root
+RUN apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        gcc-13 g++-13 make cmake git libnuma-dev && \
+    rm -rf /var/lib/apt/lists/* || true
+
+# Plugin build copies ZenDNN from its sibling parent dir when
+# ZENTORCH_USE_LOCAL_ZENDNN=1 (see cmake/modules/zendnnl.cmake). Place both as
+# siblings under /src so ${PLUGIN_PARENT_DIR}/ZenDNN resolves.
+ENV ZENTORCH_USE_LOCAL_ZENDNN=1
+COPY ZenDNN /src/ZenDNN
+COPY ZenDNN_PyTorch_Plugin /src/ZenDNN_PyTorch_Plugin
+
+# Rebuild the wheel from PR #532 source and force-reinstall over the image's
+# prebuilt zentorch (same py312 venv already on PATH).
+RUN cd /src/ZenDNN_PyTorch_Plugin && \
+    rm -rf build dist third_party/ZenDNN *.egg-info && \
+    python setup.py bdist_wheel && \
+    pip install --force-reinstall --no-deps dist/*.whl && \
+    python -c "import zentorch, torch, vllm; print('zentorch', zentorch.__version__, '| torch', torch.__version__, '| vllm', vllm.__version__)"
+
+WORKDIR /workspace
+CMD ["/bin/bash"]
diff --git a/skills/vllm-multiinstance/scripts/detect.py b/skills/vllm-multiinstance/scripts/detect.py
new file mode 100644
index 0000000..3fccbaa
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/detect.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+Detect the local CPU hardware for the multi-instance vLLM benchmark.
+
+Usage:
+    python3 scripts/detect.py
+
+Output: JSON with cpu_model, is_amd_epyc, epyc_generation
+(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), zen_arch, avx512, physical_cores,
+logical_cores, sockets, threads_per_core, numa_nodes, memory_gb. Exits 0 on
+success, 1 if no CPU info could be read.
+
+Use physical_cores to size the sweep: with CORES_PER_INSTANCE=32 fixed, run
+NUM_INSTANCES = floor((physical_cores - 16) / 32) on a single socket.
+"""
+
+import json
+import re
+import subprocess
+import sys
+
+
+def _run(cmd, timeout=20):
+    r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE, text=True, timeout=timeout)
+    return r.returncode, r.stdout, r.stderr
+
+
+def _lscpu_field(lscpu_out, label):
+    m = re.search(rf"^{re.escape(label)}:\s*(.+)$", lscpu_out, re.MULTILINE)
+    return m.group(1).strip() if m else ""
+
+
+def _epyc_generation(model):
+    """Map an AMD EPYC model name to (generation, zen_arch).
+
+    EPYC numbering encodes the generation: 7xx1=Naples (Zen1), 7xx2=Rome (Zen2),
+    7xx3=Milan (Zen3), 8xx4=Siena (Zen4c), 97x4=Bergamo (Zen4c), 9xx4=Genoa (Zen4),
+    9xx5=Turin (Zen5)."""
+    m = re.search(r"EPYC\s+(\d{4})", model.upper())
+    if not m:
+        return "unknown", "unknown"
+    num = m.group(1)
+    first, last = num[0], num[3]
+    if first == "7":
+        return {"1": ("Naples", "Zen1"), "2": ("Rome", "Zen2"),
+                "3": ("Milan", "Zen3")}.get(last, ("unknown", "unknown"))
+    if first == "8" and last == "4":
+        return "Siena", "Zen4c"
+    if first == "9":
+        if num.startswith("97") and last == "4":
+            return "Bergamo", "Zen4c"
+        if last == "4":
+            return "Genoa", "Zen4"
+        if last == "5":
+            return "Turin", "Zen5"
+    return "unknown", "unknown"
+
+
+def main():
+    rc, lscpu_out, err = _run("lscpu")
+    if rc != 0 or not lscpu_out:
+        print(json.dumps({"error": "lscpu failed",
+                          "detail": err.strip() or f"exit {rc}"}))
+        sys.exit(1)
+
+    model = _lscpu_field(lscpu_out, "Model name") or "unknown"
+    vendor = _lscpu_field(lscpu_out, "Vendor ID")
+
+    def _int(label, default=0):
+        v = _lscpu_field(lscpu_out, label)
+        try:
+            return int(v)
+        except ValueError:
+            return default
+
+    sockets = _int("Socket(s)", 1)
+    cores_per_socket = _int("Core(s) per socket", 0)
+    threads_per_core = _int("Thread(s) per core", 1) or 1
+    numa_nodes = _int("NUMA node(s)", 1)
+
+    rc, nproc_out, _ = _run("nproc --all")
+    try:
+        logical = int(nproc_out.strip())
+    except (ValueError, AttributeError):
+        logical = sockets * cores_per_socket * threads_per_core
+
+    physical = sockets * cores_per_socket if cores_per_socket else logical // threads_per_core
+
+    rc, mem_out, _ = _run("grep MemTotal /proc/meminfo")
+    mem_kb = 0
+    m = re.search(r"(\d+)", mem_out or "")
+    if m:
+        mem_kb = int(m.group(1))
+    memory_gb = mem_kb // (1024 * 1024)
+
+    is_epyc = vendor == "AuthenticAMD" and "EPYC" in model.upper()
+    generation, zen_arch = _epyc_generation(model)
+    avx512 = "avx512f" in _lscpu_field(lscpu_out, "Flags").split()
+
+    print(json.dumps({
+        "cpu_model": model,
+        "vendor": vendor,
+        "is_amd_epyc": is_epyc,
+        "epyc_generation": generation,
+        "zen_arch": zen_arch,
+        "avx512": avx512,
+        "logical_cores": logical,
+        "physical_cores": physical,
+        "sockets": sockets,
+        "threads_per_core": threads_per_core,
+        "numa_nodes": numa_nodes,
+        "memory_gb": memory_gb,
+    }, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/vllm-multiinstance/scripts/extract_perf.py b/skills/vllm-multiinstance/scripts/extract_perf.py
new file mode 100644
index 0000000..b014c94
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/extract_perf.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Extract headline guidellm metrics from a benchmarks.json.
+usage: extract_perf.py <benchmarks.json>
+Prints one line per benchmark (rate): rate, completed, req/s, out_tok/s, TTFT_ms(median), ITL_ms(median).
+"""
+import json, sys
+
+def stat(metric, field="median"):
+    """metric is a dict like {'successful': {'median':..}, ...} or {'median':..}."""
+    if metric is None:
+        return None
+    if isinstance(metric, dict):
+        # guidellm nests under 'successful' / 'total' sometimes
+        for key in ("successful", "total", "all"):
+            if key in metric and isinstance(metric[key], dict):
+                if field in metric[key]:
+                    return metric[key][field]
+        if field in metric:
+            return metric[field]
+    return None
+
+def g(d, *path):
+    cur = d
+    for p in path:
+        if isinstance(cur, dict) and p in cur:
+            cur = cur[p]
+        else:
+            return None
+    return cur
+
+f = sys.argv[1]
+d = json.load(open(f))
+bs = d["benchmarks"]
+print(f"{'rate':>6} {'completed':>10} {'req/s':>8} {'out_tok/s':>10} {'TTFT_ms':>9} {'ITL_ms':>8} {'TPOT_ms':>8}")
+for b in bs:
+    cfg = b.get("config", {}) or {}
+    # requested rate lives in config.strategy or args
+    strat = cfg.get("strategy") or {}
+    rate = strat.get("streams") or strat.get("max_concurrency") or strat.get("type_") if isinstance(strat, dict) else strat
+    m = b.get("metrics", {}) or {}
+    # request throughput
+    reqps = stat(m.get("requests_per_second"))
+    outtps = stat(m.get("output_tokens_per_second"))
+    ttft = stat(m.get("time_to_first_token_ms"))
+    itl = stat(m.get("inter_token_latency_ms"))
+    tpot = stat(m.get("time_per_output_token_ms"))
+    # request counts
+    rs = b.get("requests", {}) or {}
+    completed = None
+    for key in ("successful", "completed", "total"):
+        v = rs.get(key)
+        if isinstance(v, list):
+            completed = len(v); break
+        if isinstance(v, (int, float)):
+            completed = v; break
+    def fmt(x, p=2):
+        return f"{x:.{p}f}" if isinstance(x, (int, float)) else "n/a"
+    print(f"{str(rate):>6} {str(completed):>10} {fmt(reqps):>8} {fmt(outtps):>10} {fmt(ttft,1):>9} {fmt(itl,2):>8} {fmt(tpot,2):>8}")
diff --git a/skills/vllm-multiinstance/scripts/mem_poll.sh b/skills/vllm-multiinstance/scripts/mem_poll.sh
new file mode 100755
index 0000000..b531749
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/mem_poll.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Poll podman stats for all vllm-instance-* containers and record the peak
+# aggregate memory usage. Writes a CSV trace and a final PEAK line.
+#   usage: mem_poll.sh <label> <out_csv> [interval_sec]
+# Stop by deleting the .run flag file: <out_csv>.run
+set -uo pipefail
+LABEL="${1:?label}"
+OUT="${2:?out csv}"
+INTERVAL="${3:-2}"
+FLAG="${OUT}.run"
+touch "$FLAG"
+echo "ts_unix,n_instances,agg_mem_bytes,agg_mem_human,per_container" > "$OUT"
+
+# Convert podman's human MEM (e.g. "12.3GB", "512MB", "1.2kB") to bytes.
+to_bytes() {
+    local v="$1" num unit
+    num="${v//[^0-9.]/}"; unit="${v//[0-9.]/}"
+    case "$unit" in
+        B)           awk -v n="$num" 'BEGIN{printf "%.0f", n}';;
+        kB|KB|KiB)   awk -v n="$num" 'BEGIN{printf "%.0f", n*1024}';;
+        MB|MiB)      awk -v n="$num" 'BEGIN{printf "%.0f", n*1024*1024}';;
+        GB|GiB)      awk -v n="$num" 'BEGIN{printf "%.0f", n*1024*1024*1024}';;
+        TB|TiB)      awk -v n="$num" 'BEGIN{printf "%.0f", n*1024*1024*1024*1024}';;
+        *)           echo 0;;
+    esac
+}
+
+peak=0
+peak_human="0B"
+peak_n=0
+# tick counter only used to vary nothing; loop until flag removed.
+while [[ -f "$FLAG" ]]; do
+    # One stats snapshot of all running containers; filter to vllm-instance-*.
+    mapfile -t lines < <(podman stats --no-stream --format '{{.Name}} {{.MemUsage}}' 2>/dev/null | grep -E '^(bench-vllm-instance|vllm-instance)-' || true)
+    agg=0; n=0; per=""
+    for ln in "${lines[@]}"; do
+        name="${ln%% *}"
+        rest="${ln#* }"
+        used="${rest%% /*}"        # take left of " / " (usage, not limit)
+        used="${used// /}"
+        b=$(to_bytes "$used")
+        agg=$(( agg + b ))
+        n=$(( n + 1 ))
+        per="${per}${name}=${used};"
+    done
+    if (( n > 0 )); then
+        human=$(awk -v b="$agg" 'BEGIN{ split("B KB MB GB TB",u," "); i=1; while(b>=1024 && i<5){b/=1024;i++} printf "%.2f%s", b, u[i] }')
+        # epoch via date is unavailable in some sandboxes; use /proc/uptime delta-free stamp.
+        ts=$(cut -d' ' -f1 /proc/uptime 2>/dev/null || echo 0)
+        echo "${ts},${n},${agg},${human},${per}" >> "$OUT"
+        if (( agg > peak )); then peak=$agg; peak_human=$human; peak_n=$n; fi
+    fi
+    sleep "$INTERVAL"
+done
+
+human=$(awk -v b="$peak" 'BEGIN{ split("B KB MB GB TB",u," "); i=1; while(b>=1024 && i<5){b/=1024;i++} printf "%.2f%s", b, u[i] }')
+echo "PEAK label=${LABEL} instances=${peak_n} agg_mem_bytes=${peak} agg_mem_human=${human}" >> "$OUT"
+echo "PEAK label=${LABEL} instances=${peak_n} agg_mem_human=${human}"
diff --git a/skills/vllm-multiinstance/scripts/parse_guidellm_log.py b/skills/vllm-multiinstance/scripts/parse_guidellm_log.py
new file mode 100644
index 0000000..7ce47ec
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/parse_guidellm_log.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Parse scores from a guidellm.log file (the ASCII summary tables).
+usage: parse_guidellm_log.py <guidellm.log>
+Pulls per-strategy rows from:
+  - Server Throughput Statistics (concurrency, req/s, in tok/s, out tok/s, total tok/s)
+  - Request Latency Statistics  (latency sec, TTFT ms, ITL ms, TPOT ms; medians)
+Strategies appear in benchmark order (concurrency 32 then 64).
+"""
+import re, sys
+
+def clean(line):
+    # split an ASCII table row "| a | b | c |" -> ['a','b','c']
+    return [c.strip() for c in line.strip().strip('|').split('|')]
+
+def find_section(lines, title):
+    for i, ln in enumerate(lines):
+        if title in ln:
+            return i
+    return None
+
+def data_rows(lines, start):
+    """Yield cleaned data rows of the table beginning after `start`, i.e. rows
+    that start with '| concurrent' (the strategy name)."""
+    rows = []
+    for ln in lines[start:]:
+        s = ln.strip()
+        if s.startswith('|') and re.match(r'\|\s*(concurrent|synchronous|throughput|constant|poisson)', s):
+            rows.append(clean(ln))
+        elif s.startswith('ℹ') and rows:
+            break
+        elif s.startswith('✔') and rows:
+            break
+    return rows
+
+f = sys.argv[1]
+lines = open(f, encoding='utf-8', errors='replace').read().splitlines()
+
+# --- Server Throughput Statistics ---
+# Columns: Strategy | Conc Mdn | Conc Mean | Req/s Mean | In tok/s | Out tok/s | Total tok/s
+ti = find_section(lines, "Server Throughput Statistics")
+thr = data_rows(lines, ti) if ti is not None else []
+
+# --- Request Latency Statistics ---
+# Columns: Strategy | Lat Mdn | Lat p95 | TTFT Mdn | TTFT p95 | ITL Mdn | ITL p95 | TPOT Mdn | TPOT p95
+li = find_section(lines, "Request Latency Statistics")
+lat = data_rows(lines, li) if li is not None else []
+
+print(f"{'conc':>5} {'req/s':>7} {'in_tok/s':>9} {'out_tok/s':>10} {'tot_tok/s':>10} {'lat_s':>7} {'TTFT_ms':>9} {'ITL_ms':>7} {'TPOT_ms':>8}")
+n = max(len(thr), len(lat))
+for i in range(n):
+    t = thr[i] if i < len(thr) else []
+    l = lat[i] if i < len(lat) else []
+    # throughput row: [strategy, conc_mdn, conc_mean, reqps_mean, in_s, out_s, tot_s]
+    conc   = t[1] if len(t) > 1 else '?'
+    reqps  = t[3] if len(t) > 3 else '?'
+    in_s   = t[4] if len(t) > 4 else '?'
+    out_s  = t[5] if len(t) > 5 else '?'
+    tot_s  = t[6] if len(t) > 6 else '?'
+    # latency row: [strategy, lat_mdn, lat_p95, ttft_mdn, ttft_p95, itl_mdn, itl_p95, tpot_mdn, tpot_p95]
+    lat_s  = l[1] if len(l) > 1 else '?'
+    ttft   = l[3] if len(l) > 3 else '?'
+    itl    = l[5] if len(l) > 5 else '?'
+    tpot   = l[7] if len(l) > 7 else '?'
+    print(f"{conc:>5} {reqps:>7} {in_s:>9} {out_s:>10} {tot_s:>10} {lat_s:>7} {ttft:>9} {itl:>7} {tpot:>8}")
diff --git a/skills/vllm-multiinstance/scripts/run_combo.sh b/skills/vllm-multiinstance/scripts/run_combo.sh
new file mode 100755
index 0000000..03a824a
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/run_combo.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Drive ONE benchmark run end-to-end, fully configured by environment variables
+# (nothing model/image/PR-specific is hardcoded):
+#   - start the memory poller
+#   - run run_sweep.sh (starts the stack, runs guidellm, tears down)
+#   - stop the poller, print the peak
+#
+# Usage:
+#   LABEL=<name> VLLM_IMAGE=<image> MODEL="<repo-or-path> | <tag>" \
+#       [knobs...] bash run_combo.sh
+#   (LABEL may also be passed as the first positional arg.)
+#
+# Required:
+#   LABEL              Short name for this run; used in output filenames.
+#   VLLM_IMAGE         Container image to benchmark.
+#   MODEL              vLLM model spec "left | tag" (left = repo id or local path,
+#                      tag = short [A-Za-z0-9-] name used in test_name).
+#
+# Common knobs (all optional, passed through only when set):
+#   NATIVE=1                       Add --native (bypass zentorch).
+#   NUM_INSTANCES (default 3)      vLLM instances.
+#   CORES_PER_INSTANCE (default 32)
+#   GUIDELLM_RATES (default [32,64])  Concurrency rate list.
+#   RUN_TAG                        Suffix for output files (e.g. _c96) so
+#                                  different rate sweeps don't clobber each other.
+#   HF_CACHE_DIR (default ~/.cache/hf-shared)  Shared HF cache (pre-warmed).
+#   MODELS_DIR                     Host dir of local models (bind-mounted).
+#   BENCH_ROOT                     Where results/ + tmp/ go (default: $PWD).
+#   HARNESS                        Override the vendored harness dir if needed.
+#   EXTRA_SWEEP_ARGS               Extra args appended verbatim to run_sweep.sh.
+#
+# Example (one run):
+#   LABEL=run1 VLLM_IMAGE=amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 \
+#     MODEL="Qwen/Qwen3-0.6B | qwen3-0.6b" \
+#     bash run_combo.sh
+set -uo pipefail
+
+# LABEL from $1 or env.
+LABEL="${LABEL:-${1:-}}"
+: "${LABEL:?set LABEL (a short name for this run, used in output filenames)}"
+: "${VLLM_IMAGE:?set VLLM_IMAGE (the container image to benchmark)}"
+: "${MODEL:?set MODEL (vLLM model spec: \"repo-or-path | tag\")}"
+export VLLM_IMAGE
+
+# Resolve paths relative to this script's location. The benchmark harness is
+# vendored alongside this skill (../harness) — no dependency on any other repo.
+#   scripts/ -> vllm-multiinstance/{harness,scripts}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SKILL_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+# Vendored harness (run_sweep.sh / start.sh / generate-config.sh / stop.sh).
+HARNESS="${HARNESS:-$SKILL_DIR/harness}"
+# Where results/ and tmp/ land. Defaults to the current directory so the caller
+# controls output location; override via env.
+BENCH_ROOT="${BENCH_ROOT:-$PWD}"
+RESULTS_DIR="$BENCH_ROOT/results"
+mkdir -p "$RESULTS_DIR"
+
+# start.sh pre-warms the HF cache via `huggingface-cli`, often not on the default
+# login PATH. Use the active env; otherwise fall back to conda or a venv. The
+# model is normally already cached, so the pre-warm call is a fast no-op.
+ensure_hf_cli() {
+    command -v huggingface-cli >/dev/null 2>&1 && return 0
+    if command -v conda >/dev/null 2>&1; then
+        # shellcheck disable=SC1091
+        source "$(conda info --base)/etc/profile.d/conda.sh" 2>/dev/null \
+            && conda activate "${CONDA_ENV:-base}" 2>/dev/null
+        command -v huggingface-cli >/dev/null 2>&1 && return 0
+    fi
+    for v in "${VENV_PATH:-}" "$BENCH_ROOT/.venv" "$SKILL_DIR/.venv" "$HOME/.venv"; do
+        if [ -n "$v" ] && [ -f "$v/bin/activate" ]; then
+            # shellcheck disable=SC1091
+            source "$v/bin/activate"
+            command -v huggingface-cli >/dev/null 2>&1 && return 0
+        fi
+    done
+    echo "WARN: huggingface-cli not found via PATH/conda/venv; HF pre-warm may fail" >&2
+    return 1
+}
+ensure_hf_cli
+
+# Build the sweep args. Knobs are exported only when the caller set them, so
+# generate-config.sh emits exactly the env the run needs (and nothing else).
+SWEEP_ARGS=(-m "$MODEL")
+[ "${NO_MEM_LIMIT:-1}" = "1" ] && SWEEP_ARGS=(--no-mem-limit "${SWEEP_ARGS[@]}")
+[ "${NATIVE:-0}" = "1" ]       && SWEEP_ARGS=(--native "${SWEEP_ARGS[@]}")
+[ -n "${MODELS_DIR:-}" ]       && SWEEP_ARGS+=(--models-dir "$MODELS_DIR")
+# shellcheck disable=SC2206
+[ -n "${EXTRA_SWEEP_ARGS:-}" ] && SWEEP_ARGS+=(${EXTRA_SWEEP_ARGS})
+
+# Name our stack distinctly so it never collides with another vLLM stack on this
+# host (the poller and stop logic key off this prefix too).
+export VLLM_NAME_PREFIX="${VLLM_NAME_PREFIX:-bench-vllm-instance}"
+export VLLM_NGINX_NAME="${VLLM_NGINX_NAME:-bench-vllm-nginx-lb}"
+
+# Keep all temp off a possibly-full root fs. The patched ansible playbook reads
+# BENCH_TMPDIR for its metrics script + vllm-logs.
+export TMPDIR="${BENCH_TMPDIR:-$BENCH_ROOT/tmp}"
+mkdir -p "$TMPDIR"
+export BENCH_TMPDIR="$TMPDIR"
+export ANSIBLE_LOCAL_TEMP="$TMPDIR/ansible"
+export ANSIBLE_REMOTE_TEMP="$TMPDIR/ansible"
+mkdir -p "$ANSIBLE_LOCAL_TEMP"
+
+# Stack layout + cache (all overridable).
+export NUM_INSTANCES="${NUM_INSTANCES:-3}"
+export CORES_PER_INSTANCE="${CORES_PER_INSTANCE:-32}"
+export HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/hf-shared}"
+export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
+export HF_TOKEN="${HF_TOKEN:-offline}"   # avoid run_sweep.sh's interactive prompt
+export GUIDELLM_RATES="${GUIDELLM_RATES:-[32,64]}"
+RUN_TAG="${RUN_TAG:-}"
+
+MEM_CSV="$RESULTS_DIR/mem_${LABEL}${RUN_TAG}.csv"
+SWEEP_LOG="$RESULTS_DIR/sweep_${LABEL}${RUN_TAG}.log"
+
+echo "=================================================================="
+echo " LABEL=$LABEL"
+echo "   VLLM_IMAGE=$VLLM_IMAGE"
+echo "   MODEL=$MODEL"
+echo "   NATIVE=${NATIVE:-0}  NUM_INSTANCES=$NUM_INSTANCES  CORES=$CORES_PER_INSTANCE"
+echo "   rates=$GUIDELLM_RATES   sweep args: ${SWEEP_ARGS[*]}"
+echo "   mem csv:   $MEM_CSV"
+echo "   sweep log: $SWEEP_LOG"
+echo "=================================================================="
+
+# Start memory poller in background.
+bash "$SCRIPT_DIR/mem_poll.sh" "$LABEL" "$MEM_CSV" 2 &
+POLL_PID=$!
+
+# Run the sweep.
+( cd "$HARNESS" && ./run_sweep.sh "${SWEEP_ARGS[@]}" ) 2>&1 | tee "$SWEEP_LOG"
+SWEEP_RC=${PIPESTATUS[0]}
+
+# Stop poller.
+rm -f "${MEM_CSV}.run"
+wait "$POLL_PID" 2>/dev/null
+
+echo ""
+echo "--- $LABEL done (sweep rc=$SWEEP_RC) ---"
+grep "^PEAK" "$MEM_CSV" || echo "WARN: no PEAK recorded (stack may not have started)"
diff --git a/skills/vllm-multiinstance/scripts/setup-harness.sh b/skills/vllm-multiinstance/scripts/setup-harness.sh
new file mode 100755
index 0000000..8a447f9
--- /dev/null
+++ b/skills/vllm-multiinstance/scripts/setup-harness.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# One-time setup for the vendored multi-instance benchmark harness.
+# Clones the external guidellm/ansible automation (redhat-et/vllm-cpu-perf-eval)
+# into the harness dir and applies the bundled patch (rootless guidellm user fix,
+# /tmp -> BENCH_TMPDIR redirect, local-model bind mount). Safe to re-run.
+#
+# No dependency on ZenDNN_tools or any other repo — everything lives under this
+# skill. Run once before the first sweep.
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SKILL_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+HARNESS="${HARNESS:-$SKILL_DIR/harness}"
+EVAL_DIR="$HARNESS/vllm-cpu-perf-eval"
+PATCH="$HARNESS/vllm-cpu-perf-eval.patch"
+
+[ -f "$HARNESS/run_sweep.sh" ] || { echo "ERROR: harness not found at $HARNESS" >&2; exit 1; }
+[ -f "$PATCH" ] || { echo "ERROR: patch not found at $PATCH" >&2; exit 1; }
+
+if [ ! -d "$EVAL_DIR/.git" ]; then
+    echo "--- Cloning vllm-cpu-perf-eval (external ansible/guidellm automation) ---"
+    git clone https://github.com/redhat-et/vllm-cpu-perf-eval.git "$EVAL_DIR"
+else
+    echo "--- vllm-cpu-perf-eval already present ($EVAL_DIR) ---"
+fi
+
+echo "--- Applying patch (use --3way; upstream drifts) ---"
+if git -C "$EVAL_DIR" apply --reverse --check "$PATCH" 2>/dev/null; then
+    echo "  Patch already applied; skipping."
+else
+    git -C "$EVAL_DIR" stash -q 2>/dev/null || true
+    if git -C "$EVAL_DIR" apply --3way "$PATCH"; then
+        echo "  Patch applied."
+    else
+        echo "ERROR: patch failed to apply. Resolve manually in $EVAL_DIR." >&2
+        exit 1
+    fi
+fi
+
+echo "--- Checking ansible collections ---"
+need="containers.podman ansible.posix community.general"
+have="$(ansible-galaxy collection list 2>/dev/null)"
+for c in $need; do
+    if echo "$have" | grep -qi "^$c "; then echo "  OK   $c"; else echo "  MISS $c (install: ansible-galaxy collection install $c)"; fi
+done
+
+echo ""
+echo "Harness ready at: $HARNESS"
+echo "Ansible playbook : $EVAL_DIR/automation/test-execution/ansible/llm-benchmark-concurrent-load.yml"
diff --git a/skills/vllm-multiinstance/skill-card.md b/skills/vllm-multiinstance/skill-card.md
new file mode 100644
index 0000000..3521e4a
--- /dev/null
+++ b/skills/vllm-multiinstance/skill-card.md
@@ -0,0 +1,13 @@
+# Skill Card
+
+## Description
+
+Multi-instance vLLM benchmark on AMD EPYC CPU: runs N vLLM instances (each pinned to a range of physical cores) behind an NGINX load balancer, drives load with guidellm via ansible, and reports peak aggregate memory (podman stats) plus end-to-end throughput/latency across models, concurrency rates, and instance counts. The benchmark harness is vendored with the skill; nothing external is required beyond podman + ansible.
+
+## Owner
+
+AMD
+
+## License
+
+MIT