diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 73497cf..f4aa2d7 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -33,6 +33,11 @@ "name": "rocm-doctor", "source": "./skills/rocm-doctor", "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step." + }, + { + "name": "serving-llms-on-instinct", + "source": "./skills/serving-llms-on-instinct", + "description": "Serve LLMs on AMD Instinct GPUs (MI300X/MI325X/MI350X/MI355X) with vLLM on ROCm. Handles GPU detection, environment validation, vLLM configuration, launch, and health verification." } ] } diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json index 73497cf..f4aa2d7 100644 --- a/.cursor-plugin/marketplace.json +++ b/.cursor-plugin/marketplace.json @@ -33,6 +33,11 @@ "name": "rocm-doctor", "source": "./skills/rocm-doctor", "description": "Diagnose why ROCm, PyTorch, or llama.cpp isn't working on an AMD GPU. Matches the symptom against a fixed list of twelve known misconfigurations and proposes the next step." + }, + { + "name": "serving-llms-on-instinct", + "source": "./skills/serving-llms-on-instinct", + "description": "Serve LLMs on AMD Instinct GPUs (MI300X/MI325X/MI350X/MI355X) with vLLM on ROCm. Handles GPU detection, environment validation, vLLM configuration, launch, and health verification." } ] } diff --git a/.github/skillspector-allow.yml b/.github/skillspector-allow.yml index 6131795..af73e55 100644 --- a/.github/skillspector-allow.yml +++ b/.github/skillspector-allow.yml @@ -123,3 +123,113 @@ suppressions: to locate and replace the rule block in AGENTS.md in place on re-runs. It carries no instructions; the surrounding rule text is plain, reviewable content by design (it is the installable routing rule itself). + - skill: serving-llms-on-instinct + rule: SC2 + file: data/recipes_cache.json + match: External Script Fetching + reason: >- + False positive. The flag is on a `"guide"` markdown string (a recipe doc + embedded in this JSON cache, not runnable code). Its shell snippets are + illustrative: `uv pip install ... --extra-index-url https://wheels.vllm.ai/nightly` + installs vLLM from an HTTPS package index (the recommended-safe pattern), + and `curl http://localhost:8000/... | python3 -m json.tool` pipes a + localhost API response into a JSON pretty-printer. There is no + download-and-execute of a remote script (no `curl ... | bash`/`sh`). + - skill: serving-llms-on-instinct + rule: P6 + file: data/recipes_cache.json + match: Direct Prompt Extraction + reason: >- + False positive. The flag is on a `"guide"` markdown string (the + Ministral-3-Instruct recipe doc, not runnable code). The matched Python + example downloads the model's own publicly published `SYSTEM_PROMPT.txt` + via `hf_hub_download` and passes it as the `system` role of a chat request + (Mistral's documented setup) — it constructs a prompt, it does not reveal + or extract any hidden system prompt. The only output printed is the + model's answer (`response.choices[0].message.content`). The trigger is + merely the literal token `SYSTEM_PROMPT` in benign example code. + - skill: serving-llms-on-instinct + rule: TM2 + file: reference.md + match: Chaining Abuse + reason: >- + False positive. Line 92 is a Troubleshooting one-liner that disables + kernel NUMA balancing for GPU workloads: + `echo 0 | sudo tee /proc/sys/kernel/numa_balancing`. The `|` is just the + idiomatic way to write a root-owned /proc file (echo piped into `sudo + tee`), not multi-step tool/command chaining of untrusted or model-derived + steps. It is a single fixed, reviewable, human-run sysctl write — no LLM + output feeds the pipe and there is no chain depth to bound. + - skill: serving-llms-on-instinct + rule: TM1 + file: scripts/detect.py + match: Tool Parameter Abuse + reason: >- + False positive. Line 32 uses `subprocess.run(cmd, shell=True, ...)`, but + `shell=True` is intentional and safe here: every `cmd` passed to `_run` + is a fixed in-script literal (`amd-smi static --asic --vram --json`, + `amd-smi version --json`, and their `sudo` retries) that relies on no + shell metacharacters from user input. The only user-controlled values + (`--host`/`--user`/`--port`) never enter the shell string — they flow + solely into the SSH branch as list-form argv (`ssh ... ssh_target cmd`, + no shell), and `port` is int-coerced by argparse. No untrusted or model + output reaches the shell, so there is no parameter abuse to reject. + - skill: serving-llms-on-instinct + rule: TM1 + file: scripts/validate.py + match: Tool Parameter Abuse + reason: >- + False positive. Same `_run` helper as detect.py: line 33 uses + `subprocess.run(cmd, shell=True, ...)` where every `cmd` is a hardcoded + diagnostic literal (`test -e /dev/kfd ...`, `ls /dev/dri/renderD* ...`, + `cat /proc/sys/kernel/numa_balancing ...`, `printenv HF_TOKEN ...`, etc.) + that deliberately uses shell pipes/redirects/globs. The dynamic inputs + (`--host`/`--user`/`--port`) only reach the SSH branch as list-form argv, + never the shell string, and `port` is int-coerced. No untrusted/model + output is interpolated into the command. + - skill: serving-llms-on-instinct + rule: TM2 + file: scripts/validate.py + match: Chaining Abuse + reason: >- + False positive. The flagged lines are the NUMA-balancing fix + `echo 0 | sudo tee /proc/sys/kernel/numa_balancing`. Line 122 only runs + it under the explicit opt-in `--auto-fix` flag (user-approved), while + lines 130 and 137 are human-readable `"fix"` advisory strings that are + never executed. The `|` is the idiomatic root-owned /proc write (echo + into `sudo tee`), a single fixed sysctl command — not multi-step tool + chaining of untrusted or model-derived steps. + - skill: serving-llms-on-instinct + rule: E2 + file: scripts/estimate_vram.py + match: Env Variable Harvesting + reason: >- + False positive. Line 175 reads `HF_TOKEN` via `os.environ.get`, which is + strictly required: it is passed only to `_fetch`, which sets it as the + `Authorization: Bearer` header on requests to `https://huggingface.co` + (the token's intended recipient) so the tool can read safetensors/config + metadata for gated or private models. The token is never logged, printed, + or transmitted anywhere else — the emitted JSON contains only model and + VRAM fields. + - skill: serving-llms-on-instinct + rule: E2 + file: scripts/validate.py + match: Env Variable Harvesting + reason: >- + False positive. Line 151 runs `printenv HF_TOKEN | head -c 4` purely as a + presence check; the captured 4-char value is never emitted — only + `out.strip()` truthiness is tested to decide whether to advise the user + that HF_TOKEN is unset (needed for gated models). No credential is logged + or transmitted. + - skill: serving-llms-on-instinct + rule: P5 + file: data/recipes_cache.json + match: Harmful Content Injection + reason: >- + False positive. Line 3524 is the `"guide"` for Qwen3Guard-Gen, a + text-only safety/guardrail classifier model. The matched string + ("Tell me how to make a bomb.") is the demo *input* used to show the + moderation model correctly classifying the request as unsafe — the + documented output is `# Safety: Unsafe` / `# Categories: Violent`. No + harmful instructions are present; it is content-moderation documentation, + the opposite of harmful-content injection. diff --git a/README.md b/README.md index 1e47bd8..1ab541c 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Bring existing workloads onto AMD. | --- | --- | --- | | `cuda-to-hip` | Port CUDA kernels with `hipify` and flag anything that needs manual review. | _planned_ | | `vllm-rocm` | Stand up vLLM on AMD with the right environment variables and model configurations. | _planned_ | -| `serving-llms-on-instinct` | Deploy LLM inference on AMD Instinct GPUs end-to-end: detect hardware (or onboard via AMD Developer Cloud), validate model fit, apply the right vLLM recipe, and launch a benchmarked endpoint. SGLang and engine/backend selection in later phases. | _planned_ | +| [`serving-llms-on-instinct`](skills/serving-llms-on-instinct/SKILL.md) | Deploy LLM inference on AMD Instinct GPUs end-to-end: detect hardware (or onboard via AMD Developer Cloud), validate model fit, apply the right vLLM recipe, and launch a benchmarked endpoint. SGLang and engine/backend selection in later phases. | in-repo | ### Performance & delivery diff --git a/skills/serving-llms-on-instinct/SKILL.md b/skills/serving-llms-on-instinct/SKILL.md new file mode 100644 index 0000000..ba45b09 --- /dev/null +++ b/skills/serving-llms-on-instinct/SKILL.md @@ -0,0 +1,359 @@ +--- +name: serving-llms-on-instinct +description: >- + Serves AI models on AMD Instinct GPU hardware using vLLM. Use this skill + whenever the user wants to run, serve, deploy, start, host, or launch a + language model on an AMD GPU, AMD Instinct, MI300X, MI325X, MI350X, or MI355X. + Also use when the user mentions vLLM on ROCm, vLLM on AMD, serving on HBM, + or asks how to get a model running on AMD data center hardware. Use when the + user asks "run Qwen3", "serve DeepSeek", "start a vLLM endpoint", "get a + model running on my AMD machine", or any similar phrasing. Handles the full + flow: GPU detection, environment validation, vLLM configuration, launch, and + health verification. Do not use for NVIDIA GPUs, consumer AMD GPUs (RX + series, Radeon), Ryzen AI, NPU, MI250X, or MI100. +allowed-tools: Bash, Read +--- + +# Serving LLMs on AMD Instinct + +Get a vLLM endpoint running on AMD Instinct GPU hardware. + +## Prerequisites + +- ROCm driver and `amd-smi` installed on the GPU host +- Docker running and accessible (check with `docker ps`) +- `/dev/kfd` and `/dev/dri` present on the GPU host +- HuggingFace token in `HF_TOKEN` env var (required for gated models; not + required for Qwen3 or Gemma). For gated models (Llama 3.2, Gemma, etc.), + the HF token must belong to an account that has accepted the model's license + at `huggingface.co/`. A valid token without license acceptance will + fail with an opaque "Engine core initialization failed" error. +- For remote GPU: SSH key access configured (`ssh @` must work + without a password prompt). If only password access is available, set up + keys first: `ssh-copy-id @` + +## Data files + +Read these files directly to get model and GPU configuration: + +- **`data/recipes_cache.json`** -- model configs synced from + [vllm-project/recipes](https://github.com/vllm-project/recipes). Each entry + under `models..recipe` contains the full recipe with `model.base_args`, + `model.base_env`, `features.tool_calling.args`, `features.reasoning.args`, + `hardware_overrides.amd.extra_args`, `hardware_overrides.amd.extra_env`. + The top-level `docker_image` field has the latest resolved vLLM ROCm image. + +- **`data/gpu_overrides.json`** -- GPU-specific configuration. Contains + `docker_flags` (mandatory for all AMD Instinct), `gpu_configs` keyed by + gfx_version with `env_defaults` and `workarounds`, and `legacy_models` for + models not yet in vLLM recipes. + +- **`data/blacklist.json`** -- models in vLLM recipes that cannot be served + as LLM endpoints. Includes diffusion/image/audio generation models, embedding + models, rerankers, ASR models needing audio pipelines, and models requiring + unreleased vLLM nightly builds. Check this before attempting to serve a model. + If the user requests a blacklisted model, explain why it won't work and + suggest an alternative. + +If the user doesn't specify a model, default to **Qwen/Qwen3.5-9B**: dense +multimodal with MTP, Apache 2.0 license (no HF token needed), fits on a single +GPU, strong reasoning and tool-calling. + +## Step 1: Detect the GPU + +```bash +python3 scripts/detect.py +# Remote: +python3 scripts/detect.py --host user@hostname +``` + +Returns JSON with `gfx_version`, `vram_gb`, `gpu_count`, `rocm_version`. + +| gfx_version | Hardware | VRAM | +|---|---|---| +| gfx950 | MI350X / MI355X | 288 GB HBM3E | +| gfx942 | MI300X (192 GB) / MI325X (256 GB) / MI300A (128 GB) | varies | + +If `gfx_version` is `unknown`: `amd-smi` ran but found no GPU. Check +`lsmod | grep amdgpu`. + +## Step 2: Validate the environment + +```bash +python3 scripts/validate.py --auto-fix +# Remote: +python3 scripts/validate.py --auto-fix --host user@hostname +``` + +Returns JSON with `ready` (bool), `errors`, `warnings`, `fixes_applied`. +Do not proceed if `ready` is `false`. + +## Step 3: Refresh recipes (if stale) + +Check `fetched_at` in `data/recipes_cache.json`. If older than 24 hours or +the file is missing, refresh: + +```bash +python3 scripts/sync_recipes.py +``` + +This shallow-clones vllm-project/recipes from GitHub and fetches the latest +Docker tag from Docker Hub. Takes ~10 seconds. If it fails, the existing +cache still works. + +## Step 4: Construct the Docker command + +Read `data/recipes_cache.json` and `data/gpu_overrides.json` directly. +Build the Docker command by combining: + +1. **Docker flags** from `gpu_overrides.json > docker_flags` (mandatory for all AMD GPUs) +2. **HF cache mount**: `-v ~/.cache/huggingface:/root/.cache/huggingface` + (if a shared model cache directory exists on the host, check whether + `models--*` directories are at the cache root or inside a `hub/` + subdirectory -- mount accordingly to `/root/.cache/huggingface` or + `/root/.cache/huggingface/hub`) +3. **Port**: `-p :` (default 8000) +4. **Environment variables**: merge `gpu_configs..env_defaults` + with the recipe's `model.base_env` and `hardware_overrides.amd.extra_env`. + Always add `--env HF_TOKEN=${HF_TOKEN}`. +5. **Docker image**: use `docker_image` from `recipes_cache.json` top level + (unless the model needs a pinned image, e.g. GLM-4.5 needs `v0.15.1`). + If the user specifies a Docker image version, check it against the recipe's + `model.min_vllm_version`. Warn if the image is older -- the model may crash + on startup with an opaque "Engine core initialization failed" error. +6. **Model ID**: `--model ` +7. **vLLM args**: combine the recipe's `model.base_args` + + `hardware_overrides.amd.extra_args` + `features.tool_calling.args` + + `features.reasoning.args`. Add `--enable-auto-tool-choice` if not present. + For multi-GPU, add `--tensor-parallel-size N` (see VRAM estimation below). + For MoE models on multi-GPU, also add `--distributed-executor-backend mp`. +8. **Port arg**: `--port ` + +If the exact model ID is not in `recipes_cache.json`, check for a base model +match by stripping date/version suffixes (e.g., `Kimi-K2-Instruct` matches +`Kimi-K2-Instruct-0905`). Use the base model's recipe if found. + +If no recipe match, check `legacy_models` in `gpu_overrides.json`. If not +there either, use a generic config with +`--enable-auto-tool-choice --trust-remote-code --tool-call-parser hermes`. + +**Precision variant selection:** Recipes may offer variants (default, fp8, +nvfp4). Check `gpu_configs..precision.native` in +`gpu_overrides.json` before selecting a variant. On gfx942 (MI300X), only +`bf16`, `fp16`, `fp8_fnuz`, and `int8` are hardware-native. MXFP4 and NVFP4 +compute is emulated (dequant to BF16 during matmul), but weights stay +compressed in VRAM so quantized models still fit in less memory. +On gfx950 (MI350X), MXFP4 is hardware-native. + +**VRAM estimation and fit check:** Before constructing the Docker command, +estimate whether the model fits the available hardware: +```bash +python3 scripts/estimate_vram.py --model-id --vram-gb --tp +``` +This queries the HuggingFace Hub API (no model download) and returns JSON with: +- `weight_memory_gb` -- total weight size +- `kv_cache_bytes_per_token` -- KV cache cost per token at BF16 +- `fit.weights_fit` -- whether weights fit at the given TP +- `fit.recommended_max_model_len` -- max context the GPU can serve +- `fit.context_limited` -- true if KV cache limits context below the + model's native max +- `fit.min_tp_required` -- minimum TP needed (only if weights don't fit) + +**Understanding the overhead:** The script reserves ~4 GB for vLLM's runtime +overhead (activation profiling, HIP graph capture, internal buffers). During +startup, vLLM runs a profiling forward pass to measure peak activations, then +captures HIP graphs for optimized decode. This startup peak is higher than +steady-state. The `remaining_for_kv_gb` field reflects what's left after +weights and this overhead. + +Use `remaining_for_kv_gb` to decide: + +1. **`remaining_for_kv_gb >= 6`**: safe to run. If `context_limited: true`, + add `--max-model-len ` to the vLLM args. + Mention the FP8 KV cache option (`--kv-cache-dtype fp8`) if the user + needs longer context (`fit.max_seq_len_fp8_kv` shows the gain). +2. **`remaining_for_kv_gb` between 2 and 6**: tight but worth trying. Launch + normally. If vLLM OOMs during HIP graph capture (check container logs for + "out of memory" after "capturing CUDA/HIP graphs"), retry with + `--enforce-eager` added to the vLLM args. This skips graph capture and + frees 1-2 GB. The only cost is slightly higher decode latency. +3. **`remaining_for_kv_gb < 2`**: too tight. Will likely OOM during the + activation profiling step. Do not attempt. +4. **`weights_fit: false` with multiple GPUs**: re-run with + `--tp ` and check again. +5. **`weights_fit: false`, not enough GPUs**: look for quantized + alternatives in this order: + a. **Recipe variants**: the recipe may have `fp8` or `mxfp4` variants + with a different `model_id` that points to a quantized checkpoint. + b. **Same provider**: many providers release quantized versions alongside + the base model (e.g. `Qwen/Qwen3.5-122B-FP8` from Qwen). Search + HuggingFace for `/` with FP8/GPTQ/AWQ suffixes. + c. **AMD quantized**: AMD's Quark team publishes quantized models under + the `amd/` org on HuggingFace (e.g. `amd/Kimi-K2-Instruct-w-mxfp4-a-fp8`). + Search for `amd/` variants. + Run `estimate_vram.py` on the quantized model ID to verify it fits, + then use that model ID instead. +6. **Still doesn't fit**: tell the user the model requires more VRAM than + available and suggest either a smaller model or multi-GPU hardware. + Do not attempt to launch. + +Docker command template: +``` +docker run -d --name vllm- \ + \ + -v \ + -p : \ + --env = (for each env var) \ + --env HF_TOKEN=${HF_TOKEN} \ + \ + --model \ + \ + --port +``` + +## Step 5: Confirm with the user + +Before launching, present a summary and ask the user to confirm: +- **Model**: full HuggingFace ID (e.g. `Qwen/Qwen3.5-122B-Instruct`) +- **Precision**: variant being used (e.g. BF16, FP8) and why +- **Weight memory**: from estimate_vram.py +- **GPU**: detected hardware and VRAM +- **TP**: tensor parallelism degree (1, 2, 4, 8) +- **Context**: max achievable context length (and whether it's limited) +- **Port**: which port the endpoint will be on + +If a quantized alternative was selected (Step 4 fit check), explain that +the original model doesn't fit and which alternative is being used. + +Wait for the user's confirmation before proceeding. + +## Step 6: Launch and verify + +Before launching, check for port conflicts: +```bash +ss -tlnp 2>/dev/null | grep ': ' +``` +If a Docker container is on that port, stop it with `docker rm -f `. + +Run the Docker command. Then poll health using this loop: + +```bash +while docker inspect --format='{{.State.Running}}' 2>/dev/null | grep -q true; do + curl -sf http://localhost:/health && echo "READY" && exit 0 + sleep 60 +done +echo "FAILED -- container exited" +``` + +A 503 during loading is normal. Choose the polling strategy based on +model size (weight memory from hf-mem): + +- **Small models (< 100 GB weights)**: run the poll as a blocking command + with the Bash tool's `timeout` set to 600000 (10 minutes). Most cached + models are ready within 2-5 minutes. +- **Large models (>= 100 GB weights)**: run the poll with the Bash tool's + `run_in_background` set to `true`. Then use `TaskOutput` with + `block: true` and `timeout: 600000` to wait up to 10 minutes per check. + If the task is still running after that, call `TaskOutput` again with + the same parameters. This uses only 1 turn per 10-minute wait instead + of burning a turn every check. The background loop runs until the + container is healthy or dies. + +After health returns 200, send a warmup request (triggers HIP kernel compilation, +~40-45 seconds on gfx942): +```bash +curl -s http://localhost:/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"","messages":[{"role":"user","content":"say hi"}],"max_tokens":5}' +``` + +After the warmup succeeds, present a connection table so the user can call +the endpoint immediately: + +| Field | Value | +|-------|-------| +| Model | `` | +| Served model name | `` | +| Base URL | `http://:/v1` | +| API key | none (local) | +| Port | `` | +| Tensor parallel | `` | +| Max context | `` | +| GPU | `` | + +Then give a ready-to-run example using those exact values: + +```bash +curl -s http://:/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"","messages":[{"role":"user","content":"Hello"}]}' +``` + +## Remote vs. local + +All scripts accept `--host user@hostname`. When given, they SSH to the target. +Set `ROCM_SSH_HOST` and `ROCM_SSH_USER` env vars to avoid passing `--host` +every time. + +For remote Docker commands, run them over SSH: +```bash +ssh user@host 'docker run -d ...' +``` +Use `localhost` for health/warmup curl URLs (curl runs on the remote host). + +## Gotchas + +**`CUDA_VISIBLE_DEVICES` set to empty string** -- ROCm maps this variable to +`HIP_VISIBLE_DEVICES`. Setting it to an empty string hides all GPUs. +`CUDA_VISIBLE_DEVICES=0,1` works fine for restricting GPUs (same as +`HIP_VISIBLE_DEVICES=0,1`). If the host has it set to empty, unset it: +`unset CUDA_VISIBLE_DEVICES`. Do not pass `--env CUDA_VISIBLE_DEVICES=` (empty) +into Docker -- that also hides all GPUs inside the container. + +**FP4BMM crash on gfx942 (MI300X)** -- If the container exits immediately +with a segfault or illegal instruction: `VLLM_ROCM_USE_AITER_FP4BMM` must be +`0` on gfx942. This is set correctly in `gpu_overrides.json` for gfx942. +See vLLM issue #34641. + +**`HIP error: no kernel image`** -- The Docker image has no compiled kernel +for your GPU's gfx version. Use `vllm/vllm-openai-rocm:latest`; it includes +gfx942 and gfx950 kernels. + +**MLA models need `--block-size 1`** -- DeepSeek-R1/V3, Kimi-K2.5. +Without it the MLA attention backend silently falls back to a slower path. +This is in the recipe args for these models. + +**MoE models on multi-GPU need `--distributed-executor-backend mp`** -- +Qwen3-235B, GLM-4.5, MiniMax-M2. The default distributed executor does not +work reliably with MoE on ROCm. + +**OOM during HIP graph capture** -- If the container logs show "out of memory" +after "capturing CUDA graphs" or "capturing HIP graphs", the model fits in +VRAM but there isn't enough headroom for graph capture. Retry with +`--enforce-eager` added to the vLLM args. This disables graph capture and +frees 1-2 GB. Trade-off: slightly higher decode latency, but the model runs. + +**"Engine core initialization failed"** -- This opaque error means the engine +core subprocess died. Check early container logs: `docker logs 2>&1 | +head -50`. Common causes: gated model access denied (license not accepted on +HF), unsupported architecture on this vLLM version, OOM during weight loading, +missing `--trust-remote-code` for custom architectures, or vLLM version too old +for the model (check `min_vllm_version` in the recipe). + +**`/dev/kfd` permission denied** -- User is not in the `video` or `render` +group. Fix: `sudo usermod -aG video,render $USER` (requires re-login). + +**SSH key not configured** -- The scripts use `BatchMode=yes` SSH. If SSH +fails with `Permission denied (publickey)`, configure key-based access first. + +**Restricting GPUs on shared hosts** -- Use `--env HIP_VISIBLE_DEVICES=0,1` +or `--env CUDA_VISIBLE_DEVICES=0,1` to target specific GPUs by index. +`HIP_VISIBLE_DEVICES` is the canonical AMD variable; `CUDA_VISIBLE_DEVICES` +also works (ROCm maps it). Never set either to an empty string. + +--- + +## Reference + +Precision compatibility, VRAM estimation, Docker flags, and known quirks: +[reference.md](reference.md) diff --git a/skills/serving-llms-on-instinct/data/blacklist.json b/skills/serving-llms-on-instinct/data/blacklist.json new file mode 100644 index 0000000..2b660f9 --- /dev/null +++ b/skills/serving-llms-on-instinct/data/blacklist.json @@ -0,0 +1,59 @@ +{ + "_comment": "Models in vLLM recipes that cannot be served as LLM endpoints on AMD Instinct. The agent should refuse these and explain why.", + + "not_an_llm": { + "_comment": "Non-LLM models: diffusion, image gen, audio gen, embeddings, rerankers. These are not chat/completion endpoints.", + "models": [ + "stabilityai/stable-diffusion-3.5-medium", + "stabilityai/stable-audio-open-1.0", + "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "Wan-AI/Wan2.2-T2V-A14B", + "Qwen/Qwen-Image", + "zai-org/GLM-Image", + "zai-org/Glyph", + "meituan-longcat/LongCat-Image-Edit", + "jinaai/jina-embeddings-v5-text-small", + "jinaai/jina-reranker-m0", + "black-forest-labs/FLUX.1-dev", + "black-forest-labs/FLUX.2-dev", + "black-forest-labs/FLUX.2-klein-9B", + "Tongyi-MAI/Z-Image-Turbo", + "openai/whisper-large-v3-turbo", + "neuphonic/neutts-air" + ] + }, + + "needs_audio_pipeline": { + "_comment": "Models that require audio input/output streaming. vLLM can load them but they need a specialized audio pipeline, not a standard chat endpoint.", + "models": [ + "Qwen/Qwen3-ASR-1.7B", + "zai-org/GLM-ASR-Nano-2512", + "mistralai/Voxtral-Mini-4B-Realtime-2602", + "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16" + ] + }, + + "needs_vllm_nightly": { + "_comment": "Models that require unreleased vLLM nightly. Will break on any stable Docker image. Re-check when vLLM cuts a new release.", + "models": [ + "internlm/Intern-S2-Preview", + "mistralai/Mistral-Medium-3.5-128B", + "poolside/Laguna-XS.2", + "stepfun-ai/Step-3.7-Flash" + ] + }, + + "translation_only": { + "_comment": "Translation-only models with no chat or tool-calling support.", + "models": [ + "Google/translategemma-27b-it" + ] + }, + + "incompatible_weights": { + "_comment": "Models whose weight naming or architecture is incompatible with current vLLM stable (v0.22.0). Crashes during weight loading.", + "models": [ + "thu-pacman/PCMind-2.1-Kaiyuan-2B" + ] + } +} diff --git a/skills/serving-llms-on-instinct/data/gpu_overrides.json b/skills/serving-llms-on-instinct/data/gpu_overrides.json new file mode 100644 index 0000000..57709c9 --- /dev/null +++ b/skills/serving-llms-on-instinct/data/gpu_overrides.json @@ -0,0 +1,147 @@ +{ + "docker_flags": [ + "--group-add=video", + "--group-add=render", + "--cap-add=SYS_PTRACE", + "--security-opt seccomp=unconfined", + "--device /dev/kfd", + "--device /dev/dri", + "--ipc=host" + ], + "gpu_configs": { + "gfx950": { + "gpu_family": "AMD Instinct MI350X / MI355X", + "vram_gb": 288, + "env_defaults": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_FP4BMM": "1" + }, + "precision": { + "native": ["bf16", "fp16", "fp8_ocp", "int8", "mxfp4", "mxfp6"], + "emulated": ["fp8_fnuz"], + "unsupported": ["nvfp4"], + "notes": "MXFP4/MXFP6 are hardware-native on gfx950. FP8 uses OCP (E4M3FN) standard, not FNUZ. NVFP4 is NVIDIA-specific and will not load on ROCm." + }, + "workarounds": [] + }, + "gfx942": { + "gpu_family": "AMD Instinct MI300X / MI325X / MI300A", + "vram_gb_note": "Varies: MI300X=192, MI325X=256, MI300A=128. Use detect.py vram_gb for actual value.", + "vram_gb": 192, + "env_defaults": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_FP4BMM": "0" + }, + "precision": { + "native": ["bf16", "fp16", "fp8_fnuz", "int8"], + "emulated": ["mxfp4", "mxfp6"], + "unsupported": ["nvfp4"], + "notes": "FP8 uses FNUZ (E4M3FNUZ) dialect, not OCP. vLLM auto-converts OCP checkpoints. MXFP4/MXFP6 compute is emulated (dequant to BF16 during matmul), but weights stay compressed in VRAM. NVFP4 is NVIDIA-specific and will not load on ROCm." + }, + "workarounds": [ + {"id": "vllm-34641", "description": "FP4BMM=0 mandatory on gfx942 (MI300X crash bug)"} + ] + } + }, + "legacy_models": { + "Qwen/Qwen3-0.6B": { + "vram_fp16_gb": 2, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-1.7B": { + "vram_fp16_gb": 4, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-4B": { + "vram_fp16_gb": 9, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-8B": { + "vram_fp16_gb": 18, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-14B": { + "vram_fp16_gb": 30, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-32B": { + "vram_fp16_gb": 66, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-72B": { + "vram_fp16_gb": 148, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "Qwen/Qwen3-235B-A22B": { + "vram_fp16_gb": 564, "min_tp": 4, + "tool_call_parser": "hermes", "reasoning_parser": "qwen3", + "env_vars": { + "VLLM_USE_V1": "1", + "VLLM_ROCM_USE_AITER_MHA": "0", + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1", + "VLLM_USE_TRITON_FLASH_ATTN": "0", + "SAFETENSORS_FAST_GPU": "1" + }, + "vllm_args": [ + "--distributed-executor-backend mp", + "--max-num-batched-tokens 32768", + "--max-model-len 32768", + "--no-enable-prefix-caching", + "--gpu-memory-utilization 0.8", + "--swap-space 32" + ] + }, + "Qwen/Qwen3-VL-7B-Instruct": { + "vram_fp16_gb": 18, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": { + "MIOPEN_USER_DB_PATH": "$(pwd)/miopen", + "MIOPEN_FIND_MODE": "FAST", + "SAFETENSORS_FAST_GPU": "1" + }, + "vllm_args": ["--mm-encoder-tp-mode data"] + }, + "Qwen/Qwen3-VL-32B-Instruct": { + "vram_fp16_gb": 70, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": { + "MIOPEN_USER_DB_PATH": "$(pwd)/miopen", + "MIOPEN_FIND_MODE": "FAST", + "SAFETENSORS_FAST_GPU": "1" + }, + "vllm_args": ["--mm-encoder-tp-mode data"] + }, + "Qwen/Qwen2.5-VL-7B-Instruct": { + "vram_fp16_gb": 18, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": ["--mm-encoder-tp-mode data"] + }, + "google/gemma-4-2B-it": { + "vram_fp16_gb": 5, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "google/gemma-4-4B-it": { + "vram_fp16_gb": 9, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "google/gemma-4-27B-it": { + "vram_fp16_gb": 56, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + }, + "google/gemma-4-31B-it": { + "vram_fp16_gb": 64, "min_tp": 1, + "tool_call_parser": "hermes", + "env_vars": {}, "vllm_args": [] + } + } +} diff --git a/skills/serving-llms-on-instinct/data/recipes_cache.json b/skills/serving-llms-on-instinct/data/recipes_cache.json new file mode 100644 index 0000000..a2c9e3f --- /dev/null +++ b/skills/serving-llms-on-instinct/data/recipes_cache.json @@ -0,0 +1,11064 @@ +{ + "fetched_at": "2026-06-01T21:48:49.090727+00:00", + "docker_image": "vllm/vllm-openai-rocm:v0.22.0", + "docker_tag": "v0.22.0", + "docker_tag_date": "2026-05-29T11:13:08.484149Z", + "model_count": 98, + "models": { + "ByteDance-Seed/Seed-OSS-36B-Instruct": { + "hf_id": "ByteDance-Seed/Seed-OSS-36B-Instruct", + "meta": { + "title": "Seed-OSS-36B", + "provider": "Seed (ByteDance)", + "description": "ByteDance Seed-OSS 36B dense model with unique 'thinking budget' control and 512K context support", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "36B" + }, + "recipe": { + "meta": { + "title": "Seed-OSS-36B", + "slug": "seed-oss-36b", + "provider": "Seed (ByteDance)", + "description": "ByteDance Seed-OSS 36B dense model with unique 'thinking budget' control and 512K context support", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [] + }, + "model": { + "model_id": "ByteDance-Seed/Seed-OSS-36B-Instruct", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "36B", + "active_parameters": "36B", + "context_length": 524288, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Pinned transformers commit required for Seed-OSS tokenizer compatibility", + "command": "uv pip install git+https://github.com/huggingface/transformers.git@56d68c6706ee052b445e1e476056ed92ac5eb383" + } + ], + "features": { + "tool_calling": { + "description": "Seed-OSS tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "seed_oss" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 86, + "description": "Native BF16 weights on 8x GPU (TP=8)", + "extra_args": [ + "--tensor-parallel-size", + "8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nSeed-OSS-36B is a dense language model from ByteDance Seed with a unique **thinking\nbudget** feature for controlled reasoning, and up to 512K context. Users can choose\ntensor-parallel (low latency) or data-parallel (high throughput).\n\n## Prerequisites\n\n- Hardware: 8x GPU recommended (TP=8); also runs on consumer hardware like RTX 3090\n- vLLM >= 0.11.0 (support may require main branch)\n- Latest transformers for compatibility\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\nuv pip install git+https://github.com/huggingface/transformers.git@56d68c6706ee052b445e1e476056ed92ac5eb383\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n## Launch commands\n\nNVIDIA:\n\n```bash\nvllm serve ByteDance-Seed/Seed-OSS-36B-Instruct \\\n --host localhost --port 8000 \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser seed_oss\n```\n\nAMD:\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve ByteDance-Seed/Seed-OSS-36B-Instruct \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice --tool-call-parser seed_oss \\\n --trust-remote-code\n```\n\nTuning:\n- `--max-model-len=65536` works well; max is 512K.\n- `--max-num-batched-tokens=32768` for prompt-heavy; reduce to 8K\u201316K for latency.\n- `--gpu-memory-utilization=0.95` to maximize KV cache.\n\n## Thinking Budget\n\nControl the model's chain-of-thought length via `chat_template_kwargs`. Recommended\nvalues are multiples of 512 (512, 1K, 2K, 4K, 8K, 16K). Use `0` for direct answers.\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nmodel = client.models.list().data[0].id\n\nresponse = client.chat.completions.create(\n model=model,\n messages=[\n {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n {\"role\": \"user\", \"content\": \"Janet's ducks lay 16 eggs per day...\"},\n ],\n extra_body={\"chat_template_kwargs\": {\"thinking_budget\": 512}},\n)\nprint(response.choices[0].message.content)\n```\n\ncurl:\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"ByteDance-Seed/Seed-OSS-36B-Instruct\",\n \"messages\": [{\"role\": \"user\", \"content\": \"Explain quantum computing\"}],\n \"chat_template_kwargs\": {\"thinking_budget\": 512}\n }'\n```\n\nThe model emits `` blocks with `` markers that\nreport token usage against the budget.\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm --model ByteDance-Seed/Seed-OSS-36B-Instruct \\\n --endpoint /v1/completions --host localhost --port 8000 \\\n --dataset-name random --random-input 800 --random-output 100 \\\n --request-rate 2 --num-prompt 100\n```\n\n## References\n\n- [Seed-OSS-36B-Instruct on Hugging Face](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct)\n" + } + }, + "Google/gemma-4-26B-A4B-it": { + "hf_id": "Google/gemma-4-26B-A4B-it", + "meta": { + "title": "Gemma 4 26B-A4B IT", + "provider": "Google", + "description": "Google's Gemma 4 MoE multimodal model (26B total / 4B active) with 128 fine-grained experts, top-8 routing, thinking mode, and tool-use protocol.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "26B" + }, + "recipe": { + "meta": { + "title": "Gemma 4 26B-A4B IT", + "slug": "gemma-4-26b-a4b-it", + "provider": "Google", + "description": "Google's Gemma 4 MoE multimodal model (26B total / 4B active) with 128 fine-grained experts, top-8 routing, thinking mode, and tool-use protocol.", + "date_updated": "2026-05-11", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "MoE multimodal model \u2014 26B total / 4B active, 128 experts with top-8 routing", + "related_recipes": [ + "google/gemma-4-E2B-it", + "google/gemma-4-E4B-it", + "google/gemma-4-31B-it" + ], + "platforms": [ + { + "id": "modal", + "blurb": "Serverless deploy via the Gemma 4 Modal script bundled with this recipe.", + "script": "https://github.com/vllm-project/recipes/blob/main/Google/gemma4-modal.py", + "install": "curl -O https://raw.githubusercontent.com/vllm-project/recipes/main/Google/gemma4-modal.py\npip install modal\nmodal setup\nmodal deploy gemma4-modal.py\nmodal run gemma4-modal.py\n" + } + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model": { + "model_id": "google/gemma-4-26B-A4B-it", + "min_vllm_version": "0.19.1", + "architecture": "moe", + "parameter_count": "26B", + "active_parameters": "4B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "dependencies": [], + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Gemma 4 parser and chat template", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "gemma4", + "--chat-template", + "examples/tool_chat_template_gemma4.jinja" + ] + }, + "reasoning": { + "description": "Enable structured thinking/reasoning output", + "args": [ + "--reasoning-parser", + "gemma4" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "spec_decoding": { + "description": "MTP speculative decoding for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"google/gemma-4-26B-A4B-it-assistant\",\"num_speculative_tokens\":4}" + ] + } + }, + "opt_in_features": [ + "text_only", + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 64, + "description": "Full BF16 \u2014 single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X or 2x Xeon6/Xeon5 NUMA nodes" + }, + "fp8": { + "model_id": "RedHatAI/gemma-4-26B-A4B-it-FP8-dynamic", + "precision": "fp8", + "vram_minimum_gb": 32, + "description": "FP8 (E4M3) weights with dynamic per-token activation quantization \u2014 Hopper or Blackwell", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "nvfp4": { + "model_id": "RedHatAI/gemma-4-26B-A4B-it-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 16, + "description": "NVFP4 (4-bit) quantized weights \u2014 requires Blackwell (B200/B300)", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "cpu": { + "extra_env": { + "VLLM_CPU_KVCACHE_SPACE": "40", + "VLLM_CPU_ATTN_SPLIT_KV": "0" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[Gemma 4 26B-A4B](https://huggingface.co/google/gemma-4-26B-A4B-it) is the Mixture-of-Experts member of Google's Gemma 4 family \u2014 26B total parameters with only 4B active per token via 128 fine-grained experts and top-8 routing. It supports text + images natively, structured thinking, function calling, and dynamic vision resolution.\n\n### Key Features\n- **MoE**: 128 fine-grained experts with top-8 routing and custom GELU-activated FFN.\n- **Multimodal**: Text + images natively (video via custom frame-extraction pipeline). Audio is only supported on the smaller E2B/E4B variants.\n- **Dual Attention**: Alternating sliding-window (local) and global attention with different head dimensions.\n- **Thinking Mode**: Structured reasoning via `<|channel>thought\\n...` delimiters.\n- **Function Calling**: Custom tool-call protocol with dedicated special tokens.\n- **Dynamic Vision Resolution**: Per-request configurable vision token budget (70, 140, 280, 560, 1120 tokens).\n\nTPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with recipes for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) and [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/).\n\n## Prerequisites\n\n### pip (NVIDIA CUDA)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu129 \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n### pip (AMD ROCm: MI300X, MI325X, MI350X, MI355X)\nRequires Python 3.12, ROCm 7.2.1, glibc >= 2.35 (Ubuntu 22.04+).\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/rocm/nightly/rocm721 --upgrade\n```\n\n### pip (Intel Xeon 6 CPUs)\nFor Intel and AMD x86 CPUs, follow the [CPU pre-built wheels](https://docs.vllm.ai/en/latest/getting_started/installation/cpu/#pre-built-wheels) installation instructions.\n\n### Docker\n```bash\ndocker pull vllm/vllm-openai:gemma4-0505-cu129 # NVIDIA Hopper (H100/H200, CUDA 12.9)\ndocker pull vllm/vllm-openai:gemma4-0505-cu130 # NVIDIA Blackwell (B200/B300, CUDA 13.0)\ndocker pull vllm/vllm-openai-rocm:latest # AMD\ndocker pull vllm/vllm-openai-cpu:latest-x86_64 # For Intel Xeon 6\n```\nTPU images are published separately by [vllm-project/tpu-inference](https://github.com/vllm-project/tpu-inference); see the Trillium / Ironwood tpu-recipes below for the pinned tag.\n\n## Deployment Configurations\n\n### 26B MoE on 1x A100/H100 (BF16)\n```bash\nvllm serve google/gemma-4-26B-A4B-it \\\n --max-model-len 32768 \\\n --gpu-memory-utilization 0.90\n```\n\n### Full-Featured Server Launch\nEnables text, image, thinking, and tool calling:\n```bash\nvllm serve google/gemma-4-26B-A4B-it \\\n --max-model-len 16384 \\\n --gpu-memory-utilization 0.90 \\\n --enable-auto-tool-choice \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --chat-template examples/tool_chat_template_gemma4.jinja \\\n --limit-mm-per-prompt.image 4 \\\n --async-scheduling \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n### Docker (NVIDIA)\n```bash\ndocker run -itd --name gemma4-moe \\\n --ipc=host --network host --shm-size 16G --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:gemma4-0505-cu129 \\\n --model google/gemma-4-26B-A4B-it \\\n --max-model-len 32768 \\\n --gpu-memory-utilization 0.90 \\\n --host 0.0.0.0 --port 8000\n```\nSwap `vllm/vllm-openai:gemma4-0505-cu129` for `vllm/vllm-openai:gemma4-0505-cu130` on Blackwell (B200/B300).\n\n### Docker (AMD MI300X/MI325X/MI350X/MI355X)\n```bash\ndocker run -itd --name gemma4-rocm \\\n --ipc=host --network=host --privileged \\\n --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri \\\n --group-add=video --cap-add=SYS_PTRACE \\\n --security-opt=seccomp=unconfined --shm-size 16G \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest \\\n --model google/gemma-4-26B-A4B-it \\\n --host 0.0.0.0 --port 8000\n```\n\n### Docker (Cloud TPU \u2014 Trillium / Ironwood)\nTPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) or [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/) recipe, then run:\n```bash\ndocker run -itd --name gemma4-tpu \\\n --privileged --network host --shm-size 16G \\\n -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \\\n vllm/vllm-tpu:latest \\\n --model google/gemma-4-26B-A4B-it \\\n --tensor-parallel-size 8 \\\n --max-model-len 16384 \\\n --disable_chunked_mm_input \\\n --host 0.0.0.0 --port 8000\n```\nTrillium requires a 4-chip slice minimum; Ironwood runs on a single chip.\n\n### Intel Xeon 6 Deployment via Docker\n\nLaunch the x86 CPU vLLM Docker container for `google/gemma-4-26B-A4B-it`:\n\n```bash\ndocker run -itd --name gemma4-cpu \\\n --network host \\\n --shm-size 16g \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -e VLLM_CPU_KVCACHE_SPACE=40 \\\n -e VLLM_CPU_ATTN_SPLIT_KV=0 \\\n vllm/vllm-openai-cpu:latest-x86_64 \\\n --model google/gemma-4-26B-A4B-it \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\nFor additional Intel Xeon 6 deployment details, see the Intel Software Catalog entries for [Gemma 4 26B-A4B IT](https://aiswcatalog.intel.com/models/google-gemma-4-26b-a4b-it).\n\n## Client Usage\n\n### Text Generation\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-26B-A4B-it\",\n messages=[{\"role\": \"user\", \"content\": \"Write a poem about the ocean.\"}],\n max_tokens=512, temperature=0.7,\n)\nprint(response.choices[0].message.content)\n```\n\n### Image Understanding\n```python\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-26B-A4B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg\"}},\n {\"type\": \"text\", \"text\": \"Describe this image in detail.\"},\n ]}],\n max_tokens=1024,\n)\n```\n\n### Thinking Mode\n```bash\nvllm serve google/gemma-4-26B-A4B-it \\\n --max-model-len 16384 \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --enable-auto-tool-choice \\\n --chat-template examples/tool_chat_template_gemma4.jinja\n```\nEnable per-request via `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}}`.\n\n### Dynamic Vision Resolution\nSupported values: 70, 140, 280 (default), 560, 1120 tokens/image.\n```bash\nvllm serve google/gemma-4-26B-A4B-it \\\n --mm-processor-kwargs '{\"max_soft_tokens\": 560}'\n```\n\n## Configuration Tips\n\n- Set `--max-model-len` to match your workload.\n- `--gpu-memory-utilization 0.90-0.95` maximizes KV cache.\n- Text-only workloads: `--limit-mm-per-prompt '{\"image\": 0, \"audio\": 0}'`.\n- `--async-scheduling` improves throughput.\n- FP8 KV cache (`--kv-cache-dtype fp8`) saves ~50% KV memory.\n- For MoE, TEP (tensor-expert parallelism) and DEP (data-expert parallelism) strategies scale better than pure TP at large node counts.\n\n## Quantized Variants\n\nTwo pre-quantized checkpoints are available:\n\n- [`RedHatAI/gemma-4-26B-A4B-it-FP8-dynamic`](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-FP8-dynamic) \u2014 FP8 (E4M3) weights with dynamic per-token activation quantization; runs on Hopper and Blackwell.\n- [`RedHatAI/gemma-4-26B-A4B-it-NVFP4`](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-NVFP4) \u2014 NVFP4 (4-bit) weights; requires Blackwell (B200/B300).\n\nPick them from the **Variant** dropdown above, or pass the repo id directly to `vllm serve`.\n\n## Throughput vs Latency\n\n| Goal | TP | `--max-num-seqs` | Notes |\n|------|----|------------------|-------|\n| Max throughput | 1-2 | 256-512 | Best tok/s per GPU |\n| Min latency | 4-8 | 8-16 | Best TTFT/TPOT |\n| Balanced | 2 | 128 | Mixed workloads |\n\n## Speculative Decoding (MTP)\n\nEnable the **Spec Decoding** feature toggle (above) or add `--speculative-config` manually to use MTP drafting with the [assistant model](https://huggingface.co/google/gemma-4-26B-A4B-it-assistant). Recommended `num_speculative_tokens`: 4 for this model. See the [Gemma 4 usage guide](../../Google/Gemma4) for details and benchmarks.\n\n> **Note:** MTP speculative decoding for Gemma 4 is only available on the vLLM nightly build \u2014 it has not yet landed in a stable release. Install via the nightly wheel (`uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly/cu129 \u2026`) or use the `vllm/vllm-openai:gemma4-0505-cu129` / `vllm/vllm-openai:gemma4-0505-cu130` images above; the standard `:latest` stable tag does not include this feature.\n\n## References\n\n- [Model card](https://huggingface.co/google/gemma-4-26B-A4B-it)\n- [FP8 variant](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-FP8-dynamic)\n- [NVFP4 variant](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-NVFP4)\n- [Gemma docs](https://ai.google.dev/gemma/docs)\n- [vLLM Gemma 4 tool-call template](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_gemma4.jinja)\n- [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4)\n- [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/)\n- [Xeon 6 recipe: Gemma 4 26B-A4B IT](https://aiswcatalog.intel.com/models/google-gemma-4-26b-a4b-it)\n" + } + }, + "Google/gemma-4-31B-it": { + "hf_id": "Google/gemma-4-31B-it", + "meta": { + "title": "Gemma 4 31B IT", + "provider": "Google", + "description": "Google's unified multimodal Gemma 4 dense model (31B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "31B" + }, + "recipe": { + "meta": { + "title": "Gemma 4 31B IT", + "slug": "gemma-4-31b-it", + "provider": "Google", + "description": "Google's unified multimodal Gemma 4 dense model (31B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "date_updated": "2026-05-11", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Unified multimodal model with structured thinking, function calling, dynamic vision resolution", + "related_recipes": [], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified" + } + }, + "model": { + "model_id": "google/gemma-4-31B-it", + "min_vllm_version": "0.19.1", + "architecture": "dense", + "parameter_count": "31B", + "active_parameters": "31B", + "context_length": 262144, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Audio extras \u2014 only needed when serving the audio modality", + "command": "uv pip install \"vllm[audio]\"", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Gemma 4 parser and chat template", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "gemma4", + "--chat-template", + "examples/tool_chat_template_gemma4.jinja" + ] + }, + "reasoning": { + "description": "Enable structured thinking/reasoning output", + "args": [ + "--reasoning-parser", + "gemma4" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "spec_decoding": { + "description": "MTP speculative decoding for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"google/gemma-4-31B-it-assistant\",\"num_speculative_tokens\":4}" + ] + } + }, + "opt_in_features": [ + "text_only", + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 75, + "description": "Full BF16 \u2014 single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X" + }, + "fp8": { + "model_id": "RedHatAI/gemma-4-31B-it-FP8-dynamic", + "precision": "fp8", + "vram_minimum_gb": 38, + "description": "FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision tower stays BF16) \u2014 Hopper or Blackwell", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "nvfp4": { + "model_id": "nvidia/gemma-4-31B-it-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 19, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[Gemma 4](https://ai.google.dev/gemma/docs) is Google's most capable open model family, featuring a unified multimodal architecture that natively processes text, images, and audio. Gemma 4 models support structured thinking/reasoning, function calling with a custom tool-use protocol, and dynamic vision resolution \u2014 all available through vLLM's OpenAI-compatible API.\n\n### Key Features\n- **Multimodal**: Text + images natively (video via custom frame-extraction pipeline). The smaller E2B and E4B models also support audio.\n- **MoE variant**: 128 fine-grained experts with top-8 routing and custom GELU-activated FFN (Gemma 4 26B-A4B).\n- **Dual Attention**: Alternating sliding-window (local) and global attention with different head dimensions.\n- **Thinking Mode**: Structured reasoning via `<|channel>thought\\n...` delimiters.\n- **Function Calling**: Custom tool-call protocol with dedicated special tokens.\n- **Dynamic Vision Resolution**: Per-request configurable vision token budget (70, 140, 280, 560, 1120 tokens).\n\n### Supported Variants\n\nDense:\n- `google/gemma-4-E2B-it` (effective 2B)\n- `google/gemma-4-E4B-it` (effective 4B)\n- `google/gemma-4-31B-it` (31B)\n\nMoE:\n- `google/gemma-4-26B-A4B-it` (26B total / 4B active)\n\nTPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with recipes for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) and [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/).\n\n## Prerequisites\n\n### pip (NVIDIA CUDA)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu129 \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n### pip (AMD ROCm: MI300X, MI325X, MI350X, MI355X)\nRequires Python 3.12, ROCm 7.2.1, glibc >= 2.35 (Ubuntu 22.04+).\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/rocm/nightly/rocm721 --upgrade\n```\n\n### Docker\n```bash\ndocker pull vllm/vllm-openai:gemma4-0505-cu129 # NVIDIA Hopper (H100/H200, CUDA 12.9)\ndocker pull vllm/vllm-openai:gemma4-0505-cu130 # NVIDIA Blackwell (B200/B300, CUDA 13.0)\ndocker pull vllm/vllm-openai-rocm:latest # AMD\n```\nTPU images are published separately by [vllm-project/tpu-inference](https://github.com/vllm-project/tpu-inference); see the Trillium / Ironwood tpu-recipes below for the pinned tag.\n\n## Deployment Configurations\n\n### Quick Start (Single GPU)\n```bash\nvllm serve google/gemma-4-E4B-it \\\n --max-model-len # up to 131072\n```\n\n### 31B Dense on 2xA100/H100 (TP=2, BF16)\n```bash\nvllm serve google/gemma-4-31B-it \\\n --tensor-parallel-size 2 \\\n --max-model-len 32768 \\\n --gpu-memory-utilization 0.90\n```\n\n### 26B MoE on 1xA100/H100 (BF16)\n```bash\nvllm serve google/gemma-4-26B-A4B-it \\\n --max-model-len 32768 \\\n --gpu-memory-utilization 0.90\n```\n\n### Full-Featured Server Launch\nEnables text, image, audio, thinking, and tool calling:\n```bash\nvllm serve google/gemma-4-31B-it \\\n --tensor-parallel-size 2 \\\n --max-model-len 16384 \\\n --gpu-memory-utilization 0.90 \\\n --enable-auto-tool-choice \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --chat-template examples/tool_chat_template_gemma4.jinja \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}' \\\n --async-scheduling \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n### Docker (NVIDIA)\n```bash\ndocker run -itd --name gemma4 \\\n --ipc=host --network host --shm-size 16G --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:gemma4-0505-cu129 \\\n --model google/gemma-4-31B-it \\\n --tensor-parallel-size 2 \\\n --max-model-len 32768 \\\n --gpu-memory-utilization 0.90 \\\n --host 0.0.0.0 --port 8000\n```\nSwap `vllm/vllm-openai:gemma4-0505-cu129` for `vllm/vllm-openai:gemma4-0505-cu130` on Blackwell (B200/B300).\n\n### Docker (AMD MI300X/MI325X/MI350X/MI355X)\n```bash\ndocker run -itd --name gemma4-rocm \\\n --ipc=host --network=host --privileged \\\n --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri \\\n --group-add=video --cap-add=SYS_PTRACE \\\n --security-opt=seccomp=unconfined --shm-size 16G \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest \\\n --model google/gemma-4-31B-it \\\n --host 0.0.0.0 --port 8000\n```\n\n### Docker (Cloud TPU \u2014 Trillium / Ironwood)\nTPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) or [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/) recipe, then run:\n```bash\ndocker run -itd --name gemma4-tpu \\\n --privileged --network host --shm-size 16G \\\n -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \\\n vllm/vllm-tpu:latest \\\n --model google/gemma-4-31B-it \\\n --tensor-parallel-size 8 \\\n --max-model-len 16384 \\\n --disable_chunked_mm_input \\\n --host 0.0.0.0 --port 8000\n```\n\n## Client Usage\n\n### Text Generation\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-31B-it\",\n messages=[{\"role\": \"user\", \"content\": \"Write a poem about the ocean.\"}],\n max_tokens=512, temperature=0.7,\n)\nprint(response.choices[0].message.content)\n```\n\n### Image Understanding\n```python\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-31B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg\"}},\n {\"type\": \"text\", \"text\": \"Describe this image in detail.\"},\n ]}],\n max_tokens=1024,\n)\n```\n\n### Dynamic Vision Resolution\nSupported values: 70, 140, 280 (default), 560, 1120 tokens/image.\n```bash\nvllm serve google/gemma-4-31B-it \\\n --mm-processor-kwargs '{\"max_soft_tokens\": 560}'\n```\n\n### Audio (E2B / E4B)\nRequires `uv pip install \"vllm[audio]\"`.\n```bash\nvllm serve google/gemma-4-E2B-it \\\n --max-model-len 8192 \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}'\n```\n\n### Thinking Mode\n```bash\nvllm serve google/gemma-4-31B-it \\\n --max-model-len 16384 \\\n --enable-auto-tool-choice \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --chat-template examples/tool_chat_template_gemma4.jinja\n```\nEnable thinking per-request via `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}}`, or default-on with `--default-chat-template-kwargs '{\"enable_thinking\": true}'`.\n\n### Structured Outputs\nvLLM guided decoding constrains output to a JSON schema. Include semantic instructions in the system prompt \u2014 the model does not see schema descriptions.\n\n## Configuration Tips\n\n- Set `--max-model-len` to match your workload.\n- `--gpu-memory-utilization 0.90\u20130.95` maximizes KV cache.\n- Image-only workloads: pass `--limit-mm-per-prompt.audio 0`.\n- Text-only workloads: pass `--limit-mm-per-prompt '{\"image\": 0, \"audio\": 0}'` to skip MM profiling.\n- `--async-scheduling` improves throughput.\n- FP8 KV cache (`--kv-cache-dtype fp8`) saves ~50% KV memory.\n\n## Quantized Variants\n\nTwo pre-quantized checkpoints are available:\n\n- [`RedHatAI/gemma-4-31B-it-FP8-dynamic`](https://huggingface.co/RedHatAI/gemma-4-31B-it-FP8-dynamic) \u2014 FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision tower stays BF16); runs on Hopper and Blackwell.\n- [`nvidia/gemma-4-31B-it-NVFP4`](https://huggingface.co/nvidia/gemma-4-31B-it-NVFP4) \u2014 NVFP4 (4-bit) weights; requires Blackwell (B200/B300).\n\nPick them from the **Variant** dropdown above, or pass the repo id directly to `vllm serve`.\n\n## Throughput vs Latency\n\n| Goal | TP | `--max-num-seqs` | Notes |\n|------|----|------------------|-------|\n| Max throughput | 1-2 | 256-512 | Best tok/s per GPU |\n| Min latency | 4-8 | 8-16 | Best TTFT/TPOT |\n| Balanced | 2 | 128 | Mixed workloads |\n\n## Speculative Decoding (MTP)\n\nEnable the **Spec Decoding** feature toggle (above) or add `--speculative-config` manually to use MTP drafting with the [assistant model](https://huggingface.co/google/gemma-4-31B-it-assistant). Recommended `num_speculative_tokens`: 4\u20138 for this model. See the [Gemma 4 usage guide](../../Google/Gemma4) for details and benchmarks.\n\n> **Note:** MTP speculative decoding for Gemma 4 is only available on the vLLM nightly build \u2014 it has not yet landed in a stable release. Install via the nightly wheel (`uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly/cu129 \u2026`) or use the `vllm/vllm-openai:gemma4-0505-cu129` / `vllm/vllm-openai:gemma4-0505-cu130` images above; the standard `:latest` stable tag does not include this feature.\n\n## References\n\n- [Model card](https://huggingface.co/google/gemma-4-31B-it)\n- [FP8 variant](https://huggingface.co/RedHatAI/gemma-4-31B-it-FP8-dynamic)\n- [NVFP4 variant](https://huggingface.co/nvidia/gemma-4-31B-it-NVFP4)\n- [Gemma docs](https://ai.google.dev/gemma/docs)\n- [vLLM Gemma 4 tool-call template](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_gemma4.jinja)\n- [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4)\n- [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/)\n" + } + }, + "Google/gemma-4-E2B-it": { + "hf_id": "Google/gemma-4-E2B-it", + "meta": { + "title": "Gemma 4 E2B IT", + "provider": "Google", + "description": "Google's compact Gemma 4 multimodal model (effective 2B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "5B" + }, + "recipe": { + "meta": { + "title": "Gemma 4 E2B IT", + "slug": "gemma-4-e2b-it", + "provider": "Google", + "description": "Google's compact Gemma 4 multimodal model (effective 2B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "date_updated": "2026-05-11", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Compact unified multimodal model with audio, thinking, and function calling \u2014 runs on a single 24 GB+ GPU", + "related_recipes": [ + "google/gemma-4-E4B-it", + "google/gemma-4-31B-it", + "google/gemma-4-26B-A4B-it" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model": { + "model_id": "google/gemma-4-E2B-it", + "min_vllm_version": "0.19.1", + "architecture": "dense", + "parameter_count": "5B", + "active_parameters": "5B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Audio extras \u2014 needed to serve the audio modality", + "command": "uv pip install \"vllm[audio]\"", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Gemma 4 parser and chat template", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "gemma4", + "--chat-template", + "examples/tool_chat_template_gemma4.jinja" + ] + }, + "reasoning": { + "description": "Enable structured thinking/reasoning output", + "args": [ + "--reasoning-parser", + "gemma4" + ] + }, + "text_only": { + "description": "Skip loading the vision and audio encoders for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "spec_decoding": { + "description": "MTP speculative decoding with centroids masking for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"google/gemma-4-E2B-it-assistant\",\"num_speculative_tokens\":2}" + ] + } + }, + "opt_in_features": [ + "text_only", + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 13, + "description": "Full BF16 \u2014 single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X or 1x Xeon6/Xeon5 NUMA node" + }, + "fp8": { + "model_id": "RedHatAI/gemma-4-E2B-it-FP8-dynamic", + "precision": "fp8", + "vram_minimum_gb": 6, + "description": "FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision/audio encoders stay BF16) \u2014 Hopper or Blackwell", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "cpu": { + "extra_env": { + "VLLM_CPU_KVCACHE_SPACE": "40", + "VLLM_CPU_ATTN_SPLIT_KV": "0" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[Gemma 4 E2B](https://huggingface.co/google/gemma-4-E2B-it) is the smallest member of Google's Gemma 4 family \u2014 an effective-2B unified multimodal model that natively processes text, images, and audio, with structured thinking/reasoning, function calling, and dynamic vision resolution. It runs comfortably on a single 24 GB+ GPU.\n\n### Key Features\n- **Multimodal**: Text + images + audio natively (video via custom frame-extraction pipeline).\n- **Dual Attention**: Alternating sliding-window (local) and global attention with different head dimensions.\n- **Thinking Mode**: Structured reasoning via `<|channel>thought\\n...` delimiters.\n- **Function Calling**: Custom tool-call protocol with dedicated special tokens.\n- **Dynamic Vision Resolution**: Per-request configurable vision token budget (70, 140, 280, 560, 1120 tokens).\n\nTPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with recipes for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) and [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/).\n\n## Prerequisites\n\n### pip (NVIDIA CUDA)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu129 \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n### pip (AMD ROCm: MI300X, MI325X, MI350X, MI355X)\nRequires Python 3.12, ROCm 7.2.1, glibc >= 2.35 (Ubuntu 22.04+).\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/rocm/nightly/rocm721 --upgrade\n```\n### pip (Intel Xeon 6 CPUs)\nFor Intel and AMD x86 CPUs, follow the [CPU pre-built wheels](https://docs.vllm.ai/en/latest/getting_started/installation/cpu/#pre-built-wheels) installation instructions.\n\n\n\n### Docker\n```bash\ndocker pull vllm/vllm-openai:gemma4-0505-cu129 # NVIDIA Hopper (H100/H200, CUDA 12.9)\ndocker pull vllm/vllm-openai:gemma4-0505-cu130 # NVIDIA Blackwell (B200/B300, CUDA 13.0)\ndocker pull vllm/vllm-openai-rocm:latest # AMD\ndocker pull vllm/vllm-openai-cpu:latest-x86_64 # For Intel Xeon 6\n```\n\n## Deployment Configurations\n\n### Quick Start (Single GPU)\n```bash\nvllm serve google/gemma-4-E2B-it \\\n --max-model-len 32768\n```\n\n### With Audio Support\n```bash\nvllm serve google/gemma-4-E2B-it \\\n --max-model-len 8192 \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}'\n```\n\n### Full-Featured Server Launch\nEnables text, image, audio, thinking, and tool calling:\n```bash\nvllm serve google/gemma-4-E2B-it \\\n --max-model-len 16384 \\\n --gpu-memory-utilization 0.90 \\\n --enable-auto-tool-choice \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --chat-template examples/tool_chat_template_gemma4.jinja \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}' \\\n --async-scheduling \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n### Docker (NVIDIA)\n```bash\ndocker run -itd --name gemma4-e2b \\\n --ipc=host --network host --shm-size 16G --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:gemma4-0505-cu129 \\\n --model google/gemma-4-E2B-it \\\n --max-model-len 32768 \\\n --host 0.0.0.0 --port 8000\n```\nSwap `vllm/vllm-openai:gemma4-0505-cu129` for `vllm/vllm-openai:gemma4-0505-cu130` on Blackwell (B200/B300).\n\n### Docker (AMD MI300X/MI325X/MI350X/MI355X)\n```bash\ndocker run -itd --name gemma4-rocm \\\n --ipc=host --network=host --privileged \\\n --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri \\\n --group-add=video --cap-add=SYS_PTRACE \\\n --security-opt=seccomp=unconfined --shm-size 16G \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest \\\n --model google/gemma-4-E2B-it \\\n --host 0.0.0.0 --port 8000\n```\n\n### Docker (Cloud TPU \u2014 Trillium / Ironwood)\nTPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) or [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/) recipe, then run:\n```bash\ndocker run -itd --name gemma4-tpu \\\n --privileged --network host --shm-size 16G \\\n -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \\\n vllm/vllm-tpu:latest \\\n --model google/gemma-4-E2B-it \\\n --max-model-len 16384 \\\n --disable_chunked_mm_input \\\n --host 0.0.0.0 --port 8000\n```\n\n### Intel Xeon 6 Deployment via Docker\n\nLaunch the x86 CPU vLLM Docker container for `google/gemma-4-E2B-it`:\n\n```bash\ndocker run -itd --name gemma4-cpu \\\n --network host \\\n --shm-size 16g \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -e VLLM_CPU_KVCACHE_SPACE=40 \\\n -e VLLM_CPU_ATTN_SPLIT_KV=0 \\\n vllm/vllm-openai-cpu:latest-x86_64 \\\n --model google/gemma-4-E2B-it \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n\n## Client Usage\n\n### Audio Transcription\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-E2B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"audio_url\", \"audio_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/2/22/Beatbox_by_Wikipedia_user_Wikipedia_Brown.ogg\"}},\n {\"type\": \"text\", \"text\": \"Provide a verbatim, word-for-word transcription of the audio.\"},\n ]}],\n max_tokens=512,\n)\nprint(response.choices[0].message.content)\n```\n\n### Image Understanding\n```python\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-E2B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg\"}},\n {\"type\": \"text\", \"text\": \"Describe this image in detail.\"},\n ]}],\n max_tokens=1024,\n)\n```\n\n### Thinking Mode\nLaunch with reasoning parser, then enable per-request:\n```bash\nvllm serve google/gemma-4-E2B-it \\\n --max-model-len 16384 \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --enable-auto-tool-choice \\\n --chat-template examples/tool_chat_template_gemma4.jinja\n```\nEnable per-request via `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}}`.\n\n## Configuration Tips\n\n- Set `--max-model-len` to match your workload (max 131072).\n- Image-only workloads: `--limit-mm-per-prompt.audio 0`.\n- Text-only workloads: `--limit-mm-per-prompt '{\"image\": 0, \"audio\": 0}'` to skip MM profiling.\n- `--async-scheduling` improves throughput.\n- FP8 KV cache (`--kv-cache-dtype fp8`) saves ~50% KV memory.\n\n## Quantized Variant\n\n[`RedHatAI/gemma-4-E2B-it-FP8-dynamic`](https://huggingface.co/RedHatAI/gemma-4-E2B-it-FP8-dynamic) is a pre-quantized FP8 (E4M3) checkpoint \u2014 linear weights with dynamic per-token activation quantization, vision/audio encoders kept in BF16. Runs on Hopper and Blackwell. Pick the **fp8** variant above, or pass the repo id directly to `vllm serve`.\n\n## Speculative Decoding (MTP)\n\nEnable the **Spec Decoding** feature toggle (above) or add `--speculative-config` manually to use MTP drafting with the [assistant model](https://huggingface.co/google/gemma-4-E2B-it-assistant). Recommended `num_speculative_tokens`: 2 for this model. The E2B assistant uses centroids masking for efficient sparse logit computation. See the [Gemma 4 usage guide](../../Google/Gemma4) for details and benchmarks.\n\n> **Note:** MTP speculative decoding for Gemma 4 is only available on the vLLM nightly build \u2014 it has not yet landed in a stable release. Install via the nightly wheel (`uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly/cu129 \u2026`) or use the `vllm/vllm-openai:gemma4-0505-cu129` / `vllm/vllm-openai:gemma4-0505-cu130` images above; the standard `:latest` stable tag does not include this feature.\n\n## References\n\n- [Model card](https://huggingface.co/google/gemma-4-E2B-it)\n- [FP8 variant](https://huggingface.co/RedHatAI/gemma-4-E2B-it-FP8-dynamic)\n- [Gemma docs](https://ai.google.dev/gemma/docs)\n- [vLLM Gemma 4 tool-call template](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_gemma4.jinja)\n- [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4)\n- [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/)\n" + } + }, + "Google/gemma-4-E4B-it": { + "hf_id": "Google/gemma-4-E4B-it", + "meta": { + "title": "Gemma 4 E4B IT", + "provider": "Google", + "description": "Google's compact Gemma 4 multimodal model (effective 4B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "8B" + }, + "recipe": { + "meta": { + "title": "Gemma 4 E4B IT", + "slug": "gemma-4-e4b-it", + "provider": "Google", + "description": "Google's compact Gemma 4 multimodal model (effective 4B) with native text, image, and audio, plus thinking mode and tool-use protocol.", + "date_updated": "2026-05-11", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Effective-4B unified multimodal model with audio, thinking, and function calling \u2014 runs on a single 24 GB+ GPU", + "related_recipes": [ + "google/gemma-4-E2B-it", + "google/gemma-4-31B-it", + "google/gemma-4-26B-A4B-it" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "trillium": "verified", + "ironwood": "verified", + "xeon6": "verified" + } + }, + "model": { + "model_id": "google/gemma-4-E4B-it", + "min_vllm_version": "0.19.1", + "architecture": "dense", + "parameter_count": "8B", + "active_parameters": "8B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Audio extras \u2014 needed to serve the audio modality", + "command": "uv pip install \"vllm[audio]\"", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Gemma 4 parser and chat template", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "gemma4", + "--chat-template", + "examples/tool_chat_template_gemma4.jinja" + ] + }, + "reasoning": { + "description": "Enable structured thinking/reasoning output", + "args": [ + "--reasoning-parser", + "gemma4" + ] + }, + "text_only": { + "description": "Skip loading the vision and audio encoders for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "spec_decoding": { + "description": "MTP speculative decoding with centroids masking for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"google/gemma-4-E4B-it-assistant\",\"num_speculative_tokens\":4}" + ] + } + }, + "opt_in_features": [ + "text_only", + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 20, + "description": "Full BF16 \u2014 single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355Xi or 2x Xeon6/Xeon5 NUMA nodes" + }, + "fp8": { + "model_id": "RedHatAI/gemma-4-E4B-it-FP8-dynamic", + "precision": "fp8", + "vram_minimum_gb": 10, + "description": "FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision/audio encoders stay BF16) \u2014 Hopper or Blackwell", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "cpu": { + "extra_env": { + "VLLM_CPU_KVCACHE_SPACE": "40", + "VLLM_CPU_ATTN_SPLIT_KV": "0" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[Gemma 4 E4B](https://huggingface.co/google/gemma-4-E4B-it) is Google's effective-4B unified multimodal model \u2014 text + images + audio in a single model, with structured thinking/reasoning, function calling, and dynamic vision resolution. It fits on a single 24 GB+ GPU.\n\n### Key Features\n- **Multimodal**: Text + images + audio natively (video via custom frame-extraction pipeline).\n- **Dual Attention**: Alternating sliding-window (local) and global attention with different head dimensions.\n- **Thinking Mode**: Structured reasoning via `<|channel>thought\\n...` delimiters.\n- **Function Calling**: Custom tool-call protocol with dedicated special tokens.\n- **Dynamic Vision Resolution**: Per-request configurable vision token budget (70, 140, 280, 560, 1120 tokens).\n\nTPU support is provided through [vLLM TPU](https://github.com/vllm-project/tpu-inference) with recipes for [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) and [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/).\n\n## Prerequisites\n\n### pip (NVIDIA CUDA)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu129 \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n### pip (AMD ROCm: MI300X, MI325X, MI350X, MI355X)\nRequires Python 3.12, ROCm 7.2.1, glibc >= 2.35 (Ubuntu 22.04+).\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/rocm/nightly/rocm721 --upgrade\n```\n\n### pip (Intel Xeon 6 CPUs)\nFor Intel and AMD x86 CPUs, follow the [CPU pre-built wheels](https://docs.vllm.ai/en/latest/getting_started/installation/cpu/#pre-built-wheels) installation instructions.\n\n### Docker\n```bash\ndocker pull vllm/vllm-openai:gemma4-0505-cu129 # NVIDIA Hopper (H100/H200, CUDA 12.9)\ndocker pull vllm/vllm-openai:gemma4-0505-cu130 # NVIDIA Blackwell (B200/B300, CUDA 13.0)\ndocker pull vllm/vllm-openai-rocm:latest # AMD\ndocker pull vllm/vllm-openai-cpu:latest-x86_64 # For Intel Xeon 6\n```\n\n## Deployment Configurations\n\n### Quick Start (Single GPU)\n```bash\nvllm serve google/gemma-4-E4B-it \\\n --max-model-len 32768\n```\n\n### With Audio Support\n```bash\nvllm serve google/gemma-4-E4B-it \\\n --max-model-len 8192 \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}'\n```\n\n### Full-Featured Server Launch\nEnables text, image, audio, thinking, and tool calling:\n```bash\nvllm serve google/gemma-4-E4B-it \\\n --max-model-len 16384 \\\n --gpu-memory-utilization 0.90 \\\n --enable-auto-tool-choice \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --chat-template examples/tool_chat_template_gemma4.jinja \\\n --limit-mm-per-prompt '{\"image\": 4, \"audio\": 1}' \\\n --async-scheduling \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n### Docker (NVIDIA)\n```bash\ndocker run -itd --name gemma4-e4b \\\n --ipc=host --network host --shm-size 16G --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:gemma4-0505-cu129 \\\n --model google/gemma-4-E4B-it \\\n --max-model-len 32768 \\\n --host 0.0.0.0 --port 8000\n```\nSwap `vllm/vllm-openai:gemma4-0505-cu129` for `vllm/vllm-openai:gemma4-0505-cu130` on Blackwell (B200/B300).\n\n### Docker (AMD MI300X/MI325X/MI350X/MI355X)\n```bash\ndocker run -itd --name gemma4-rocm \\\n --ipc=host --network=host --privileged \\\n --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri \\\n --group-add=video --cap-add=SYS_PTRACE \\\n --security-opt=seccomp=unconfined --shm-size 16G \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest \\\n --model google/gemma-4-E4B-it \\\n --host 0.0.0.0 --port 8000\n```\n\n### Docker (Cloud TPU \u2014 Trillium / Ironwood)\nTPU uses the separate `vllm/vllm-tpu` image (no pip wheel). Pull the tag specified by the upstream [Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4) or [Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/) recipe, then run:\n```bash\ndocker run -itd --name gemma4-tpu \\\n --privileged --network host --shm-size 16G \\\n -v /dev/shm:/dev/shm -e HF_TOKEN=$HF_TOKEN \\\n vllm/vllm-tpu:latest \\\n --model google/gemma-4-E4B-it \\\n --max-model-len 16384 \\\n --disable_chunked_mm_input \\\n --host 0.0.0.0 --port 8000\n```\n\n### Intel Xeon 6 Deployment via Docker\n\nLaunch the x86 CPU vLLM Docker container for `google/gemma-4-E4B-it`:\n\n```bash\ndocker run -itd --name gemma4-cpu \\\n --network host \\\n --shm-size 16g \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -e VLLM_CPU_KVCACHE_SPACE=40 \\\n -e VLLM_CPU_ATTN_SPLIT_KV=0 \\\n vllm/vllm-openai-cpu:latest-x86_64 \\\n --model google/gemma-4-E4B-it \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\nFor additional Intel Xeon 6 deployment details, see the Intel Software Catalog entries for [Gemma 4 E4B IT](https://aiswcatalog.intel.com/models/google-gemma-4-e4b-it).\n\n## Client Usage\n\n### Audio Transcription\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-E4B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"audio_url\", \"audio_url\": {\"url\": \"https://example.com/audio.wav\"}},\n {\"type\": \"text\", \"text\": \"Transcribe this audio.\"},\n ]}],\n max_tokens=512,\n)\nprint(response.choices[0].message.content)\n```\n\n### Image Understanding\n```python\nresponse = client.chat.completions.create(\n model=\"google/gemma-4-E4B-it\",\n messages=[{\"role\": \"user\", \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg\"}},\n {\"type\": \"text\", \"text\": \"Describe this image in detail.\"},\n ]}],\n max_tokens=1024,\n)\n```\n\n### Thinking Mode\n```bash\nvllm serve google/gemma-4-E4B-it \\\n --max-model-len 16384 \\\n --reasoning-parser gemma4 \\\n --tool-call-parser gemma4 \\\n --enable-auto-tool-choice \\\n --chat-template examples/tool_chat_template_gemma4.jinja\n```\nEnable per-request via `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}}`.\n\n## Configuration Tips\n\n- Set `--max-model-len` to match your workload (max 131072).\n- Image-only workloads: `--limit-mm-per-prompt.audio 0`.\n- Text-only workloads: `--limit-mm-per-prompt '{\"image\": 0, \"audio\": 0}'` to skip MM profiling.\n- `--async-scheduling` improves throughput.\n- FP8 KV cache (`--kv-cache-dtype fp8`) saves ~50% KV memory.\n\n## Quantized Variant\n\n[`RedHatAI/gemma-4-E4B-it-FP8-dynamic`](https://huggingface.co/RedHatAI/gemma-4-E4B-it-FP8-dynamic) is a pre-quantized FP8 (E4M3) checkpoint \u2014 linear weights with dynamic per-token activation quantization, vision/audio encoders kept in BF16. Runs on Hopper and Blackwell. Pick the **fp8** variant above, or pass the repo id directly to `vllm serve`.\n\n## Speculative Decoding (MTP)\n\nEnable the **Spec Decoding** feature toggle (above) or add `--speculative-config` manually to use MTP drafting with the [assistant model](https://huggingface.co/google/gemma-4-E4B-it-assistant). Recommended `num_speculative_tokens`: 4 for this model. The E4B assistant uses centroids masking for efficient sparse logit computation. See the [Gemma 4 usage guide](../../Google/Gemma4) for details and benchmarks.\n\n> **Note:** MTP speculative decoding for Gemma 4 is only available on the vLLM nightly build \u2014 it has not yet landed in a stable release. Install via the nightly wheel (`uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly/cu129 \u2026`) or use the `vllm/vllm-openai:gemma4-0505-cu129` / `vllm/vllm-openai:gemma4-0505-cu130` images above; the standard `:latest` stable tag does not include this feature.\n\n## References\n\n- [Model card](https://huggingface.co/google/gemma-4-E4B-it)\n- [FP8 variant](https://huggingface.co/RedHatAI/gemma-4-E4B-it-FP8-dynamic)\n- [Gemma docs](https://ai.google.dev/gemma/docs)\n- [vLLM Gemma 4 tool-call template](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_gemma4.jinja)\n- [TPU recipes: Trillium](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Gemma4)\n- [TPU recipes: Ironwood](https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/inference/ironwood/vLLM/Gemma4/)\n- [Xeon 6 recipe: Gemma 4 E4B IT](https://aiswcatalog.intel.com/models/google-gemma-4-e4b-it)\n" + } + }, + "Google/translategemma-27b-it": { + "hf_id": "Google/translategemma-27b-it", + "meta": { + "title": "TranslateGemma 27B IT", + "provider": "Google", + "description": "Lightweight open translation model from Google (based on Gemma 3) supporting 55 languages. Served via the vLLM-optimized Infomaniak-AI checkpoint.", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "27B" + }, + "recipe": { + "meta": { + "title": "TranslateGemma 27B IT", + "slug": "translategemma-27b-it", + "provider": "Google", + "description": "Lightweight open translation model from Google (based on Gemma 3) supporting 55 languages. Served via the vLLM-optimized Infomaniak-AI checkpoint.", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "Deployable on laptops/desktops and cloud GPUs; vLLM-optimized checkpoint removes custom JSON inputs", + "related_recipes": [] + }, + "model": { + "model_id": "google/translategemma-27b-it", + "min_vllm_version": "0.14.1", + "architecture": "dense", + "parameter_count": "27B", + "active_parameters": "27B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 65, + "description": "Original Google checkpoint in BF16 (has vLLM compatibility issues \u2014 prefer vllm-optimized variant)" + }, + "vllm_optimized": { + "model_id": "Infomaniak-AI/vllm-translategemma-27b-it", + "precision": "bf16", + "vram_minimum_gb": 65, + "description": "Infomaniak-AI vLLM-optimized checkpoint \u2014 recommended. Fixes RoPE config, EOS token, and replaces custom JSON inputs with string delimiters." + }, + "small_4b": { + "model_id": "google/translategemma-4b-it", + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "4B variant for lower-resource deployments (prefer Infomaniak-AI/vllm-translategemma-4b-it)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[TranslateGemma](https://huggingface.co/collections/google/translategemma) is a family of lightweight, state-of-the-art open translation models from Google, based on the Gemma 3 family. TranslateGemma models handle translation across 55 languages and are small enough to deploy on laptops, desktops, and modest cloud GPU environments.\n\n### Original Models\n- [google/translategemma-27b-it](https://huggingface.co/google/translategemma-27b-it)\n- [google/translategemma-4b-it](https://huggingface.co/google/translategemma-4b-it)\n\n### Optimized vLLM Models\n- [Infomaniak-AI/vllm-translategemma-27b-it](https://huggingface.co/Infomaniak-AI/vllm-translategemma-27b-it)\n- [Infomaniak-AI/vllm-translategemma-4b-it](https://huggingface.co/Infomaniak-AI/vllm-translategemma-4b-it)\n\n### Why use the vLLM-optimized models?\n\nThe original Google models have compatibility issues with standard inference engines like vLLM. The optimized versions from Infomaniak-AI ([detailed changes](https://huggingface.co/Infomaniak-AI/vllm-translategemma-27b-it#changes-from-original-model)):\n\n- **vLLM Compatibility**: Originals require custom JSON parameters (`source_lang_code`/`target_lang_code`). The optimized version uses string delimiters.\n- **RoPE Simplification**: Originals use a complex RoPE configuration for sliding attention. Optimized uses a standard linear RoPE format (`factor: 8.0`).\n- **EOS Token Fix**: Corrects the EOS token from `` to ``.\n\n## Prerequisites\n\n### Docker\n```bash\ndocker pull vllm/vllm-openai:latest\n```\n\n## Deployment Configurations\n\nVerified for both 4B and 27B:\n```bash\ndocker run -itd --name google-translategemma-27b-it \\\n --ipc=host \\\n --network host \\\n --shm-size 16G \\\n --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:latest \\\n Infomaniak-AI/vllm-translategemma-27b-it \\\n --served-model-name translategemma-27b-it \\\n --gpu-memory-utilization 0.8 \\\n --host 0.0.0.0 \\\n --port 8000\n```\n\n## Client Usage\n\nTips:\n- **Prompt Delimiters**: Encode language metadata directly in the content string:\n `<<>>{src_lang}<<>>{tgt_lang}<<>>{text}`\n- **Language Codes**: ISO 639-1 Alpha-2 (e.g. `en`, `zh`) and regional variants (e.g. `en_US`, `zh_CN`).\n- **Context Limit**: ~2K tokens.\n\n### cURL\n```bash\ncurl -X POST http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"translategemma-27b-it\",\n \"messages\": [{\n \"role\": \"user\",\n \"content\": \"<<>>en<<>>zh<<>>We distribute two models for language identification, which can recognize 176 languages.\"\n }]\n }'\n```\n\n## References\n\n- [TranslateGemma collection](https://huggingface.co/collections/google/translategemma)\n- [Infomaniak-AI optimized 27B](https://huggingface.co/Infomaniak-AI/vllm-translategemma-27b-it)\n- [Infomaniak-AI optimized 4B](https://huggingface.co/Infomaniak-AI/vllm-translategemma-4b-it)\n" + } + }, + "MiniMaxAI/MiniMax-M2.1": { + "hf_id": "MiniMaxAI/MiniMax-M2.1", + "meta": { + "title": "MiniMax-M2.1", + "provider": "MiniMax", + "description": "MiniMax M2.1 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "230B" + }, + "recipe": { + "meta": { + "title": "MiniMax-M2.1", + "slug": "minimax-m2.1", + "provider": "MiniMax", + "description": "MiniMax M2.1 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Updated M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context", + "related_recipes": [], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "MiniMaxAI/MiniMax-M2.1", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "230B", + "active_parameters": "10B", + "context_length": 196608, + "base_args": [ + "--trust-remote-code", + "--compilation-config", + "{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "MiniMax M2 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "minimax_m2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiniMax M2 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "minimax_m2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 276, + "description": "Native FP8 checkpoint \u2014 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_FA" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 4 + } + }, + "guide": "## Overview\n\n[MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1) is part of the MiniMax\nM2 series of advanced MoE language models. It retains the M2 architecture (10B active,\n230B total) with improvements over the original M2 release. Supports 196K context per sequence.\n\n## Prerequisites\n\n- **OS:** Linux\n- **Python:** 3.10 - 3.13\n- **NVIDIA:** compute capability >= 7.0; ~220 GB for weights + 240 GB per 1M context tokens\n- **AMD:** MI300X / MI325X / MI350X / MI355X with ROCm 7.0+\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Docker (dedicated M2-series image)\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:minimax27 MiniMaxAI/MiniMax-M2.1 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --trust-remote-code\n```\n\n## Launching the Server\n\n### NVIDIA \u2014 TP4\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.1 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nPure TP8 is not supported. For >4 GPUs use DP+EP or TP+EP.\n\n### TP4+EP (recommended for H100)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.1 \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### AMD ROCm\n\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve MiniMaxAI/MiniMax-M2.1 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model MiniMaxAI/MiniMax-M2.1 \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- See [MiniMax-M2](./MiniMax-M2.yaml) for shared troubleshooting notes\n (`fuse_minimax_qk_norm`, nightly vs stable, DeepGEMM, AITER).\n\n## References\n\n- [Model card](https://huggingface.co/MiniMaxAI/MiniMax-M2.1)\n- [MiniMax](https://www.minimax.io/)\n" + } + }, + "MiniMaxAI/MiniMax-M2.5": { + "hf_id": "MiniMaxAI/MiniMax-M2.5", + "meta": { + "title": "MiniMax-M2.5", + "provider": "MiniMax", + "description": "MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "230B" + }, + "recipe": { + "meta": { + "title": "MiniMax-M2.5", + "slug": "minimax-m2.5", + "provider": "MiniMax", + "description": "MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint", + "date_updated": "2026-05-18", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Refreshed M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "MiniMaxAI/MiniMax-M2.5", + "min_vllm_version": "0.20.2", + "docker_image": "vllm/vllm-openai:v0.20.2", + "architecture": "moe", + "parameter_count": "230B", + "active_parameters": "10B", + "context_length": 196608, + "base_args": [ + "--trust-remote-code", + "--compilation-config", + "{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"fuse_minimax_qk_norm\":true}}" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "MiniMax M2 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "minimax_m2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiniMax M2 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "minimax_m2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 276, + "description": "Native FP8 checkpoint \u2014 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom" + }, + "nvfp4": { + "model_id": "nvidia/MiniMax-M2.5-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 138, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--max-num-seqs", + "512", + "--max-num-batched-tokens", + "32768", + "--kv-cache-dtype", + "fp8", + "--moe-backend", + "triton", + "--attention-backend", + "FLASHINFER", + "--enable-flashinfer-autotune" + ], + "extra_env": { + "PYTHONNOUSERSITE": "1", + "SAFETENSORS_FAST_GPU": "1", + "VLLM_USE_DEEP_GEMM": "0", + "VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER": "0", + "VLLM_FLOAT32_MATMUL_PRECISION": "high" + } + }, + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_FA" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 4 + } + }, + "guide": "## Overview\n\n[MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5) is part of the MiniMax\nM2 series of advanced MoE language models. It retains the M2 architecture (10B active,\n230B total) and a 196K context per sequence.\n\n## Prerequisites\n\n- **OS:** Linux\n- **Python:** 3.10 - 3.13\n- **NVIDIA:** compute capability >= 7.0; ~220 GB for weights + 240 GB per 1M context tokens\n- **AMD:** MI300X / MI325X / MI350X / MI355X with ROCm 7.0+\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Docker (dedicated M2-series image)\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -e PYTHONNOUSERSITE=1 \\\n -e SAFETENSORS_FAST_GPU=1 \\\n -e VLLM_USE_DEEP_GEMM=0 \\\n -e VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0 \\\n -e VLLM_FLOAT32_MATMUL_PRECISION=high \\\n vllm/vllm-openai:v0.20.2 MiniMaxAI/MiniMax-M2.5 \\\n --tensor-parallel-size 4 \\\n --max-num-seqs 512 \\\n --max-num-batched-tokens 32768 \\\n --kv-cache-dtype fp8 \\\n --moe-backend triton \\\n --attention-backend FLASHINFER \\\n --enable-flashinfer-autotune \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --trust-remote-code\n```\n\n## Launching the Server\n\n### NVIDIA H200 \u2014 TP4\n\n```bash\nPYTHONNOUSERSITE=1 \\\nSAFETENSORS_FAST_GPU=1 \\\nVLLM_USE_DEEP_GEMM=0 \\\nVLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0 \\\nVLLM_FLOAT32_MATMUL_PRECISION=high \\\nvllm serve MiniMaxAI/MiniMax-M2.5 \\\n --tensor-parallel-size 4 \\\n --max-num-seqs 512 \\\n --max-num-batched-tokens 32768 \\\n --kv-cache-dtype fp8 \\\n --moe-backend triton \\\n --attention-backend FLASHINFER \\\n --enable-flashinfer-autotune \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nPure TP8 is not supported. For >4 GPUs use DP+EP or TP+EP.\n\n### TP4+EP (recommended for H100)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.5 \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### AMD ROCm\n\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve MiniMaxAI/MiniMax-M2.5 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model MiniMaxAI/MiniMax-M2.5 \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- See [MiniMax-M2](./MiniMax-M2.yaml) for shared troubleshooting notes\n (`fuse_minimax_qk_norm`, nightly vs stable, DeepGEMM, AITER).\n\n## References\n\n- [Model card](https://huggingface.co/MiniMaxAI/MiniMax-M2.5)\n- [MiniMax](https://www.minimax.io/)\n" + } + }, + "MiniMaxAI/MiniMax-M2.7": { + "hf_id": "MiniMaxAI/MiniMax-M2.7", + "meta": { + "title": "MiniMax-M2.7", + "provider": "MiniMax", + "description": "MiniMax M2.7 MoE language model (230B total / 10B active) \u2014 latest M2 release for coding, agent toolchains, and long-context reasoning with native FP8", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "dgx_station_gb300": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "230B" + }, + "recipe": { + "meta": { + "title": "MiniMax-M2.7", + "slug": "minimax-m2.7", + "provider": "MiniMax", + "description": "MiniMax M2.7 MoE language model (230B total / 10B active) \u2014 latest M2 release for coding, agent toolchains, and long-context reasoning with native FP8", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Latest M2 series release; verified accuracy on AIME25, GPQA-D, GSM8K; 196K context", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified", + "dgx_station_gb300": "verified" + } + }, + "model": { + "model_id": "MiniMaxAI/MiniMax-M2.7", + "min_vllm_version": "0.20.0", + "architecture": "moe", + "parameter_count": "230B", + "active_parameters": "10B", + "context_length": 196608, + "base_args": [ + "--trust-remote-code", + "--compilation-config", + "{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "MiniMax M2 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "minimax_m2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiniMax M2 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "minimax_m2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 276, + "description": "Native FP8 checkpoint \u2014 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom" + }, + "nvfp4": { + "model_id": "nvidia/MiniMax-M2.7-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 138, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_FA" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 4 + } + }, + "guide": "## Overview\n\n[MiniMax-M2.7](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) is the latest release\nin the MiniMax M2 series. Like earlier M2 variants, it ships with 10B active\nparameters out of 230B total, and supports a 196K context per sequence. MiniMax has\nverified M2.7 accuracy on AIME25, GPQA-D, and GSM8K at vLLM commit\n`0f3ce4c74b1875791d6604e006b6e905fde9f698`.\n\n## Prerequisites\n\n- **OS:** Linux\n- **Python:** 3.10 - 3.13\n- **NVIDIA:** compute capability >= 7.0; ~220 GB for weights + 240 GB per 1M context tokens\n- **AMD:** MI300X / MI325X / MI350X / MI355X with ROCm 7.0+\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n### Docker (dedicated M2-series image)\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:minimax27 MiniMaxAI/MiniMax-M2.7 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --trust-remote-code\n```\n\n## Launching the Server\n\n### NVIDIA \u2014 TP4 (4x H200/H20/H100 or 4x A100/A800)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.7 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nPure TP8 is not supported. For >4 GPUs use DP+EP or TP+EP:\n\n### DP8+EP\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.7 \\\n --data-parallel-size 8 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### TP4+EP (recommended for H100)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.7 \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### TP8+EP\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2.7 \\\n --tensor-parallel-size 8 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### AMD ROCm \u2014 TP2 or TP4\n\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve MiniMaxAI/MiniMax-M2.7 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model MiniMaxAI/MiniMax-M2.7 \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **`fuse_minimax_qk_norm` not recognized:** This fusion was introduced in\n [vLLM PR #37045](https://github.com/vllm-project/vllm/pull/37045); ensure your\n vLLM build includes it.\n- **Corrupted output on stable release:** Upgrade to a nightly after commit\n `cf3eacfe58fa9e745c2854782ada884a9f992cf7`.\n- **Verified accuracy:** Use the pinned commit\n `0f3ce4c74b1875791d6604e006b6e905fde9f698` if reproducing MiniMax's reported results.\n- **DeepGEMM:** vLLM uses DeepGEMM by default; install via\n [install_deepgemm.sh](https://github.com/vllm-project/vllm/blob/main/tools/install_deepgemm.sh)\n if missing.\n- **AITER first launch:** Initial AMD launch JIT-compiles optimized kernels; subsequent launches reuse cached kernels.\n\n## References\n\n- [Model card](https://huggingface.co/MiniMaxAI/MiniMax-M2.7)\n- [MiniMax](https://www.minimax.io/)\n- [vLLM PR #37045 (fuse_minimax_qk_norm)](https://github.com/vllm-project/vllm/pull/37045)\n" + } + }, + "MiniMaxAI/MiniMax-M2": { + "hf_id": "MiniMaxAI/MiniMax-M2", + "meta": { + "title": "MiniMax-M2", + "provider": "MiniMax", + "description": "MiniMax M2 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint, with an NVFP4 variant for Blackwell", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "230B" + }, + "recipe": { + "meta": { + "title": "MiniMax-M2", + "slug": "minimax-m2", + "provider": "MiniMax", + "description": "MiniMax M2 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning \u2014 native FP8 checkpoint, with an NVFP4 variant for Blackwell", + "date_updated": "2026-05-11", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Open-source MoE with strong SWE-Bench and Terminal-Bench performance, 196K context", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "MiniMaxAI/MiniMax-M2", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "230B", + "active_parameters": "10B", + "context_length": 196608, + "base_args": [ + "--trust-remote-code", + "--compilation-config", + "{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Optional: DeepGEMM FP8 MoE kernels for throughput (skip on B200 \u2014 known FlashInfer FP8 MoE error)", + "command": "export VLLM_USE_DEEP_GEMM=1", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "MiniMax M2 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "minimax_m2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiniMax M2 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "minimax_m2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 276, + "description": "Native FP8 checkpoint \u2014 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom" + }, + "nvfp4": { + "model_id": "RedHatAI/MiniMax-M2-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 138, + "description": "NVFP4 (4-bit) quantized weights \u2014 requires Blackwell (B200/B300); roughly half the VRAM of the FP8 checkpoint", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_FA" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 4 + } + }, + "guide": "## Overview\n\n[MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) is an advanced MoE language\nmodel from [MiniMax](https://www.minimax.io/). Highlights:\n\n- Superior intelligence \u2014 #1 among open-source models globally on math, science, coding, tool use\n- Advanced coding \u2014 multi-file edits, run-fix loops, test-validated repairs (SWE-Bench, Terminal-Bench)\n- Agent performance \u2014 plans and executes complex toolchains across shell, browser, and code\n- Efficient design \u2014 10B active / 230B total for low latency and high throughput\n- 196K context length per sequence\n\n## Prerequisites\n\n- **OS:** Linux\n- **Python:** 3.10 - 3.13\n- **NVIDIA:** compute capability >= 7.0; ~220 GB for weights + 240 GB per 1M context tokens\n- **AMD:** MI300X / MI325X / MI350X / MI355X with ROCm 7.0+\n\n### Install vLLM (NVIDIA, stable)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (NVIDIA, nightly)\n\nIf you hit corrupted output, upgrade to a nightly after commit\n`cf3eacfe58fa9e745c2854782ada884a9f992cf7`:\n\n```bash\nuv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n### Docker (NVIDIA, dedicated M2 image)\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:minimax27 MiniMaxAI/MiniMax-M2 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --trust-remote-code\n```\n\n## Launching the Server\n\n### NVIDIA \u2014 TP4 (4x H200/H20/H100 or 4x A100/A800)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nPure TP8 is not supported. For >4 GPUs use DP+EP or TP+EP:\n\n### TP4+EP (recommended for H100)\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2 \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### DP8+EP\n\n```bash\nvllm serve MiniMaxAI/MiniMax-M2 \\\n --data-parallel-size 8 \\\n --enable-expert-parallel \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --compilation-config '{\"mode\":3,\"pass_config\":{\"fuse_minimax_qk_norm\":true}}' \\\n --enable-auto-tool-choice\n```\n\n### AMD ROCm \u2014 TP2 or TP4\n\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve MiniMaxAI/MiniMax-M2 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser minimax_m2 \\\n --reasoning-parser minimax_m2 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model MiniMaxAI/MiniMax-M2 \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **`fuse_minimax_qk_norm` not recognized:** This fusion was introduced in\n [vLLM PR #37045](https://github.com/vllm-project/vllm/pull/37045); ensure your\n vLLM build includes it.\n- **Corrupted output on stable release:** Upgrade to a nightly after commit\n `cf3eacfe58fa9e745c2854782ada884a9f992cf7`.\n- **DeepGEMM:** vLLM uses DeepGEMM by default; install via\n [install_deepgemm.sh](https://github.com/vllm-project/vllm/blob/main/tools/install_deepgemm.sh)\n when missing.\n- **AITER first launch:** AMD initial launch JIT-compiles CK-based FP8 MoE, RMSNorm,\n and activation kernels. Subsequent launches use cached kernels.\n\n## Quantized Variant (NVFP4)\n\n[`RedHatAI/MiniMax-M2-NVFP4`](https://huggingface.co/RedHatAI/MiniMax-M2-NVFP4) is an NVFP4 (4-bit)\ncheckpoint \u2014 roughly half the VRAM of the native FP8 checkpoint. Select the **nvfp4** variant\nabove (it sets `VLLM_USE_FLASHINFER_MOE_FP4=1` and `--kv-cache-dtype fp8`), or pass the repo id\ndirectly to `vllm serve`. NVFP4 requires a Blackwell GPU (compute capability 10.0+).\n\n## References\n\n- [Model card](https://huggingface.co/MiniMaxAI/MiniMax-M2)\n- [NVFP4 variant](https://huggingface.co/RedHatAI/MiniMax-M2-NVFP4)\n- [MiniMax](https://www.minimax.io/)\n- [vLLM PR #37045 (fuse_minimax_qk_norm)](https://github.com/vllm-project/vllm/pull/37045)\n" + } + }, + "OpenGVLab/InternVL3_5-8B": { + "hf_id": "OpenGVLab/InternVL3_5-8B", + "meta": { + "title": "InternVL3.5", + "provider": "InternVL (OpenGVLab)", + "description": "InternVL 3.5 vision-language models from Shanghai AI Lab with thinking-mode prompting", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "8B" + }, + "recipe": { + "meta": { + "title": "InternVL3.5", + "slug": "internvl3.5", + "provider": "InternVL (OpenGVLab)", + "description": "InternVL 3.5 vision-language models from Shanghai AI Lab with thinking-mode prompting", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "internlm/Intern-S1" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "OpenGVLab/InternVL3_5-8B", + "min_vllm_version": "0.10.0", + "architecture": "dense", + "parameter_count": "8B", + "active_parameters": "8B", + "context_length": 40960, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 19, + "description": "BF16 weights for the 8B variant" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[InternVL3.5](https://github.com/OpenGVLab/InternVL) is a vision-language model developed\nby Shanghai AI Laboratory. It supports single-image and multi-image prompts, plus an\noptional \"thinking mode\" via a custom system prompt.\n\n## Prerequisites\n\n- Hardware: 1x GPU with >=20 GB VRAM (A100, L40S, H100, etc.)\n- vLLM >= 0.10.0\n\n### Install vLLM (CUDA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm MI300X/MI325X/MI355X)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n\n## Launch command\n\n```bash\nvllm serve OpenGVLab/InternVL3_5-8B --trust-remote-code\n```\n\nOn AMD:\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve OpenGVLab/InternVL3_5-8B --trust-remote-code\n```\n\n## Client Usage\n\nSingle image:\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(api_key=\"\", base_url=\"http://0.0.0.0:8000/v1\")\nmodel_name = client.models.list().data[0].id\n\nresponse = client.chat.completions.create(\n model=model_name,\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": \"Describe the image.\"},\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg\"}},\n ],\n }],\n temperature=0.0,\n)\nprint(response.choices[0].message.content)\n```\n\n## Thinking Mode\n\nSet a thinking system prompt and use `temperature=0.6` to mitigate repetition:\n\n```python\nTHINKING_SYSTEM_PROMPT = \"\"\"\nYou are an AI assistant that rigorously follows this response protocol:\n\n1. First, conduct a detailed analysis of the question. Consider different angles, potential\nsolutions, and reason through the problem step-by-step. Enclose this entire thinking process\nwithin and tags.\n\n2. After the thinking section, provide a clear, concise, and direct answer to the user's\nquestion. Separate the answer from the think section with a newline.\n\"\"\".strip()\n```\n\n## References\n\n- [InternVL3.5-8B](https://huggingface.co/OpenGVLab/InternVL3_5-8B)\n- [InternVL GitHub](https://github.com/OpenGVLab/InternVL)\n" + } + }, + "PaddlePaddle/PaddleOCR-VL-1.5": { + "hf_id": "PaddlePaddle/PaddleOCR-VL-1.5", + "meta": { + "title": "PaddleOCR-VL-1.5", + "provider": "PaddlePaddle", + "description": "PaddleOCR-VL-1.5 (0.9B) \u2014 next-gen compact VLM for document parsing; adds text spotting, seal recognition, and Tibetan/Bengali", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "0.9B" + }, + "recipe": { + "meta": { + "title": "PaddleOCR-VL-1.5", + "slug": "paddleocr-vl-1.5", + "provider": "PaddlePaddle", + "description": "PaddleOCR-VL-1.5 (0.9B) \u2014 next-gen compact VLM for document parsing; adds text spotting, seal recognition, and Tibetan/Bengali", + "date_updated": "2026-05-11", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "PaddlePaddle/PaddleOCR-VL" + ] + }, + "model": { + "model_id": "PaddlePaddle/PaddleOCR-VL-1.5", + "min_vllm_version": "0.11.1", + "architecture": "dense", + "parameter_count": "0.9B", + "active_parameters": "0.9B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code", + "--max-num-batched-tokens", + "16384", + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "PaddlePaddle runtime (install in a separate venv from vllm to avoid conflicts)", + "command": "uv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/" + }, + { + "note": "PaddleOCR document-parsing helpers (1.5 ships under the same paddleocr[doc-parser] extra)", + "command": "uv pip install -U \"paddleocr[doc-parser]\"" + }, + { + "note": "Safetensors loader used by the doc-parser path", + "command": "uv pip install safetensors" + } + ], + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "BF16 weights \u2014 small footprint, runs on most GPUs" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[PaddleOCR-VL-1.5](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5) is the next\ngeneration of PaddleOCR-VL \u2014 same 0.9B architecture (NaViT-style dynamic-resolution\nvision encoder + ERNIE-4.5-0.3B LM), with substantial accuracy gains and new tasks:\n\n- **94.5% on OmniDocBench v1.5** (SOTA)\n- **Text spotting** \u2014 line-level localization + recognition in one pass\n- **Seal recognition** \u2014 new task with SOTA results\n- **Irregular-shape localization** \u2014 polygonal detection under skew/warping\n- **Multilingual** \u2014 adds Tibetan and Bengali\n- Cross-page table merging and paragraph-heading recognition\n\nArchitecture is identical to PaddleOCR-VL, so the vLLM launch command and feature\nflags are the same.\n\n## Prerequisites\n\n- Hardware: 1x GPU (small VRAM footprint)\n- vLLM >= 0.11.1 (nightly if not released yet)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n## Launch command\n\n```bash\nvllm serve PaddlePaddle/PaddleOCR-VL-1.5 \\\n --trust-remote-code \\\n --max-num-batched-tokens 16384 \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0\n```\n\nTip: OCR workloads don't benefit much from prefix caching or image reuse, so disable\nthose to avoid hashing/caching overhead.\n\n## Client Usage\n\nTask-specific prompts (note the two new tasks `spotting` and `seal`):\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nTASKS = {\n \"ocr\": \"OCR:\",\n \"table\": \"Table Recognition:\",\n \"formula\": \"Formula Recognition:\",\n \"chart\": \"Chart Recognition:\",\n \"spotting\": \"Spotting:\",\n \"seal\": \"Seal Recognition:\",\n}\n\nresponse = client.chat.completions.create(\n model=\"PaddlePaddle/PaddleOCR-VL-1.5\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://.../receipt.png\"}},\n {\"type\": \"text\", \"text\": TASKS[\"spotting\"]},\n ],\n }],\n temperature=0.0,\n)\nprint(response.choices[0].message.content)\n```\n\n## Offline Inference with PP-DocLayoutV2\n\nUse separate venvs for `vllm` and `paddlepaddle` to avoid conflicts. If you see\n\"The model PaddleOCR-VL-1.5-0.9B does not exist.\", add\n`--served-model-name PaddleOCR-VL-1.5-0.9B`.\n\n```bash\nuv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/\nuv pip install -U \"paddleocr[doc-parser]\"\nuv pip install safetensors\n```\n\n```python\nfrom paddleocr import PaddleOCRVL\n\npipeline = PaddleOCRVL(\n vl_rec_model_name=\"PaddleOCR-VL-1.5-0.9B\",\n vl_rec_backend=\"vllm-server\",\n vl_rec_server_url=\"http://localhost:8000/v1\",\n layout_detection_model_name=\"PP-DocLayoutV2\",\n layout_detection_model_dir=\"/path/to/your/PP-DocLayoutV2/\",\n)\n\noutput = pipeline.predict(\"https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png\")\nfor i, res in enumerate(output):\n res.save_to_json(save_path=f\"output_{i}.json\")\n res.save_to_markdown(save_path=f\"output_{i}.md\")\n```\n\n## References\n\n- [PaddleOCR-VL-1.5 on Hugging Face](https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5)\n- [PaddleOCR GitHub](https://github.com/PaddlePaddle/PaddleOCR)\n- [vLLM support tracking issue](https://github.com/vllm-project/vllm/issues/33554)\n" + } + }, + "PaddlePaddle/PaddleOCR-VL": { + "hf_id": "PaddlePaddle/PaddleOCR-VL", + "meta": { + "title": "PaddleOCR-VL", + "provider": "PaddlePaddle", + "description": "PaddleOCR-VL (0.9B) \u2014 compact vision-language model for document parsing, OCR, tables, formulas, charts", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "0.9B" + }, + "recipe": { + "meta": { + "title": "PaddleOCR-VL", + "slug": "paddleocr-vl", + "provider": "PaddlePaddle", + "description": "PaddleOCR-VL (0.9B) \u2014 compact vision-language model for document parsing, OCR, tables, formulas, charts", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "related_recipes": [] + }, + "model": { + "model_id": "PaddlePaddle/PaddleOCR-VL", + "min_vllm_version": "0.11.1", + "architecture": "dense", + "parameter_count": "0.9B", + "active_parameters": "0.9B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code", + "--max-num-batched-tokens", + "16384", + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "PaddlePaddle runtime (install in a separate venv from vllm to avoid conflicts)", + "command": "uv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/" + }, + { + "note": "PaddleOCR document-parsing helpers", + "command": "uv pip install -U \"paddleocr[doc-parser]\"" + }, + { + "note": "Safetensors loader used by the doc-parser path", + "command": "uv pip install safetensors" + } + ], + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "BF16 weights \u2014 small footprint, runs on most GPUs" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL) is a SOTA resource-efficient\nmodel for document parsing. Its core (PaddleOCR-VL-0.9B) combines a NaViT-style dynamic\nresolution visual encoder with an ERNIE-4.5-0.3B language model, optimized for OCR,\ntables, formulas, and chart recognition.\n\n## Prerequisites\n\n- Hardware: 1x GPU (small VRAM footprint)\n- vLLM >= 0.11.1 (nightly if not released yet)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\n```\n\n## Launch command\n\n```bash\nvllm serve PaddlePaddle/PaddleOCR-VL \\\n --trust-remote-code \\\n --max-num-batched-tokens 16384 \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0\n```\n\nTip: OCR workloads don't benefit much from prefix caching or image reuse, so disable\nthose to avoid hashing/caching overhead.\n\n## Client Usage\n\nTask-specific prompts:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nTASKS = {\n \"ocr\": \"OCR:\",\n \"table\": \"Table Recognition:\",\n \"formula\": \"Formula Recognition:\",\n \"chart\": \"Chart Recognition:\",\n}\n\nresponse = client.chat.completions.create(\n model=\"PaddlePaddle/PaddleOCR-VL\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://.../receipt.png\"}},\n {\"type\": \"text\", \"text\": TASKS[\"ocr\"]},\n ],\n }],\n temperature=0.0,\n)\nprint(response.choices[0].message.content)\n```\n\n## Offline Inference with PP-DocLayoutV2\n\nUse separate venvs for `vllm` and `paddlepaddle` to avoid conflicts. If you see\n\"The model PaddleOCR-VL-0.9B does not exist.\", add `--served-model-name PaddleOCR-VL-0.9B`.\n\n```bash\nuv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/\nuv pip install -U \"paddleocr[doc-parser]\"\nuv pip install safetensors\n```\n\n```python\nfrom paddleocr import PaddleOCRVL\n\npipeline = PaddleOCRVL(\n vl_rec_backend=\"vllm-server\",\n vl_rec_server_url=\"http://localhost:8000/v1\",\n layout_detection_model_name=\"PP-DocLayoutV2\",\n layout_detection_model_dir=\"/path/to/your/PP-DocLayoutV2/\",\n)\n\noutput = pipeline.predict(\"https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png\")\nfor i, res in enumerate(output):\n res.save_to_json(save_path=f\"output_{i}.json\")\n res.save_to_markdown(save_path=f\"output_{i}.md\")\n```\n\n## References\n\n- [PaddleOCR-VL on Hugging Face](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)\n- [PaddleOCR GitHub](https://github.com/PaddlePaddle/PaddleOCR)\n" + } + }, + "Qwen/Qwen-Image": { + "hf_id": "Qwen/Qwen-Image", + "meta": { + "title": "Qwen-Image", + "provider": "Qwen", + "description": "Text-to-image diffusion model (20B parameters) from the Qwen-Image family, served via vLLM-Omni.", + "tasks": [ + "omni" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "20B" + }, + "recipe": { + "meta": { + "title": "Qwen-Image", + "slug": "qwen-image", + "provider": "Qwen", + "description": "Text-to-image diffusion model (20B parameters) from the Qwen-Image family, served via vLLM-Omni.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "omni" + ], + "performance_headline": "Shared DiT core across T2I, image editing, and layered-image variants; accelerated via Cache-DiT, TeaCache, and sequence parallelism", + "related_recipes": [] + }, + "model": { + "model_id": "Qwen/Qwen-Image", + "min_vllm_version": "0.18.0", + "architecture": "dense", + "parameter_count": "20B", + "active_parameters": "20B", + "context_length": 0, + "base_args": [], + "base_env": {} + }, + "omni": { + "tasks": [ + "t2i" + ] + }, + "dependencies": [ + { + "note": "vLLM-Omni must be installed from source and pins vllm==0.18.0 for diffusion support", + "command": "git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.18.0" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 48, + "description": "Full precision BF16 \u2014 use CPU offload / layerwise offload for lower VRAM" + }, + "fp8": { + "precision": "fp8", + "vram_minimum_gb": 24, + "description": "FP8 quantization with `img_mlp` kept at full precision for quality.", + "extra_args": [ + "--quantization", + "fp8", + "--ignored-layers", + "img_mlp" + ] + }, + "int8": { + "precision": "int8", + "vram_minimum_gb": 24, + "description": "INT8 quantization (`--quantization int8`).", + "extra_args": [ + "--quantization", + "int8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nQwen-Image is a diffusion-based text-to-image model. This recipe documents the Qwen-Image family served via **vLLM-Omni**:\n\n| Model | HuggingFace | Description |\n|-------|-------------|-------------|\n| **Qwen-Image** | [Qwen/Qwen-Image](https://huggingface.co/Qwen/Qwen-Image) | Text-to-image (20B, Aug 2025) |\n| **Qwen-Image-2512** | [Qwen/Qwen-Image-2512](https://huggingface.co/Qwen/Qwen-Image-2512) | Updated T2I with enhanced realism (Dec 2025) |\n| **Qwen-Image-Edit** | [Qwen/Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit) | Single-image editing (Aug 2025) |\n| **Qwen-Image-Edit-2509** | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) | Multi-image editing (Sep 2025) |\n| **Qwen-Image-Edit-2511** | [Qwen/Qwen-Image-Edit-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) | Enhanced consistency + built-in LoRA (Nov 2025) |\n| **Qwen-Image-Layered** | [Qwen/Qwen-Image-Layered](https://huggingface.co/Qwen/Qwen-Image-Layered) | Decomposes input into RGBA layers (Dec 2025) |\n\nAll models share the same DiT transformer core \u2014 acceleration methods are applicable across the entire series.\n\n## Prerequisites\n\n```bash\ngit clone https://github.com/vllm-project/vllm-omni.git\ncd vllm-omni\nuv venv\nsource .venv/bin/activate\nuv pip install -e . vllm==0.18.0\n```\n\n## Usage\n\n### Text-to-Image\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image \\\n --prompt \"a cup of coffee on the table\" \\\n --output output_qwen_image.png \\\n --num-inference-steps 50 \\\n --cfg-scale 4.0\n```\n\n### Image Editing (Qwen-Image-Edit)\n```bash\npython3 ./examples/offline_inference/image_to_image/image_edit.py \\\n --model Qwen/Qwen-Image-Edit \\\n --image qwen_bear.png \\\n --prompt \"Let this mascot dance under the moon, surrounded by floating stars\" \\\n --output output_image_edit.png \\\n --num-inference-steps 50 \\\n --cfg-scale 4.0\n```\n\n### Layered RGBA Decomposition\n```bash\npython3 ./examples/offline_inference/image_to_image/image_edit.py \\\n --model Qwen/Qwen-Image-Layered \\\n --image input.png \\\n --prompt \"\" \\\n --output layered \\\n --num-inference-steps 50 \\\n --cfg-scale 4.0 \\\n --layers 4 \\\n --color-format \"RGBA\"\n```\n\n## Acceleration\n\nPick one cache backend AND any supported parallel strategy.\n\n### Cache-DiT\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --cache-backend cache_dit\n```\n\n### TeaCache\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --cache-backend tea_cache\n```\n\n### Ulysses / Ring Sequence Parallelism\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --ulysses-degree 4\n```\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --ring-degree 4\n```\n\n### CFG Parallelism (2 GPUs, non-distilled models with `cfg-scale > 1`)\n```bash\npython3 ./examples/offline_inference/image_to_image/image_edit.py \\\n --model Qwen/Qwen-Image-Edit --image qwen_bear.png --prompt \"...\" \\\n --cfg-parallel-size 2 --num-inference-steps 50 --cfg-scale 4.0\n```\n\n### Tensor Parallelism\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --tensor-parallel-size 2\n```\n\n### CPU / Layerwise Offload (low VRAM)\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --enable-cpu-offload\n```\n```bash\npython3 ./examples/offline_inference/image_to_image/image_edit.py \\\n --model Qwen/Qwen-Image-Edit --image qwen_bear.png --prompt \"...\" \\\n --enable-layerwise-offload\n```\n\n### VAE Patch Parallelism\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" \\\n --height 1536 --width 1536 \\\n --ulysses-degree 2 --vae-patch-parallel-size 2\n```\nMust be combined with another parallel method.\n\n### Quantization (Qwen-Image / Qwen-Image-2512 only)\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --quantization fp8 \\\n --ignored-layers \"img_mlp\"\n```\n```bash\npython3 ./examples/offline_inference/text_to_image/text_to_image.py \\\n --model Qwen/Qwen-Image --prompt \"...\" --quantization int8\n```\n\nQwen-Image-Edit variants do **not** support quantization.\n\n## Configuration Tips\n\n- **Cache + SP** is the recommended combo for long-sequence generation.\n- **Sequence parallelism** (Ulysses / Ring) beats TP for high-res / long-sequence.\n- **Tensor parallelism** is most useful when model weights alone don't fit on one GPU.\n- **CFG parallelism** targets non-distilled diffusion with full CFG (not for guidance-distilled models).\n- To reduce peak VRAM, use CPU/layerwise offload and/or VAE patch parallelism.\n- TeaCache and Cache-DiT cannot be used together.\n- `--enforce-eager` disables torch.compile if needed.\n\nSee the [Feature Support Table](https://github.com/vllm-project/vllm-omni/blob/main/docs/user_guide/diffusion_features.md#supported-models) and [Feature Compatibility Guide](https://github.com/vllm-project/vllm-omni/blob/main/docs/user_guide/feature_compatibility.md) for combinations.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen-Image)\n- [vLLM-Omni](https://github.com/vllm-project/vllm-omni)\n" + } + }, + "Qwen/Qwen2.5-VL-72B-Instruct": { + "hf_id": "Qwen/Qwen2.5-VL-72B-Instruct", + "meta": { + "title": "Qwen2.5-VL-72B-Instruct", + "provider": "Qwen", + "description": "Qwen2.5-VL dense vision-language model (72B) for high-quality image and video understanding.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "72B" + }, + "recipe": { + "meta": { + "title": "Qwen2.5-VL-72B-Instruct", + "slug": "qwen2.5-vl-72b-instruct", + "provider": "Qwen", + "description": "Qwen2.5-VL dense vision-language model (72B) for high-quality image and video understanding.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Verified on 4x A100 and 4x MI300X/MI325X/MI355X with BF16", + "related_recipes": [], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen2.5-VL-72B-Instruct", + "min_vllm_version": "0.7.0", + "architecture": "dense", + "parameter_count": "72B", + "active_parameters": "72B", + "context_length": 128000, + "base_args": [], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 173, + "description": "Full precision BF16 \u2014 4x A100 80GB or 4x MI300X/MI325X/MI355X" + }, + "awq": { + "model_id": "Qwen/Qwen2.5-VL-72B-Instruct-AWQ", + "precision": "int4", + "vram_minimum_gb": 43, + "description": "AWQ 4-bit quantized weights", + "extra_args": [ + "--quantization", + "awq" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data" + ], + "extra_env": {} + }, + "blackwell": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data" + ], + "extra_env": {} + }, + "amd": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data", + "--limit-mm-per-prompt", + "{\"image\":2,\"video\":0}" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nThis guide describes how to run Qwen2.5-VL series on the targeted accelerated stack. Since BF16 is the commonly used precision for Qwen2.5-VL training, using BF16 in inference ensures the best accuracy.\n\n## Prerequisites\n\n### NVIDIA\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### AMD ROCm (MI300X, MI325X, MI355X)\n> Note: The vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35. Use the Docker flow if your environment is incompatible.\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n### TPU Deployment\n- [Qwen2.5-VL on Trillium (v6e)](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Qwen2.5-VL)\n\n## Deployment Configurations\n\n### 4xA100 (BF16, TP=4)\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\nvllm serve Qwen/Qwen2.5-VL-72B-Instruct \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --tensor-parallel-size 4 \\\n --mm-encoder-tp-mode data \\\n --limit-mm-per-prompt '{\"image\":2,\"video\":0}'\n```\n\n### 4xMI300X/MI325X/MI355X (BF16, TP=4)\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\nexport VLLM_ROCM_USE_AITER=1\nvllm serve Qwen/Qwen2.5-VL-72B-Instruct \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --tensor-parallel-size 4 \\\n --mm-encoder-tp-mode data \\\n --limit-mm-per-prompt '{\"image\":2,\"video\":0}'\n```\n\n### Qwen2.5-VL-7B-Instruct (DP=4)\nFor medium-size 7B model, data parallelism works better.\n\n```bash\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\nvllm serve Qwen/Qwen2.5-VL-7B-Instruct \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --data-parallel-size 4 \\\n --limit-mm-per-prompt '{\"image\":2,\"video\":0}'\n```\n\n## Configuration Tips\n\n- `--max-model-len 65536` is usually good for most scenarios (native context is 128K).\n- For A100-80GB devices, TP must be >= 2 to avoid OOM.\n- `--limit-mm-per-prompt` caps incoming multimodal requests.\n- `--mm-encoder-tp-mode data` deploys the small ViT encoder in DP fashion (ViT \u2248 675M vs 72B LM).\n- vLLM uses 90% of GPU memory by default; set `--gpu-memory-utilization=0.95` to maximize KV cache.\n\n## Benchmarking\n\nLaunch the server with `--no-enable-prefix-caching` to get consistent measurements.\n\n### VisionArena-Chat\n```bash\nvllm bench serve \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --backend openai-chat \\\n --endpoint /v1/chat/completions \\\n --model Qwen/Qwen2.5-VL-72B-Instruct \\\n --dataset-name hf \\\n --dataset-path lmarena-ai/VisionArena-Chat \\\n --num-prompts 128\n```\n\n### Random Synthetic\n```bash\nvllm bench serve \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --model Qwen/Qwen2.5-VL-72B-Instruct \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --num-prompts 128\n```\n\nWorkload mixes:\n- Prompt-heavy: 8000 in / 1000 out\n- Decode-heavy: 1000 in / 8000 out\n- Balanced: 1000 in / 1000 out\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n" + } + }, + "Qwen/Qwen3-235B-A22B-Instruct-2507": { + "hf_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "meta": { + "title": "Qwen3-235B-A22B-Instruct", + "provider": "Qwen", + "description": "Flagship Qwen3 MoE instruct model with 235B total and 22B active parameters, tuned for high-quality text generation.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "235B" + }, + "recipe": { + "meta": { + "title": "Qwen3-235B-A22B-Instruct", + "slug": "qwen3-235b-a22b-instruct-2507", + "provider": "Qwen", + "description": "Flagship Qwen3 MoE instruct model with 235B total and 22B active parameters, tuned for high-quality text generation.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Verified on 4x/8x H200, MI300X/MI325X/MI355X nodes (BF16 and FP8)", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "235B", + "active_parameters": "22B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Hermes-compatible parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 564, + "description": "Full precision BF16 \u2014 requires 4x H200 or 8x MI300X/MI325X/MI355X" + }, + "fp8": { + "model_id": "Qwen/Qwen3-235B-A22B-FP8", + "precision": "fp8", + "vram_minimum_gb": 240, + "tp": 4, + "description": "Qwen official FP8 checkpoint for improved efficiency on SM90+" + }, + "nvfp4": { + "model_id": "nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 141, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) is the flagship instruct MoE model in the Qwen3 series, with 235B total parameters and 22B active parameters. This guide covers deploying the model efficiently with vLLM on NVIDIA and AMD GPUs.\n\n## Prerequisites\n\n### NVIDIA CUDA (pip)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n### AMD ROCm (pip)\n> Note: The vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35. Use the Docker flow if your environment is incompatible.\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n\n## Deployment Configurations\n\n### BF16 on MI300X/MI325X/MI355X (4 GPUs)\n\n```bash\nHIP_VISIBLE_DEVICES=\"4,5,6,7\" \\\nVLLM_ROCM_USE_AITER=1 \\\nVLLM_ROCM_USE_AITER_MHA=0 \\\nSAFETENSORS_FAST_GPU=1 \\\nvllm serve Qwen/Qwen3-235B-A22B \\\n --trust-remote-code \\\n -tp 4 \\\n --disable-log-requests \\\n --swap-space 32 \\\n --distributed-executor-backend mp \\\n --max-num-batched-tokens 32768 \\\n --max-model-len 32768 \\\n --no-enable-prefix-caching \\\n --gpu-memory-utilization 0.8\n```\n\n### FP8 on MI300X/MI325X/MI355X (4 GPUs)\n\n```bash\nHIP_VISIBLE_DEVICES=\"4,5,6,7\" \\\nVLLM_ROCM_USE_AITER=1 \\\nVLLM_ROCM_USE_AITER_MHA=0 \\\nSAFETENSORS_FAST_GPU=1 \\\nvllm serve Qwen/Qwen3-235B-A22B-FP8 \\\n --trust-remote-code \\\n -tp 4 \\\n --disable-log-requests \\\n --swap-space 16 \\\n --distributed-executor-backend mp \\\n --max-num-batched-tokens 32768 \\\n --max-model-len 32768 \\\n --no-enable-prefix-caching \\\n --gpu-memory-utilization 0.8\n```\n\n### TPU Deployment\n\n- [Qwen3-32B on Trillium (v6e)](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Qwen3)\n- [Qwen3-4B on Trillium (v6e)](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Qwen3)\n\n## Client Usage\n\n```bash\nvllm bench serve \\\n --model \"Qwen/Qwen3-235B-A22B-FP8\" \\\n --dataset-name random \\\n --random-input-len 8192 \\\n --random-output-len 1024 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos \\\n --trust-remote-code\n```\n\n## Troubleshooting\n\n- Use `--max-num-batched-tokens` and `--max-model-len` to fit memory constraints on smaller nodes.\n- If OOM at startup, lower `--gpu-memory-utilization` (e.g. 0.8) and disable prefix caching.\n- AMD builds require `VLLM_ROCM_USE_AITER=1` for best performance.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8)\n- [vLLM documentation](https://docs.vllm.ai/)\n" + } + }, + "Qwen/Qwen3-ASR-1.7B": { + "hf_id": "Qwen/Qwen3-ASR-1.7B", + "meta": { + "title": "Qwen3-ASR-1.7B", + "provider": "Qwen", + "description": "Speech-to-text model supporting 11 languages, multiple accents, and singing voice with customizable text-context prompting.", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "2.3B" + }, + "recipe": { + "meta": { + "title": "Qwen3-ASR-1.7B", + "slug": "qwen3-asr-1.7b", + "provider": "Qwen", + "description": "Speech-to-text model supporting 11 languages, multiple accents, and singing voice with customizable text-context prompting.", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "performance_headline": "Accurate multilingual ASR, including singing voice; single-GPU serving", + "related_recipes": [] + }, + "model": { + "model_id": "Qwen/Qwen3-ASR-1.7B", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "2.3B", + "active_parameters": "2.3B", + "context_length": 65536, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Audio extras \u2014 required for ASR input pre-processing (librosa, soundfile)", + "command": "uv pip install -U \"vllm[audio]\"" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 4, + "description": "Full precision BF16 \u2014 fits on a single mid-range GPU" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nQwen3-ASR is a speech-to-text model that achieves accurate and robust recognition across 11 languages and multiple accents. It supports prompting the model with text context in any format to produce customized ASR results and performs well on singing-voice recognition. This guide demonstrates how to deploy Qwen3-ASR efficiently with vLLM.\n\n## Prerequisites\n\nInstall vLLM with audio dependencies:\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu129 \\\n --extra-index-url https://download.pytorch.org/whl/cu129 \\\n --index-strategy unsafe-best-match\nuv pip install \"vllm[audio]\"\n```\n\n## Launching with vLLM\n\n```bash\nvllm serve Qwen/Qwen3-ASR-1.7B\n```\n\n## Client Usage\n\n### Chat Completions (OpenAI SDK)\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nresponse = client.chat.completions.create(\n model=\"Qwen/Qwen3-ASR-1.7B\",\n messages=[{\n \"role\": \"user\",\n \"content\": [{\n \"type\": \"audio_url\",\n \"audio_url\": {\"url\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav\"},\n }],\n }],\n)\nprint(response.choices[0].message.content)\n```\n\n### Transcription API\n```python\nimport httpx\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\naudio_url = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav\"\naudio_file = httpx.get(audio_url).content\n\ntranscription = client.audio.transcriptions.create(\n model=\"Qwen/Qwen3-ASR-1.7B\",\n file=audio_file,\n)\nprint(transcription.text)\n```\n\n### cURL\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"messages\": [\n {\"role\": \"user\", \"content\": [\n {\"type\": \"audio_url\", \"audio_url\": {\"url\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav\"}}\n ]}\n ]\n }'\n```\n\n### Offline Inference\n```python\nfrom vllm import LLM, SamplingParams\nfrom vllm.assets.audio import AudioAsset\n\nllm = LLM(model=\"Qwen/Qwen3-ASR-1.7B\")\naudio_asset = AudioAsset(\"winning_call\")\n\nconversation = [{\n \"role\": \"user\",\n \"content\": [{\"type\": \"audio_url\", \"audio_url\": {\"url\": audio_asset.url}}],\n}]\n\nsampling_params = SamplingParams(temperature=0.01, max_tokens=256)\noutputs = llm.chat(conversation, sampling_params=sampling_params)\nprint(outputs[0].outputs[0].text)\n```\n\n## Troubleshooting\n\n- Make sure `vllm[audio]` extras are installed or audio requests will fail.\n- Use the nightly wheel until Qwen3-ASR support lands in the stable release.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)\n- [vLLM audio support](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n" + } + }, + "Qwen/Qwen3-Coder-480B-A35B-Instruct": { + "hf_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "meta": { + "title": "Qwen3-Coder-480B-A35B-Instruct", + "provider": "Qwen", + "description": "Large coder MoE with 480B total / 35B active parameters, strong tool-use and code generation capabilities.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "480B" + }, + "recipe": { + "meta": { + "title": "Qwen3-Coder-480B-A35B-Instruct", + "slug": "qwen3-coder-480b-a35b-instruct", + "provider": "Qwen", + "description": "Large coder MoE with 480B total / 35B active parameters, strong tool-use and code generation capabilities.", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "HumanEval 0.939, MBPP 0.918 (FP8). Recommended FP8 on 8x H200/H20 via DP=8", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "480B", + "active_parameters": "35B", + "context_length": 262144, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Optional: opt into DeepGEMM FP8 MoE kernels for extra throughput", + "command": "export VLLM_USE_DEEP_GEMM=1", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 1152, + "description": "Full BF16 \u2014 8x H200/H20 (141GB \u00d7 8) recommended" + }, + "fp8": { + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 576, + "tp": 4, + "description": "Qwen official FP8 checkpoint \u2014 required for DP=8 serving" + }, + "nvfp4": { + "model_id": "nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 288, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [], + "extra_env": { + "VLLM_USE_DEEP_GEMM": "1" + } + }, + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3-Coder](https://github.com/QwenLM/Qwen3-Coder) is an advanced large language model created by the Qwen team. `Qwen3-Coder-480B-A35B-Instruct` is the flagship coder MoE with 480B total / 35B active parameters. vLLM supports it including tool calling; the guide below covers BF16 and FP8 serving on NVIDIA and AMD GPUs.\n\n## Prerequisites\n\n### CUDA\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### ROCm (MI300X, MI325X, MI355X)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\nThe ROCm wheel requires Python 3.12, ROCm 7.0, and glibc >= 2.35.\n\n## Deployment Configurations\n\n### 8xH200 / 8xH20 BF16\n```bash\nvllm serve Qwen/Qwen3-Coder-480B-A35B-Instruct \\\n --max-model-len 32000 \\\n --enable-expert-parallel \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\n### 8xH200 / 8xH20 FP8 (DP=8, recommended)\n```bash\nVLLM_USE_DEEP_GEMM=1 vllm serve Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \\\n --max-model-len 131072 \\\n --enable-expert-parallel \\\n --data-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\n### 8xMI300X/MI325X/MI355X BF16\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve Qwen/Qwen3-Coder-480B-A35B-Instruct \\\n --max-model-len 32000 \\\n --enable-expert-parallel \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\n### 8xMI300X/MI325X/MI355X FP8\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \\\n --trust-remote-code \\\n --max-model-len 131072 \\\n --enable-expert-parallel \\\n --data-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\n## Client Usage\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **Context-length OOM**: A single H20 node cannot serve the native 262144 context. Reduce `--max-model-len` or raise `--gpu-memory-utilization`.\n- **TP=8 failure on FP8**: Expect `ValueError: The output_size of gate's and up's weight = 320 is not divisible by weight quantization block_n = 128.` on FP8 with TP=8. Switch to `--data-parallel-size 8` instead.\n- **DeepGEMM**: set `VLLM_USE_DEEP_GEMM=1` for faster FP8 matmul. Follow the [setup instructions](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/deepgemm/README.md#setup) to install it.\n- **Tool calling**: add `--tool-call-parser qwen3_coder` as shown above.\n\n## Evaluation\n\n| Dataset | Test Type | Pass@1 Score |\n|------------|--------------------|--------------|\n| HumanEval | Base tests | 0.939 |\n| HumanEval+ | Base + extra tests | 0.902 |\n| MBPP | Base tests | 0.918 |\n| MBPP+ | Base + extra tests | 0.794 |\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)\n- [Qwen3-Coder GitHub](https://github.com/QwenLM/Qwen3-Coder)\n- [EvalPlus](https://github.com/evalplus/evalplus)\n" + } + }, + "Qwen/Qwen3-Next-80B-A3B-Instruct": { + "hf_id": "Qwen/Qwen3-Next-80B-A3B-Instruct", + "meta": { + "title": "Qwen3-Next-80B-A3B-Instruct", + "provider": "Qwen", + "description": "Advanced Qwen3-Next MoE model (80B total / 3B active) with hybrid attention, highly sparse experts, and multi-token prediction.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "80B" + }, + "recipe": { + "meta": { + "title": "Qwen3-Next-80B-A3B-Instruct", + "slug": "qwen3-next-80b-a3b-instruct", + "provider": "Qwen", + "description": "Advanced Qwen3-Next MoE model (80B total / 3B active) with hybrid attention, highly sparse experts, and multi-token prediction.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Highly sparse MoE with MTP-accelerated decoding, runs on 4x H200/H20/A100/A800", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "80B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Hermes parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}", + "--no-enable-chunked-prefill" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 192, + "description": "Full precision BF16 \u2014 fits on 4x H200/H20/A100/A800" + }, + "fp8": { + "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 96, + "description": "Qwen official FP8 checkpoint \u2014 recommended on SM90/SM100" + }, + "nvfp4": { + "model_id": "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 48, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "1", + "VLLM_FLASHINFER_MOE_BACKEND": "latency", + "VLLM_USE_DEEP_GEMM": "0", + "VLLM_USE_TRTLLM_ATTENTION": "0", + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3-Next](https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd&from=research.latest-advancements-list) is an advanced LLM from the Qwen team featuring:\n- A hybrid attention mechanism\n- A highly sparse Mixture-of-Experts (MoE) structure\n- Training-stability-friendly optimizations\n- A multi-token prediction mechanism for faster inference\n\n## Prerequisites\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Deployment Configurations\n\nLaunch on 4x H200/H20 or 4x A100/A800 GPUs.\n\n### Basic Multi-GPU (BF16)\n```bash\nvllm serve Qwen/Qwen3-Next-80B-A3B-Instruct \\\n --tensor-parallel-size 4 \\\n --served-model-name qwen3-next \\\n --enable-prefix-caching\n```\n\nIf you hit `torch.AcceleratorError: CUDA error: an illegal memory access was encountered`, add `--compilation_config.cudagraph_mode=PIECEWISE`.\n\n### FP8 (SM90/SM100)\n```bash\nvllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \\\n --tensor-parallel-size 4 \\\n --enable-prefix-caching\n```\n\nOn SM100, accelerate with the FP8 FlashInfer TRTLLM MoE kernel:\n```bash\nVLLM_USE_FLASHINFER_MOE_FP8=1 \\\nVLLM_FLASHINFER_MOE_BACKEND=latency \\\nVLLM_USE_DEEP_GEMM=0 \\\nVLLM_USE_TRTLLM_ATTENTION=0 \\\nVLLM_ATTENTION_BACKEND=FLASH_ATTN \\\nvllm serve Qwen/Qwen3-Next-80B-A3B-Instruct-FP8 \\\n --tensor-parallel-size 4\n```\n\n### MTP (Multi-Token Prediction)\n```bash\nvllm serve Qwen/Qwen3-Next-80B-A3B-Instruct \\\n --tokenizer-mode auto --gpu-memory-utilization 0.8 \\\n --speculative-config '{\"method\": \"qwen3_next_mtp\", \"num_speculative_tokens\": 2}' \\\n --tensor-parallel-size 4 --no-enable-chunked-prefill\n```\n\n### Tool / Function Calling\n```bash\nvllm serve ... --tool-call-parser hermes --enable-auto-tool-choice\n```\n\n### AMD (MI300X/MI325X/MI355X)\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n```bash\nSAFETENSORS_FAST_GPU=1 \\\nVLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \\\nvllm serve Qwen/Qwen3-Next-80B-A3B-Instruct \\\n --tensor-parallel-size 4 \\\n --max-model-len 32768 \\\n --no-enable-prefix-caching \\\n --trust-remote-code\n```\n\n## Client Usage\n\nBenchmark:\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model Qwen/Qwen3-Next-80B-A3B-Instruct \\\n --served-model-name qwen3-next \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **Sub-optimal MoE performance warning**: Tune the MoE Triton kernel with [benchmark_moe](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), then set `VLLM_TUNED_CONFIG_FOLDER` to the directory containing the generated config.\n- **IMA error in DP mode**: add `--compilation_config.cudagraph_mode=PIECEWISE`.\n- For more parallel topologies, see the [Data Parallel Deployment docs](https://docs.vllm.ai/en/latest/serving/data_parallel_deployment.html).\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct-FP8)\n- [Qwen3-Next blog](https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd)\n" + } + }, + "Qwen/Qwen3-VL-235B-A22B-Instruct": { + "hf_id": "Qwen/Qwen3-VL-235B-A22B-Instruct", + "meta": { + "title": "Qwen3-VL-235B-A22B-Instruct", + "provider": "Qwen", + "description": "Qwen3-VL flagship MoE vision-language model with 235B total / 22B active parameters, supporting images, video, and long context.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "235B" + }, + "recipe": { + "meta": { + "title": "Qwen3-VL-235B-A22B-Instruct", + "slug": "qwen3-vl-235b-a22b-instruct", + "provider": "Qwen", + "description": "Qwen3-VL flagship MoE vision-language model with 235B total / 22B active parameters, supporting images, video, and long context.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Strong on images, video, and text \u2014 #1 open model on text on lmarena.ai at release", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3-VL-235B-A22B-Instruct", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "235B", + "active_parameters": "22B", + "context_length": 262144, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Recommended for offline multimodal inference (image/video pre-processing helpers)", + "command": "uv pip install qwen-vl-utils==0.0.14", + "optional": true + } + ], + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 564, + "description": "Full BF16 \u2014 ideal on H200/B200 with 8 GPUs" + }, + "fp8": { + "model_id": "Qwen/Qwen3-VL-235B-A22B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 282, + "tp": 4, + "description": "Qwen official FP8 checkpoint for optimal H100 memory efficiency" + }, + "nvfp4": { + "model_id": "nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 141, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data", + "--async-scheduling" + ], + "extra_env": {} + }, + "blackwell": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data", + "--async-scheduling" + ], + "extra_env": {} + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3-VL](https://github.com/QwenLM/Qwen3-VL) is the most powerful vision-language model in the Qwen series, delivering upgrades to text understanding & generation, visual perception & reasoning, extended context, spatial/video dynamics, and agent interaction. The flagship `Qwen3-VL-235B-A22B-Instruct` is a MoE model that requires at least 8 GPUs with \u226580 GB memory each (A100/H100/H200 class).\n\n## Prerequisites\n\n```bash\nuv venv\nsource .venv/bin/activate\n\n# Install vLLM >= 0.11.0\nuv pip install -U vllm\n\n# Install Qwen-VL utility library (recommended for offline inference)\nuv pip install qwen-vl-utils==0.0.14\n```\n\n## Deployment Configurations\n\n### H100 (Image + Video, FP8)\n```bash\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct-FP8 \\\n --tensor-parallel-size 8 \\\n --mm-encoder-tp-mode data \\\n --enable-expert-parallel \\\n --async-scheduling\n```\n\n### H100 (Image-Only, FP8, TP4)\n```bash\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct-FP8 \\\n --tensor-parallel-size 4 \\\n --limit-mm-per-prompt.video 0 \\\n --async-scheduling \\\n --gpu-memory-utilization 0.95 \\\n --max-num-seqs 128\n```\n\n### A100 & H100 (Image-Only, BF16)\n```bash\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \\\n --tensor-parallel-size 8 \\\n --limit-mm-per-prompt.video 0 \\\n --async-scheduling\n```\n\n### A100 & H100 (Image + Video, BF16)\n```bash\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \\\n --tensor-parallel-size 8 \\\n --max-model-len 128000 \\\n --async-scheduling\n```\n\n### H200 & B200\n```bash\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \\\n --tensor-parallel-size 8 \\\n --mm-encoder-tp-mode data \\\n --async-scheduling\n```\n\n### MI300X/MI325X/MI355X (BF16)\n```bash\nMIOPEN_USER_DB_PATH=\"$(pwd)/miopen\" \\\nMIOPEN_FIND_MODE=FAST \\\nVLLM_ROCM_USE_AITER=1 \\\nSAFETENSORS_FAST_GPU=1 \\\nvllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \\\n --tensor-parallel 4 \\\n --mm-encoder-tp-mode data\n```\n\n## Configuration Tips\n\n- Use `--limit-mm-per-prompt.video 0` if your server only serves image inputs to save memory.\n- `OMP_NUM_THREADS=1` reduces CPU contention during preprocessing.\n- The model's context length is 262K. Reduce `--max-model-len` (e.g. 128000) if you don't need the full range.\n- `--async-scheduling` overlaps scheduling with decoding for better throughput.\n- `--mm-encoder-tp-mode data` deploys the vision encoder in data-parallel fashion for better performance.\n- If your inputs are mostly unique, pass `--mm-processor-cache-gb 0` to skip caching overhead.\n- Extend context with YaRN:\n `--rope-scaling '{\"rope_type\":\"yarn\",\"factor\":3.0,\"original_max_position_embeddings\":262144,\"mrope_section\":[24,20,20],\"mrope_interleaved\":true}' --max-model-len 1000000`\n\nText-only mode: pass `--limit-mm-per-prompt.video 0 --limit-mm-per-prompt.image 0` to free memory for KV cache when serving text-only traffic.\n\n## Client Usage\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"}},\n {\"type\": \"text\", \"text\": \"Read all the text in the image.\"},\n ],\n}]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"Qwen/Qwen3-VL-235B-A22B-Instruct\",\n messages=messages,\n max_tokens=2048,\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- OOM on A100 / H100 BF16: reduce `--max-model-len`, drop to image-only, or switch to the FP8 checkpoint.\n- If enabling `--mm-encoder-tp-mode data` raises memory pressure, lower `--gpu-memory-utilization`.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)\n- [Qwen3-VL GitHub](https://github.com/QwenLM/Qwen3-VL)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n" + } + }, + "Qwen/Qwen3.5-0.8B": { + "hf_id": "Qwen/Qwen3.5-0.8B", + "meta": { + "title": "Qwen3.5-0.8B", + "provider": "Qwen", + "description": "Qwen3.5 tiny dense multimodal model (0.8B) \u2014 ultra-low-VRAM / edge serving with 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "0.8B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-0.8B", + "slug": "qwen3.5-0.8b", + "provider": "Qwen", + "description": "Qwen3.5 tiny dense multimodal model (0.8B) \u2014 ultra-low-VRAM / edge serving with 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Tiny Qwen3.5 dense for edge / draft-model use", + "related_recipes": [ + "Qwen/Qwen3.5-2B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-0.8B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "0.8B", + "active_parameters": "0.8B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads.", + "args": [ + "--language-model-only" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Full precision BF16 \u2014 runs on any modern GPU" + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) is the smallest\nmember of the Qwen3.5 family \u2014 same hybrid gated delta networks architecture\nand 262K context, at a size suited to edge devices or as a draft model for\nspeculative decoding with larger Qwen3.5 checkpoints.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** any modern GPU (>=4 GB VRAM)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve Qwen/Qwen3.5-0.8B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-0.8B\",\n messages=[{\"role\": \"user\", \"content\": \"Hi!\"}],\n max_tokens=64,\n)\nprint(resp.choices[0].message.content)\n```\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-0.8B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-0.8B-Base)\n" + } + }, + "Qwen/Qwen3.5-122B-A10B": { + "hf_id": "Qwen/Qwen3.5-122B-A10B", + "meta": { + "title": "Qwen3.5-122B-A10B", + "provider": "Qwen", + "description": "Mid-size Qwen3.5 multimodal MoE (122B total / 10B active) with gated delta networks, 256 experts, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "122B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-122B-A10B", + "slug": "qwen3.5-122b-a10b", + "provider": "Qwen", + "description": "Mid-size Qwen3.5 multimodal MoE (122B total / 10B active) with gated delta networks, 256 experts, and 262K context", + "date_updated": "2026-04-22", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Qwen3.5 mid-tier MoE \u2014 fits on 4x H200 BF16 or 2x H200 FP8", + "related_recipes": [ + "Qwen/Qwen3.5-397B-A17B", + "Qwen/Qwen3.5-35B-A3B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-122B-A10B", + "min_vllm_version": "0.17.0", + "architecture": "moe", + "parameter_count": "122B", + "active_parameters": "10B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 293, + "description": "Full precision BF16 \u2014 requires 4x H200 or equivalent" + }, + "fp8": { + "model_id": "Qwen/Qwen3.5-122B-A10B-FP8", + "precision": "fp8", + "vram_minimum_gb": 147, + "description": "Qwen official FP8 checkpoint \u2014 fits on 2x H200" + }, + "gptq_int4": { + "model_id": "Qwen/Qwen3.5-122B-A10B-GPTQ-Int4", + "precision": "int4", + "vram_minimum_gb": 74, + "description": "GPTQ Int4 checkpoint \u2014 single-GPU serving on 80GB hardware" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "pd_cluster": { + "prefill": { + "env": { + "VLLM_SSM_CONV_STATE_LAYOUT": "DS" + } + }, + "decode": { + "env": { + "VLLM_SSM_CONV_STATE_LAYOUT": "DS" + } + } + } + }, + "guide": "## Overview\n\n[Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B) is a mid-tier\nmember of the Qwen3.5 family, sharing the gated delta networks MoE architecture\nwith 122B total parameters and 10B activated per token (256 experts). It is\nmultimodal (vision + text) and natively supports 262K context.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware (BF16):** 4x H200 or 8x H100\n- **Hardware (FP8):** 2x H200 or 4x H100\n- **Hardware (Int4):** single 80 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### BF16 on 4xH200 (TP4)\n\n```bash\nvllm serve Qwen/Qwen3.5-122B-A10B \\\n --tensor-parallel-size 4 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### FP8 on 2xH200 (TP2)\n\n```bash\nvllm serve Qwen/Qwen3.5-122B-A10B-FP8 \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### Throughput-focused (text-only, EP)\n\n```bash\nvllm serve Qwen/Qwen3.5-122B-A10B-FP8 \\\n -dp 4 --enable-expert-parallel \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.5-122B-A10B-FP8 \\\n --tensor-parallel-size 2 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-122B-A10B\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize the gated delta networks paper.\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n- **Prefix Caching (Mamba):** currently experimental in \"align\" mode.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3.5-122B-A10B-FP8)\n- [GPTQ-Int4 checkpoint](https://huggingface.co/Qwen/Qwen3.5-122B-A10B-GPTQ-Int4)\n- [Qwen3.5-397B-A17B recipe](../Qwen3.5-397B-A17B)\n" + } + }, + "Qwen/Qwen3.5-27B": { + "hf_id": "Qwen/Qwen3.5-27B", + "meta": { + "title": "Qwen3.5-27B", + "provider": "Qwen", + "description": "Qwen3.5 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "27B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-27B", + "slug": "qwen3.5-27b", + "provider": "Qwen", + "description": "Qwen3.5 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Qwen3.5 flagship dense \u2014 single-GPU FP8 or 2x GPU BF16", + "related_recipes": [ + "Qwen/Qwen3.5-397B-A17B", + "Qwen/Qwen3.5-35B-A3B", + "Qwen/Qwen3.5-9B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-27B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "27B", + "active_parameters": "27B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 65, + "description": "Full precision BF16 \u2014 fits on 1x H200 or 2x H100" + }, + "fp8": { + "model_id": "Qwen/Qwen3.5-27B-FP8", + "precision": "fp8", + "vram_minimum_gb": 33, + "description": "Qwen official FP8 checkpoint \u2014 single 40 GB GPU" + }, + "gptq_int4": { + "model_id": "Qwen/Qwen3.5-27B-GPTQ-Int4", + "precision": "int4", + "vram_minimum_gb": 17, + "description": "GPTQ Int4 checkpoint \u2014 fits on a single 24 GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B) is the flagship dense\nmodel of the Qwen3.5 family. It uses the same gated delta networks hybrid\nattention as its MoE siblings, supports vision+text input, and natively serves\n262K context. MTP (multi-token prediction) is supported out of the box for\nlow-latency decoding.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware (BF16):** 1x H200 or 2x H100\n- **Hardware (FP8):** single 40 GB GPU (H100/H200/L40S)\n- **Hardware (Int4):** single 24 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### Single-GPU FP8\n\n```bash\nvllm serve Qwen/Qwen3.5-27B-FP8 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### BF16 on 2xH100 (TP2)\n\n```bash\nvllm serve Qwen/Qwen3.5-27B \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.5-27B-FP8 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n### Text-only (skip vision encoder)\n\n```bash\nvllm serve Qwen/Qwen3.5-27B-FP8 \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-27B\",\n messages=[{\"role\": \"user\", \"content\": \"Write a haiku about gated delta networks.\"}],\n max_tokens=256,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n- **Prefix Caching (Mamba):** currently experimental in \"align\" mode.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-27B)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3.5-27B-FP8)\n- [GPTQ-Int4 checkpoint](https://huggingface.co/Qwen/Qwen3.5-27B-GPTQ-Int4)\n- [Qwen3.5-397B-A17B recipe](../Qwen3.5-397B-A17B)\n" + } + }, + "Qwen/Qwen3.5-2B": { + "hf_id": "Qwen/Qwen3.5-2B", + "meta": { + "title": "Qwen3.5-2B", + "provider": "Qwen", + "description": "Qwen3.5 mini dense multimodal model (2B) \u2014 edge / low-VRAM serving with 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "2B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-2B", + "slug": "qwen3.5-2b", + "provider": "Qwen", + "description": "Qwen3.5 mini dense multimodal model (2B) \u2014 edge / low-VRAM serving with 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Edge-scale Qwen3.5 dense \u2014 fits on 8 GB GPUs", + "related_recipes": [ + "Qwen/Qwen3.5-4B", + "Qwen/Qwen3.5-0.8B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-2B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "2B", + "active_parameters": "2B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads.", + "args": [ + "--language-model-only" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 5, + "description": "Full precision BF16 \u2014 fits on an 8 GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-2B](https://huggingface.co/Qwen/Qwen3.5-2B) is a miniature dense\nQwen3.5 model \u2014 the full gated delta networks architecture, vision encoder,\nand 262K context, in a form small enough for 8 GB consumer GPUs or edge\ninference.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** single 8 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve Qwen/Qwen3.5-2B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-2B\",\n messages=[{\"role\": \"user\", \"content\": \"Hi!\"}],\n max_tokens=64,\n)\nprint(resp.choices[0].message.content)\n```\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-2B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-2B-Base)\n" + } + }, + "Qwen/Qwen3.5-35B-A3B": { + "hf_id": "Qwen/Qwen3.5-35B-A3B", + "meta": { + "title": "Qwen3.5-35B-A3B", + "provider": "Qwen", + "description": "Compact Qwen3.5 multimodal MoE (35B total / 3B active) with gated delta networks, 256 experts, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "35B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-35B-A3B", + "slug": "qwen3.5-35b-a3b", + "provider": "Qwen", + "description": "Compact Qwen3.5 multimodal MoE (35B total / 3B active) with gated delta networks, 256 experts, and 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Compact Qwen3.5 MoE \u2014 single-GPU FP8 or 2x GPU BF16 serving", + "related_recipes": [ + "Qwen/Qwen3.5-397B-A17B", + "Qwen/Qwen3.5-122B-A10B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-35B-A3B", + "min_vllm_version": "0.17.0", + "architecture": "moe", + "parameter_count": "35B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 84, + "description": "Full precision BF16 \u2014 fits on 1x H200 or 2x H100" + }, + "fp8": { + "model_id": "Qwen/Qwen3.5-35B-A3B-FP8", + "precision": "fp8", + "vram_minimum_gb": 42, + "description": "Qwen official FP8 checkpoint \u2014 single-GPU serving" + }, + "gptq_int4": { + "model_id": "Qwen/Qwen3.5-35B-A3B-GPTQ-Int4", + "precision": "int4", + "vram_minimum_gb": 21, + "description": "GPTQ Int4 checkpoint \u2014 fits on a single 24GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_dep", + "multi_node_tep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) is the smallest\nMoE in the Qwen3.5 family, sharing the gated delta networks architecture with\n35B total parameters and 3B activated per token (256 experts). With FP8 weights\nit fits on a single 80 GB GPU and supports the full 262K context.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware (BF16):** 1x H200 or 2x H100\n- **Hardware (FP8):** single H100/H200\n- **Hardware (Int4):** single 24 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### Single-GPU FP8\n\n```bash\nvllm serve Qwen/Qwen3.5-35B-A3B-FP8 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### BF16 on 2xH200 (TP2)\n\n```bash\nvllm serve Qwen/Qwen3.5-35B-A3B \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.5-35B-A3B-FP8 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-35B-A3B\",\n messages=[{\"role\": \"user\", \"content\": \"Explain gated delta networks in one paragraph.\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n- **Prefix Caching (Mamba):** currently experimental in \"align\" mode.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-FP8)\n- [GPTQ-Int4 checkpoint](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-GPTQ-Int4)\n- [Qwen3.5-397B-A17B recipe](../Qwen3.5-397B-A17B)\n" + } + }, + "Qwen/Qwen3.5-397B-A17B": { + "hf_id": "Qwen/Qwen3.5-397B-A17B", + "meta": { + "title": "Qwen3.5-397B", + "provider": "Qwen", + "description": "Multimodal MoE model with gated delta networks architecture, 397B total / 17B active parameters, up to 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "397B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-397B", + "slug": "qwen3.5-397b", + "provider": "Qwen", + "description": "Multimodal MoE model with gated delta networks architecture, 397B total / 17B active parameters, up to 262K context", + "date_updated": "2026-04-16", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Verified on 8x H200, 8x MI300X/MI355X, and GB200 nodes", + "related_recipes": [ + "Qwen/Qwen3.6-35B-A3B" + ], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3.5-397B-A17B", + "min_vllm_version": "0.17.0", + "architecture": "moe", + "parameter_count": "397B", + "active_parameters": "17B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": { + "VLLM_DEEP_GEMM_WARMUP": "skip", + "VLLM_USE_DEEP_GEMM": "0", + "VLLM_FLASHINFER_MOE_BACKEND": "latency" + } + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative_config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":3}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 953, + "description": "Full precision BF16 \u2014 requires 8x H200 or equivalent" + }, + "nvfp4": { + "model_id": "nvidia/Qwen3.5-397B-A17B-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 238, + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + }, + "gptq_int4": { + "model_id": "Qwen/Qwen3.5-397B-A17B-GPTQ-Int4", + "precision": "int4", + "vram_minimum_gb": 239, + "description": "GPTQ Int4 checkpoint \u2014 halves VRAM vs FP8" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "pd_cluster": { + "prefill": { + "env": { + "VLLM_SSM_CONV_STATE_LAYOUT": "DS" + } + }, + "decode": { + "env": { + "VLLM_SSM_CONV_STATE_LAYOUT": "DS" + } + } + } + }, + "guide": "## Overview\n\n[Qwen3.5](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) is a multimodal mixture-of-experts model featuring a gated delta networks architecture with 397B total parameters and 17B active parameters. This guide covers how to efficiently deploy and serve the model across different hardware configurations and workload profiles using vLLM.\n\n## Prerequisites\n\n### Pip Install\n\n#### NVIDIA\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n#### AMD\n> Note: The vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35. If your environment does not meet these requirements, please use the Docker-based setup. Supported GPUs: MI300X, MI325X, MI355X.\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n### Docker\n\n#### NVIDIA\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai Qwen/Qwen3.5-397B-A17B \\\n --tensor-parallel-size 8 \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\nFor Blackwell GPUs, use `vllm/vllm-openai:cu130-nightly`.\n\n#### AMD\n```bash\ndocker run --device=/dev/kfd --device=/dev/dri \\\n --security-opt seccomp=unconfined \\\n --group-add video \\\n --ipc=host \\\n -p 8000:8000 \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest \\\n Qwen/Qwen3.5-397B-A17B-FP8 \\\n --tensor-parallel-size 8 \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n## Deployment Configurations\n\nThe configurations below have been verified on 8x H200 GPUs and 8x MI300X/MI355X GPUs. We recommend using the official FP8 checkpoint [Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8) for optimal serving efficiency.\n\n### Throughput-Focused (Text-Only)\n\nFor maximum text throughput under high concurrency, use `--language-model-only` to skip loading the vision encoder and free up memory for KV cache, and enable Expert Parallelism.\n\n```bash\nvllm serve Qwen/Qwen3.5-397B-A17B-FP8 \\\n -dp 8 \\\n --enable-expert-parallel \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n### Throughput-Focused (Multimodal)\n\nFor multimodal workloads, use `--mm-encoder-tp-mode data` for data-parallel vision encoding and `--mm-processor-cache-type shm` for shared-memory caching of preprocessed multimodal inputs.\n\n```bash\nvllm serve Qwen/Qwen3.5-397B-A17B-FP8 \\\n -dp 8 \\\n --enable-expert-parallel \\\n --mm-encoder-tp-mode data \\\n --mm-processor-cache-type shm \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\nTo enable tool calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder` to the serve command.\n\n### Latency-Focused\n\nFor latency-sensitive workloads at low concurrency, enable MTP-1 speculative decoding and disable prefix caching. MTP-1 reduces time-per-output-token (TPOT) with a high acceptance rate, at the cost of lower throughput under load.\n\n> Note: MTP-1 speculative decoding for AMD GPUs is under development.\n\n```bash\nvllm serve Qwen/Qwen3.5-397B-A17B-FP8 \\\n --tensor-parallel-size 8 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n### GB200 Deployment\n\nWe recommend using the NVFP4 checkpoint [nvidia/Qwen3.5-397B-A17B-NVFP4](https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4) for optimal serving efficiency on GB200 nodes.\n\n```bash\nvllm serve nvidia/Qwen3.5-397B-A17B-NVFP4 \\\n -dp 4 \\\n --enable-expert-parallel \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n### MI355X Deployment\n\n```bash\nvllm serve Qwen/Qwen3.5-397B-A17B-FP8 \\\n -tp 2 \\\n --enable-expert-parallel \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n## Processing Ultra-Long Texts\n\nQwen3.5-397B-A17B natively supports `262,144` tokens. For longer inputs, apply\nYaRN RoPE scaling via `--hf-overrides` and raise `--max-model-len`. Pick\n`factor` to match your real workload \u2014 `2.0` covers ~524K, `4.0` covers\n~1M \u2014 since YaRN at higher factors degrades short-context quality.\n\n```bash\nVLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \\\n --tensor-parallel-size 8 \\\n --max-model-len 1010000 \\\n --reasoning-parser qwen3 \\\n --hf-overrides '{\"text_config\": {\"rope_parameters\": {\"mrope_interleaved\": true, \"mrope_section\": [11, 11, 10], \"rope_type\": \"yarn\", \"rope_theta\": 10000000, \"partial_rotary_factor\": 0.25, \"factor\": 4.0, \"original_max_position_embeddings\": 262144}}}'\n```\n\nSee the [model card](https://huggingface.co/Qwen/Qwen3.5-397B-A17B#processing-ultra-long-texts) for the full parameter reference.\n\n## Client Usage\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(\n api_key=\"EMPTY\",\n base_url=\"http://localhost:8000/v1\",\n timeout=3600\n)\n\nmessages = [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"\n }\n },\n {\n \"type\": \"text\",\n \"text\": \"Read all the text in the image.\"\n }\n ]\n }\n]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-397B-A17B\",\n messages=messages,\n max_tokens=2048\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(f\"Generated text: {response.choices[0].message.content}\")\n```\n\n## Troubleshooting\n\n**CUDA graph / Mamba cache size error**\n\nYou may encounter:\n```\nassert num_cache_lines >= batch\n```\nThis occurs because the CUDA graph capture size is larger than the Mamba cache size. Reduce `--max-cudagraph-capture-size` (default is 512). See https://github.com/vllm-project/vllm/pull/34571 for details.\n\n**Configuration tips**\n\n- **Disable Reasoning**: Add `--reasoning-parser qwen3 --default-chat-template-kwargs '{\"enable_thinking\": false}'` to disable reasoning mode via command-line parameters.\n- **Prefix Caching**: Prefix caching for Mamba cache \"align\" mode is currently experimental.\n- **Multi-token Prediction**: MTP-1 reduces per-token latency but degrades throughput under high concurrency because speculative tokens consume KV cache capacity. Adjust `num_speculative_tokens` (1-5) based on your use case.\n- **Encoder Data Parallelism**: `--mm-encoder-tp-mode data` deploys the vision encoder in a data-parallel fashion. This consumes additional memory and may require adjustment of `--gpu-memory-utilization`.\n## References\n\n- Model card: https://huggingface.co/Qwen/Qwen3.5-397B-A17B\n- FP8 checkpoint: https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8\n- NVFP4 checkpoint: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4\n- vLLM documentation: https://docs.vllm.ai/\n" + } + }, + "Qwen/Qwen3.5-4B": { + "hf_id": "Qwen/Qwen3.5-4B", + "meta": { + "title": "Qwen3.5-4B", + "provider": "Qwen", + "description": "Qwen3.5 compact dense multimodal model (4B) \u2014 fits on 16 GB consumer GPUs with full 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "4B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-4B", + "slug": "qwen3.5-4b", + "provider": "Qwen", + "description": "Qwen3.5 compact dense multimodal model (4B) \u2014 fits on 16 GB consumer GPUs with full 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Consumer-GPU-friendly Qwen3.5 dense with MTP support", + "related_recipes": [ + "Qwen/Qwen3.5-9B", + "Qwen/Qwen3.5-2B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-4B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "4B", + "active_parameters": "4B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads.", + "args": [ + "--language-model-only" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "Full precision BF16 \u2014 fits on a single 16 GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-4B](https://huggingface.co/Qwen/Qwen3.5-4B) is the compact dense\nentry in the Qwen3.5 family \u2014 same gated delta networks architecture, vision\nencoder, 262K context, and MTP decoding as the larger siblings, sized for\n16 GB consumer GPUs.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** single 16 GB GPU (RTX 4080 / L4 / A10 / T4-24GB)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve Qwen/Qwen3.5-4B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.5-4B \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-4B\",\n messages=[{\"role\": \"user\", \"content\": \"Hello!\"}],\n max_tokens=128,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`.\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-4B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-4B-Base)\n" + } + }, + "Qwen/Qwen3.5-9B": { + "hf_id": "Qwen/Qwen3.5-9B", + "meta": { + "title": "Qwen3.5-9B", + "provider": "Qwen", + "description": "Qwen3.5 dense multimodal model (9B) with gated delta networks hybrid attention, MTP, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "9B" + }, + "recipe": { + "meta": { + "title": "Qwen3.5-9B", + "slug": "qwen3.5-9b", + "provider": "Qwen", + "description": "Qwen3.5 dense multimodal model (9B) with gated delta networks hybrid attention, MTP, and 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Single-GPU Qwen3.5 dense with MTP-accelerated decoding", + "related_recipes": [ + "Qwen/Qwen3.5-27B", + "Qwen/Qwen3.5-4B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.5-9B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "9B", + "active_parameters": "9B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache.", + "args": [ + "--language-model-only" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 22, + "description": "Full precision BF16 \u2014 single 24 GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.5-9B](https://huggingface.co/Qwen/Qwen3.5-9B) is a dense multimodal\nmodel from the Qwen3.5 family \u2014 same gated delta networks hybrid attention,\nvision encoder, 262K context, and MTP support as its larger siblings, but\nsized to fit comfortably on a single 24 GB GPU.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** single 24 GB GPU (RTX 4090 / L4 / A10G / H100)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### Single-GPU BF16\n\n```bash\nvllm serve Qwen/Qwen3.5-9B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.5-9B \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-9B\",\n messages=[{\"role\": \"user\", \"content\": \"Hello!\"}],\n max_tokens=128,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-9B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-9B-Base)\n" + } + }, + "Qwen/Qwen3.6-27B": { + "hf_id": "Qwen/Qwen3.6-27B", + "meta": { + "title": "Qwen3.6-27B", + "provider": "Qwen", + "description": "Qwen3.6 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "27B" + }, + "recipe": { + "meta": { + "title": "Qwen3.6-27B", + "slug": "Qwen3.6-27b", + "provider": "Qwen", + "description": "Qwen3.6 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context", + "date_updated": "2026-04-22", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Qwen3.6 flagship dense \u2014 single-GPU FP8 or 2x GPU BF16", + "related_recipes": [ + "Qwen/Qwen3.6-35B-A3B" + ] + }, + "model": { + "model_id": "Qwen/Qwen3.6-27B", + "min_vllm_version": "0.17.0", + "architecture": "dense", + "parameter_count": "27B", + "active_parameters": "27B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 65, + "description": "Full precision BF16 \u2014 fits on 1x H200 or 2x H100" + }, + "fp8": { + "model_id": "Qwen/Qwen3.6-27B-FP8", + "precision": "fp8", + "vram_minimum_gb": 33, + "description": "Qwen official FP8 checkpoint \u2014 single 40 GB GPU" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.6-27B](https://huggingface.co/Qwen/Qwen3.6-27B) is the flagship dense\nmodel of the Qwen3.6 family. It uses the same gated delta networks hybrid\nattention as its MoE siblings, supports vision+text input, and natively serves\n262K context. MTP (multi-token prediction) is supported out of the box for\nlow-latency decoding.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware (BF16):** 1x H200 or 2x H100\n- **Hardware (FP8):** single 40 GB GPU (H100/H200/L40S)\n- **Hardware (Int4):** single 24 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### Single-GPU FP8\n\n```bash\nvllm serve Qwen/Qwen3.6-27B-FP8 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### BF16 on 2xH100 (TP2)\n\n```bash\nvllm serve Qwen/Qwen3.6-27B \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.6-27B-FP8 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 1}' \\\n --reasoning-parser qwen3\n```\n\n### Text-only (skip vision encoder)\n\n```bash\nvllm serve Qwen/Qwen3.6-27B-FP8 \\\n --language-model-only \\\n --reasoning-parser qwen3 \\\n --enable-prefix-caching\n```\n\n## Processing Ultra-Long Texts\n\nQwen3.6-27B natively supports `262,144` tokens. For longer inputs, apply\nYaRN RoPE scaling via `--hf-overrides` and raise `--max-model-len`. Pick\n`factor` to match your real workload \u2014 `2.0` covers ~524K, `4.0` covers\n~1M \u2014 since YaRN at higher factors degrades short-context quality.\n\n```bash\nVLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve Qwen/Qwen3.6-27B-FP8 \\\n --tensor-parallel-size 2 \\\n --max-model-len 1010000 \\\n --reasoning-parser qwen3 \\\n --hf-overrides '{\"text_config\": {\"rope_parameters\": {\"mrope_interleaved\": true, \"mrope_section\": [11, 11, 10], \"rope_type\": \"yarn\", \"rope_theta\": 10000000, \"partial_rotary_factor\": 0.25, \"factor\": 4.0, \"original_max_position_embeddings\": 262144}}}'\n```\n\nSee the [model card](https://huggingface.co/Qwen/Qwen3.6-27B#processing-ultra-long-texts) for the full parameter reference.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.6-27B\",\n messages=[{\"role\": \"user\", \"content\": \"Write a haiku about gated delta networks.\"}],\n max_tokens=256,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Disable reasoning:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n- **Prefix Caching (Mamba):** currently experimental in \"align\" mode.\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.6-27B)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3.6-27B-FP8)\n- [GPTQ-Int4 checkpoint](https://huggingface.co/Qwen/Qwen3.6-27B-GPTQ-Int4)\n- [Qwen3.6-397B-A17B recipe](../Qwen3.6-397B-A17B)\n" + } + }, + "Qwen/Qwen3.6-35B-A3B": { + "hf_id": "Qwen/Qwen3.6-35B-A3B", + "meta": { + "title": "Qwen3.6-35B-A3B", + "provider": "Qwen", + "description": "Smaller Qwen3.6 multimodal MoE model (35B total / 3B active) with 256 experts (8 routed + 1 shared), gated delta networks architecture, and 262K context", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "35B" + }, + "recipe": { + "meta": { + "title": "Qwen3.6-35B-A3B", + "slug": "qwen3.6-35b-a3b", + "provider": "Qwen", + "description": "Smaller Qwen3.6 multimodal MoE model (35B total / 3B active) with 256 experts (8 routed + 1 shared), gated delta networks architecture, and 262K context", + "date_updated": "2026-04-18", + "difficulty": "beginner", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Compact Qwen3.6 MoE with 3B active parameters \u2014 single-GPU FP8 or 2-4 GPU BF16 serving", + "related_recipes": [ + "Qwen/Qwen3.5-397B-A17B" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3.6-35B-A3B", + "min_vllm_version": "0.17.0", + "architecture": "moe", + "parameter_count": "35B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Qwen3 Coder parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Enable chain-of-thought reasoning with Qwen3 parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Multi-token prediction speculative decoding for lower latency", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":2}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 84, + "description": "Full precision BF16 \u2014 fits on 1x H200 or 2x H100" + }, + "fp8": { + "model_id": "Qwen/Qwen3.6-35B-A3B-FP8", + "precision": "fp8", + "vram_minimum_gb": 42, + "description": "Qwen official FP8 checkpoint \u2014 single-GPU serving" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Qwen3.6-35B-A3B](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) is the smaller sibling\nof Qwen3.5, sharing the same gated-delta-networks MoE architecture but with 35B total\nparameters and 3B activated (256 experts, 8 routed + 1 shared). With FP8 weights it\nfits comfortably on a single 80 GB GPU and supports the full 262K context.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware (BF16):** 1x H200 or 2x H100\n- **Hardware (FP8):** single H100/H200 or 1x MI300X/MI325X/MI355X\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n### Single-GPU FP8\n\n```bash\nvllm serve Qwen/Qwen3.6-35B-A3B-FP8 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### BF16 on 2xH200 (TP2)\n\n```bash\nvllm serve Qwen/Qwen3.6-35B-A3B \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n### MTP speculative decoding\n\n```bash\nvllm serve Qwen/Qwen3.6-35B-A3B \\\n --tensor-parallel-size 2 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3 \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 2}'\n```\n\n### AMD (MI300X / MI325X / MI355X)\n\n```bash\nVLLM_ROCM_USE_AITER=1 vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3 \\\n --trust-remote-code\n```\n\n## Processing Ultra-Long Texts\n\nQwen3.6-35B-A3B natively supports `262,144` tokens. For longer inputs, apply\nYaRN RoPE scaling via `--hf-overrides` and raise `--max-model-len`. Pick\n`factor` to match your real workload \u2014 `2.0` covers ~524K, `4.0` covers\n~1M \u2014 since YaRN at higher factors degrades short-context quality.\n\n```bash\nVLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \\\n --tensor-parallel-size 2 \\\n --max-model-len 1010000 \\\n --reasoning-parser qwen3 \\\n --hf-overrides '{\"text_config\": {\"rope_parameters\": {\"mrope_interleaved\": true, \"mrope_section\": [11, 11, 10], \"rope_type\": \"yarn\", \"rope_theta\": 10000000, \"partial_rotary_factor\": 0.25, \"factor\": 4.0, \"original_max_position_embeddings\": 262144}}}'\n```\n\nSee the [model card](https://huggingface.co/Qwen/Qwen3.6-35B-A3B#processing-ultra-long-texts) for the full parameter reference.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.6-35B-A3B\",\n messages=[{\"role\": \"user\", \"content\": \"Explain gated delta networks in one paragraph.\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **CUDA graph / Mamba cache size error:** reduce `--max-cudagraph-capture-size`\n (default 512). See [vLLM PR #34571](https://github.com/vllm-project/vllm/pull/34571).\n- **Reasoning disable:** add `--default-chat-template-kwargs '{\"enable_thinking\": false}'`.\n- **Prefix Caching (Mamba):** currently experimental in \"align\" mode.\n\n## References\n\n- [Qwen3.6-35B-A3B on Hugging Face](https://huggingface.co/Qwen/Qwen3.6-35B-A3B)\n- [FP8 checkpoint](https://huggingface.co/Qwen/Qwen3.6-35B-A3B-FP8)\n- [Qwen3.5 recipe (sibling 397B-A17B flagship)](../Qwen3.5-397B-A17B)\n" + } + }, + "Qwen/Qwen3Guard-Gen-8B": { + "hf_id": "Qwen/Qwen3Guard-Gen-8B", + "meta": { + "title": "Qwen3Guard-Gen-8B", + "provider": "Qwen", + "description": "Lightweight text-only guardrail/safety classifier model in the Qwen3Guard family.", + "tasks": [ + "text" + ], + "hardware": { + "mi300x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "8B" + }, + "recipe": { + "meta": { + "title": "Qwen3Guard-Gen-8B", + "slug": "qwen3guard-gen-8b", + "provider": "Qwen", + "description": "Lightweight text-only guardrail/safety classifier model in the Qwen3Guard family.", + "date_updated": "2026-04-30", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "Runs on a single GPU; serves safety classifications over OpenAI-compatible API", + "related_recipes": [], + "hardware": { + "mi300x": "verified" + } + }, + "model": { + "model_id": "Qwen/Qwen3Guard-Gen-8B", + "min_vllm_version": "0.10.0", + "architecture": "dense", + "parameter_count": "8B", + "active_parameters": "8B", + "context_length": 32768, + "base_args": [], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 19, + "description": "Full precision BF16 \u2014 single GPU with >=20 GB VRAM" + }, + "small_4b": { + "model_id": "Qwen/Qwen3Guard-Gen-4B", + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "4B variant for more constrained deployments" + }, + "tiny_0_6b": { + "model_id": "Qwen/Qwen3Guard-Gen-0.6B", + "precision": "bf16", + "vram_minimum_gb": 4, + "description": "0.6B variant for edge / ultra-low-cost serving" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n**Qwen3Guard-Gen** is a lightweight text-only guardrail model. This guide describes how to run the 8B variant \u2014 as well as the 4B and 0.6B variants \u2014 on GPU using vLLM.\n\n## Prerequisites\n\n### CUDA\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### ROCm\n> Note: The vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35.\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n## Deployment Configurations\n\n### Single GPU (CUDA)\n```bash\nvllm serve Qwen/Qwen3Guard-Gen-8B \\\n --host 0.0.0.0 \\\n --max-model-len 32768\n```\n\n### Single GPU (ROCm)\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve Qwen/Qwen3Guard-Gen-8B \\\n --host 0.0.0.0 \\\n --max-model-len 32768\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [{\"role\": \"user\", \"content\": \"Tell me how to make a bomb.\"}]\n\nresponse = client.chat.completions.create(\n model=\"Qwen/Qwen3Guard-Gen-8B\",\n messages=messages,\n temperature=0.0,\n)\nprint(\"Generated text:\", response.choices[0].message.content)\n# Safety: Unsafe\n# Categories: Violent\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model Qwen/Qwen3Guard-Gen-8B \\\n --dataset-name random \\\n --random-input-len 2000 \\\n --random-output-len 512 \\\n --num-prompts 100\n```\n\n## Available Variants\n\nThe Qwen3Guard-Gen series includes multiple model sizes, all compatible with the same vLLM serving commands:\n- **Qwen/Qwen3Guard-Gen-8B**\n- **Qwen/Qwen3Guard-Gen-4B**\n- **Qwen/Qwen3Guard-Gen-0.6B**\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3Guard-Gen-8B)\n- [Qwen3Guard-Gen-4B](https://huggingface.co/Qwen/Qwen3Guard-Gen-4B)\n- [Qwen3Guard-Gen-0.6B](https://huggingface.co/Qwen/Qwen3Guard-Gen-0.6B)\n" + } + }, + "Wan-AI/Wan2.2-T2V-A14B-Diffusers": { + "hf_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "meta": { + "title": "Wan2.2", + "provider": "Wan (Alibaba)", + "description": "Wan2.2 video generation models \u2014 T2V/I2V MoE (14B active) and unified TI2V (5B dense), served via vLLM-Omni", + "tasks": [ + "omni" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "28B" + }, + "recipe": { + "meta": { + "title": "Wan2.2", + "slug": "wan2.2", + "provider": "Wan (Alibaba)", + "description": "Wan2.2 video generation models \u2014 T2V/I2V MoE (14B active) and unified TI2V (5B dense), served via vLLM-Omni", + "date_updated": "2026-04-27", + "difficulty": "intermediate", + "tasks": [ + "omni" + ], + "related_recipes": [] + }, + "model": { + "model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "28B", + "active_parameters": "14B", + "context_length": 0, + "base_args": [], + "base_env": {} + }, + "omni": { + "tasks": [ + { + "id": "t2v", + "vram_minimum_gb": 152, + "description": "T2V MoE \u2014 14B active parameters" + }, + { + "id": "i2v", + "model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers", + "vram_minimum_gb": 40, + "description": "I2V MoE \u2014 14B active parameters" + }, + { + "id": "ti2v", + "model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers", + "vram_minimum_gb": 20, + "description": "Unified Text+Image-to-Video \u2014 dense 5B" + } + ] + }, + "dependencies": [ + { + "note": "Pin vllm==0.12.0 for Wan2.2", + "command": "uv pip install vllm==0.12.0" + }, + { + "note": "vllm-omni pinned to a specific commit that includes Wan2.2 text-to-video support", + "command": "uv pip install git+https://github.com/vllm-project/vllm-omni.git@ef01223c42be10ee260b9f6e5ec31894cd09d86e" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 152, + "description": "BF16 \u2014 variants moved to omni.tasks (T2V / I2V / TI2V each pick a different checkpoint)" + } + }, + "compatible_strategies": [], + "hardware_overrides": { + "amd": { + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nWan2.2 is a video generation family served via **vLLM-Omni** with optional **Cache-DiT**\nacceleration:\n\n- `Wan-AI/Wan2.2-T2V-A14B-Diffusers` \u2014 Text-to-Video (MoE, 14B active)\n- `Wan-AI/Wan2.2-I2V-A14B-Diffusers` \u2014 Image-to-Video (MoE, 14B active)\n- `Wan-AI/Wan2.2-TI2V-5B-Diffusers` \u2014 Unified Text+Image-to-Video (dense 5B)\n\n## Prerequisites\n\n- vLLM-Omni on top of vLLM 0.12.0\n- diffusers (bundled in vLLM-Omni CLI scripts)\n\n## Installation\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm==0.12.0\nuv pip install git+https://github.com/vllm-project/vllm-omni.git@ef01223c42be10ee260b9f6e5ec31894cd09d86e\n```\n\n### AMD ROCm (MI300X/MI325X/MI355X)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm==0.12.0 --extra-index-url https://wheels.vllm.ai/rocm/\nuv pip install git+https://github.com/vllm-project/vllm-omni.git@ef01223c42be10ee260b9f6e5ec31894cd09d86e\n```\n\n> \u26a0\ufe0f The vLLM ROCm wheel requires Python 3.12, ROCm 7.0, and glibc >= 2.35.\n\n## Text-to-Video (T2V)\n\n```python\nfrom vllm_omni.entrypoints.omni import Omni\n\nomni = Omni(model=\"Wan-AI/Wan2.2-T2V-A14B-Diffusers\")\nframes = omni.generate(\n \"Two anthropomorphic cats in comfy boxing gear fight on a spotlighted stage.\",\n height=720, width=1280,\n num_frames=81,\n num_inference_steps=40,\n guidance_scale=4.0,\n)\n```\n\nCLI:\n\n```bash\npython examples/offline_inference/text_to_video/text_to_video.py \\\n --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \\\n --prompt \"A serene lakeside sunrise with mist over the water.\" \\\n --height 720 --width 1280 \\\n --num_frames 81 --num_inference_steps 40 \\\n --guidance_scale 4.0 --fps 24 \\\n --output t2v_output.mp4\n```\n\n### Running T2V on MI300X/MI325X/MI355X GPUs\n\n```bash\nVLLM_ROCM_USE_AITER=1 \\\nSAFETENSORS_FAST_GPU=1 \\\npython examples/offline_inference/text_to_video/text_to_video.py \\\n --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \\\n --prompt \"A serene lakeside sunrise with mist over the water.\" \\\n --height 720 --width 1280 \\\n --num_frames 81 --num_inference_steps 40 \\\n --guidance_scale 4.0 --fps 24 \\\n --output t2v_output.mp4\n```\n\n## Image-to-Video (I2V)\n\n```python\nimport PIL.Image\nfrom vllm_omni.entrypoints.omni import Omni\n\nomni = Omni(model=\"Wan-AI/Wan2.2-I2V-A14B-Diffusers\")\nimage = PIL.Image.open(\"input.jpg\").convert(\"RGB\")\n\nframes = omni.generate(\n \"A cat playing with yarn\",\n pil_image=image,\n height=480, width=832,\n num_frames=81,\n num_inference_steps=50,\n guidance_scale=5.0,\n)\n```\n\n### Running I2V on MI300X/MI325X/MI355X GPUs\n\n```bash\nVLLM_ROCM_USE_AITER=1 \\\nSAFETENSORS_FAST_GPU=1 \\\npython examples/offline_inference/image_to_video/image_to_video.py \\\n --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \\\n --image input.jpg --prompt \"A cat playing with yarn\" \\\n --num_frames 81 --num_inference_steps 50 \\\n --guidance_scale 5.0 --fps 16 --output i2v_output.mp4\n```\n\nTI2V CLI:\n\n```bash\npython examples/offline_inference/image_to_video/image_to_video.py \\\n --model Wan-AI/Wan2.2-TI2V-5B-Diffusers \\\n --image input.jpg --prompt \"A cat playing with yarn\" \\\n --num_frames 81 --num_inference_steps 50 \\\n --guidance_scale 5.0 --fps 16 --output ti2v_output.mp4\n```\n\n### Running TI2V on MI300X/MI325X/MI355X GPUs\n\n```bash\nVLLM_ROCM_USE_AITER=1 \\\nSAFETENSORS_FAST_GPU=1 \\\npython examples/offline_inference/image_to_video/image_to_video.py \\\n --model Wan-AI/Wan2.2-TI2V-5B-Diffusers \\\n --image input.jpg --prompt \"A cat playing with yarn\" \\\n --num_frames 81 --num_inference_steps 50 \\\n --guidance_scale 5.0 --fps 16 --output ti2v_output.mp4\n```\n\n## Cache-DiT Acceleration\n\n```python\nomni = Omni(\n model=\"Wan-AI/Wan2.2-T2V-A14B-Diffusers\",\n cache_backend=\"cache_dit\",\n cache_config={\n \"Fn_compute_blocks\": 8,\n \"Bn_compute_blocks\": 0,\n \"max_warmup_steps\": 4,\n \"residual_diff_threshold\": 0.12,\n },\n)\n```\n\n## Key Parameters\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `height` | 720 (T2V) / auto (I2V) | Video height (multiples of 16) |\n| `width` | 1280 (T2V) / auto (I2V) | Video width (multiples of 16) |\n| `num_frames` | 81 | Frames to generate |\n| `num_inference_steps` | 40\u201350 | Denoising steps |\n| `guidance_scale` | 4.0\u20135.0 | Classifier-free guidance scale |\n| `boundary_ratio` | 0.875 | MoE boundary split ratio |\n| `flow_shift` | 5.0 (720p) / 12.0 (480p) | Scheduler flow shift |\n\n## References\n\n- [Wan2.2-T2V-A14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)\n- [Wan2.2-I2V-A14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)\n- [Wan2.2-TI2V-5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)\n- [Cache-DiT Acceleration](https://github.com/vipshop/cache-dit)\n" + } + }, + "XiaomiMiMo/MiMo-V2-Flash": { + "hf_id": "XiaomiMiMo/MiMo-V2-Flash", + "meta": { + "title": "MiMo-V2-Flash", + "provider": "MiMo (Xiaomi)", + "description": "Xiaomi's MoE reasoning model (309B total / 15B active) with hybrid attention and MTP for fast agentic workflows", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "309B" + }, + "recipe": { + "meta": { + "title": "MiMo-V2-Flash", + "slug": "mimo-v2-flash", + "provider": "MiMo (Xiaomi)", + "description": "Xiaomi's MoE reasoning model (309B total / 15B active) with hybrid attention and MTP for fast agentic workflows", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "XiaomiMiMo/MiMo-V2-Flash", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "309B", + "active_parameters": "15B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--generation-config", + "vllm" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 XML tool-call parser", + "args": [ + "--tool-call-parser", + "qwen3_xml" + ] + }, + "reasoning": { + "description": "Qwen3 reasoning parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 371, + "description": "Native FP8 weights; 4x H200 recommended with TP4", + "extra_args": [ + "--tensor-parallel-size", + "4", + "--gpu-memory-utilization", + "0.9" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "0" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[MiMo-V2-Flash](https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash) is a MoE language model\nwith 309B total parameters and 15B active. Designed for high-speed reasoning and agentic\nworkflows, it features hybrid attention and Multi-Token Prediction (MTP) to reduce\ninference cost.\n\n## Prerequisites\n\n- Hardware: 4x H200 (TP4) or equivalent aggregate VRAM (~320 GB with FP8)\n- vLLM >= 0.11.0\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm MI300X/MI325X/MI355X)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n## Launch commands\n\nBasic TP4:\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2-Flash \\\n --host 0.0.0.0 --port 9001 --seed 1024 \\\n --served-model-name mimo_v2_flash \\\n --tensor-parallel-size 4 \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.9 \\\n --generation-config vllm\n```\n\nWith tool calling + reasoning:\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2-Flash \\\n --tensor-parallel-size 4 --trust-remote-code --gpu-memory-utilization 0.9 \\\n --tool-call-parser qwen3_xml \\\n --reasoning-parser qwen3 \\\n --generation-config vllm \\\n --served-model-name mimo_v2_flash\n```\n\nDP + TP + EP:\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2-Flash \\\n --data-parallel-size 2 \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.9 \\\n --generation-config vllm \\\n --served-model-name mimo_v2_flash\n```\n\nAMD:\n\n```bash\nexport VLLM_ROCM_USE_AITER=0\nvllm serve XiaomiMiMo/MiMo-V2-Flash --tensor-parallel-size 4 \\\n --trust-remote-code --gpu-memory-utilization 0.9 --generation-config vllm\n```\n\nTunable flags:\n- `--max-model-len=65536` works well; max is 128K.\n- `--max-num-batched-tokens=32768` for prompt-heavy; 16K/8K for lower latency.\n- `--gpu-memory-utilization=0.95` to maximize KV cache.\n\n## Client Usage\n\n```bash\ncurl -X POST http://localhost:9001/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"mimo_v2_flash\",\n \"messages\": [{\"role\": \"user\", \"content\": \"Hello MiMo!\"}],\n \"chat_template_kwargs\": {\"enable_thinking\": true}\n }'\n```\n\nSet `\"enable_thinking\": false` (or omit the kwargs) to disable thinking mode.\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model XiaomiMiMo/MiMo-V2-Flash \\\n --dataset-name random --random-input-len 8000 --random-output-len 1000 \\\n --request-rate 3 --num-prompts 1800 --ignore-eos\n```\n\n## Accuracy (GSM8K)\n\nReported 5-shot `exact_match`: flexible 0.9128, strict 0.9075.\n\n## References\n\n- [MiMo-V2-Flash on Hugging Face](https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash)\n" + } + }, + "XiaomiMiMo/MiMo-V2.5-Pro": { + "hf_id": "XiaomiMiMo/MiMo-V2.5-Pro", + "meta": { + "title": "MiMo-V2.5-Pro", + "provider": "MiMo (Xiaomi)", + "description": "Xiaomi's flagship MoE reasoning model (1.02T total / 42B active) with hybrid attention, native FP8 weights, and Multi-Token Prediction", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "MiMo-V2.5-Pro", + "slug": "mimo-v2-5-pro", + "provider": "MiMo (Xiaomi)", + "description": "Xiaomi's flagship MoE reasoning model (1.02T total / 42B active) with hybrid attention, native FP8 weights, and Multi-Token Prediction", + "date_updated": "2026-04-27", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "related_recipes": [ + "XiaomiMiMo/MiMo-V2.5" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", + "min_vllm_version": "0.21.0", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "42B", + "context_length": 1048576, + "base_args": [ + "--trust-remote-code", + "--max-model-len auto", + "--generation-config", + "vllm" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "MiMo tool-call parser", + "args": [ + "--tool-call-parser", + "mimo", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiMo reasoning parser", + "args": [ + "--reasoning-parser", + "mimo" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 1224, + "description": "Native FP8 weights (block-wise e4m3 128x128); 8x H200 with TP8", + "extra_args": [ + "--tensor-parallel-size", + "8", + "--gpu-memory-utilization", + "0.95", + "--max-model-len", + "auto" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nMiMo-V2.5-Pro is Xiaomi's flagship MoE reasoning model with 1.02T total parameters\nand 42B active per token. It uses 384 routed experts (top-8) with hybrid\nattention (full-attention + sliding-window 128 at 6:1 ratio) over 70 layers\n(1 dense + 69 MoE) and ships with native FP8 (block-wise e4m3) weights.\nA 3-layer Multi-Token Prediction (MTP) head enables speculative decoding for\n~3x output speed.\n\n## Prerequisites\n\n- Hardware: 8x H200 (TP8)\n\n### Pull the vLLM docker image\n\nStable vLLM does not yet support MiMo V2.5. Use the pre-built image:\n\n```bash\ndocker pull vllm/vllm-openai:mimov25-cu129\n```\n\n## Launch commands\n\nSingle-node TP8 (H200):\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2.5-Pro \\\n --tensor-parallel-size 8 \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.95 \\\n --max-model-len auto \\\n --generation-config vllm\n```\n\nWith tool calling + reasoning:\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2.5-Pro \\\n --tensor-parallel-size 8 \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.95 \\\n --max-model-len auto \\\n --reasoning-parser mimo \\\n --tool-call-parser mimo \\\n --enable-auto-tool-choice \\\n --generation-config vllm\n```\n\n## Client Usage\n\n```bash\ncurl -X POST http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"XiaomiMiMo/MiMo-V2.5-Pro\",\n \"messages\": [{\"role\": \"user\", \"content\": \"Hello MiMo!\"}],\n \"chat_template_kwargs\": {\"enable_thinking\": true}\n }'\n```\n\nSet `\"enable_thinking\": false` (or omit the kwargs) to disable thinking mode.\n\n\n## References\n\n- [MiMo-V2.5-Pro on Hugging Face](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro)\n" + } + }, + "XiaomiMiMo/MiMo-V2.5": { + "hf_id": "XiaomiMiMo/MiMo-V2.5", + "meta": { + "title": "MiMo-V2.5", + "provider": "MiMo (Xiaomi)", + "description": "MiMo-V2.5 is a native omnimodal model with strong agentic capabilities, supporting text, image, video, and audio understanding within a unified architecture", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "311B" + }, + "recipe": { + "meta": { + "title": "MiMo-V2.5", + "slug": "mimo-v2-5", + "provider": "MiMo (Xiaomi)", + "description": "MiMo-V2.5 is a native omnimodal model with strong agentic capabilities, supporting text, image, video, and audio understanding within a unified architecture", + "date_updated": "2026-04-27", + "difficulty": "advanced", + "tasks": [ + "multimodal", + "text" + ], + "related_recipes": [ + "XiaomiMiMo/MiMo-V2-Flash" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "XiaomiMiMo/MiMo-V2.5", + "min_vllm_version": "0.21.0", + "architecture": "moe", + "parameter_count": "311B", + "active_parameters": "15B", + "context_length": 1048576, + "base_args": [ + "--trust-remote-code", + "--generation-config", + "vllm" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "MiMo tool-call parser", + "args": [ + "--tool-call-parser", + "mimo", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "MiMo reasoning parser", + "args": [ + "--reasoning-parser", + "mimo" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 374, + "tp": 4, + "description": "Native FP8 weights (block-wise e4m3 128x128); 4x H200 with TP4", + "extra_args": [ + "--gpu-memory-utilization", + "0.95", + "--max-model-len", + "auto" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nMiMo-V2.5 is Xiaomi's native omnimodal MoE model with 310B total parameters\nand 15B active per token, supporting text, image, video, and audio understanding\nwithin a unified architecture. Built on the MiMo-V2-Flash backbone with dedicated\nvision (729M) and audio (261M) encoders, it uses 256 routed experts (top-8) with\nhybrid attention (SWA-128 + full-attention at 5:1 ratio) over 48 layers\n(1 dense + 47 MoE) and ships with native FP8 (block-wise e4m3) weights.\nA 3-layer Multi-Token Prediction (MTP) head is included for speculative decoding.\n\n## Prerequisites\n\n- Hardware: 4x H200 (TP4)\n\n### Pull the vLLM docker image\n\nStable vLLM does not yet support MiMo V2.5. Use the pre-built image:\n\n```bash\ndocker pull vllm/vllm-openai:mimov25-cu129\n```\n\n## Launch commands\n\nSingle-node TP4 (H200):\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2.5 \\\n --tensor-parallel-size 4 \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.95 \\\n --max-model-len auto \\\n --generation-config vllm\n```\n\nWith tool calling + reasoning:\n\n```bash\nvllm serve XiaomiMiMo/MiMo-V2.5 \\\n --tensor-parallel-size 4 \\\n --trust-remote-code \\\n --gpu-memory-utilization 0.95 \\\n --max-model-len auto \\\n --reasoning-parser mimo \\\n --tool-call-parser mimo \\\n --enable-auto-tool-choice \\\n --generation-config vllm\n```\n\nTunable flags:\n- `--max-model-len` \u2014 full context is 1,048,576; use `auto` to size to KV budget.\n- `--max-num-batched-tokens=32768` for prompt-heavy workloads; lower for latency.\n- `--gpu-memory-utilization=0.95` to maximize KV cache.\n\n## Client Usage\n\n```bash\ncurl -X POST http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"XiaomiMiMo/MiMo-V2.5\",\n \"messages\": [{\"role\": \"user\", \"content\": \"Hello MiMo!\"}],\n \"chat_template_kwargs\": {\"enable_thinking\": true}\n }'\n```\n\nSet `\"enable_thinking\": false` (or omit the kwargs) to disable thinking mode.\n\n## Benchmarking\n\nLaunch the server with `--no-enable-prefix-caching` to get consistent measurements.\n\n### VisionArena-Chat\n```\nvllm bench serve \\\n --model XiaomiMiMo/MiMo-V2.5 \\\n --backend openai-chat \\\n --endpoint /v1/chat/completions \\\n --dataset-name hf \\\n --dataset-path lmarena-ai/VisionArena-Chat \\\n --num-prompts 128\n```\n\n### Random Synthetic\n```bash\nvllm bench serve \\\n --model XiaomiMiMo/MiMo-V2.5 \\\n --dataset-name random --random-input-len 8000 --random-output-len 1000 \\\n --request-rate 3 --num-prompts 1800 --ignore-eos\n```\n\n## References\n\n- [MiMo-V2.5 on Hugging Face](https://huggingface.co/XiaomiMiMo/MiMo-V2.5)\n" + } + }, + "arcee-ai/Trinity-Large-Thinking": { + "hf_id": "arcee-ai/Trinity-Large-Thinking", + "meta": { + "title": "Trinity-Large-Thinking", + "provider": "Arcee AI", + "description": "Arcee AI's reasoning-focused sparse MoE (AfmoeForCausalLM) with structured traces and agentic tool use", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "398B" + }, + "recipe": { + "meta": { + "title": "Trinity-Large-Thinking", + "slug": "trinity-large-thinking", + "provider": "Arcee AI", + "description": "Arcee AI's reasoning-focused sparse MoE (AfmoeForCausalLM) with structured traces and agentic tool use", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [] + }, + "model": { + "model_id": "arcee-ai/Trinity-Large-Thinking", + "min_vllm_version": "0.11.1", + "architecture": "moe", + "parameter_count": "398B", + "active_parameters": "13B", + "context_length": 262144, + "base_args": [ + "--dtype", + "bfloat16" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 Coder tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "DeepSeek-R1 reasoning parser extracts ... into message.reasoning", + "args": [ + "--reasoning-parser", + "deepseek_r1" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 955, + "description": "Full precision BF16 on multi-GPU (sparse MoE \u2014 multi-GPU recommended)" + }, + "nvfp4": { + "model_id": "arcee-ai/Trinity-Large-Thinking-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 239, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Trinity-Large-Thinking](https://huggingface.co/arcee-ai/Trinity-Large-Thinking) is\nArcee AI's reasoning-focused Trinity Large checkpoint \u2014 a sparse MoE model designed for\nlong-horizon planning, tool use, and multi-step agent workflows. It uses the\n`AfmoeForCausalLM` architecture and emits explicit reasoning traces inside `...`\nblocks.\n\nFor multi-turn chat and agentic loops, reasoning tokens should be preserved across\nturns as part of the working state.\n\n## Prerequisites\n\n- vLLM >= 0.11.1\n- Hardware: multi-GPU recommended for production deployments\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm openai --torch-backend auto\n```\n\n## Launch command\n\n```bash\nvllm serve arcee-ai/Trinity-Large-Thinking \\\n --dtype bfloat16 \\\n --reasoning-parser deepseek_r1 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\nWhy these flags:\n- `--reasoning-parser deepseek_r1` extracts `...` into `message.reasoning`.\n- `--enable-auto-tool-choice` lets the model decide when to call tools.\n- `--tool-call-parser qwen3_coder` converts tool calls into OpenAI-style `tool_calls`.\n- `--dtype bfloat16` matches the recommended serving dtype.\n\nAdd parallelism flags (`--tensor-parallel-size`, `--data-parallel-size`, or\n`--enable-expert-parallel`) for your hardware. Lower `--max-model-len` if you don't\nneed the full long-context config.\n\n## Validation Request\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nmodel = client.models.list().data[0].id\n\ntools = [{\n \"type\": \"function\",\n \"function\": {\n \"name\": \"get_weather\",\n \"description\": \"Get the current weather for a location.\",\n \"parameters\": {\n \"type\": \"object\",\n \"properties\": {\"location\": {\"type\": \"string\"}},\n \"required\": [\"location\"],\n },\n },\n}]\n\nresponse = client.chat.completions.create(\n model=model,\n messages=[{\"role\": \"user\", \"content\": \"What is the weather in Paris right now?\"}],\n tools=tools, tool_choice=\"auto\",\n)\n\nmsg = response.choices[0].message\nreasoning = getattr(msg, \"reasoning\", None) or getattr(msg, \"reasoning_content\", None)\nprint(\"reasoning:\", reasoning)\nprint(\"content:\", msg.content)\nprint(\"tool_calls:\", msg.tool_calls)\n```\n\n## Preserving Reasoning Across Turns\n\nPass reasoning back as `reasoning` on assistant messages:\n\n```python\nassistant_msg = {\"role\": \"assistant\", \"content\": msg.content or \"\"}\nif reasoning:\n assistant_msg[\"reasoning\"] = reasoning\nif msg.tool_calls:\n assistant_msg[\"tool_calls\"] = [\n {\"id\": tc.id, \"type\": \"function\",\n \"function\": {\"name\": tc.function.name, \"arguments\": tc.function.arguments}}\n for tc in msg.tool_calls\n ]\nmessages.append(assistant_msg)\n```\n\nRules:\n- Pass reasoning back as `reasoning` (even if your client exposes it as `reasoning_content`).\n- Keep `content` as an empty string (not `null`) on tool-only turns.\n- Append the assistant message before tool-result messages.\n- Use `/v1/chat/completions` for structured reasoning output.\n\n## Troubleshooting\n\n- **No reasoning:** start server with `--reasoning-parser deepseek_r1`; use `/v1/chat/completions`.\n- **Tool calls as plain text:** enable `--enable-auto-tool-choice` and `--tool-call-parser qwen3_coder`.\n- **Loses coherence after tool turns:** preserve `reasoning` on each assistant turn; don't set content to `null`.\n- **OOM:** lower `--max-model-len`; scale parallelism; use a local checkpoint path.\n\n## References\n\n- [Trinity-Large-Thinking on Hugging Face](https://huggingface.co/arcee-ai/Trinity-Large-Thinking)\n" + } + }, + "baidu/ERNIE-4.5-21B-A3B-PT": { + "hf_id": "baidu/ERNIE-4.5-21B-A3B-PT", + "meta": { + "title": "ERNIE-4.5", + "provider": "Ernie (Baidu)", + "description": "Baidu ERNIE 4.5 MoE text models (21B-A3B, 300B-A47B) with BF16 and FP8 support plus ERNIE-MTP speculative decoding", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "21B" + }, + "recipe": { + "meta": { + "title": "ERNIE-4.5", + "slug": "ernie-4.5", + "provider": "Ernie (Baidu)", + "description": "Baidu ERNIE 4.5 MoE text models (21B-A3B, 300B-A47B) with BF16 and FP8 support plus ERNIE-MTP speculative decoding", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [ + "baidu/ERNIE-4.5-VL-28B-A3B-PT" + ] + }, + "model": { + "model_id": "baidu/ERNIE-4.5-21B-A3B-PT", + "min_vllm_version": "0.10.1", + "architecture": "moe", + "parameter_count": "21B", + "active_parameters": "3B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": { + "spec_decoding": { + "description": "ERNIE-MTP (multi-token prediction) speculative decoding", + "args": [ + "--speculative-config", + "{\"method\":\"ernie_mtp\",\"model\":\"baidu/ERNIE-4.5-21B-A3B-PT\",\"num_speculative_tokens\":1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 106, + "description": "BF16 weights; fits on 1x80GB GPU (21B variant)" + }, + "300b": { + "model_id": "baidu/ERNIE-4.5-300B-A47B-PT", + "precision": "bf16", + "vram_minimum_gb": 640, + "description": "300B total / 47B active; 8x80GB with FP8 online, 16x80GB for BF16", + "extra_args": [ + "--tensor-parallel-size", + "8", + "--gpu-memory-utilization", + "0.95", + "--quantization", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--gpu-memory-utilization", + "0.9" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nERNIE 4.5 is Baidu's MoE language model family. This recipe covers the text-only\nvariants:\n\n- `baidu/ERNIE-4.5-21B-A3B-PT` \u2014 21B total / 3B active (fits on 1x80GB)\n- `baidu/ERNIE-4.5-300B-A47B-PT` \u2014 300B total / 47B active (8x80GB FP8 or 16x80GB BF16)\n\nBoth support ERNIE-MTP speculative decoding via `--speculative-config`.\n\n## Prerequisites\n\n- transformers >= 4.54.0\n- vLLM >= 0.10.1\n- Hardware depends on variant (see above)\n\n### Install vLLM\n\n```bash\nuv venv --python 3.12 --seed\nsource .venv/bin/activate\nuv pip install vllm --torch-backend=auto\n```\n\n## Launch commands\n\n21B on 1x80GB GPU:\n\n```bash\nvllm serve baidu/ERNIE-4.5-21B-A3B-PT\n```\n\n300B on 8x80GB with vLLM FP8 online quantization:\n\n```bash\nvllm serve baidu/ERNIE-4.5-300B-A47B-PT \\\n --tensor-parallel-size 8 \\\n --gpu-memory-utilization 0.95 \\\n --quantization fp8\n```\n\n300B on 16x80GB native BF16 (multi-node via Ray):\n\n```bash\nvllm serve baidu/ERNIE-4.5-300B-A47B-PT --tensor-parallel-size 16\n```\n\nERNIE-MTP speculative decoding (21B example):\n\n```bash\nvllm serve baidu/ERNIE-4.5-21B-A3B-PT \\\n --speculative-config '{\"method\":\"ernie_mtp\",\"model\":\"baidu/ERNIE-4.5-21B-A3B-PT\",\"num_speculative_tokens\":1}'\n```\n\n## Client Usage\n\nStandard OpenAI-compatible API; model ID is the HF repo.\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model baidu/ERNIE-4.5-21B-A3B-PT \\\n --dataset-name random \\\n --random-input-len 8000 --random-output-len 1000 \\\n --request-rate 10 --num-prompts 16 --ignore-eos\n```\n\nTest configurations: prompt-heavy (8k/1k), decode-heavy (1k/8k), balanced (1k/1k).\nVary `--num-prompts` across 1, 16, 32, 64, 128, 256, 512.\n\n## References\n\n- [ERNIE-4.5-21B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT)\n- [ERNIE-4.5-300B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-300B-A47B-PT)\n- [vLLM multi-node deployment](https://docs.vllm.ai/en/latest/serving/parallelism_scaling.html)\n" + } + }, + "baidu/ERNIE-4.5-VL-28B-A3B-PT": { + "hf_id": "baidu/ERNIE-4.5-VL-28B-A3B-PT", + "meta": { + "title": "ERNIE-4.5-VL", + "provider": "Ernie (Baidu)", + "description": "Baidu ERNIE 4.5 VL MoE vision-language models (28B-A3B, 424B-A47B) with heterogeneous text/vision experts", + "tasks": [ + "multimodal" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "28B" + }, + "recipe": { + "meta": { + "title": "ERNIE-4.5-VL", + "slug": "ernie-4.5-vl", + "provider": "Ernie (Baidu)", + "description": "Baidu ERNIE 4.5 VL MoE vision-language models (28B-A3B, 424B-A47B) with heterogeneous text/vision experts", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "baidu/ERNIE-4.5-21B-A3B-PT" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "baidu/ERNIE-4.5-VL-28B-A3B-PT", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "28B", + "active_parameters": "3B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 67, + "description": "BF16 weights; fits on 1x80GB GPU (28B VL variant)" + }, + "424b": { + "model_id": "baidu/ERNIE-4.5-VL-424B-A47B-PT", + "precision": "bf16", + "vram_minimum_gb": 1120, + "description": "424B total / 47B active; 8x140GB BF16 or 16x80GB BF16", + "extra_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "8" + ] + }, + "424b_fp8": { + "model_id": "baidu/ERNIE-4.5-VL-424B-A47B-PT", + "precision": "fp8", + "vram_minimum_gb": 640, + "description": "424B with FP8 online quantization + CPU offload for 8x80GB testing", + "extra_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "8", + "--quantization", + "fp8", + "--cpu-offload-gb", + "50" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nERNIE 4.5 VL is Baidu's multimodal MoE model with heterogeneous experts (separate text\nand vision experts). Because of the heterogeneous architecture, **torch.compile and CUDA\ngraphs are not supported**.\n\n- `baidu/ERNIE-4.5-VL-28B-A3B-PT` \u2014 28B total / 3B active (1x80GB)\n- `baidu/ERNIE-4.5-VL-424B-A47B-PT` \u2014 424B total / 47B active (8x140GB BF16, 8x80GB FP8+offload, or 16x80GB BF16)\n\n## Prerequisites\n\n- vLLM: support added to main branch recently; install latest\n- Hardware depends on variant\n\n### Install vLLM (CUDA)\n\n```bash\nuv venv --python 3.12 --seed\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm MI300X/MI325X/MI355X)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\n## Launch commands\n\n28B on 1x80GB:\n\n```bash\nvllm serve baidu/ERNIE-4.5-VL-28B-A3B-PT --trust-remote-code\n```\n\n424B BF16 on 8x140GB:\n\n```bash\nvllm serve baidu/ERNIE-4.5-VL-424B-A47B-PT \\\n --trust-remote-code \\\n --tensor-parallel-size 8\n```\n\n424B with FP8 + CPU offload on 8x80GB (testing only):\n\n```bash\nvllm serve baidu/ERNIE-4.5-VL-424B-A47B-PT \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --quantization fp8 \\\n --cpu-offload-gb 50\n```\n\n28B on AMD MI300X+:\n\n```bash\nVLLM_ROCM_USE_AITER=1 SAFETENSORS_FAST_GPU=1 \\\n vllm serve baidu/ERNIE-4.5-VL-28B-A3B-PT \\\n --tensor-parallel-size 4 \\\n --gpu-memory-utilization 0.9 \\\n --disable-log-requests \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model baidu/ERNIE-4.5-VL-28B-A3B-PT \\\n --dataset-name random \\\n --random-input-len 8000 --random-output-len 1000 \\\n --request-rate 10 --num-prompts 16 --ignore-eos --trust-remote-code\n```\n\n## References\n\n- [ERNIE-4.5-VL-28B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT)\n- [ERNIE-4.5-VL-424B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-PT)\n" + } + }, + "deepseek-ai/DeepSeek-OCR-2": { + "hf_id": "deepseek-ai/DeepSeek-OCR-2", + "meta": { + "title": "DeepSeek-OCR-2", + "provider": "DeepSeek", + "description": "Next-generation DeepSeek OCR model with improved document-to-markdown grounding and optical context compression.", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "3B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-OCR-2", + "slug": "deepseek-ocr-2", + "provider": "DeepSeek", + "description": "Next-generation DeepSeek OCR model with improved document-to-markdown grounding and optical context compression.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Improved grounding and markdown conversion over DeepSeek-OCR", + "related_recipes": [ + "deepseek-ai/DeepSeek-OCR" + ] + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-OCR-2", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "3B", + "active_parameters": "3B", + "context_length": 8192, + "base_args": [ + "--trust-remote-code", + "--logits_processors", + "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor", + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0" + ], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 7, + "description": "Full precision BF16 (~3.4B params)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-OCR-2 is a frontier OCR model exploring optical context compression for LLMs.\nIt iterates on DeepSeek-OCR with better grounding and markdown conversion, and supports\nprompts like `\\n<|grounding|>Convert the document to markdown.` for richer\ndocument parsing.\n\n## Prerequisites\n\n- **Hardware**: Single GPU with >=8 GB VRAM is typically sufficient for BF16 inference.\n- **vLLM**: Current stable release (tested with `uv pip install -U vllm --torch-backend auto`).\n- **Python**: 3.10+\n\nInstall vLLM:\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Client Usage\n\n### Offline OCR (Python)\n\n```python\nfrom vllm import LLM, SamplingParams\nfrom vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor\nfrom PIL import Image\n\nllm = LLM(\n model=\"deepseek-ai/DeepSeek-OCR-2\",\n enable_prefix_caching=False,\n mm_processor_cache_gb=0,\n logits_processors=[NGramPerReqLogitsProcessor],\n)\n\nimage_1 = Image.open(\"path/to/your/image_1.png\").convert(\"RGB\")\nimage_2 = Image.open(\"path/to/your/image_2.png\").convert(\"RGB\")\n# prompt = \"\\nFree OCR. \"\nprompt = \"\\n<|grounding|>Convert the document to markdown. \"\n\nmodel_input = [\n {\"prompt\": prompt, \"multi_modal_data\": {\"image\": image_1}},\n {\"prompt\": prompt, \"multi_modal_data\": {\"image\": image_2}},\n]\n\nsampling_param = SamplingParams(\n temperature=0.0,\n max_tokens=8192,\n extra_args=dict(\n ngram_size=30,\n window_size=90,\n whitelist_token_ids={128821, 128822}, # , \n ),\n skip_special_tokens=False,\n)\n\nfor output in llm.generate(model_input, sampling_param):\n print(output.outputs[0].text)\n```\n\n### Online OCR serving\n\n```bash\nvllm serve deepseek-ai/DeepSeek-OCR-2 \\\n --logits_processors vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0\n```\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"}},\n {\"type\": \"text\", \"text\": \"Free OCR.\"},\n ],\n }\n]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"deepseek-ai/DeepSeek-OCR-2\",\n messages=messages,\n max_tokens=2048,\n temperature=0.0,\n extra_body={\n \"skip_special_tokens\": False,\n \"vllm_xargs\": {\n \"ngram_size\": 30,\n \"window_size\": 90,\n \"whitelist_token_ids\": [128821, 128822],\n },\n },\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(f\"Generated text: {response.choices[0].message.content}\")\n```\n\n## Troubleshooting / Configuration Tips\n\n- **Use the custom logits processor** along with the model for optimal OCR and markdown\n generation performance.\n- Unlike multi-turn chat, OCR tasks do not typically benefit from prefix caching or image\n reuse, so disable these features to avoid unnecessary hashing and caching overhead.\n- DeepSeek-OCR-2 works better with plain prompts than instruction formats. See the\n [official main prompts](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2#main-prompts).\n- Depending on your hardware, adjust `max_num_batched_tokens` for better throughput.\n\n## References\n\n- [DeepSeek-OCR-2 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html#offline-inference)\n" + } + }, + "deepseek-ai/DeepSeek-OCR": { + "hf_id": "deepseek-ai/DeepSeek-OCR", + "meta": { + "title": "DeepSeek-OCR", + "provider": "DeepSeek", + "description": "Frontier OCR model exploring optical context compression for LLMs, optimized for document parsing and markdown generation.", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "3B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-OCR", + "slug": "deepseek-ocr", + "provider": "DeepSeek", + "description": "Frontier OCR model exploring optical context compression for LLMs, optimized for document parsing and markdown generation.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Optical context compression for efficient OCR and document understanding", + "related_recipes": [ + "deepseek-ai/DeepSeek-OCR-2" + ] + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-OCR", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "3B", + "active_parameters": "3B", + "context_length": 8192, + "base_args": [ + "--trust-remote-code", + "--logits_processors", + "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor", + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0" + ], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 7, + "description": "Full precision BF16 (~3.3B params)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-OCR is a frontier OCR model exploring optical context compression for LLMs.\nIt is optimized for document parsing, free-form OCR, and markdown generation from images,\nand ships with a custom n-gram logits processor for optimal quality.\n\n## Prerequisites\n\n- **Hardware**: Single GPU with >=8 GB VRAM is typically sufficient for BF16 inference.\n- **vLLM**: Current stable release (tested with `uv pip install -U vllm --torch-backend auto`).\n- **Python**: 3.10+\n\nInstall vLLM:\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Client Usage\n\n### Offline OCR (Python)\n\n```python\nfrom vllm import LLM, SamplingParams\nfrom vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor\nfrom PIL import Image\n\nllm = LLM(\n model=\"deepseek-ai/DeepSeek-OCR\",\n enable_prefix_caching=False,\n mm_processor_cache_gb=0,\n logits_processors=[NGramPerReqLogitsProcessor],\n)\n\nimage_1 = Image.open(\"path/to/your/image_1.png\").convert(\"RGB\")\nimage_2 = Image.open(\"path/to/your/image_2.png\").convert(\"RGB\")\nprompt = \"\\nFree OCR.\"\n\nmodel_input = [\n {\"prompt\": prompt, \"multi_modal_data\": {\"image\": image_1}},\n {\"prompt\": prompt, \"multi_modal_data\": {\"image\": image_2}},\n]\n\nsampling_param = SamplingParams(\n temperature=0.0,\n max_tokens=8192,\n extra_args=dict(\n ngram_size=30,\n window_size=90,\n whitelist_token_ids={128821, 128822}, # , \n ),\n skip_special_tokens=False,\n)\n\nfor output in llm.generate(model_input, sampling_param):\n print(output.outputs[0].text)\n```\n\n### Online OCR serving\n\n```bash\nvllm serve deepseek-ai/DeepSeek-OCR \\\n --logits_processors vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0\n```\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"}},\n {\"type\": \"text\", \"text\": \"Free OCR.\"},\n ],\n }\n]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"deepseek-ai/DeepSeek-OCR\",\n messages=messages,\n max_tokens=2048,\n temperature=0.0,\n extra_body={\n \"skip_special_tokens\": False,\n \"vllm_xargs\": {\n \"ngram_size\": 30,\n \"window_size\": 90,\n \"whitelist_token_ids\": [128821, 128822],\n },\n },\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(f\"Generated text: {response.choices[0].message.content}\")\n```\n\n## Troubleshooting / Configuration Tips\n\n- **Use the custom logits processor** along with the model for optimal OCR and markdown\n generation performance.\n- Unlike multi-turn chat, OCR tasks do not typically benefit from prefix caching or image\n reuse, so disable these features to avoid unnecessary hashing and caching overhead.\n- DeepSeek-OCR works better with plain prompts than instruction formats. See the\n [official example prompts](https://github.com/deepseek-ai/DeepSeek-OCR/blob/2ac6d64a00656693b79c4f759a5e62c1b78bbeb1/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py#L27-L37).\n- Depending on your hardware, adjust `max_num_batched_tokens` for better throughput.\n\n## References\n\n- [DeepSeek-OCR on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-OCR)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html#offline-inference)\n" + } + }, + "deepseek-ai/DeepSeek-R1": { + "hf_id": "deepseek-ai/DeepSeek-R1", + "meta": { + "title": "DeepSeek-R1", + "provider": "DeepSeek", + "description": "DeepSeek-R1 is a 671B-parameter MoE reasoning model built on the DeepSeek-V3 architecture, trained with large-scale reinforcement learning for strong chain-of-thought capabilities.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "671B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-R1", + "slug": "deepseek-r1", + "provider": "DeepSeek", + "description": "DeepSeek-R1 is a 671B-parameter MoE reasoning model built on the DeepSeek-V3 architecture, trained with large-scale reinforcement learning for strong chain-of-thought capabilities.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Open-weights RL-trained reasoning model with native FP8 / FP4 variants", + "related_recipes": [ + "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-V3.1" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-R1", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "671B", + "active_parameters": "37B", + "context_length": 163840, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable DeepSeek-R1 tool calling with the deepseek_v3 tool-call parser.", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "deepseek_v3", + "--chat-template", + "examples/tool_chat_template_deepseekr1.jinja" + ] + }, + "reasoning": { + "description": "Enable reasoning/thinking mode with the DeepSeek R1 reasoning parser.", + "args": [ + "--reasoning-parser", + "deepseek_r1" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "Native FP8 weights on 8xH200 (recommended)" + }, + "r1_0528": { + "model_id": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "May 2025 DeepSeek-R1 refresh (DeepSeek-R1-0528)" + }, + "nvfp4": { + "model_id": "nvidia/DeepSeek-R1-0528-NVFP4-v2", + "precision": "nvfp4", + "vram_minimum_gb": 403, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "1" + } + }, + "blackwell": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + }, + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-R1 is a 671B-parameter Mixture-of-Experts reasoning model (37B activated per\ntoken) that shares its architecture with DeepSeek-V3, so the same launch recipes apply\nto both. DeepSeek publishes a refreshed checkpoint as `DeepSeek-R1-0528`, and NVIDIA\npublishes an FP4 quantized variant (`nvidia/DeepSeek-R1-FP4`) that runs on Blackwell\nGPUs with fewer devices.\n\n## Prerequisites\n\n- **Hardware (FP8)**: 8x H200 GPUs (verified)\n- **Hardware (FP4)**: 4x B200 GPUs\n- **vLLM**: Install with `uv pip install -U vllm --torch-backend auto`\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Client Usage\n\n### 8xH200 (FP8)\n\nTensor Parallel + Expert Parallel (TP8+EP):\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-0528 \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --enable-expert-parallel\n```\n\nData Parallel + Expert Parallel (DP8+EP):\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-0528 \\\n --trust-remote-code \\\n --data-parallel-size 8 \\\n --enable-expert-parallel\n```\n\n### 4xB200 (FP4)\n\nEnable FlashInfer MoE kernels before launching:\n```bash\n# For FP4 (recommended on Blackwell)\nexport VLLM_USE_FLASHINFER_MOE_FP4=1\n# For FP8 on Blackwell\nexport VLLM_USE_FLASHINFER_MOE_FP8=1\n```\n\nTensor Parallel + Expert Parallel (TP4+EP):\n```bash\nCUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve nvidia/DeepSeek-R1-FP4 \\\n --trust-remote-code \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel\n```\n\nData Parallel + Expert Parallel (DP4+EP):\n```bash\nCUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve nvidia/DeepSeek-R1-FP4 \\\n --trust-remote-code \\\n --data-parallel-size 4 \\\n --enable-expert-parallel\n```\n\n## Benchmarking\n\nFor benchmarking, disable prefix caching by adding `--no-enable-prefix-caching`\nto the server command.\n\n```bash\n# FP8 benchmark\nvllm bench serve \\\n --model deepseek-ai/DeepSeek-R1-0528 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n```bash\n# FP4 benchmark\nvllm bench serve \\\n --model nvidia/DeepSeek-R1-FP4 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\nTest different workloads by adjusting input/output lengths:\n- Prompt-heavy: 8000 input / 1000 output\n- Decode-heavy: 1000 input / 8000 output\n- Balanced: 1000 input / 1000 output\n\n## Troubleshooting\n\n- **Disaggregated Serving with Wide EP (Experimental GB200)**: See\n [vLLM issue #33583](https://github.com/vllm-project/vllm/issues/33583),\n [the vLLM blog post](https://blog.vllm.ai/2026/02/03/dsr1-gb200-part1.html), and\n [this reference fork](https://github.com/minosfuture/vllm/tree/pd_gb200_0114/runs/DS-R1/fp4)\n for GB200 disaggregated serving recipes.\n\n## References\n\n- [DeepSeek-R1 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-R1)\n- [DeepSeek-R1-0528 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528)\n- [NVIDIA DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4)\n- [vLLM Expert Parallelism docs](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html)\n" + } + }, + "deepseek-ai/DeepSeek-V3.1": { + "hf_id": "deepseek-ai/DeepSeek-V3.1", + "meta": { + "title": "DeepSeek-V3.1", + "provider": "DeepSeek", + "description": "DeepSeek-V3.1 is a hybrid MoE model that supports dynamic switching between thinking and non-thinking modes, with tool calling and function execution.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "671B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V3.1", + "slug": "deepseek-v3.1", + "provider": "DeepSeek", + "description": "DeepSeek-V3.1 is a hybrid MoE model that supports dynamic switching between thinking and non-thinking modes, with tool calling and function execution.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Hybrid thinking / non-thinking MoE with native FP8 and tool calling", + "related_recipes": [ + "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-V3.2-Exp" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V3.1", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "671B", + "active_parameters": "37B", + "context_length": 163840, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable DeepSeek-V3.1 tool calling with the deepseek_v31 tool-call parser.", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "deepseek_v31", + "--chat-template", + "examples/tool_chat_template_deepseekv31.jinja" + ] + }, + "reasoning": { + "description": "Dynamic thinking mode via chat_template_kwargs={'thinking': true|false}. No separate parser flag is required; the chat template emits ... content.", + "args": [ + "--reasoning-parser", + "deepseek_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "Native FP8 weights on 8xH200 (or H20) with 141GB per GPU" + }, + "nvfp4": { + "model_id": "nvidia/DeepSeek-V3.1-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 403, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-V3.1 is a hybrid MoE model that supports both thinking and non-thinking modes.\nYou can dynamically switch between the two modes from the client by passing\n`extra_body={\"chat_template_kwargs\": {\"thinking\": True|False}}`.\n\n## Prerequisites\n\n- **Hardware**: 8x H200 (or H20) GPUs (141 GB per GPU)\n- **vLLM**: Current stable release\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launching DeepSeek-V3.1\n\n### Serving on 8xH200 (or H20) GPUs\n\n```bash\nvllm serve deepseek-ai/DeepSeek-V3.1 \\\n --enable-expert-parallel \\\n --tensor-parallel-size 8 \\\n --served-model-name ds31\n```\n\n### Function calling\n\nvLLM supports user-defined tool calling for DeepSeek-V3.1. Add these flags when launching\nthe server. The example chat template ships in the official container and can also be\ndownloaded from the vLLM repo:\n[`tool_chat_template_deepseekv31.jinja`](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_deepseekv31.jinja).\n\n```bash\nvllm serve ... \\\n --enable-auto-tool-choice \\\n --tool-call-parser deepseek_v31 \\\n --chat-template examples/tool_chat_template_deepseekv31.jinja\n```\n\n## Client Usage\n\n### OpenAI Python SDK\n\nControl thinking mode via `extra_body={\"chat_template_kwargs\": {\"thinking\": False}}`\n(or `True` to enable thinking).\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nmodel = client.models.list().data[0].id\n\nmessages = [\n {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n {\"role\": \"user\", \"content\": \"Who are you?\"},\n {\"role\": \"assistant\", \"content\": \"HmmI am DeepSeek\"},\n {\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"},\n]\nresponse = client.chat.completions.create(\n model=model,\n messages=messages,\n extra_body={\"chat_template_kwargs\": {\"thinking\": False}},\n)\nprint(response.choices[0].message.content)\n```\n\nWhen `thinking=True`, output includes a `` segment delimiting chain-of-thought;\nwhen `thinking=False`, the model produces a direct answer without the thinking segment.\n\n### curl\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"ds31\",\n \"messages\": [\n {\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}\n ],\n \"chat_template_kwargs\": {\"thinking\": true}\n }'\n```\n\n## References\n\n- [DeepSeek-V3.1 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V3.1)\n- [vLLM tool chat template for DeepSeek-V3.1](https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_deepseekv31.jinja)\n" + } + }, + "deepseek-ai/DeepSeek-V3.2-Exp": { + "hf_id": "deepseek-ai/DeepSeek-V3.2-Exp", + "meta": { + "title": "DeepSeek-V3.2-Exp", + "provider": "DeepSeek", + "description": "Experimental DeepSeek-V3.2 preview with sparse attention (MQA-like logits) and FP8 KV cache; architecture matches DeepSeek-V3.1 except for the sparse attention mechanism.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "671B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V3.2-Exp", + "slug": "deepseek-v3.2-exp", + "provider": "DeepSeek", + "description": "Experimental DeepSeek-V3.2 preview with sparse attention (MQA-like logits) and FP8 KV cache; architecture matches DeepSeek-V3.1 except for the sparse attention mechanism.", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "Sparse attention MoE with FP8 KV cache and strong GSM8K score (~0.96)", + "related_recipes": [ + "deepseek-ai/DeepSeek-V3.1", + "deepseek-ai/DeepSeek-V3.2" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V3.2-Exp", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "671B", + "active_parameters": "37B", + "context_length": 163840, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "DeepGEMM pinned build for MQA logits (FP8 MoE kernels)", + "command": "uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation" + } + ], + "features": { + "tool_calling": { + "description": "Enable tool calling with DeepSeek V3.2 chat template support.", + "args": [ + "--tokenizer-mode", + "deepseek_v32", + "--tool-call-parser", + "deepseek_v32", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Dynamic thinking mode via chat_template_kwargs, same as DeepSeek-V3.1.", + "args": [ + "--reasoning-parser", + "deepseek_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "Native FP8 weights on 8xH200 (or H20, or 8xB200) with FP8 KV cache default" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--block-size", + "1", + "--kv-cache-dtype", + "bfloat16", + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "32768" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "1", + "SAFETENSORS_FAST_GPU": "1", + "VLLM_RPC_TIMEOUT": "18000000" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-V3.2-Exp is a sparse-attention MoE preview. Its main architecture is similar to\nDeepSeek-V3.1, with a sparse attention mechanism. Only Hopper and Blackwell data center\nGPUs are supported for now.\n\n## Prerequisites\n\n- **Hardware**: 8x H200 (or H20, or 8xB200) GPUs\n- **vLLM**: Current stable release\n- **DeepGEMM**: Required for MQA logits computation (and optionally for MoE)\n\n```bash\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\nuv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation\n```\n\nNote: DeepGEMM is used both for MoE and for MQA logits computation. It is required for\nMQA logits. To disable the MoE path only, set `VLLM_USE_DEEP_GEMM=0`. Some users report\nbetter performance with `VLLM_USE_DEEP_GEMM=0` (e.g. on H20), and this also skips the\nlong warmup.\n\n## Launching DeepSeek-V3.2-Exp\n\n### Serving on 8xH200 (or H20) GPUs\n\nUsing the recommended EP/DP mode:\n```bash\nvllm serve deepseek-ai/DeepSeek-V3.2-Exp -dp 8 --enable-expert-parallel\n```\n\nUsing tensor parallel:\n```bash\nvllm serve deepseek-ai/DeepSeek-V3.2-Exp -tp 8\n```\n\n### Serving on 8xB200 GPUs\n\nSame as the above.\n\n## Accuracy Benchmarking\n\n```bash\nlm-eval --model local-completions --tasks gsm8k \\\n --model_args model=deepseek-ai/DeepSeek-V3.2-Exp,base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False\n```\n\nReported GSM8K score: `0.9591` (5-shot) and `0.9538` (20-shot).\n\n## Performance Tips\n\n1. The kernels are mainly optimized for TP=1, so it is recommended to run this model\n under EP/DP mode (e.g. DP=8, EP=8, TP=1). If you hit any errors or hangs, try tensor\n parallel instead. Simple TP works and is more robust, but the performance is not optimal.\n2. The default config uses a custom `fp8` KV cache. You can also use `bfloat16` KV cache\n by specifying `kv_cache_dtype=bfloat16`. FP8 allows more tokens to be cached but incurs\n quantization/dequantization overhead. Use `bfloat16` for short requests and `fp8` for\n long requests.\n\n## Troubleshooting\n\n- **`CUDA error (flashmla-src/csrc/smxx/mla_combine.cu:201): invalid configuration argument`**:\n This may be caused by too large a batch size. Try `--max-num-seqs 256` or smaller\n (default is 1024).\n- For thinking-mode toggling, refer to the DeepSeek-V3.1 recipe (`deepseek-ai/DeepSeek-V3.1`).\n\n## References\n\n- [DeepSeek-V3.2-Exp on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)\n- [End-to-end tutorial (Jupyter Notebook)](https://github.com/vllm-project/recipes/blob/main/DeepSeek/DeepSeek_v3_2_vLLM_getting_started_guide.ipynb)\n" + } + }, + "deepseek-ai/DeepSeek-V3.2": { + "hf_id": "deepseek-ai/DeepSeek-V3.2", + "meta": { + "title": "DeepSeek-V3.2", + "provider": "DeepSeek", + "description": "DeepSeek V3.2 MoE model with MLA attention, sparse attention, and scalable RL for strong reasoning and agent capabilities.", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "671B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V3.2", + "slug": "deepseek-v3.2", + "provider": "DeepSeek", + "description": "DeepSeek V3.2 MoE model with MLA attention, sparse attention, and scalable RL for strong reasoning and agent capabilities.", + "date_updated": "2026-04-01", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "GPT-5-level reasoning with efficient MoE inference", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V3.2", + "min_vllm_version": "0.18.0", + "architecture": "moe", + "parameter_count": "671B", + "active_parameters": "37B", + "context_length": 163840, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code", + "--kernel-config.enable_flashinfer_autotune=False" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "DeepGEMM pinned build for MQA logits (FP8 MoE kernels)", + "command": "uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation" + }, + { + "note": "Set VLLM_USE_DEEP_GEMM=0 to skip DeepGEMM for the MoE path (recommended on H20)", + "command": "export VLLM_USE_DEEP_GEMM=0", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable tool calling with DeepSeek V3.2 chat template support.", + "args": [ + "--tokenizer-mode", + "deepseek_v32", + "--tool-call-parser", + "deepseek_v32", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Enable reasoning/thinking mode with the DeepSeek V3 reasoning parser.", + "args": [ + "--reasoning-parser", + "deepseek_v3" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding with 3 speculative tokens.", + "args": [ + "--speculative_config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":3}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "Native FP8 checkpoint (F8_E4M3)" + }, + "nvfp4": { + "model_id": "nvidia/DeepSeek-V3.2-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 403, + "description": "NVIDIA FP4 quantized variant with FP8 KV cache for reduced VRAM usage.", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention-backend", + "FLASHINFER_MLA" + ], + "extra_env": {} + }, + "amd": { + "extra_args": [], + "extra_env": {} + } + }, + "strategy_overrides": { + "pd_cluster": { + "prefill": { + "extra_args": [], + "extra_env": {} + }, + "decode": { + "extra_args": [], + "extra_env": {} + } + }, + "single_node_dep": { + "extra_args": [], + "extra_env": {} + } + }, + "guide": "## Overview\n\nDeepSeek-V3.2 is a Mixture-of-Experts model that balances computational efficiency with\nstrong reasoning and agent capabilities through three technical innovations: DeepSeek Sparse\nAttention (DSA) for efficient long-context processing, a scalable reinforcement learning\nframework achieving GPT-5-level performance, and a large-scale agentic task synthesis pipeline\nfor robust tool-use generalization.\n\n## Prerequisites\n\n- **Hardware**: Minimum 8x H100/H200 80GB GPUs (BF16) or 3x H200 (NVFP4 variant).\n- **vLLM**: Version 0.18.0 or later (nightly recommended).\n- **Python**: 3.10+\n- **CUDA**: 12.x or later (CUDA 13.x may require extra env vars; see Troubleshooting).\n- **Disk**: ~1.3 TB for BF16 weights; ~350 GB for NVFP4 variant.\n- **DeepGEMM** (recommended):\n ```bash\n uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation\n ```\n Note: Set `VLLM_USE_DEEP_GEMM=0` to disable MoE DeepGEMM if you experience issues\n (e.g., on H20 GPUs) or want to skip the long warmup.\n\n## Client Usage\n\nLaunch the server:\n```bash\nvllm serve deepseek-ai/DeepSeek-V3.2 \\\n --tensor-parallel-size 8 \\\n --trust-remote-code \\\n --kernel-config.enable_flashinfer_autotune=False \\\n --tokenizer-mode deepseek_v32 \\\n --tool-call-parser deepseek_v32 \\\n --enable-auto-tool-choice \\\n --reasoning-parser deepseek_v3\n```\n\nUse the OpenAI Python SDK to interact with the server:\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(\n api_key=\"your-api-key\",\n base_url=\"http://localhost:8000/v1\",\n)\n\n# Standard chat\nresponse = client.chat.completions.create(\n model=\"deepseek-ai/DeepSeek-V3.2\",\n messages=[{\"role\": \"user\", \"content\": \"Hello!\"}],\n)\n\n# Thinking / reasoning mode\nresponse = client.chat.completions.create(\n model=\"deepseek-ai/DeepSeek-V3.2\",\n messages=[{\"role\": \"user\", \"content\": \"Solve this step by step...\"}],\n extra_body={\"chat_template_kwargs\": {\"thinking\": True}},\n)\n```\n\n## Troubleshooting\n\n**`ptxas fatal: Value 'sm_110a' is not defined for option 'gpu-name'`**\nThis can occur on CUDA 13.x. Fix by exporting:\n```bash\nexport TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas\nexport PATH=/usr/local/cuda/bin:$PATH\nexport LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}\n```\n\n**TP=8 performance on Hopper/Blackwell**\nAvoid `-tp 8` with FlashMLA-Sparse. Due to kernel restrictions, TP=8 yields only 16 heads\nper rank but is padded to 64, causing overhead. Prefer TP=2 (Hopper) or TP=1 (Blackwell)\nwith DP/EP mode: `vllm serve deepseek-ai/DeepSeek-V3.2 -dp 8 --enable-expert-parallel`.\n\n**DeepGEMM warmup too slow**\nSet `VLLM_USE_DEEP_GEMM=0` to disable MoE DeepGEMM and skip the long warmup.\n\n## References\n\n- [DeepSeek-V3.2 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V3.2)\n- [DeepSeek API Documentation](https://api-docs.deepseek.com/)\n- [vLLM Data Parallel Deployment Guide](https://docs.vllm.ai/en/latest/serving/data_parallel_deployment.html)\n" + } + }, + "deepseek-ai/DeepSeek-V3": { + "hf_id": "deepseek-ai/DeepSeek-V3", + "meta": { + "title": "DeepSeek-V3", + "provider": "DeepSeek", + "description": "DeepSeek-V3 is a 671B-parameter Mixture-of-Experts model with native FP8 weights and strong reasoning, coding, and math capabilities.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "671B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V3", + "slug": "deepseek-v3", + "provider": "DeepSeek", + "description": "DeepSeek-V3 is a 671B-parameter Mixture-of-Experts model with native FP8 weights and strong reasoning, coding, and math capabilities.", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Frontier open-weights MoE with native FP8 and FP4 variants", + "related_recipes": [ + "deepseek-ai/DeepSeek-R1", + "deepseek-ai/DeepSeek-V3.1" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V3", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "671B", + "active_parameters": "37B", + "context_length": 163840, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable DeepSeek-V3 tool calling with the deepseek_v3 tool-call parser.", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "deepseek_v3", + "--chat-template", + "examples/tool_chat_template_deepseekv3.jinja" + ] + }, + "reasoning": { + "description": "Enable reasoning/thinking mode with the DeepSeek R1 reasoning parser.", + "args": [ + "--reasoning-parser", + "deepseek_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 805, + "description": "Native FP8 weights on 8xH200 (recommended)" + }, + "fp4": { + "model_id": "nvidia/DeepSeek-V3-FP4", + "precision": "fp4", + "vram_minimum_gb": 403, + "description": "NVIDIA FP4 quantized weights for Blackwell (e.g. 4xB200)", + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "1" + } + }, + "blackwell": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + }, + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nDeepSeek-V3 is a 671B-parameter Mixture-of-Experts model (37B activated per token)\nshipped with native FP8 weights. It shares its architecture with DeepSeek-R1, so the\nsame launch recipes apply to both models. For Blackwell GPUs, NVIDIA publishes an FP4\nquantized variant (`nvidia/DeepSeek-V3-FP4` / `nvidia/DeepSeek-R1-FP4`) that runs on\nfewer GPUs.\n\n## Prerequisites\n\n- **Hardware (FP8)**: 8x H200 GPUs (verified)\n- **Hardware (FP4)**: 4x B200 GPUs\n- **vLLM**: Install with `uv pip install -U vllm --torch-backend auto`\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Client Usage\n\n### 8xH200 (FP8)\n\nTensor Parallel + Expert Parallel (TP8+EP):\n```bash\nvllm serve deepseek-ai/DeepSeek-V3 \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --enable-expert-parallel\n```\n\nData Parallel + Expert Parallel (DP8+EP):\n```bash\nvllm serve deepseek-ai/DeepSeek-V3 \\\n --trust-remote-code \\\n --data-parallel-size 8 \\\n --enable-expert-parallel\n```\n\n### 4xB200 (FP4)\n\nEnable FlashInfer MoE kernels before launching:\n```bash\n# For FP4 (recommended on Blackwell)\nexport VLLM_USE_FLASHINFER_MOE_FP4=1\n# For FP8 on Blackwell\nexport VLLM_USE_FLASHINFER_MOE_FP8=1\n```\n\nTensor Parallel + Expert Parallel (TP4+EP):\n```bash\nCUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve nvidia/DeepSeek-V3-FP4 \\\n --trust-remote-code \\\n --tensor-parallel-size 4 \\\n --enable-expert-parallel\n```\n\nData Parallel + Expert Parallel (DP4+EP):\n```bash\nCUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve nvidia/DeepSeek-V3-FP4 \\\n --trust-remote-code \\\n --data-parallel-size 4 \\\n --enable-expert-parallel\n```\n\n## Benchmarking\n\nFor benchmarking, disable prefix caching by adding `--no-enable-prefix-caching`\nto the server command.\n\n```bash\n# Prompt-heavy benchmark (8k/1k)\nvllm bench serve \\\n --model deepseek-ai/DeepSeek-V3 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\nTest different workloads by adjusting input/output lengths:\n- Prompt-heavy: 8000 input / 1000 output\n- Decode-heavy: 1000 input / 8000 output\n- Balanced: 1000 input / 1000 output\n\n## Troubleshooting\n\n- **Disaggregated Serving with Wide EP (Experimental GB200)**: See\n [vLLM issue #33583](https://github.com/vllm-project/vllm/issues/33583) and the\n [vLLM blog post](https://blog.vllm.ai/2026/02/03/dsr1-gb200-part1.html) for\n GB200 disaggregated serving recipes.\n\n## References\n\n- [DeepSeek-V3 on Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V3)\n- [NVIDIA DeepSeek-V3-FP4](https://huggingface.co/nvidia/DeepSeek-V3-FP4)\n- [vLLM Expert Parallelism docs](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html)\n" + } + }, + "deepseek-ai/DeepSeek-V4-Flash": { + "hf_id": "deepseek-ai/DeepSeek-V4-Flash", + "meta": { + "title": "DeepSeek-V4-Flash", + "provider": "DeepSeek", + "description": "DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max).", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified", + "b300": "verified", + "gb300": "verified", + "dgx_station_gb300": "verified", + "mi300x": "unsupported", + "mi325x": "unsupported", + "mi355x": "unsupported" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "284B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V4-Flash", + "slug": "deepseek-v4-flash", + "provider": "DeepSeek", + "description": "DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max).", + "date_updated": "2026-05-28", + "difficulty": "hard", + "tasks": [ + "text" + ], + "performance_headline": "Compact 284B/13B V4 sibling \u2014 single-node 1M-context serving with FP4+FP8 weights and MTP", + "related_recipes": [], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified", + "b300": "verified", + "gb300": "verified", + "dgx_station_gb300": "verified", + "mi300x": "unsupported", + "mi325x": "unsupported", + "mi355x": "unsupported" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V4-Flash", + "min_vllm_version": "0.20.0", + "architecture": "moe", + "parameter_count": "284B", + "active_parameters": "13B", + "context_length": 1048576, + "flashinfer_autotune": true, + "base_args": [ + "--trust-remote-code", + "--kv-cache-dtype", + "fp8", + "--block-size", + "256" + ] + }, + "dependencies": [ + { + "note": "DeepGEMM FP8 kernels \u2014 install via vLLM tools/install_deepgemm.sh", + "command": "bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)" + } + ], + "features": { + "tool_calling": { + "description": "Enable tool calling with DeepSeek V4 chat template support.", + "args": [ + "--tokenizer-mode", + "deepseek_v4", + "--tool-call-parser", + "deepseek_v4", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.", + "args": [ + "--reasoning-parser", + "deepseek_v4" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding with 2 speculative tokens (1 on Hopper).", + "args": [ + "--speculative_config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":2}" + ], + "hardware_overrides": { + "hopper": { + "args": [ + "--speculative_config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + } + } + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 170, + "description": "Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_dep", + "pd_cluster" + ], + "default_strategy": "single_node_tep", + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention_config.use_fp4_indexer_cache=True", + "--moe-backend", + "deep_gemm_mega_moe" + ] + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 8, + "extra_args": [ + "--no-enable-flashinfer-autotune" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention_config.use_fp4_indexer_cache=True" + ] + } + } + }, + "single_node_dep": { + "extra_args": [ + "--data-parallel-size", + "4", + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "multi_node_dep": { + "extra_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "pd_cluster": { + "env": { + "VLLM_USE_NCCL_SYMM_MEM": "1", + "NCCL_CUMEM_ENABLE": "1", + "NCCL_MNNVL_ENABLE": "1", + "NCCL_NVLS_ENABLE": "1" + }, + "prefill": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--enforce-eager", + "--max-num-seqs", + "8", + "--max-num-batched-tokens", + "65536", + "--no-disable-hybrid-kv-cache-manager", + "--enable-sleep-mode" + ], + "env": {} + }, + "decode": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--max-num-seqs", + "1536", + "--max-num-batched-tokens", + "1536", + "--max-cudagraph-capture-size", + "1536", + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}", + "--no-disable-hybrid-kv-cache-manager", + "--enable-sleep-mode" + ], + "env": {} + } + } + }, + "guide": "## Overview\n\nDeepSeek-V4-Flash is a 284B-total / 13B-active MoE model in the V4 preview family.\nIt pairs a **hybrid attention stack** \u2014 Compressed Sparse Attention (CSA) + Heavily\nCompressed Attention (HCA) \u2014 with **Manifold-Constrained Hyper-Connections (mHC)** to\nreach 27% of V3.2's per-token inference FLOPs and 10% of V3.2's KV cache at 1M\ncontext. Pre-trained on 32T+ tokens; post-training is a two-stage pipeline (domain-\nspecific expert cultivation + unified consolidation via on-policy distillation).\n\nCheckpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the\nremaining (attention / norm / router) params stay in FP8.\n\n## Reasoning modes\n\nThe chat template exposes three reasoning-effort modes:\n\n- **Non-think** \u2014 fast, intuitive responses.\n- **Think High** \u2014 explicit chain-of-thought for logical analysis and planning.\n- **Think Max** \u2014 maximum reasoning effort; requires `--max-model-len >= 393216`\n (384K tokens) to avoid truncation.\n\nRecommended sampling: `temperature = 1.0`, `top_p = 1.0`.\n\n### OpenAI Client Example\n\nFor DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a\ncustom **Think Max** mode via `\"reasoning_effort\": \"max\"`.\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nmodel = \"deepseek-ai/DeepSeek-V4-Flash\"\nmessages = [{\"role\": \"user\", \"content\": \"What is 17*19? Return only the final integer.\"}]\n\n# Non-think\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n)\n\n# Think High\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n extra_body={\n \"chat_template_kwargs\": {\n \"thinking\": True,\n \"reasoning_effort\": \"high\",\n },\n },\n)\n\n# Think Max\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n extra_body={\n \"chat_template_kwargs\": {\n \"thinking\": True,\n \"reasoning_effort\": \"max\",\n },\n },\n)\n```\n\n## Recommended deployment\n\nNon-disaggregated serving on multi-GPU supported hardware: single-node DP + EP with\n`--data-parallel-size 4`. Fills a GB200 NVL4 tray exactly; uses 4 of 8 GPUs per\nreplica on H200/B200/B300 (leaving headroom for throughput-vs-latency tuning).\nOn DGX Station, use the single-GPU launch below. For disaggregated prefill/decode\non GB200, use the PD Cluster tab.\n\n### DGX Station Single-GPU\n\n```bash\nvllm serve deepseek-ai/DeepSeek-V4-Flash \\\n --tensor-parallel-size 1 --pipeline-parallel-size 1 \\\n --kv-cache-dtype fp8 --trust-remote-code --block-size 256 \\\n --gpu-memory-utilization 0.92 \\\n --compilation-config '{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\",\"custom_ops\":[\"all\"]}' \\\n --attention_config.use_fp4_indexer_cache True \\\n --tokenizer-mode deepseek_v4 --tool-call-parser deepseek_v4 \\\n --enable-auto-tool-choice --reasoning-parser deepseek_v4 \\\n --max-cudagraph-capture-size 128 \\\n --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'\n```\n\n### H200 Single-Node PD (Mooncake)\n\nSingle-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node,\nusing MooncakeConnector over RDMA for KV cache transfer.\n\n**Prefill** (GPUs 0\u20133, port 8000):\n\n```bash\ndocker run --gpus all \\\n --privileged --ipc=host -p 8000:8000 \\\n --network host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -v /mnt/shared:/mnt/shared \\\n -e TILELANG_CLEANUP_TEMP_FILES=1 \\\n -e VLLM_DISABLE_COMPILE_CACHE=1 \\\n -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \\\n -e VLLM_RPC_TIMEOUT=600000 \\\n -e VLLM_LOG_STATS_INTERVAL=1 \\\n -e VLLM_MOONCAKE_BOOTSTRAP_PORT=8998 \\\n -e CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n vllm/vllm-openai:latest \\\n deepseek-ai/DeepSeek-V4-Flash \\\n --trust-remote-code \\\n --kv-cache-dtype fp8 \\\n --block-size 256 \\\n --port 8000 \\\n --data-parallel-size 4 \\\n --enable-expert-parallel \\\n --tokenizer-mode deepseek_v4 \\\n --reasoning-parser deepseek_v4 \\\n --max-model-len auto \\\n --max-num-batched-tokens 16384 \\\n --max-num-seqs 8 \\\n --enforce-eager \\\n --no-disable-hybrid-kv-cache-manager \\\n --disable-uvicorn-access-log \\\n --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\",\"kv_load_failure_policy\":\"fail\",\"kv_buffer_device\":\"cuda\",\"kv_connector_extra_config\":{\"enforce_handshake_compat\":false,\"mooncake_protocol\":\"rdma\"}}'\n```\n\n**Decode** (GPUs 4\u20137, port 8001):\n\n```bash\ndocker run --gpus all \\\n --privileged --ipc=host -p 8001:8001 \\\n --network host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -v /mnt/shared:/mnt/shared \\\n -e TILELANG_CLEANUP_TEMP_FILES=1 \\\n -e VLLM_DISABLE_COMPILE_CACHE=1 \\\n -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \\\n -e VLLM_RPC_TIMEOUT=600000 \\\n -e VLLM_LOG_STATS_INTERVAL=1 \\\n -e VLLM_MOONCAKE_BOOTSTRAP_PORT=9889 \\\n -e CUDA_VISIBLE_DEVICES=4,5,6,7 \\\n vllm/vllm-openai:latest \\\n deepseek-ai/DeepSeek-V4-Flash \\\n --trust-remote-code \\\n --kv-cache-dtype fp8 \\\n --block-size 256 \\\n --port 8001 \\\n --data-parallel-size 4 \\\n --enable-expert-parallel \\\n --tokenizer-mode deepseek_v4 \\\n --reasoning-parser deepseek_v4 \\\n --max-model-len auto \\\n --max-num-seqs 512 \\\n --max-num-batched-tokens 512 \\\n --compilation-config '{\"mode\":0,\"cudagraph_mode\":\"FULL_DECODE_ONLY\",\"max_cudagraph_capture_size\":512,\"compile_ranges_endpoints\":[512]}' \\\n --no-disable-hybrid-kv-cache-manager \\\n --disable-uvicorn-access-log \\\n --kv-transfer-config '{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\",\"kv_load_failure_policy\":\"fail\",\"kv_buffer_device\":\"cuda\",\"kv_connector_extra_config\":{\"enforce_handshake_compat\":false,\"mooncake_protocol\":\"rdma\"}}'\n```\n\n**[Router](https://github.com/vllm-project/router)**:\n\n```bash\npip install vllm-router\n\nvllm-router --policy round_robin \\\n --vllm-pd-disaggregation \\\n --prefill http://localhost:8000 \\\n --decode http://localhost:8001 \\\n --host 127.0.0.1 \\\n --port 30000 \\\n --intra-node-data-parallel-size 4 \\\n --kv-connector mooncake\n```\n\n### H200 Single-Node PD (Nixl)\n\nSingle-host disaggregated serving: 4 prefill GPUs + 4 decode GPUs on one 8-GPU H200 node,\nusing NixlConnector for KV cache transfer.\n\n**Prefill** (GPUs 0\u20133, port 8000):\n\n```bash\ndocker run --gpus all \\\n --privileged --ipc=host -p 8000:8000 \\\n --network host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -v /mnt/shared:/mnt/shared \\\n -e TILELANG_CLEANUP_TEMP_FILES=1 \\\n -e VLLM_DISABLE_COMPILE_CACHE=1 \\\n -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \\\n -e VLLM_RPC_TIMEOUT=600000 \\\n -e VLLM_LOG_STATS_INTERVAL=1 \\\n -e VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \\\n -e CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n vllm/vllm-openai:latest \\\n deepseek-ai/DeepSeek-V4-Flash \\\n --trust-remote-code \\\n --kv-cache-dtype fp8 \\\n --block-size 256 \\\n --port 8000 \\\n --data-parallel-size 4 \\\n --enable-expert-parallel \\\n --tokenizer-mode deepseek_v4 \\\n --reasoning-parser deepseek_v4 \\\n --max-model-len auto \\\n --max-num-batched-tokens 16384 \\\n --max-num-seqs 8 \\\n --enforce-eager \\\n --no-disable-hybrid-kv-cache-manager \\\n --disable-uvicorn-access-log \\\n --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'\n```\n\n**Decode** (GPUs 4\u20137, port 8001):\n\n```bash\ndocker run --gpus all \\\n --privileged --ipc=host -p 8001:8001 \\\n --network host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n -v /mnt/shared:/mnt/shared \\\n -e TILELANG_CLEANUP_TEMP_FILES=1 \\\n -e VLLM_DISABLE_COMPILE_CACHE=1 \\\n -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \\\n -e VLLM_RPC_TIMEOUT=600000 \\\n -e VLLM_LOG_STATS_INTERVAL=1 \\\n -e VLLM_NIXL_SIDE_CHANNEL_PORT=5558 \\\n -e CUDA_VISIBLE_DEVICES=4,5,6,7 \\\n vllm/vllm-openai:latest \\\n deepseek-ai/DeepSeek-V4-Flash \\\n --trust-remote-code \\\n --kv-cache-dtype fp8 \\\n --block-size 256 \\\n --port 8001 \\\n --data-parallel-size 4 \\\n --enable-expert-parallel \\\n --tokenizer-mode deepseek_v4 \\\n --reasoning-parser deepseek_v4 \\\n --max-model-len auto \\\n --max-num-seqs 512 \\\n --max-num-batched-tokens 512 \\\n --compilation-config '{\"mode\":0,\"cudagraph_mode\":\"FULL_DECODE_ONLY\",\"max_cudagraph_capture_size\":512,\"compile_ranges_endpoints\":[512]}' \\\n --no-disable-hybrid-kv-cache-manager \\\n --disable-uvicorn-access-log \\\n --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'\n```\n\n**[Router](https://github.com/vllm-project/router)**:\n\n```bash\npip install vllm-router\n\nvllm-router --policy round_robin \\\n --vllm-pd-disaggregation \\\n --prefill http://localhost:8000 \\\n --decode http://localhost:8001 \\\n --host 127.0.0.1 \\\n --port 30000 \\\n --intra-node-data-parallel-size 4 \\\n --kv-connector nixl\n```\n" + } + }, + "deepseek-ai/DeepSeek-V4-Pro": { + "hf_id": "deepseek-ai/DeepSeek-V4-Pro", + "meta": { + "title": "DeepSeek-V4-Pro", + "provider": "DeepSeek", + "description": "DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified", + "b300": "verified", + "gb300": "verified", + "mi300x": "unsupported", + "mi325x": "unsupported", + "mi355x": "unsupported" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1600B" + }, + "recipe": { + "meta": { + "title": "DeepSeek-V4-Pro", + "slug": "deepseek-v4-pro", + "provider": "DeepSeek", + "description": "DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning.", + "date_updated": "2026-04-24", + "difficulty": "hard", + "tasks": [ + "text" + ], + "performance_headline": "Frontier 1.6T/49B reasoning MoE with native FP4+FP8 weights, MTP speculative decoding, and 1M-token context", + "related_recipes": [], + "hardware": { + "h200": "verified", + "b200": "verified", + "gb200": "verified", + "b300": "verified", + "gb300": "verified", + "mi300x": "unsupported", + "mi325x": "unsupported", + "mi355x": "unsupported" + } + }, + "model": { + "model_id": "deepseek-ai/DeepSeek-V4-Pro", + "min_vllm_version": "0.20.0", + "architecture": "moe", + "parameter_count": "1600B", + "active_parameters": "49B", + "context_length": 1048576, + "flashinfer_autotune": true, + "base_args": [ + "--trust-remote-code", + "--kv-cache-dtype", + "fp8", + "--block-size", + "256" + ] + }, + "dependencies": [ + { + "note": "DeepGEMM FP8 kernels \u2014 install via vLLM tools/install_deepgemm.sh", + "command": "bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)" + } + ], + "features": { + "tool_calling": { + "description": "Enable tool calling with DeepSeek V4 chat template support.", + "args": [ + "--tokenizer-mode", + "deepseek_v4", + "--tool-call-parser", + "deepseek_v4", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.", + "args": [ + "--reasoning-parser", + "deepseek_v4" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding with 2 speculative tokens.", + "args": [ + "--speculative_config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":2}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 960, + "description": "Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "default_strategy": "single_node_tep", + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--max-model-len", + "800000", + "--gpu-memory-utilization", + "0.95", + "--max-num-seqs", + "512", + "--max-num-batched-tokens", + "512", + "--no-enable-flashinfer-autotune", + "--compilation-config", + "{\"mode\": 0, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}" + ], + "extra_env": { + "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": "0" + } + }, + "blackwell": { + "extra_args": [ + "--attention_config.use_fp4_indexer_cache=True", + "--moe-backend", + "deep_gemm_mega_moe" + ] + } + }, + "strategy_overrides": { + "single_node_tp": { + "extra_args": [ + "--no-enable-flashinfer-autotune" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention_config.use_fp4_indexer_cache=True" + ] + } + } + }, + "single_node_tep": { + "extra_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "multi_node_tep": { + "extra_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "single_node_dep": { + "extra_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "multi_node_dep": { + "extra_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}" + ] + }, + "pd_cluster": { + "env": { + "VLLM_USE_NCCL_SYMM_MEM": "1", + "NCCL_CUMEM_ENABLE": "1", + "NCCL_MNNVL_ENABLE": "1", + "NCCL_NVLS_ENABLE": "1" + }, + "prefill": { + "nodes": { + "default": 2, + "gb300": 1 + }, + "parallelism": "dep", + "vllm_args": [ + "--enforce-eager", + "--max-num-seqs", + "2", + "--max-num-batched-tokens", + "16384", + "--no-disable-hybrid-kv-cache-manager", + "--enable-sleep-mode" + ], + "env": {} + }, + "decode": { + "nodes": { + "default": 2, + "gb300": 1 + }, + "parallelism": "dep", + "vllm_args": [ + "--max-num-seqs", + "1024", + "--max-num-batched-tokens", + "1024", + "--max-cudagraph-capture-size", + "1024", + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}", + "--no-disable-hybrid-kv-cache-manager", + "--enable-sleep-mode" + ], + "env": {} + } + } + }, + "guide": "## Overview\n\nDeepSeek-V4-Pro is the flagship of the V4 preview family: a 1.6T-total / 49B-active\nMixture-of-Experts model. It pairs a **hybrid attention stack** \u2014 Compressed Sparse\nAttention (CSA) + Heavily Compressed Attention (HCA) \u2014 with **Manifold-Constrained\nHyper-Connections (mHC)** to reach 27% of V3.2's per-token inference FLOPs and 10% of\nV3.2's KV cache at 1M context. Pre-trained on 32T+ tokens with the **Muon optimizer**\nfor faster convergence; post-training is a two-stage pipeline (domain-specific expert\ncultivation + unified consolidation via on-policy distillation).\n\nCheckpoint is **FP4+FP8 mixed**: MoE expert weights are stored in FP4 while the\nremaining (attention / norm / router) params stay in FP8.\n\n## Reasoning modes\n\nThe chat template exposes three reasoning-effort modes:\n\n- **Non-think** \u2014 fast, intuitive responses.\n- **Think High** \u2014 explicit chain-of-thought for complex problem-solving and planning.\n- **Think Max** \u2014 maximum reasoning effort; requires `--max-model-len >= 393216`\n (384K tokens) to avoid truncation.\n\nRecommended sampling: `temperature = 1.0`, `top_p = 1.0`.\n\n### OpenAI Client Example\n\nFor DeepSeek-V4, keep reasoning controls in `chat_template_kwargs`, as it exposes a\ncustom **Think Max** mode via `\"reasoning_effort\": \"max\"`.\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nmodel = \"deepseek-ai/DeepSeek-V4-Pro\"\nmessages = [{\"role\": \"user\", \"content\": \"What is 17*19? Return only the final integer.\"}]\n\n# Non-think\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n)\n\n# Think High\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n extra_body={\n \"chat_template_kwargs\": {\n \"thinking\": True,\n \"reasoning_effort\": \"high\",\n },\n },\n)\n\n# Think Max\nresp = client.chat.completions.create(\n model=model,\n messages=messages,\n extra_body={\n \"chat_template_kwargs\": {\n \"thinking\": True,\n \"reasoning_effort\": \"max\",\n },\n },\n)\n```\n\n## Recommended deployments\n\n- **B300 (8\u00d7 GPU)**: single-node DP + EP with `--data-parallel-size 8`.\n- **H200 (8\u00d7 GPU)**: DP + EP with `--data-parallel-size 8`. Context is capped at\n 800K tokens (`--max-model-len 800000`) to leave KV headroom with dense params\n replicated across ranks \u2014 applies to both single-node and multi-node H200.\n- **GB200 NVL4 (4\u00d7 GPU per tray)**: the ~960 GB mixed-precision checkpoint does not\n fit on one tray; run multi-node DP + EP across **2 trays** (8 GPUs total) with\n `--data-parallel-size 8`. Pick the \"Multi-Node\" tab and set nodes to 2.\n" + } + }, + "inclusionAI/Ling-2.6-1T": { + "hf_id": "inclusionAI/Ling-2.6-1T", + "meta": { + "title": "Ling-2.6-1T", + "provider": "inclusionAI", + "description": "Ling-2.6-1T (BailingMoeV2_5) FP8 instruct model with 1T total / 50B active params, hybrid linear + MLA attention, 262K context", + "tasks": [ + "text" + ], + "hardware": { + "b300": "verified", + "mi300x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Ling-2.6-1T", + "slug": "ling-2.6-1t", + "provider": "inclusionAI", + "description": "Ling-2.6-1T (BailingMoeV2_5) FP8 instruct model with 1T total / 50B active params, hybrid linear + MLA attention, 262K context", + "date_updated": "2026-05-13", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "related_recipes": [ + "inclusionAI/Ling-2.6-flash", + "inclusionAI/Ring-1T-FP8" + ], + "hardware": { + "b300": "verified", + "mi300x": "verified" + } + }, + "model": { + "model_id": "inclusionAI/Ling-2.6-1T", + "min_vllm_version": "0.20.2", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "50B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "8" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 1200, + "description": "FP8 weights with TP=8 on B300 or 8x MI300X/MI355X-class nodes" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 8 + } + }, + "guide": "## Overview\n\n[Ling-2.6-1T](https://huggingface.co/inclusionAI/Ling-2.6-1T) is inclusionAI's\nBailingMoeV2_5 FP8 flagship model with 1T total / 50B active parameters, hybrid\nlinear + MLA attention, and a 262K context window.\n\n## Deployment Configurations\n\n### Docker (AMD MI300X / MI325X / MI355X, TP=8)\n\nTP=8 has been verified on an MI300X-class node at the model-derived 262K\ncontext. MI325X and MI355X have larger per-GPU HBM.\n\n```bash\ndocker run --rm -it \\\n --cap-add=SYS_PTRACE \\\n --ipc=host \\\n --privileged=true \\\n --shm-size=128GB \\\n --network=host \\\n --device=/dev/kfd \\\n --device=/dev/dri \\\n --group-add video \\\n -e VLLM_ROCM_USE_AITER=1 \\\n vllm/vllm-openai-rocm:v0.20.2 \\\n inclusionAI/Ling-2.6-1T \\\n --tensor-parallel-size 8 \\\n --trust-remote-code\n```\n\n## Client Usage\n\n### Text Generation\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"inclusionAI/Ling-2.6-1T\",\n messages=[{\"role\": \"user\", \"content\": \"Write a poem about the ocean.\"}],\n max_tokens=512, temperature=0.7,\n)\nprint(response.choices[0].message.content)\n```\n\n## References\n\n- [Ling-2.6-1T on Hugging Face](https://huggingface.co/inclusionAI/Ling-2.6-1T)\n- [vLLM ROCm GPU install docs](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/?device=rocm)\n- [ROCm vLLM optimization guide](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/vllm-optimization.html)\n" + } + }, + "inclusionAI/Ling-2.6-flash": { + "hf_id": "inclusionAI/Ling-2.6-flash", + "meta": { + "title": "Ling-2.6-flash", + "provider": "inclusionAI", + "description": "Ling-2.6-flash (BailingMoeV2_5) instruct model with 104B total / 7.4B active params, hybrid linear + MLA attention, 128K context, optimized for agent workloads", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "104B" + }, + "recipe": { + "meta": { + "title": "Ling-2.6-flash", + "slug": "ling-2.6-flash", + "provider": "inclusionAI", + "description": "Ling-2.6-flash (BailingMoeV2_5) instruct model with 104B total / 7.4B active params, hybrid linear + MLA attention, 128K context, optimized for agent workloads", + "date_updated": "2026-05-13", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [ + "inclusionAI/Ring-1T-FP8" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified" + } + }, + "model": { + "model_id": "inclusionAI/Ling-2.6-flash", + "min_vllm_version": "0.20.2", + "architecture": "moe", + "parameter_count": "104B", + "active_parameters": "7.4B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "4" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 250, + "description": "BF16 weights with base TP=4; the guide shows a tested 2-GPU AMD command" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 4 + } + }, + "guide": "## Overview\n\n[Ling-2.6-flash](https://huggingface.co/inclusionAI/Ling-2.6-flash) is a\nBailingMoeV2_5 MoE instruct model with 104B total / 7.4B active parameters,\nhybrid linear + MLA attention, and a 131K context window.\n\n## Deployment Configurations\n\n### Docker (AMD MI300X / MI325X / MI355X, TP=2)\n\nMI300X / MI325X / MI355X GPUs have larger per-GPU HBM, so TP=2 fits the full\n131K context.\n\n```bash\ndocker run --rm -it \\\n --cap-add=SYS_PTRACE \\\n --ipc=host \\\n --privileged=true \\\n --shm-size=128GB \\\n --network=host \\\n --device=/dev/kfd \\\n --device=/dev/dri \\\n --group-add video \\\n -e VLLM_ROCM_USE_AITER=1 \\\n vllm/vllm-openai-rocm:v0.20.2 \\\n inclusionAI/Ling-2.6-flash \\\n --tensor-parallel-size 2 \\\n --trust-remote-code\n```\n\n## Client Usage\n\n### Text Generation\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"inclusionAI/Ling-2.6-flash\",\n messages=[{\"role\": \"user\", \"content\": \"Write a poem about the ocean.\"}],\n max_tokens=512, temperature=0.7,\n)\nprint(response.choices[0].message.content)\n```\n\n## References\n\n- [Ling-2.6-flash on Hugging Face](https://huggingface.co/inclusionAI/Ling-2.6-flash)\n- [vLLM ROCm GPU install docs](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/?device=rocm)\n- [ROCm vLLM optimization guide](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/vllm-optimization.html)\n" + } + }, + "inclusionAI/Ring-1T-FP8": { + "hf_id": "inclusionAI/Ring-1T-FP8", + "meta": { + "title": "Ring-1T-FP8", + "provider": "inclusionAI", + "description": "Ring-1T (BailingMoeV2) FP8 model (~1T total params) for 8xH200 or 8xMI300X deployment", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Ring-1T-FP8", + "slug": "ring-1t-fp8", + "provider": "inclusionAI", + "description": "Ring-1T (BailingMoeV2) FP8 model (~1T total params) for 8xH200 or 8xMI300X deployment", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "inclusionAI/Ring-1T-FP8", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "50B", + "context_length": 65536, + "base_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "8", + "--max_num_seqs", + "32", + "--kv-cache-dtype", + "fp8", + "--served-model-name", + "Ring-1T-FP8" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 1200, + "description": "FP8 weights on 8x H200 (80 GB) with FP8 KV cache", + "extra_args": [ + "--gpu-memory-utilization", + "0.97", + "--compilation-config", + "{\"use_inductor\": false}" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nRing-1T-FP8 is inclusionAI's BailingMoeV2 FP8 model (~1T total parameters). This recipe\ncovers pure tensor-parallel deployment across 8 GPUs on NVIDIA H200 or AMD MI300X+.\n\n## Prerequisites\n\n- Hardware: 8x H200 or 8x MI300X/MI325X/MI355X\n- vLLM >= 0.11.0\n\n### Install vLLM (CUDA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n\n## Launch commands\n\n8x H200 (FP8 KV cache):\n\n```bash\nvllm serve inclusionAI/Ring-1T-FP8 \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --gpu-memory-utilization 0.97 \\\n --max_num_seqs 32 \\\n --kv-cache-dtype fp8 \\\n --compilation-config '{\"use_inductor\": false}' \\\n --served-model-name Ring-1T-FP8\n```\n\n8x MI300X/MI325X/MI355X:\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve inclusionAI/Ring-1T-FP8 \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --gpu-memory-utilization 0.9 \\\n --max_num_seqs 32 \\\n --kv-cache-dtype fp8 \\\n --served-model-name Ring-1T-FP8\n```\n\nTuning flags:\n- `--max-model-len=65536` works well for most scenarios.\n- `--max-num-batched-tokens=32768` for prompt-heavy; 16384/8192 for lower latency.\n- Reduce `--gpu-memory-utilization` below 0.97 if you hit OOM.\n\n## Client Usage\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"Ring-1T-FP8\",\n \"messages\": [{\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}]\n }'\n```\n\n## References\n\n- [Ring-1T-FP8 on Hugging Face](https://huggingface.co/inclusionAI/Ring-1T-FP8)\n" + } + }, + "inclusionAI/Ring-2.6-1T": { + "hf_id": "inclusionAI/Ring-2.6-1T", + "meta": { + "title": "Ring-2.6-1T", + "provider": "inclusionAI", + "description": "Ring-2.6-1T (BailingMoeV2_5) FP8 thinking model with 1T total / 50B active params, hybrid linear + MLA attention, 128K context", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Ring-2.6-1T", + "slug": "ring-2.6-1t", + "provider": "inclusionAI", + "description": "Ring-2.6-1T (BailingMoeV2_5) FP8 thinking model with 1T total / 50B active params, hybrid linear + MLA attention, 128K context", + "date_updated": "2026-05-15", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "related_recipes": [ + "inclusionAI/Ling-2.6-1T", + "inclusionAI/Ring-1T-FP8" + ] + }, + "model": { + "model_id": "inclusionAI/Ring-2.6-1T", + "min_vllm_version": "0.20.2", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "50B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code", + "--tensor-parallel-size", + "8" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 1200, + "description": "FP8 weights with TP=8 on B300 or 8x MI300X/MI355X-class nodes" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 8 + } + }, + "guide": "## Overview\n\n[Ring-2.6-1T](https://huggingface.co/inclusionAI/Ring-2.6-1T) is inclusionAI's\nBailingMoeV2_5 FP8 trillion-scale **thinking** model (1T total / 50B active\nparameters) with hybrid linear + MLA attention and a 128K context window.\nIt is the reasoning-focused counterpart to [Ling-2.6-1T](https://huggingface.co/inclusionAI/Ling-2.6-1T)\nin the Ring 2.6 series and emits explicit `...` traces; the\nchat template accepts a `reasoning_effort` field (default `high`).\n\nvLLM 0.20.1 shipped the BailingMoeV2.5 MLA RoPE fix (vllm-project/vllm#41185)\nthat is load-bearing for this architecture \u2014 pin v0.20.2 or newer.\n\n## Deployment Configurations\n\n### Docker (AMD MI300X / MI325X / MI355X, TP=8)\n\nTP=8 fits the model-derived 128K context on an MI300X-class node; MI325X\nand MI355X have larger per-GPU HBM and more headroom.\n\n```bash\ndocker run --rm -it \\\n --cap-add=SYS_PTRACE \\\n --ipc=host \\\n --privileged=true \\\n --shm-size=128GB \\\n --network=host \\\n --device=/dev/kfd \\\n --device=/dev/dri \\\n --group-add video \\\n -e VLLM_ROCM_USE_AITER=1 \\\n vllm/vllm-openai-rocm:v0.20.2 \\\n inclusionAI/Ring-2.6-1T \\\n --tensor-parallel-size 8 \\\n --trust-remote-code\n```\n\n### Pip (NVIDIA B300, TP=8)\n\n```bash\nuv venv && source .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n\nvllm serve inclusionAI/Ring-2.6-1T \\\n --trust-remote-code \\\n --tensor-parallel-size 8\n```\n\n## Client Usage\n\n### Text Generation\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nresponse = client.chat.completions.create(\n model=\"inclusionAI/Ring-2.6-1T\",\n messages=[{\"role\": \"user\", \"content\": \"Prove there are infinitely many primes.\"}],\n max_tokens=4096,\n temperature=0.6,\n extra_body={\"chat_template_kwargs\": {\"reasoning_effort\": \"high\"}},\n)\nprint(response.choices[0].message.content)\n```\n\nSet `reasoning_effort` to `low` / `medium` / `high` to control the depth of\nthinking. The model wraps its reasoning trace in `...` before\nemitting the final answer.\n\n## References\n\n- [Ring-2.6-1T on Hugging Face](https://huggingface.co/inclusionAI/Ring-2.6-1T)\n- [vLLM PR #41185 \u2014 BailingMoeV2.5 MLA RoPE fix](https://github.com/vllm-project/vllm/pull/41185)\n- [vLLM ROCm GPU install docs](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/?device=rocm)\n" + } + }, + "internlm/Intern-S1": { + "hf_id": "internlm/Intern-S1", + "meta": { + "title": "Intern-S1", + "provider": "InternLM", + "description": "Intern-S1 vision-language model from Shanghai AI Lab with BF16/FP8 variants and thinking/non-thinking modes", + "tasks": [ + "multimodal" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "241B" + }, + "recipe": { + "meta": { + "title": "Intern-S1", + "slug": "intern-s1", + "provider": "InternLM", + "description": "Intern-S1 vision-language model from Shanghai AI Lab with BF16/FP8 variants and thinking/non-thinking modes", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "OpenGVLab/InternVL3_5-8B" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "internlm/Intern-S1", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "241B", + "active_parameters": "28B", + "context_length": 65536, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "InternLM tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "internlm" + ] + }, + "reasoning": { + "description": "DeepSeek-R1 reasoning parser extracts ...", + "args": [ + "--reasoning-parser", + "deepseek_r1" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 578, + "tp": 8, + "description": "BF16 on 8x H800 (80GB each)" + }, + "fp8": { + "model_id": "internlm/Intern-S1-FP8", + "precision": "fp8", + "vram_minimum_gb": 289, + "tp": 4, + "description": "FP8 on 4x H800 (80GB each)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "0" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Intern-S1](https://github.com/InternLM/Intern-S1) is a vision-language model developed\nby Shanghai AI Laboratory. It supports thinking and non-thinking modes via chat-template\nkwargs and ships in BF16 and FP8 variants.\n\n## Prerequisites\n\n- Hardware: 8xH800 (80GB) for BF16, 4xH800 for FP8, or 4-8x MI300X/MI325X/MI355X\n- vLLM >= 0.10.0\n\n### Install vLLM (CUDA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n\n## Launch commands\n\nBF16 on 8xH800:\n\n```bash\nvllm serve internlm/Intern-S1 \\\n --trust-remote-code \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --reasoning-parser deepseek_r1 \\\n --tool-call-parser internlm\n```\n\nFP8 on 4xH800:\n\n```bash\nvllm serve internlm/Intern-S1-FP8 \\\n --trust-remote-code \\\n --tensor-parallel-size 4 \\\n --enable-auto-tool-choice \\\n --reasoning-parser deepseek_r1 \\\n --tool-call-parser internlm\n```\n\nFP8 on 8xMI300X/MI325X:\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nexport VLLM_ROCM_USE_AITER_MOE=0\nvllm serve internlm/Intern-S1-FP8 \\\n --trust-remote-code --tensor-parallel-size 8 \\\n --enable-auto-tool-choice --reasoning-parser deepseek_r1 --tool-call-parser internlm\n```\n\nFP8 on 8xMI355X: set only `VLLM_ROCM_USE_AITER=1` (no need to disable AITER MoE).\n\n## Switching Between Thinking and Non-Thinking Modes\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(api_key=\"YOUR_API_KEY\", base_url=\"http://0.0.0.0:8000/v1\")\nmodel_name = client.models.list().data[0].id\n\nresponse = client.chat.completions.create(\n model=model_name,\n messages=[{\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}],\n temperature=0.8, top_p=0.8,\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n)\nprint(response)\n```\n\n## Troubleshooting\n\n- `ValueError: No available memory for the cache blocks.` \u2014 add `--gpu-memory-utilization 0.95`.\n\n## References\n\n- [Intern-S1 on Hugging Face](https://huggingface.co/internlm/Intern-S1)\n- [Intern-S1-FP8](https://huggingface.co/internlm/Intern-S1-FP8)\n- [Intern-S1 GitHub](https://github.com/InternLM/Intern-S1)\n" + } + }, + "internlm/Intern-S2-Preview": { + "hf_id": "internlm/Intern-S2-Preview", + "meta": { + "title": "Intern-S2-Preview", + "provider": "InternLM", + "description": "Scientific multimodal MoE (36B total / 3B active) continued pre-trained from Qwen3.5 \u2014 hybrid linear/full attention, 262K context, MTP-accelerated reasoning. BF16 and FP8 checkpoints.", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "36B" + }, + "recipe": { + "meta": { + "title": "Intern-S2-Preview", + "slug": "intern-s2-preview", + "provider": "InternLM", + "description": "Scientific multimodal MoE (36B total / 3B active) continued pre-trained from Qwen3.5 \u2014 hybrid linear/full attention, 262K context, MTP-accelerated reasoning. BF16 and FP8 checkpoints.", + "date_updated": "2026-05-15", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "35B-A3B scientific multimodal foundation model \u2014 single-node BF16 with MTP", + "related_recipes": [ + "internlm/Intern-S1", + "Qwen/Qwen3.5-35B-A3B" + ], + "hardware": { + "h200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "internlm/Intern-S2-Preview", + "min_vllm_version": "nightly", + "nightly_required": true, + "architecture": "moe", + "parameter_count": "36B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with the Qwen3 Coder parser (per official deployment guide)", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Extract ... chain-of-thought via the Qwen3 reasoning parser", + "args": [ + "--reasoning-parser", + "qwen3" + ] + }, + "spec_decoding": { + "description": "Shared-weight MTP speculative decoding \u2014 4 draft tokens (recommended in the deployment guide)", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":4}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "spec_decoding", + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 87, + "description": "BF16 on 1x H200 or 2x H100/H800 (deployment guide uses TP=2)" + }, + "fp8": { + "model_id": "internlm/Intern-S2-Preview-FP8", + "precision": "fp8", + "vram_minimum_gb": 44, + "description": "Official DeepSeek-style block FP8 (128x128, ue8m0 scales) \u2014 fits on a single H100/H200" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_dep", + "multi_node_tep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Intern-S2-Preview](https://huggingface.co/internlm/Intern-S2-Preview) is a\nscientific multimodal foundation model from Shanghai AI Laboratory, continued\npre-trained from Qwen3.5. It packs 36B total / 3B active parameters across\n256 experts with hybrid linear/full attention, supports a 262K context, and\nships with built-in shared-weight MTP for fast reasoning.\n\nBeyond chat and reasoning, it adds vision and time-series modalities and\nimproves agent capabilities for scientific workflows.\n\n## Prerequisites\n\n- **vLLM version:** nightly build with `InternS2PreviewForConditionalGeneration`\n support ([PR #42705](https://github.com/vllm-project/vllm/pull/42705)). The\n architecture is not yet in any stable release.\n- **Hardware (BF16):** 1x H200 (141 GB) or 2x H100/H800 (80 GB)\n- **Hardware (FP8):** single H100/H200\n- **Trust remote code:** required (custom modeling files ship in the repo)\n\n### Install vLLM (nightly)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly\n```\n\n## Launch commands\n\n### Recommended \u2014 with MTP speculative decoding\n\n```bash\nvllm serve internlm/Intern-S2-Preview \\\n --trust-remote-code \\\n --tensor-parallel-size 2 \\\n --reasoning-parser qwen3 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":4}'\n```\n\n### Basic serving (no MTP)\n\n```bash\nvllm serve internlm/Intern-S2-Preview \\\n --trust-remote-code \\\n --tensor-parallel-size 2 \\\n --reasoning-parser qwen3 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\n### Long-context (YaRN, up to 512K)\n\nThe base config sets `max_position_embeddings = 262144`. For longer contexts,\noverride the RoPE config to enable YaRN:\n\n```bash\nvllm serve internlm/Intern-S2-Preview \\\n --trust-remote-code \\\n --tensor-parallel-size 2 \\\n --max-model-len 512000 \\\n --reasoning-parser qwen3 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --hf-overrides '{\"text_config\": {\"rope_parameters\": {\"mrope_interleaved\": true, \"mrope_section\": [11, 11, 10], \"rope_type\": \"yarn\", \"rope_theta\": 10000000, \"partial_rotary_factor\": 0.25, \"factor\": 4.0, \"original_max_position_embeddings\": 262144}}}'\n```\n\n## Client Usage\n\nRecommended sampling parameters from the model card:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"internlm/Intern-S2-Preview\",\n messages=[{\"role\": \"user\", \"content\": \"Design a synthesis route for paracetamol.\"}],\n temperature=0.8,\n top_p=0.95,\n max_tokens=32768,\n extra_body={\n \"top_k\": 50,\n \"min_p\": 0.0,\n \"spaces_between_special_tokens\": False,\n },\n)\nprint(resp.choices[0].message.content)\n```\n\n### Toggle thinking mode\n\nThinking is enabled by default. Disable it per request:\n\n```python\nresp = client.chat.completions.create(\n model=\"internlm/Intern-S2-Preview\",\n messages=[{\"role\": \"user\", \"content\": \"What is AGI?\"}],\n temperature=0.8,\n top_p=0.95,\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n)\n```\n\n> The model card notes: do **not** disable thinking mode for agentic tasks.\n\n## Troubleshooting\n\n- **Unknown architecture `InternS2PreviewForConditionalGeneration`:** the\n handler landed via vLLM PR #42705 \u2014 use a nightly wheel or build from main\n once the PR merges.\n- **OOM at full 262K context:** drop `--max-model-len` to 65536 or 131072,\n or lower `--gpu-memory-utilization` headroom.\n\n## References\n\n- [Model card](https://huggingface.co/internlm/Intern-S2-Preview)\n- [Deployment guide](https://huggingface.co/internlm/Intern-S2-Preview/blob/main/deployment_guide.md)\n- [Intern-S1 GitHub](https://github.com/InternLM/Intern-S1)\n- [vLLM support PR #42705](https://github.com/vllm-project/vllm/pull/42705)\n" + } + }, + "jinaai/jina-embeddings-v5-text-small": { + "hf_id": "jinaai/jina-embeddings-v5-text-small", + "meta": { + "title": "Jina Embeddings v5 Text Small", + "provider": "Jina AI", + "description": "Jina AI's fifth-gen multilingual text embedding model (677M, Qwen3-0.6B-Base) with task-specific LoRA adapters for retrieval, text-matching, classification, and clustering.", + "tasks": [ + "embedding" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "0.7B" + }, + "recipe": { + "meta": { + "title": "Jina Embeddings v5 Text Small", + "slug": "jina-embeddings-v5-text-small", + "provider": "Jina AI", + "description": "Jina AI's fifth-gen multilingual text embedding model (677M, Qwen3-0.6B-Base) with task-specific LoRA adapters for retrieval, text-matching, classification, and clustering.", + "date_updated": "2026-05-09", + "difficulty": "beginner", + "tasks": [ + "embedding" + ], + "performance_headline": "71.7 MTEB English v2 / 67.7 MMTEB at <1B params, 119+ languages, 32K context", + "related_recipes": [ + "jinaai/jina-reranker-m0" + ] + }, + "model": { + "model_id": "jinaai/jina-embeddings-v5-text-small", + "min_vllm_version": "0.20.0", + "architecture": "dense", + "parameter_count": "0.7B", + "active_parameters": "0.7B", + "context_length": 32768, + "base_args": [ + "--trust-remote-code", + "--runner", + "pooling" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "model_id": "jinaai/jina-embeddings-v5-text-small-retrieval", + "label": "retrieval", + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Retrieval task: query/document encoding for RAG and search. Adapter pre-merged into the base weights." + }, + "text_matching": { + "model_id": "jinaai/jina-embeddings-v5-text-small-text-matching", + "label": "text-matching", + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Text-matching task: semantic similarity, dedup, paraphrase detection. Adapter pre-merged into the base weights." + }, + "classification": { + "model_id": "jinaai/jina-embeddings-v5-text-small-classification", + "label": "classification", + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Classification task: linear probing, intent detection. Adapter pre-merged into the base weights." + }, + "clustering": { + "model_id": "jinaai/jina-embeddings-v5-text-small-clustering", + "label": "clustering", + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Clustering task: k-means, topic discovery. Adapter pre-merged into the base weights." + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[`jina-embeddings-v5-text-small`](https://huggingface.co/jinaai/jina-embeddings-v5-text-small)\nis the fifth generation of Jina AI's multilingual text embedding family, released\nFebruary 18, 2026. It scores **71.7 on MTEB English v2** and **67.7 on MMTEB** with\nonly 677M parameters \u2014 the highest among multilingual embedding models under 1B \u2014\nand supports **119+ languages with up to 32K-token context**. Built on\n`Qwen3-0.6B-Base` and trained by distilling `Qwen3-Embedding-4B` plus task-specific\ncontrastive losses, it produces 1024-dim embeddings that stay robust under\ntruncation (Matryoshka dims: 32\u20131024) and binary quantization.\n\nvLLM support landed in [PR #39575](https://github.com/vllm-project/vllm/pull/39575)\nvia the `JinaEmbeddingsV5Model` architecture and the `--runner pooling` API.\n\n### Task-specific adapters\n\nv5 ships four LoRA adapters \u2014 one per supported task. For each task, Jina AI\npublishes a sibling repo with that adapter **pre-merged into the base weights**;\nthese are the simplest path for vLLM and what this recipe serves. Pick a task\nabove; the recipe swaps the model id accordingly:\n\n| Task | Pre-merged repo |\n|------|----------------|\n| Retrieval | `jinaai/jina-embeddings-v5-text-small-retrieval` |\n| Text-matching | `jinaai/jina-embeddings-v5-text-small-text-matching` |\n| Classification | `jinaai/jina-embeddings-v5-text-small-classification` |\n| Clustering | `jinaai/jina-embeddings-v5-text-small-clustering` |\n\n## Prerequisites\n\n- **Hardware**: any single GPU with \u2265 2 GB VRAM (T4 / L4 / A10 / A100 / H100 /\n H200 / B200 / MI300X all fine \u2014 bf16 weights are ~1.4 GB).\n- **vLLM**: requires a build that includes PR #39575. Use the nightly wheel\n until the next stable release ships:\n\n ```bash\n uv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly \\\n --index-strategy unsafe-best-match\n ```\n\n## Launch command\n\nUse the launch command above (it points at the chosen task's pre-merged repo).\nThe full form looks like:\n\n```bash\nvllm serve jinaai/jina-embeddings-v5-text-small-retrieval \\\n --trust-remote-code \\\n --runner pooling \\\n --host 0.0.0.0 --port 8000\n```\n\nSwap the model id for `-text-matching`, `-classification`, or `-clustering` to\nserve the corresponding task \u2014 or just toggle the **Variant** pill above.\n\n### Alternative: base repo + `--hf-overrides`\n\nIf you'd rather download a single checkpoint and switch tasks at startup, serve\nthe base `jinaai/jina-embeddings-v5-text-small` repo and pass the task via\n`--hf-overrides`:\n\n```bash\nvllm serve jinaai/jina-embeddings-v5-text-small \\\n --trust-remote-code \\\n --runner pooling \\\n --hf-overrides '{\"jina_task\": \"retrieval\"}'\n```\n\nAllowed values: `retrieval`, `text-matching`, `classification`, `clustering`.\nThis loads the base weights and merges the requested adapter at startup.\n\n## Client usage\n\n### Embeddings (`/v1/embeddings`)\n\n```bash\ncurl -s http://localhost:8000/v1/embeddings \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"jinaai/jina-embeddings-v5-text-small-retrieval\",\n \"input\": [\"Query: What is climate change?\"]\n }' | python3 -m json.tool\n```\n\n### Python (OpenAI SDK)\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nresp = client.embeddings.create(\n model=\"jinaai/jina-embeddings-v5-text-small-retrieval\",\n input=[\n \"Climate change has led to rising sea levels.\",\n \"Overview of climate change impacts on coastal cities.\",\n ],\n)\nfor d in resp.data:\n print(d.index, len(d.embedding))\n```\n\n### Multilingual text-matching example\n\nSwitch the **Variant** pill to **text-matching** (or serve the\n`-text-matching` repo) and embed semantically equivalent strings across\nlanguages:\n\n```python\ntexts = [\n \"\u063a\u0631\u0648\u0628 \u062c\u0645\u064a\u0644 \u0639\u0644\u0649 \u0627\u0644\u0634\u0627\u0637\u0626\",\n \"\u6d77\u6ee9\u4e0a\u7f8e\u4e3d\u7684\u65e5\u843d\",\n \"A beautiful sunset over the beach\",\n \"Un beau coucher de soleil sur la plage\",\n \"\u6d5c\u8fba\u306b\u6c88\u3080\u7f8e\u3057\u3044\u5915\u65e5\",\n]\nresp = client.embeddings.create(\n model=\"jinaai/jina-embeddings-v5-text-small-text-matching\",\n input=texts,\n)\n```\n\n## Configuration tips\n\n- **Pick the task that matches your workload** \u2014 retrieval prompts (`query`\n vs. `document`) are baked into the retrieval adapter, so the wrong task\n measurably degrades recall.\n- **Matryoshka truncation**: embeddings stay useful when truncated to 32 / 64\n / 128 / 256 / 512 / 768 dims \u2014 keep the prefix and renormalize.\n- **Throughput**: with TP=1 on a single small GPU, the bottleneck is usually\n tokenization \u2014 batch your inputs (`input: [...]` with up to a few hundred\n short docs per request).\n- **bf16 vs fp16**: README recommends bf16 on modern GPUs; PR #39575's test\n used fp16. Either dtype works; bf16 is more numerically stable on Hopper /\n Blackwell / MI300X+.\n\n## References\n\n- [Model card](https://huggingface.co/jinaai/jina-embeddings-v5-text-small)\n- [Release blog](https://jina.ai/news/jina-embeddings-v5-text-distilling-4b-quality-into-sub-1b-multilingual-embeddings)\n- [Technical report (arXiv:2602.15547)](https://arxiv.org/abs/2602.15547)\n- [vLLM PR #39575 \u2014 Add Jina Embeddings v5 model support](https://github.com/vllm-project/vllm/pull/39575)\n- [Pre-merged retrieval repo](https://huggingface.co/jinaai/jina-embeddings-v5-text-small-retrieval)\n- [Pre-merged text-matching repo](https://huggingface.co/jinaai/jina-embeddings-v5-text-small-text-matching)\n- [Pre-merged classification repo](https://huggingface.co/jinaai/jina-embeddings-v5-text-small-classification)\n- [Pre-merged clustering repo](https://huggingface.co/jinaai/jina-embeddings-v5-text-small-clustering)\n" + } + }, + "jinaai/jina-reranker-m0": { + "hf_id": "jinaai/jina-reranker-m0", + "meta": { + "title": "Jina Reranker m0", + "provider": "Jina AI", + "description": "Multilingual, multimodal reranker for text and visual documents across 29+ languages via Qwen2-VL backbone", + "tasks": [ + "embedding" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "2.4B" + }, + "recipe": { + "meta": { + "title": "Jina Reranker m0", + "slug": "jina-reranker-m0", + "provider": "Jina AI", + "description": "Multilingual, multimodal reranker for text and visual documents across 29+ languages via Qwen2-VL backbone", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "embedding" + ], + "related_recipes": [] + }, + "model": { + "model_id": "jinaai/jina-reranker-m0", + "min_vllm_version": "0.8.0", + "architecture": "dense", + "parameter_count": "2.4B", + "active_parameters": "2.4B", + "context_length": 32768, + "base_args": [ + "--gpu-memory-utilization", + "0.75", + "--max-num-seqs", + "32" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 6, + "description": "BF16 weights; 2x T4 or 2x L4 GPUs" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[jinaai/jina-reranker-m0](https://huggingface.co/jinaai/jina-reranker-m0) is a\nmultilingual, multimodal reranker that ranks visual documents across 29+ languages.\nIt accepts text and visual content, including pages with mixed text, figures, tables,\nand various layouts.\n\nDeployment target: 2x NVIDIA T4 or 2x NVIDIA L4.\n\n## Prerequisites\n\n- Hardware: 2x T4 or 2x L4 (or any 2x GPU with ~16 GB each)\n- vLLM >= 0.8.0\n\n### Install vLLM (CUDA)\n\n```bash\nuv pip install vllm\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700\n```\n\n## Launch command\n\n```bash\nvllm serve jinaai/jina-reranker-m0 \\\n --host 0.0.0.0 \\\n --port 8000 \\\n --tensor_parallel_size 2 \\\n --gpu-memory-utilization 0.75 \\\n --max_num_seqs 32\n```\n\nOn AMD:\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve jinaai/jina-reranker-m0 \\\n --tensor_parallel_size 2 --gpu-memory-utilization 0.75 --max_num_seqs 32\n```\n\n## Rerank API\n\n```bash\ncurl -X POST http://localhost:8000/v1/rerank \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"jinaai/jina-reranker-m0\",\n \"query\": \"What are the health benefits of green tea?\",\n \"documents\": [\n \"Green tea contains antioxidants called catechins...\",\n \"El precio del caf\u00e9 ha aumentado un 20% este a\u00f1o...\",\n \"Studies show that drinking green tea regularly...\"\n ],\n \"top_n\": 3,\n \"return_documents\": true\n }'\n```\n\n## Score API\n\nText-to-text:\n\n```bash\ncurl -X POST http://localhost:8000/v1/score \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"jinaai/jina-reranker-m0\",\n \"text_1\": [\"What is the capital of Brazil?\"],\n \"text_2\": [\"The capital of Brazil is Brasilia.\"]\n }'\n```\n\nMultimodal (text vs. images):\n\n```json\n{\n \"model\": \"jinaai/jina-reranker-m0\",\n \"text_1\": \"A cat\",\n \"text_2\": {\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"cat_img.jpg\"}},\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"dog_img.jpg\"}}\n ]\n }\n}\n```\n\n## Offline Deployment\n\n```python\nfrom vllm import LLM\n\nllm = LLM(\n model=\"jinaai/jina-reranker-m0\",\n tensor_parallel_size=2,\n gpu_memory_utilization=0.75,\n max_model_len=1024,\n max_num_seqs=32,\n kv_cache_dtype=\"fp8\",\n dtype=\"bfloat16\",\n)\n\nres = llm.score(\"fast recipes for weeknight dinners\", [\n \"A 65-minute pasta with garlic and olive oil.\",\n \"Slow braised short ribs that cook for 5 hours.\",\n \"Stir-fry veggies with pre-cooked rice.\",\n])\nfor item in res:\n print(item.outputs.score)\n```\n\n## References\n\n- [jina-reranker-m0 on Hugging Face](https://huggingface.co/jinaai/jina-reranker-m0)\n- [vLLM Score API documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#score-api)\n" + } + }, + "meituan-longcat/LongCat-Image-Edit": { + "hf_id": "meituan-longcat/LongCat-Image-Edit", + "meta": { + "title": "LongCat-Image-Edit", + "provider": "LongCat (Meituan)", + "description": "Bilingual (Chinese-English) image editing model from Meituan LongCat, served via vLLM-Omni", + "tasks": [ + "omni" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "6B" + }, + "recipe": { + "meta": { + "title": "LongCat-Image-Edit", + "slug": "longcat-image-edit", + "provider": "LongCat (Meituan)", + "description": "Bilingual (Chinese-English) image editing model from Meituan LongCat, served via vLLM-Omni", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "omni" + ], + "related_recipes": [] + }, + "model": { + "model_id": "meituan-longcat/LongCat-Image-Edit", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "6B", + "active_parameters": "6B", + "context_length": 0, + "base_args": [], + "base_env": {} + }, + "omni": { + "tasks": [ + "i2i" + ] + }, + "dependencies": [ + { + "note": "vLLM-Omni must be installed from source with vllm==0.12.0 for LongCat-Image-Edit", + "command": "git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.12.0" + }, + { + "note": "xformers CUDA 12.8 build required for the diffusion attention kernels", + "command": "uv pip install -U xformers --index-url https://download.pytorch.org/whl/cu128" + }, + { + "note": "diffusers from source (needed by the image-edit pipeline)", + "command": "git clone https://github.com/huggingface/diffusers.git && cd diffusers && uv pip install -e ." + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 36, + "description": "BF16 weights; served via vLLM-Omni" + } + }, + "compatible_strategies": [], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nLongCat-Image-Edit is the image-editing variant of LongCat-Image by Meituan. It supports\nbilingual (Chinese-English) editing instructions and is served via **vLLM-Omni** (not\nstandard vLLM). It achieves SOTA performance among open-source image editing models.\n\n## Prerequisites\n\n- Hardware: 1x GPU with >=40 GB VRAM\n- vLLM-Omni (runs on top of vLLM 0.12.0)\n- diffusers (latest from source)\n- xformers (latest)\n\n## Installation\n\n```bash\n# Clone and install vllm-omni\ngit clone https://github.com/vllm-project/vllm-omni.git\ncd vllm-omni\nuv venv\nsource .venv/bin/activate\nuv pip install -e . vllm==0.12.0\n\n# Update xformers to the latest version\nuv pip install -U xformers --index-url https://download.pytorch.org/whl/cu128\n\n# Update diffusers to the latest version\ngit clone https://github.com/huggingface/diffusers.git\ncd diffusers\nuv pip install -e .\n```\n\n## Usage\n\n```bash\ncd vllm-omni\npython3 ./examples/offline_inference/image_to_image/image_edit.py \\\n --image qwen_bear.png \\\n --prompt \"Add a white art board written with colorful text 'vLLM-Omni' on grassland. Add a paintbrush in the bear's hands. Position the bear standing in front of the art board as if painting.\" \\\n --output output_image_edit.png \\\n --num_inference_steps 50 \\\n --guidance_scale 4.5 \\\n --seed 42 \\\n --model meituan-longcat/LongCat-Image-Edit \\\n --cache_backend cache_dit \\\n --cache_dit_max_continuous_cached_steps 2\n```\n\n## References\n\n- [LongCat-Image-Edit on Hugging Face](https://huggingface.co/meituan-longcat/LongCat-Image-Edit)\n- [vLLM-Omni](https://github.com/vllm-project/vllm-omni)\n" + } + }, + "meta-llama/Llama-3.1-8B-Instruct": { + "hf_id": "meta-llama/Llama-3.1-8B-Instruct", + "meta": { + "title": "Llama-3.1-8B-Instruct", + "provider": "Meta", + "description": "Meta's Llama 3.1 8B dense instruction-tuned language model with 128K context", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "8B" + }, + "recipe": { + "meta": { + "title": "Llama-3.1-8B-Instruct", + "slug": "llama-3.1-8b-instruct", + "provider": "Meta", + "description": "Meta's Llama 3.1 8B dense instruction-tuned language model with 128K context", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "related_recipes": [ + "meta-llama/Llama-3.3-70B-Instruct" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "min_vllm_version": "0.6.0", + "architecture": "dense", + "parameter_count": "8B", + "active_parameters": "8B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": { + "spec_decoding": { + "description": "EAGLE3 speculative decoding (requires vLLM >= 0.9.0)", + "args": [ + "--speculative-config", + "{\"model\":\"RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 20, + "description": "Full precision BF16" + }, + "nvfp4": { + "model_id": "nvidia/Llama-3.1-8B-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 5, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "nvidia_fp8": { + "model_id": "nvidia/Llama-3.1-8B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 10, + "description": "FP8 quantized weights for Hopper/Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nLlama 3.1 Instruct is Meta's instruction-tuned language model family. The 8B dense\nvariant is lightweight and ideal for single-GPU deployment, with 128K context support.\nA 70B variant is also available (see related recipes).\n\n## Prerequisites\n\n- Hardware: 1x GPU with >=16 GB VRAM (e.g. A100, L40S, H100, H200)\n- vLLM >= 0.6.0\n- CUDA Driver compatible with your vLLM version\n- Docker with NVIDIA Container Toolkit (recommended)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend auto\n```\n\n### TPU Deployment\n\n- [Llama3.x-70B on Trillium (v6e)](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x)\n- [Llama3.1-8B on Trillium (v6e)](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM/Llama3.x)\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"unused\")\nresponse = client.chat.completions.create(\n model=\"meta-llama/Llama-3.1-8B-Instruct\",\n messages=[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}],\n)\nprint(response.choices[0].message.content)\n```\n\n## Speculative Decoding (EAGLE3)\n\nAn EAGLE3 draft head is available at\n[`RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3`](https://huggingface.co/RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3).\nEnable the **Spec Decoding** toggle above, or add the `--speculative-config` manually:\n\n```bash\nvllm serve meta-llama/Llama-3.1-8B-Instruct \\\n --speculative-config '{\"model\":\"RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}'\n```\n\nEAGLE3 verification requires vLLM >= 0.9.0. `num_speculative_tokens` of 3 is a reasonable\nstarting point for chat workloads.\n\n## Troubleshooting\n\n**OOM on small GPUs:**\nLower `--max-model-len` or `--gpu-memory-utilization`.\n\n**EAGLE3 draft head not loading:**\nUpgrade vLLM to >= 0.9.0 \u2014 earlier releases don't support the EAGLE3 verification path.\n\n## References\n\n- [Model card](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)\n- [EAGLE3 draft head](https://huggingface.co/RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3)\n- [Llama 3.1 70B model card](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)\n" + } + }, + "meta-llama/Llama-3.3-70B-Instruct": { + "hf_id": "meta-llama/Llama-3.3-70B-Instruct", + "meta": { + "title": "Llama-3.3-70B", + "provider": "Meta", + "description": "Llama 3.3 70B dense model with NVIDIA FP8/FP4 quantized variants for Hopper and Blackwell GPUs", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "70B" + }, + "recipe": { + "meta": { + "title": "Llama-3.3-70B", + "slug": "llama3.3-70b", + "provider": "Meta", + "description": "Llama 3.3 70B dense model with NVIDIA FP8/FP4 quantized variants for Hopper and Blackwell GPUs", + "date_updated": "2026-04-16", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "", + "related_recipes": [ + "meta-llama/Llama-4-Scout-17B-16E-Instruct" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "70B", + "active_parameters": "70B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 170, + "description": "Full precision BF16" + }, + "fp8": { + "model_id": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 84, + "description": "NVIDIA FP8 quantization for Hopper and Blackwell", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": {} + }, + "nvfp4": { + "model_id": "nvidia/Llama-3.3-70B-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 42, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--async-scheduling", + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "8192" + ], + "extra_env": {} + }, + "blackwell": { + "extra_args": [ + "--async-scheduling", + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "8192", + "--compilation-config", + "{\"pass_config\":{\"fuse_allreduce_rms\":true,\"fuse_attn_quant\":true,\"eliminate_noops\":true}}" + ], + "extra_env": {} + }, + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nLlama 3.3 70B Instruct is Meta's 70-billion parameter dense language model.\nNVIDIA provides FP8 and FP4 quantized variants optimized for Hopper (H100/H200)\nand Blackwell (B200/GB200) GPUs. FP4 is Blackwell-only and provides the best\nVRAM efficiency.\n\n## Prerequisites\n\n- Hardware: 1x H100/H200 (FP8), 1x B200 (FP4), or 2x GPUs for BF16\n- vLLM >= 0.12.0\n- CUDA Driver >= 575\n- Docker with NVIDIA Container Toolkit (recommended)\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"unused\")\nresponse = client.chat.completions.create(\n model=\"nvidia/Llama-3.3-70B-Instruct-FP8\",\n messages=[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}],\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n**FP4 variant not loading:**\nFP4 is only supported on Blackwell (compute capability 10.0). Use FP8 on Hopper.\n\n**OOM with BF16 on single GPU:**\nUse the FP8 variant (~70 GB) or FP4 variant (~40 GB) to fit on a single GPU.\n\n## References\n\n- [Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)\n- [NVIDIA FP8 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)\n- [NVIDIA FP4 variant](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4)\n" + } + }, + "meta-llama/Llama-4-Scout-17B-16E-Instruct": { + "hf_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "meta": { + "title": "Llama-4-Scout", + "provider": "Meta", + "description": "Llama 4 Scout 17B-16E MoE model with NVIDIA FP8/FP4 variants, fits on a single GPU with quantization", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "109B" + }, + "recipe": { + "meta": { + "title": "Llama-4-Scout", + "slug": "llama-4-scout", + "provider": "Meta", + "description": "Llama 4 Scout 17B-16E MoE model with NVIDIA FP8/FP4 variants, fits on a single GPU with quantization", + "date_updated": "2026-04-16", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "", + "related_recipes": [ + "meta-llama/Llama-3.3-70B-Instruct" + ], + "hardware": { + "h100": "verified", + "b200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "109B", + "active_parameters": "17B", + "context_length": 10485760, + "base_args": [], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 262, + "description": "Full precision BF16" + }, + "fp8": { + "model_id": "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 131, + "description": "NVIDIA FP8 quantization for Hopper and Blackwell, fits on 1x H100", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": {} + }, + "nvfp4": { + "model_id": "nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 65, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "hopper": { + "extra_args": [ + "--async-scheduling", + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "8192" + ], + "extra_env": {} + }, + "blackwell": { + "extra_args": [ + "--async-scheduling", + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "8192", + "--compilation-config", + "{\"pass_config\":{\"fuse_allreduce_rms\":true,\"eliminate_noops\":true}}" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "1" + } + }, + "amd": { + "extra_args": [ + "--no-enable-prefix-caching", + "--max-num-batched-tokens", + "16384", + "--max-num-seqs", + "64", + "--max-model-len", + "32000" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nLlama 4 Scout is Meta's MoE model with 17B active parameters across 16 experts\n(109B total). NVIDIA provides FP8 and FP4 quantized variants. With FP4 quantization,\nthe model fits on a single B200 GPU \u2014 making it one of the most accessible MoE models.\n\n## Prerequisites\n\n- Hardware: 1x B200 (FP4), 1x H100 (FP8), or 4x GPUs (BF16)\n- vLLM >= 0.12.0\n- CUDA Driver >= 575\n- Docker with NVIDIA Container Toolkit (recommended)\n- License: Must agree to Meta's Llama 4 Scout Community License\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"unused\")\nresponse = client.chat.completions.create(\n model=\"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8\",\n messages=[{\"role\": \"user\", \"content\": \"Explain MoE models briefly.\"}],\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n**FP4 only works on Blackwell:**\nFP4 quantization requires compute capability 10.0 (B200/GB200). Use FP8 on Hopper.\n\n**TP=1 recommended for best throughput:**\nFor maximum throughput per GPU, keep TP=1. Increase TP to 2/4/8 for lower latency.\n\n## References\n\n- [Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)\n- [NVIDIA FP8 variant](https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP8)\n- [NVIDIA FP4 variant](https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP4)\n" + } + }, + "microsoft/Phi-4-mini-instruct": { + "hf_id": "microsoft/Phi-4-mini-instruct", + "meta": { + "title": "Phi-4", + "provider": "Microsoft", + "description": "Microsoft's Phi-4 family of lightweight dense models (mini-instruct, reasoning, multimodal) with 128K context", + "tasks": [ + "text", + "multimodal" + ], + "hardware": { + "h100": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "4B" + }, + "recipe": { + "meta": { + "title": "Phi-4", + "slug": "phi-4", + "provider": "Microsoft", + "description": "Microsoft's Phi-4 family of lightweight dense models (mini-instruct, reasoning, multimodal) with 128K context", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "text", + "multimodal" + ], + "related_recipes": [], + "hardware": { + "h100": "verified" + } + }, + "model": { + "model_id": "microsoft/Phi-4-mini-instruct", + "min_vllm_version": "0.7.0", + "architecture": "dense", + "parameter_count": "4B", + "active_parameters": "4B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "Phi-4-mini-instruct, conversational instruction-tuned" + }, + "mini_reasoning": { + "model_id": "microsoft/Phi-4-mini-reasoning", + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "Optimized for reasoning tasks" + }, + "reasoning": { + "model_id": "microsoft/Phi-4-reasoning", + "precision": "bf16", + "vram_minimum_gb": 30, + "description": "Advanced reasoning capabilities (14B)" + }, + "multimodal": { + "model_id": "microsoft/Phi-4-multimodal-instruct", + "precision": "bf16", + "vram_minimum_gb": 16, + "description": "Multimodal instruction-following (text + image)", + "extra_args": [ + "--trust-remote-code" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nThe Phi-4 family includes several lightweight, open models from Microsoft. These models\ncan process text and, in some variants, multimodal inputs like images, to generate\ntext outputs. They come with a 128K token context length.\n\n## Prerequisites\n\n- Hardware: 1x GPU with >=16 GB VRAM (A100, L40S, H100, etc.)\n- vLLM >= 0.7.0\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launch commands\n\nPhi-4-mini-instruct on a single GPU:\n\n```bash\nvllm serve microsoft/Phi-4-mini-instruct \\\n --host 0.0.0.0 \\\n --max-model-len 4000\n```\n\nPhi-4-multimodal-instruct (requires --trust-remote-code for LoRA modules):\n\n```bash\nvllm serve microsoft/Phi-4-multimodal-instruct \\\n --host 0.0.0.0 \\\n --max-model-len 4000 \\\n --trust-remote-code\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nresponse = client.chat.completions.create(\n model=\"microsoft/Phi-4-mini-instruct\",\n messages=[{\"role\": \"user\", \"content\": \"Write a short story\"}],\n temperature=0.0,\n)\nprint(response.choices[0].message.content)\n```\n\nMultimodal (requires Phi-4-multimodal-instruct):\n\n```python\nresponse = client.chat.completions.create(\n model=\"microsoft/Phi-4-multimodal-instruct\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": \"Describe this image.\"},\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://example.com/image.jpg\"}},\n ],\n }],\n)\n```\n\n## Available Phi-4 Variants\n\n- `microsoft/Phi-4-mini-instruct` \u2014 conversational instruction-tuned\n- `microsoft/Phi-4-mini-reasoning` \u2014 optimized for reasoning\n- `microsoft/Phi-4-reasoning` \u2014 advanced reasoning\n- `microsoft/Phi-4-multimodal-instruct` \u2014 multimodal (text + image)\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model microsoft/Phi-4-mini-instruct \\\n --dataset-name random \\\n --random-input-len 2000 \\\n --random-output-len 512 \\\n --num-prompts 100\n```\n\n## Troubleshooting\n\n- Multimodal variant fails to load: add `--trust-remote-code`.\n\n## References\n\n- [Phi-4-mini-instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct)\n- [Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)\n- [Phi-4-reasoning](https://huggingface.co/microsoft/Phi-4-reasoning)\n" + } + }, + "mistralai/Ministral-3-14B-Instruct-2512": { + "hf_id": "mistralai/Ministral-3-14B-Instruct-2512", + "meta": { + "title": "Ministral-3-Instruct", + "provider": "Mistral AI", + "description": "Ministral 3 Instruct family (3B/8B/14B) with FP8 weights, vision support, and 256K context", + "tasks": [ + "multimodal" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "14B" + }, + "recipe": { + "meta": { + "title": "Ministral-3-Instruct", + "slug": "ministral-3-instruct", + "provider": "Mistral AI", + "description": "Ministral 3 Instruct family (3B/8B/14B) with FP8 weights, vision support, and 256K context", + "date_updated": "2026-05-25", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "mistralai/Ministral-3-8B-Reasoning-2512", + "mistralai/Mistral-Large-3-675B-Instruct-2512" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified" + } + }, + "model": { + "model_id": "mistralai/Ministral-3-14B-Instruct-2512", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "14B", + "active_parameters": "14B", + "context_length": 262144, + "base_args": [ + "--tokenizer_mode", + "mistral", + "--config_format", + "mistral", + "--load_format", + "mistral" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Mistral tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "mistral" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 17, + "description": "Native FP8 weights (14B), fits on 1x H200" + }, + "8b": { + "model_id": "mistralai/Ministral-3-8B-Instruct-2512", + "precision": "fp8", + "vram_minimum_gb": 12, + "description": "Smaller 8B variant with independent embedding/output layers" + }, + "3b": { + "model_id": "mistralai/Ministral-3-3B-Instruct-2512", + "precision": "fp8", + "vram_minimum_gb": 6, + "description": "Smallest 3B variant with tied embeddings" + }, + "fp8": { + "model_id": "mistralai/Ministral-3-14B-Instruct-2512-FP8", + "precision": "fp8", + "vram_minimum_gb": 17, + "description": "FP8 quantized weights for Hopper/Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--config_format", + "mistral", + "--load_format", + "mistral" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nMinistral-3 Instruct comes with FP8 weights in 3 different sizes:\n\n- 3B: tied embeddings (shares embedding and output layers)\n- 8B and 14B: independent embedding and output layers\n\nEach variant has vision support and a 256K context length. Smaller models offer faster\ninference at the cost of lower quality; pick the best trade-off for your use case.\n\n## Prerequisites\n\n- Hardware: 1x H200 (sufficient for all three sizes thanks to FP8 weights);\n 1x MI300X (verified) / MI325X / MI355X\n- vLLM >= 0.11.0\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launch command\n\n```bash\nvllm serve mistralai/Ministral-3-14B-Instruct-2512 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral\n```\n\nFor 8B: `mistralai/Ministral-3-8B-Instruct-2512`\nFor 3B: `mistralai/Ministral-3-3B-Instruct-2512`\n\n- `enable-auto-tool-choice`: required for tool usage\n- `tool-call-parser mistral`: required for tool usage\n- `--max-model-len` defaults to `262144`; reduce to save memory\n- `--max-num-batched-tokens` balances throughput and latency\n\n### AMD (MI300X / MI325X / MI355X)\n\nVerified on an 8-GPU MI300X node with **TP=1** per variant.\n\n**3B**\n\n```bash\ndocker run --device=/dev/kfd --device=/dev/dri \\\n --security-opt seccomp=unconfined --group-add video \\\n --privileged --ipc=host -p 8000:8000 \\\n -e VLLM_ROCM_USE_AITER=1 \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest mistralai/Ministral-3-3B-Instruct-2512 \\\n --tokenizer_mode mistral \\\n --tensor-parallel-size 1 \\\n --config_format mistral \\\n --load_format mistral \\\n --enable-auto-tool-choice \\\n --tool-call-parser mistral\n```\n\n**8B**\n\n```bash\ndocker run --device=/dev/kfd --device=/dev/dri \\\n --security-opt seccomp=unconfined --group-add video \\\n --privileged --ipc=host -p 8000:8000 \\\n -e VLLM_ROCM_USE_AITER=1 \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest mistralai/Ministral-3-8B-Instruct-2512 \\\n --tokenizer_mode mistral \\\n --tensor-parallel-size 1 \\\n --config_format mistral \\\n --load_format mistral \\\n --max-model-len auto \\\n --max-num-batched-tokens 8192 \\\n --enable-auto-tool-choice \\\n --tool-call-parser mistral\n```\n\n**14B**\n\n```bash\ndocker run --device=/dev/kfd --device=/dev/dri \\\n --security-opt seccomp=unconfined --group-add video \\\n --privileged --ipc=host -p 8000:8000 \\\n -e VLLM_ROCM_USE_AITER=1 \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai-rocm:latest mistralai/Ministral-3-14B-Instruct-2512 \\\n --tokenizer_mode mistral \\\n --tensor-parallel-size 1 \\\n --config_format mistral \\\n --load_format mistral \\\n --max-num-seqs 256 \\\n --max-model-len auto \\\n --gpu-memory-utilization 0.95 \\\n --max-num-batched-tokens 8192 \\\n --enable-auto-tool-choice \\\n --tool-call-parser mistral\n```\n\n## Client Usage\n\nVision reasoning example:\n\n```python\nfrom datetime import datetime, timedelta\nfrom openai import OpenAI\nfrom huggingface_hub import hf_hub_download\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nmodel = client.models.list().data[0].id\n\ndef load_system_prompt(repo_id, filename):\n path = hf_hub_download(repo_id=repo_id, filename=filename)\n with open(path) as f:\n prompt = f.read()\n today = datetime.today().strftime(\"%Y-%m-%d\")\n yesterday = (datetime.today() - timedelta(days=1)).strftime(\"%Y-%m-%d\")\n return prompt.format(name=repo_id.split(\"/\")[-1], today=today, yesterday=yesterday)\n\nSYSTEM_PROMPT = load_system_prompt(model, \"SYSTEM_PROMPT.txt\")\nimage_url = \"https://static.wikia.nocookie.net/essentialsdocs/images/7/70/Battle.png/revision/latest?cb=20220523172438\"\n\nresponse = client.chat.completions.create(\n model=model,\n messages=[\n {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n {\"role\": \"user\", \"content\": [\n {\"type\": \"text\", \"text\": \"What action should I take here?\"},\n {\"type\": \"image_url\", \"image_url\": {\"url\": image_url}},\n ]},\n ],\n temperature=0.15, max_tokens=262144,\n)\nprint(response.choices[0].message.content)\n```\n\nFunction calling and text-only examples follow a similar OpenAI-compatible pattern.\n\n## Benchmarking (MI300X verification)\n\nServing benchmarks used `vllm bench serve` with random 1024-token input/output,\n`--max-concurrency 32`, and `--num-prompts 100` against each variant above.\nAccuracy used `lm-eval` GSM8K (5-shot, `flexible-extract` / `strict-match` filters).\n\n\n### Throughput\n\n| Variant | Output tok/s | Mean TTFT (ms) | Mean TPOT (ms) |\n|---------|-------------:|---------------:|---------------:|\n| 3B | 3842 | 288 | 6.42 |\n| 8B | 2468 | 1117 | 9.48 |\n| 14B | 1941 | 1229 | 12.22 |\n\n### GSM8K (5-shot `exact_match`)\n\n| Variant | flexible-extract | strict-match |\n|---------|-----------------:|-------------:|\n| 3B | 0.7786 \u00b1 0.0114 | 0.7445 \u00b1 0.0120 |\n| 8B | 0.8560 \u00b1 0.0097 | 0.8491 \u00b1 0.0099 |\n| 14B | 0.8795 \u00b1 0.0090 | 0.8764 \u00b1 0.0091 |\n\n**14B** full `vllm bench serve` output (TP=1, MI300X):\n\n```shell\n============ Serving Benchmark Result ============\nSuccessful requests: 100\nFailed requests: 0\nMaximum request concurrency: 32\nBenchmark duration (s): 52.76\nTotal input tokens: 102400\nTotal generated tokens: 102400\nRequest throughput (req/s): 1.90\nOutput token throughput (tok/s): 1940.79\nPeak output token throughput (tok/s): 3126.00\nPeak concurrent requests: 64.00\nTotal token throughput (tok/s): 3881.58\n---------------Time to First Token----------------\nMean TTFT (ms): 1228.72\nMedian TTFT (ms): 952.25\nP99 TTFT (ms): 2925.25\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms): 12.22\nMedian TPOT (ms): 12.25\nP99 TPOT (ms): 12.83\n---------------Inter-token Latency----------------\nMean ITL (ms): 12.22\nMedian ITL (ms): 11.78\nP99 ITL (ms): 13.66\n==================================================\n```\n\n## Troubleshooting\n\n- OOM: lower `--max-model-len` (e.g. 32768) or use the 3B/8B variant.\n\n## References\n\n- [Ministral-3-14B-Instruct](https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512)\n- [Ministral-3-8B-Instruct](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)\n- [Ministral-3-3B-Instruct](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512)\n" + } + }, + "mistralai/Ministral-3-8B-Reasoning-2512": { + "hf_id": "mistralai/Ministral-3-8B-Reasoning-2512", + "meta": { + "title": "Ministral-3-Reasoning", + "provider": "Mistral AI", + "description": "Ministral 3 Reasoning family (3B/8B/14B) with BF16 weights, vision support, and 256K context", + "tasks": [ + "multimodal" + ], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "8B" + }, + "recipe": { + "meta": { + "title": "Ministral-3-Reasoning", + "slug": "ministral-3-reasoning", + "provider": "Mistral AI", + "description": "Ministral 3 Reasoning family (3B/8B/14B) with BF16 weights, vision support, and 256K context", + "date_updated": "2026-05-06", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "mistralai/Ministral-3-14B-Instruct-2512", + "mistralai/Mistral-Large-3-675B-Instruct-2512" + ], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified" + } + }, + "model": { + "model_id": "mistralai/Ministral-3-8B-Reasoning-2512", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "8B", + "active_parameters": "8B", + "context_length": 262144, + "base_args": [ + "--tokenizer_mode", + "mistral", + "--config_format", + "mistral", + "--load_format", + "mistral" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Mistral tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "mistral" + ] + }, + "reasoning": { + "description": "Mistral reasoning parser extracts ... into message.reasoning", + "args": [ + "--reasoning-parser", + "mistral" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 22, + "description": "Native BF16 weights (8B)" + }, + "3b": { + "model_id": "mistralai/Ministral-3-3B-Reasoning-2512", + "precision": "bf16", + "vram_minimum_gb": 8, + "description": "Smallest 3B variant with tied embeddings" + }, + "14b": { + "model_id": "mistralai/Ministral-3-14B-Reasoning-2512", + "precision": "bf16", + "vram_minimum_gb": 32, + "description": "Largest 14B variant; 2xH200 recommended for full context", + "extra_args": [ + "--tensor-parallel-size", + "2" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--no-enable-prefix-caching" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nMinistral-3 Reasoning comes with BF16 weights in 3 sizes:\n\n- 3B (tied embeddings)\n- 8B, 14B (independent embeddings/outputs)\n\nEach variant has vision support and 256K max context. On GB200, we observe significant\nspeed-ups with NVFP4 Marlin fallback for older GPUs.\n\n## Prerequisites\n\n- Hardware: 1x H200 (3B/8B), 2x H200 recommended for 14B with full context\n- vLLM >= 0.11.0\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launch command\n\n3B or 8B on 1x H200:\n\n```bash\nvllm serve mistralai/Ministral-3-8B-Reasoning-2512 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral \\\n --reasoning-parser mistral\n```\n\n14B on 2x H200:\n\n```bash\nvllm serve mistralai/Ministral-3-14B-Reasoning-2512 \\\n --tensor-parallel-size 2 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral \\\n --reasoning-parser mistral\n```\n\nKey flags:\n- `enable-auto-tool-choice`: required for tool usage\n- `tool-call-parser mistral`: required for tool usage\n- `reasoning-parser mistral`: required to extract reasoning content\n\n## Client Usage\n\nStreaming reasoning + answer:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nstream = client.chat.completions.create(\n model=\"mistralai/Ministral-3-8B-Reasoning-2512\",\n messages=[{\"role\": \"user\", \"content\": \"Solve: use 2,5,6,3 to make 24.\"}],\n stream=True, temperature=0.7, top_p=0.95, max_tokens=262144,\n)\nfor chunk in stream:\n delta = chunk.choices[0].delta\n rc = getattr(delta, \"reasoning_content\", None)\n if rc:\n print(rc, end=\"\", flush=True)\n if delta.content:\n print(delta.content, end=\"\", flush=True)\n```\n\n## Troubleshooting\n\n- OOM on 14B: use `--tensor-parallel-size 2` or lower `--max-model-len`.\n\n## References\n\n- [Ministral-3-8B-Reasoning](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512)\n- [Ministral-3-3B-Reasoning](https://huggingface.co/mistralai/Ministral-3-3B-Reasoning-2512)\n- [Ministral-3-14B-Reasoning](https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512)\n" + } + }, + "mistralai/Mistral-Large-3-675B-Instruct-2512": { + "hf_id": "mistralai/Mistral-Large-3-675B-Instruct-2512", + "meta": { + "title": "Mistral-Large-3-675B-Instruct", + "provider": "Mistral AI", + "description": "Mistral Large 3 (675B) with FP8 and NVFP4 weights for 8xH200 / 4xB200 deployments", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "675B" + }, + "recipe": { + "meta": { + "title": "Mistral-Large-3-675B-Instruct", + "slug": "mistral-large-3-675b-instruct", + "provider": "Mistral AI", + "description": "Mistral Large 3 (675B) with FP8 and NVFP4 weights for 8xH200 / 4xB200 deployments", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "mistralai/Ministral-3-14B-Instruct-2512" + ], + "hardware": { + "h100": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "mistralai/Mistral-Large-3-675B-Instruct-2512", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "675B", + "active_parameters": "22B", + "context_length": 294912, + "base_args": [ + "--tokenizer_mode", + "mistral", + "--config_format", + "mistral", + "--load_format", + "mistral" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Mistral tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "mistral" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 810, + "description": "FP8 weights on 8xH200 (recommended for fine-tuning; up to 256K context)", + "extra_args": [ + "--tensor-parallel-size", + "8" + ] + }, + "nvfp4": { + "model_id": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 405, + "description": "NVFP4 weights on 4xB200 (use for <64K context; B200-native, Marlin fallback on A100/H100)", + "extra_args": [ + "--tensor-parallel-size", + "4" + ] + }, + "fp8": { + "model_id": "mistralai/Mistral-Large-3-675B-Instruct-2512-FP8", + "precision": "fp8", + "vram_minimum_gb": 810, + "description": "FP8 quantized weights for Hopper/Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--no-enable-prefix-caching" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nMistral-Large-3-675B-Instruct-2512 is available in FP8 and NVFP4 formats:\n\n- **FP8** (`mistralai/Mistral-Large-3-675B-Instruct-2512`): up to 256K context, 8xH200\n- **NVFP4** (`mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4`): best for <64K context, 4xB200\n\nNVFP4 gives significant speed-up on B200 (native FP4 support). On older GPUs (A100/H100),\nvLLM falls back to Marlin FP4, which matches FP8 speed while saving memory.\n\nFor large contexts (>64K) we observed a performance regression on NVFP4 \u2014 use FP8 in\nthose cases. A minor regression on vision datasets is expected with NVFP4 (calibration\nwas mainly on text).\n\n## Prerequisites\n\n- Hardware: 8xH200 for FP8, 4xB200 for NVFP4\n- vLLM >= 0.11.0\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launch commands\n\nFP8 on 8xH200:\n\n```bash\nvllm serve mistralai/Mistral-Large-3-675B-Instruct-2512 \\\n --tensor-parallel-size 8 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral\n```\n\nNVFP4 on 4xB200:\n\n```bash\nvllm serve mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4 \\\n --tensor-parallel-size 4 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral\n```\n\nAdditional flags:\n- `--max-model-len`: default 262144; reduce to save memory\n- `--max-num-batched-tokens`: balance throughput vs. latency\n- `--limit-mm-per-prompt.image 0`: skip vision encoder for text-only tasks\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresponse = client.chat.completions.create(\n model=\"mistralai/Mistral-Large-3-675B-Instruct-2512\",\n messages=[{\"role\": \"user\", \"content\": \"Write a sentence...\"}],\n temperature=0.15, max_tokens=262144,\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- Accuracy regression with NVFP4 at long context: switch to FP8 variant.\n- OOM: reduce `--max-model-len` or adjust TP size.\n\n## References\n\n- [Mistral-Large-3 FP8](https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512)\n- [Mistral-Large-3 NVFP4](https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4)\n" + } + }, + "mistralai/Mistral-Medium-3.5-128B": { + "hf_id": "mistralai/Mistral-Medium-3.5-128B", + "meta": { + "title": "Mistral-Medium-3.5", + "provider": "Mistral AI", + "description": "Mistral Medium 3.5 (128B) dense vision-language model with native FP8 weights and 256K context", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "128B" + }, + "recipe": { + "meta": { + "title": "Mistral-Medium-3.5", + "slug": "mistral-medium-3.5", + "provider": "Mistral AI", + "description": "Mistral Medium 3.5 (128B) dense vision-language model with native FP8 weights and 256K context", + "date_updated": "2026-04-30", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "mistralai/Mistral-Large-3-675B-Instruct-2512", + "mistralai/Ministral-3-8B-Reasoning-2512" + ] + }, + "model": { + "model_id": "mistralai/Mistral-Medium-3.5-128B", + "min_vllm_version": "nightly", + "nightly_required": true, + "architecture": "dense", + "parameter_count": "128B", + "active_parameters": "128B", + "context_length": 262144, + "base_args": [ + "--tokenizer_mode", + "mistral", + "--config_format", + "mistral", + "--load_format", + "mistral" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Mistral tokenizer / chat-template runtime \u2014 Mistral 3.5 needs >= 1.11.1 (auto-installed by vLLM nightly, pin if you hit an older cached wheel)", + "command": "uv pip install -U \"mistral_common>=1.11.1\"" + } + ], + "features": { + "tool_calling": { + "description": "Mistral tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "mistral" + ] + }, + "reasoning": { + "description": "Mistral reasoning parser extracts [THINK]...[/THINK] into message.reasoning (emitted when reasoning_effort='high')", + "args": [ + "--reasoning-parser", + "mistral" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + }, + "spec_decoding": { + "description": "EAGLE speculative decoding via mistralai/Mistral-Medium-3.5-128B-EAGLE draft head", + "args": [ + "--speculative-config", + "{\"model\":\"mistralai/Mistral-Medium-3.5-128B-EAGLE\",\"num_speculative_tokens\":3,\"method\":\"eagle\",\"max_model_len\":\"65536\"}" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel", + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 154, + "description": "Native FP8 weights (vision tower / projector / lm_head kept in BF16); recommended on 8xH200 or 4xB200", + "extra_args": [ + "--tensor-parallel-size", + "8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--no-enable-prefix-caching" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nMistral-Medium-3.5 is a 128B dense vision-language model from Mistral AI. The\nweights ship pre-quantized to FP8 (E4M3) with the vision tower, multimodal\nprojector, and `lm_head` retained in BF16. Image input is supported up to\n1540x1540 (Pixtral-style encoder, patch size 14). Context length is 256K\nvia YaRN scaling (factor 64x over the 4K base).\n\nReasoning is opt-in per request via `reasoning_effort: \"high\"` \u2014 when set,\nthe model emits `[THINK]...[/THINK]` blocks that the Mistral reasoning\nparser surfaces as `message.reasoning_content`. Tool calling uses the\n`[AVAILABLE_TOOLS]` / `[TOOL_CALLS]` chat-template tokens.\n\n## Prerequisites\n\n- Hardware: 8xH200 (recommended) or 4xB200; single B200 / MI300X also fits the\n weights (~134 GB raw) but leaves little room for the 256K KV cache.\n- vLLM **nightly** (Mistral 3.5 architecture support has not yet shipped in a\n stable release).\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto \\\n --extra-index-url https://wheels.vllm.ai/nightly\n```\n\nThis pulls in `mistral_common >= 1.11.1` and `transformers >= 5.4.0` automatically.\n\n## Launch command\n\n8xH200 (or 8xB200):\n\n```bash\nvllm serve mistralai/Mistral-Medium-3.5-128B \\\n --tensor-parallel-size 8 \\\n --tokenizer_mode mistral --config_format mistral --load_format mistral \\\n --enable-auto-tool-choice --tool-call-parser mistral \\\n --reasoning-parser mistral\n```\n\nUseful flags:\n\n- `--max-model-len`: default 262144; lower it (e.g. 65536) to free VRAM for\n larger batch sizes on tighter GPU pools.\n- `--language-model-only`: skip the vision encoder entirely for text-only\n workloads.\n- `--mm-encoder-tp-mode data`: run the small vision encoder data-parallel\n instead of tensor-parallel \u2014 avoids the all-reduce overhead.\n- `--limit-mm-per-prompt.image N`: cap images per request.\n\n## EAGLE speculative decoding\n\nMistral ships a dedicated EAGLE draft head at\n[`mistralai/Mistral-Medium-3.5-128B-EAGLE`](https://huggingface.co/mistralai/Mistral-Medium-3.5-128B-EAGLE).\nIt is **not** included in the default config \u2014 toggle the `spec_decoding` feature.\n\nMistral's recommended serve command (from the EAGLE model card):\n\n```bash\nvllm serve mistralai/Mistral-Medium-3.5-128B --tensor-parallel-size 8 \\\n --tool-call-parser mistral --enable-auto-tool-choice --reasoning-parser mistral \\\n --max_num_batched_tokens 16384 --max_num_seqs 128 --gpu_memory_utilization 0.8 \\\n --speculative_config '{\"model\":\"mistralai/Mistral-Medium-3.5-128B-EAGLE\",\"num_speculative_tokens\":3,\"method\":\"eagle\",\"max_model_len\":\"65536\"}'\n```\n\nThe draft model is a 2-layer Mistral-style head trained on the 128B target;\nit shares the tokenizer and runs at TP=8 alongside the target.\n\n## Client usage\n\nReasoning + tool calling against the OpenAI-compatible endpoint:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"mistralai/Mistral-Medium-3.5-128B\",\n messages=[{\"role\": \"user\", \"content\": \"Plan a 3-day Paris trip.\"}],\n extra_body={\"reasoning_effort\": \"high\"},\n temperature=0.7, max_tokens=4096,\n)\nmsg = resp.choices[0].message\nprint(\"reasoning:\", getattr(msg, \"reasoning_content\", None))\nprint(\"answer:\", msg.content)\n```\n\nImage input (vision):\n\n```python\nresp = client.chat.completions.create(\n model=\"mistralai/Mistral-Medium-3.5-128B\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://...\"}},\n {\"type\": \"text\", \"text\": \"Describe this image.\"},\n ],\n }],\n max_tokens=512,\n)\n```\n\n## Troubleshooting\n\n- OOM at full 256K context on H200: drop `--max-model-len` to 131072 or 65536,\n or set `--language-model-only` if you don't need vision.\n- `reasoning_effort` rejected: only `\"none\"` and `\"high\"` are accepted by the\n chat template \u2014 anything else raises an exception.\n\n## References\n\n- [Model card](https://huggingface.co/mistralai/Mistral-Medium-3.5-128B)\n" + } + }, + "mistralai/Mistral-Small-4-119B-2603": { + "hf_id": "mistralai/Mistral-Small-4-119B-2603", + "meta": { + "title": "Mistral-Small-4-119B", + "provider": "Mistral AI", + "description": "Mistral Small 4 (119B MoE, 6.5B active) \u2014 multimodal hybrid instruct + reasoning model with native FP8 weights and 256K context", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "119B" + }, + "recipe": { + "meta": { + "title": "Mistral-Small-4-119B", + "slug": "mistral-small-4-119b", + "provider": "Mistral AI", + "description": "Mistral Small 4 (119B MoE, 6.5B active) \u2014 multimodal hybrid instruct + reasoning model with native FP8 weights and 256K context", + "date_updated": "2026-05-13", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "mistralai/Mistral-Medium-3.5-128B", + "mistralai/Mistral-Large-3-675B-Instruct-2512" + ] + }, + "model": { + "model_id": "mistralai/Mistral-Small-4-119B-2603", + "min_vllm_version": "0.20.0", + "architecture": "moe", + "parameter_count": "119B", + "active_parameters": "6.5B", + "context_length": 262144, + "base_args": [ + "--max-model-len", + "262144", + "--attention-backend", + "FLASH_ATTN_MLA" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Mistral tokenizer / chat-template runtime \u2014 Mistral Small 4 needs >= 1.11.0 (vLLM 0.20.1+ bundles it, pin explicitly if you hit an older cached wheel)", + "command": "uv pip install -U \"mistral_common>=1.11.0\"" + }, + { + "note": "Transformers v5 silences YaRN warnings and is required for the latest Mistral 4 chat template", + "command": "uv pip install -U transformers" + } + ], + "features": { + "tool_calling": { + "description": "Mistral tool-call parser with automatic tool choice \u2014 emits [TOOL_CALLS] / [ARGS] from the chat template", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "mistral" + ] + }, + "reasoning": { + "description": "Mistral reasoning parser extracts [THINK]...[/THINK] into message.reasoning_content (emitted when reasoning_effort='high')", + "args": [ + "--reasoning-parser", + "mistral" + ] + }, + "spec_decoding": { + "description": "EAGLE speculative decoding via the mistralai/Mistral-Small-4-119B-2603-eagle 2-layer draft head", + "args": [ + "--speculative-config", + "{\"model\":\"mistralai/Mistral-Small-4-119B-2603-eagle\",\"num_speculative_tokens\":3,\"method\":\"eagle\",\"max_model_len\":\"65536\"}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 143, + "description": "Native FP8 E4M3 weights (vision tower / projector / lm_head kept in BF16); recommended on 2xB200/H200 or MI300X with FLASH_ATTN_MLA" + }, + "nvfp4": { + "model_id": "mistralai/Mistral-Small-4-119B-2603-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 72, + "description": "NVFP4 4-bit weights for B200-class GPUs (Marlin fallback on Hopper); overrides the attention backend to TRITON_MLA", + "extra_args": [ + "--attention-backend", + "TRITON_MLA" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "pd_cluster" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--no-enable-prefix-caching" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 2 + } + }, + "guide": "## Overview\n\nMistral Small 4 119B is a hybrid Mixture-of-Experts model from Mistral AI:\n**128 experts, 4 active per token** (plus 1 shared expert), 119B total\nparameters with **6.5B activated per token**. It unifies the capabilities of\nthree earlier Mistral families \u2014 **Instruct**, **Reasoning** (formerly\nMagistral), and **Devstral** \u2014 into a single checkpoint, with per-request\ntoggling between fast instant-reply and step-by-step reasoning via\n`reasoning_effort`.\n\nThe weights ship pre-quantized to FP8 E4M3 (the vision tower, multimodal\nprojector, and `lm_head` are kept in BF16). Multimodal input accepts text\nand image (Pixtral-style encoder, 1540x1540, patch size 14). Context length\nis **256K** via YaRN scaling (factor 128x over the 8K base) \u2014 the config\nadvertises 1M positions but Mistral recommends serving at 256K.\n\nMistral Small 4 also ships two companion checkpoints:\n\n- **NVFP4** ([`mistralai/Mistral-Small-4-119B-2603-NVFP4`](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603-NVFP4)) \u2014\n 4-bit `compressed-tensors` weights (~72 GB raw); served with the\n `TRITON_MLA` backend.\n- **EAGLE draft head** ([`mistralai/Mistral-Small-4-119B-2603-eagle`](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603-eagle)) \u2014\n a 2-layer Mistral-style draft trained on the 119B target; enable via the\n `spec_decoding` feature.\n\n## Prerequisites\n\n- Hardware: 2xB200 / 2xH200 / 2xMI300X (FP8 default) or 1xB200 with reduced\n context (NVFP4).\n- vLLM **>= 0.20.0** \u2014 earlier releases load the model but trip on the\n Mistral tool-call / reasoning parsers fixed in PR #39217 (in 0.19.1+)\n and the grammar factory landed in 0.20.0.\n\n### Install vLLM\n\n```bash\nuv venv && source .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\nuv pip install -U \"mistral_common>=1.11.0\" transformers\n```\n\nVerify with `python -c \"import mistral_common; print(mistral_common.__version__)\"`.\n\n## Launch command\n\nFP8 default (2xB200 / 2xH200):\n\n```bash\nvllm serve mistralai/Mistral-Small-4-119B-2603 \\\n --max-model-len 262144 \\\n --tensor-parallel-size 2 \\\n --attention-backend FLASH_ATTN_MLA \\\n --tool-call-parser mistral --enable-auto-tool-choice \\\n --reasoning-parser mistral \\\n --max_num_batched_tokens 16384 --max_num_seqs 128 \\\n --gpu_memory_utilization 0.8\n```\n\nNVFP4 variant (single B200, or 2xB200 for full 256K context):\n\n```bash\nvllm serve mistralai/Mistral-Small-4-119B-2603-NVFP4 \\\n --max-model-len 262144 \\\n --tensor-parallel-size 2 \\\n --attention-backend TRITON_MLA \\\n --tool-call-parser mistral --enable-auto-tool-choice \\\n --reasoning-parser mistral \\\n --max_num_batched_tokens 16384 --max_num_seqs 128 \\\n --gpu_memory_utilization 0.8\n```\n\nMistral publishes a custom Docker image\n[`mistralllm/vllm-ms4:latest`](https://hub.docker.com/r/mistralllm/vllm-ms4)\nwith patched tool-call / reasoning parsing \u2014 use it if you're pinned to a\nvLLM version below 0.20.0 and hit either of those issues.\n\n## EAGLE speculative decoding\n\nThe EAGLE draft head is **not** included in the default config \u2014 toggle the\n`spec_decoding` feature (or pass `--speculative_config` directly):\n\n```bash\nvllm serve mistralai/Mistral-Small-4-119B-2603 \\\n --max-model-len 262144 \\\n --tensor-parallel-size 2 \\\n --attention-backend FLASH_ATTN_MLA \\\n --tool-call-parser mistral --enable-auto-tool-choice \\\n --reasoning-parser mistral \\\n --max_num_batched_tokens 16384 --max_num_seqs 128 \\\n --gpu_memory_utilization 0.8 \\\n --speculative_config '{\n \"model\": \"mistralai/Mistral-Small-4-119B-2603-eagle\",\n \"num_speculative_tokens\": 3,\n \"method\": \"eagle\",\n \"max_model_len\": \"65536\"\n }'\n```\n\n## Client usage\n\nReasoning is opt-in per request via `reasoning_effort`. Only `\"none\"` and\n`\"high\"` are accepted \u2014 anything else raises an exception in the chat\ntemplate. Recommended sampling:\n\n- `reasoning_effort=\"none\"`: `temperature` 0.0\u20130.7 depending on task.\n- `reasoning_effort=\"high\"`: `temperature=0.7`.\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"mistralai/Mistral-Small-4-119B-2603\",\n messages=[{\"role\": \"user\", \"content\": \"Plan a 3-day Paris trip.\"}],\n extra_body={\"reasoning_effort\": \"high\"},\n temperature=0.7, max_tokens=4096,\n)\nmsg = resp.choices[0].message\nprint(\"reasoning:\", getattr(msg, \"reasoning_content\", None))\nprint(\"answer:\", msg.content)\n```\n\nImage input (vision):\n\n```python\nresp = client.chat.completions.create(\n model=\"mistralai/Mistral-Small-4-119B-2603\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://...\"}},\n {\"type\": \"text\", \"text\": \"Describe this image.\"},\n ],\n }],\n max_tokens=512,\n)\n```\n\nTool calling follows the standard OpenAI schema \u2014 the chat template emits\n`[AVAILABLE_TOOLS]` / `[TOOL_CALLS]` tokens which the `mistral` tool-call\nparser surfaces as `message.tool_calls`.\n\n## Troubleshooting\n\n- **OOM at full 256K context** on 2xH200: drop `--max-model-len` to 131072\n or 65536, or move to NVFP4.\n- **`reasoning_effort` rejected**: the chat template only accepts `\"none\"`\n and `\"high\"`.\n- **NVFP4 weight-loader errors** on older wheels: try Mistral's\n `mistralllm/vllm-ms4:latest` Docker image, which carries the parser /\n weight-loader fixes Mistral ships ahead of the upstream merge.\n\n## References\n\n- [Model card](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603)\n- [NVFP4 variant](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603-NVFP4)\n- [EAGLE draft head](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603-eagle)\n" + } + }, + "mistralai/Voxtral-Mini-4B-Realtime-2602": { + "hf_id": "mistralai/Voxtral-Mini-4B-Realtime-2602", + "meta": { + "title": "Voxtral-Mini-4B-Realtime-2602", + "provider": "Mistral AI", + "description": "Multilingual realtime speech transcription (13 languages) with a natively streaming causal audio encoder; configurable 80ms\u20132.4s transcription delay served via vLLM's Realtime API", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "4.4B" + }, + "recipe": { + "meta": { + "title": "Voxtral-Mini-4B-Realtime-2602", + "slug": "voxtral-mini-4b-realtime-2602", + "provider": "Mistral AI", + "description": "Multilingual realtime speech transcription (13 languages) with a natively streaming causal audio encoder; configurable 80ms\u20132.4s transcription delay served via vLLM's Realtime API", + "date_updated": "2026-05-13", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "performance_headline": "Matches offline open-source ASR accuracy at 480ms delay; >12.5 tok/s on a single 16GB GPU", + "related_recipes": [] + }, + "model": { + "model_id": "mistralai/Voxtral-Mini-4B-Realtime-2602", + "min_vllm_version": "0.20.0", + "architecture": "dense", + "parameter_count": "4.4B", + "active_parameters": "4.4B", + "context_length": 131072, + "base_args": [ + "--tokenizer-mode", + "mistral", + "--compilation_config", + "{\"cudagraph_mode\": \"PIECEWISE\"}" + ], + "base_env": { + "VLLM_DISABLE_COMPILE_CACHE": "1" + } + }, + "dependencies": [ + { + "note": "mistral-common's audio extras bundle soxr/librosa/soundfile plus the Voxtral Realtime tokenizer (>= 1.9.0)", + "command": "uv pip install -U \"mistral-common[audio]>=1.9.0\"" + }, + { + "note": "Transformers v5 silences a barrage of warnings emitted when serving Voxtral on v4 (see vllm-project/vllm#34642)", + "command": "uv pip install -U transformers" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 16, + "description": "BF16 weights \u2014 single 16GB+ GPU, full 131072 (~3h audio) context" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\nVoxtral Mini 4B Realtime is Mistral AI's multilingual realtime speech\ntranscription model \u2014 among the first open-source ASR systems to hit\naccuracy comparable to offline models with a **<500ms** end-to-end delay.\n\n- **Architecture**: \u22483.4B Mistral text LM + \u2248970M custom **causal** audio\n encoder, both using sliding-window attention to support \"infinite\" streaming.\n- **Languages**: 13 (English, French, Spanish, German, Russian, Chinese,\n Japanese, Italian, Portuguese, Dutch, Arabic, Hindi, Korean).\n- **Configurable delay**: any multiple of 80ms between 80ms and 1200ms,\n plus 2400ms as a standalone value. Default is **480ms**, the sweet spot\n Mistral identified between latency and accuracy.\n- **Context**: 131072 tokens (\u22483h of audio at 80ms/token).\n\n## Prerequisites\n\n- **Hardware**: a single GPU with **\u2265 16 GB** VRAM (BF16 weights only).\n- **vLLM**: **>= 0.20.0** \u2014 the Voxtral Realtime architecture has been\n registered since v0.16.0, but v0.20.0 is the first stable release with\n the architecture documented in the supported-models list.\n\n### Install vLLM and audio dependencies\n\n```bash\nuv venv && source .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\nuv pip install -U \"mistral-common[audio]>=1.9.0\" transformers\n```\n\nVerify the audio extras pulled `mistral_common >= 1.9.0`:\n\n```bash\npython -c \"import mistral_common; print(mistral_common.__version__)\"\n```\n\n## Launch command\n\n```bash\nVLLM_DISABLE_COMPILE_CACHE=1 vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\\n --tokenizer-mode mistral \\\n --compilation_config '{\"cudagraph_mode\": \"PIECEWISE\"}'\n```\n\n> `--tokenizer-mode mistral` is **required**: Voxtral Realtime's tokenizer\n> only loads through `mistral_common`. Omitting it raises a tokenizer\n> initialization error at startup.\n\nOnce it starts you should see the Realtime API route registered:\n\n```\nRoute: /v1/realtime, Endpoint: realtime_endpoint\n```\n\n### Tuning flags\n\n- `--max-num-batched-tokens` \u2014 balance throughput vs latency (higher means\n more throughput at the cost of per-request latency).\n- `--max-model-len` \u2014 defaults to 131072 (\u22483h). Reduce it if you know your\n sessions are shorter; this cuts the memory reserved for pre-computed\n RoPE frequencies. As a rule of thumb, one text token \u2248 80ms of audio,\n so a 1h meeting needs `--max-model-len >= 3600/0.08 = 45000`.\n\n## Client usage\n\n### Recommended settings\n\n- Always set `temperature=0.0`.\n- Use **WebSockets** against `/v1/realtime` for streaming audio sessions.\n- Adjust the transcription delay by editing the `transcription_delay_ms`\n field in the model's\n [`tekken.json`](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/blob/main/tekken.json)\n to any multiple of 80ms in `[80, 1200]`, or to `2400`.\n\n### Stream an audio file\n\nSee vLLM's\n[Realtime audio file client example](https://docs.vllm.ai/en/latest/examples/online_serving/openai_realtime_client/).\n\n### Live microphone (Gradio demo)\n\nSee vLLM's\n[Realtime microphone client example](https://docs.vllm.ai/en/latest/examples/online_serving/openai_realtime_microphone_client/)\nfor an end-to-end live-transcription UI.\n\n## Benchmarks (Fleurs, average WER)\n\n| Delay | AVG | English | French | Chinese | Japanese |\n|---------|--------|---------|--------|---------|----------|\n| 160ms | 12.60% | 6.46% | 9.75% | 17.67% | 19.17% |\n| 240ms | 10.80% | 5.91% | 8.00% | 13.84% | 15.17% |\n| **480ms** | **8.72%** | **4.90%** | **6.42%** | **10.45%** | **9.59%** |\n| 960ms | 7.70% | 4.34% | 5.68% | 8.99% | 6.80% |\n| 2400ms | 6.73% | 4.05% | 5.23% | 8.48% | 5.50% |\n\nAt **480ms** Voxtral Mini Realtime matches Mistral's offline Voxtral\nTranscribe 2.0 on Long-form English and Short-form English benchmarks\n(within 1 WER point on TEDLIUM, Meanwhile, AMI IHM, etc.).\n\n## Troubleshooting\n\n- **`Route: /v1/realtime` not registered** \u2014 your vLLM is < 0.16.0.\n Upgrade to 0.20.0+.\n- **Tokenizer initialization error** \u2014 you forgot `--tokenizer-mode mistral`.\n Voxtral Realtime's tokenizer can only load through `mistral_common`.\n- **`mistral_common` import error / wrong version** \u2014 install with the audio\n extras: `pip install -U \"mistral-common[audio]>=1.9.0\"`.\n- **Transformers v4 warning spam** \u2014 upgrade to Transformers v5\n (`uv pip install -U transformers`), tracked in\n [vllm-project/vllm#34642](https://github.com/vllm-project/vllm/issues/34642).\n- **Hangs / crashes on long sessions** \u2014 known upstream issue, see\n [#39996](https://github.com/vllm-project/vllm/issues/39996) (encoder KV\n cache eviction) and [#38233](https://github.com/vllm-project/vllm/issues/38233)\n (multi-session encode). Restart sessions periodically as a workaround.\n\n## References\n\n- [Mistral blog: Voxtral Transcribe 2](https://mistral.ai/news/voxtral-transcribe-2)\n- [vLLM Realtime API docs](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/#realtime-api)\n- [vLLM streaming-input blog post](https://vllm.ai/blog/streaming-realtime.html)\n- [HuggingFace demo space](https://huggingface.co/spaces/mistralai/Voxtral-Mini-Realtime)\n" + } + }, + "moonshotai/Kimi-K2-Instruct": { + "hf_id": "moonshotai/Kimi-K2-Instruct", + "meta": { + "title": "Kimi-K2-Instruct", + "provider": "Moonshot AI", + "description": "Moonshot AI's Kimi-K2 is a trillion-parameter MoE instruction model (~32B active) with native FP8 weights and strong tool-calling capabilities.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Kimi-K2-Instruct", + "slug": "kimi-k2-instruct", + "provider": "Moonshot AI", + "description": "Moonshot AI's Kimi-K2 is a trillion-parameter MoE instruction model (~32B active) with native FP8 weights and strong tool-calling capabilities.", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "Open-weights 1T-parameter MoE with native FP8 and Kimi K2 tool calling", + "related_recipes": [ + "moonshotai/Kimi-K2-Thinking", + "moonshotai/Kimi-K2.5" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "moonshotai/Kimi-K2-Instruct", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "32B", + "context_length": 131072, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code", + "--tokenizer-mode", + "auto" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Optional: DeepEP + DeepGEMM for the DP+EP deployment path on H800/H200", + "command": "uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "kimi_k2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "fp8", + "vram_minimum_gb": 1200, + "description": "Native FP8 weights on 16xH800 / 16xH200 (smallest deployment for 128k seqlen)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "multi_node_tp_pp": { + "vllm_args": [ + "--dtype", + "bfloat16", + "--quantization", + "fp8", + "--kv-cache-dtype", + "fp8", + "--decode-context-parallel-size", + "8", + "--enable-chunked-prefill", + "--max-model-len", + "65536", + "--max-num-batched-tokens", + "1024", + "--max-num-seqs", + "1", + "--disable-log-requests" + ] + } + }, + "guide": "## Overview\n\nKimi-K2-Instruct is Moonshot AI's trillion-parameter Mixture-of-Experts instruction\nmodel (approximately 32B activated per token) shipped with native FP8 weights. The\nsmallest deployment unit for Kimi-K2 FP8 weights with 128k seqlen on mainstream H800\nplatforms is a 16-GPU cluster using either Tensor Parallel (TP) or Data Parallel +\nExpert Parallel (DP+EP). This guide is partially adapted from the official\n[Kimi-K2-Instruct Deployment Guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/deploy_guidance.md).\n\n## Prerequisites\n\n- **Hardware (FP8)**: 16x H800 or 16x H200 GPUs (verified)\n- **vLLM**: Current stable release\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Running Kimi-K2 with FP8 on 16xH800\n\n### Tensor Parallelism + Pipeline Parallelism (TP8+PP2)\n\n```bash\n# node 0 (start Ray on both nodes first)\nvllm serve moonshotai/Kimi-K2-Instruct \\\n --trust-remote-code \\\n --tokenizer-mode auto \\\n --tensor-parallel-size 8 \\\n --pipeline-parallel-size 2 \\\n --dtype bfloat16 \\\n --quantization fp8 \\\n --max-model-len 2048 \\\n --max-num-seqs 1 \\\n --max-num-batched-tokens 1024 \\\n --enable-chunked-prefill \\\n --disable-log-requests \\\n --kv-cache-dtype fp8 \\\n -dcp 8\n```\n\nKey parameter notes:\n- `--enable-auto-tool-choice`: required when enabling tool usage.\n- `--tool-call-parser kimi_k2`: required when enabling tool usage.\n\n### Data Parallelism + Expert Parallelism (DP16+EP)\n\nYou can install libraries like DeepEP and DeepGEMM as needed. Then run the command\n(example on H800):\n\n```bash\n# node 0\nvllm serve moonshotai/Kimi-K2-Instruct \\\n --port 8000 --served-model-name kimi-k2 \\\n --trust-remote-code \\\n --data-parallel-size 16 \\\n --data-parallel-size-local 8 \\\n --data-parallel-address $MASTER_IP \\\n --data-parallel-rpc-port $PORT \\\n --enable-expert-parallel \\\n --max-num-batched-tokens 8192 \\\n --max-num-seqs 256 \\\n --gpu-memory-utilization 0.85 \\\n --enable-auto-tool-choice \\\n --tool-call-parser kimi_k2\n\n# node 1\nvllm serve moonshotai/Kimi-K2-Instruct \\\n --headless \\\n --data-parallel-start-rank 8 \\\n --port 8000 --served-model-name kimi-k2 \\\n --trust-remote-code \\\n --data-parallel-size 16 \\\n --data-parallel-size-local 8 \\\n --data-parallel-address $MASTER_IP \\\n --data-parallel-rpc-port $PORT \\\n --enable-expert-parallel \\\n --max-num-batched-tokens 8192 \\\n --max-num-seqs 256 \\\n --gpu-memory-utilization 0.85 \\\n --enable-auto-tool-choice \\\n --tool-call-parser kimi_k2\n```\n\nAdditional flags:\n- `--max-model-len` preserves memory; `--max-model-len=65536` is usually good for most scenarios.\n- `--max-num-batched-tokens` balances throughput vs latency. `32768` is good for prompt-heavy\n workloads; reduce to 16k or 8k to cut activation memory and decrease latency.\n- vLLM conservatively uses 90% of GPU memory. Set `--gpu-memory-utilization=0.95` to\n maximize KV cache.\n\n## Benchmarking\n\n### FP8 Benchmark on 16xH800\n\n```bash\nvllm bench serve \\\n --model moonshotai/Kimi-K2-Instruct \\\n --dataset-name random \\\n --random-input-len 1000 \\\n --random-output-len 512 \\\n --request-rate 1.0 \\\n --num-prompts 8 \\\n --ignore-eos \\\n --trust-remote-code\n```\n\n### FP8 Benchmark on 16xH200\n\n```bash\nvllm bench serve \\\n --model moonshotai/Kimi-K2-Instruct \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos \\\n --trust-remote-code\n```\n\nAdding `-dcp 8` at launch can further improve throughput on H200 (observed ~33% lower\nmean TTFT and higher tok/s in internal benchmarks).\n\nTest different batch sizes by changing `--num-prompts`, e.g. 1, 16, 32, 64, 128, 256, 512.\n\n## References\n\n- [Kimi-K2-Instruct on Hugging Face](https://huggingface.co/moonshotai/Kimi-K2-Instruct)\n- [Official Kimi-K2 Deployment Guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/deploy_guidance.md)\n- [vLLM Expert Parallelism docs](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html)\n" + } + }, + "moonshotai/Kimi-K2-Thinking": { + "hf_id": "moonshotai/Kimi-K2-Thinking", + "meta": { + "title": "Kimi-K2-Thinking", + "provider": "Moonshot AI", + "description": "Kimi-K2-Thinking is an advanced reasoning MoE model with native INT4 QAT weights, designed for long-horizon agent workflows interleaving chain-of-thought reasoning with tool calls.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Kimi-K2-Thinking", + "slug": "kimi-k2-thinking", + "provider": "Moonshot AI", + "description": "Kimi-K2-Thinking is an advanced reasoning MoE model with native INT4 QAT weights, designed for long-horizon agent workflows interleaving chain-of-thought reasoning with tool calls.", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "1T MoE thinking model with native INT4 QAT for 2x low-latency speed-up", + "related_recipes": [ + "moonshotai/Kimi-K2-Instruct", + "moonshotai/Kimi-K2.5" + ], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "moonshotai/Kimi-K2-Thinking", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "32B", + "context_length": 262144, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "kimi_k2" + ] + }, + "reasoning": { + "description": "Kimi K2 reasoning parser for extracting chain-of-thought content.", + "args": [ + "--reasoning-parser", + "kimi_k2" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "int4", + "vram_minimum_gb": 600, + "description": "Native INT4 (QAT) weights on 8xH200 / 8xH20; 2x low-latency speed-up vs FP8" + }, + "nvfp4": { + "model_id": "nvidia/Kimi-K2-Thinking-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 600, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking) is an advanced\ntrillion-parameter MoE created by Moonshot AI with these highlights:\n\n- **Deep Thinking & Tool Orchestration**: End-to-end trained to interleave\n chain-of-thought reasoning with function calls, enabling autonomous research, coding,\n and writing workflows that last hundreds of steps without drift.\n- **Native INT4 Quantization**: Quantization-Aware Training (QAT) delivers lossless 2x\n speed-up in low-latency mode.\n- **Stable Long-Horizon Agency**: Maintains coherent goal-directed behavior across up to\n 200-300 consecutive tool invocations, surpassing prior models that degrade after\n 30-50 steps.\n\n## Prerequisites\n\n- **Hardware**: 8x H200 or 8x H20 GPUs\n- **vLLM**: Current stable release\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launching Kimi-K2-Thinking with vLLM\n\n### Low-Latency Scenarios (TP8)\n\n```bash\nvllm serve moonshotai/Kimi-K2-Thinking \\\n --tensor-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser kimi_k2 \\\n --reasoning-parser kimi_k2 \\\n --trust-remote-code\n```\n\nThe `--reasoning-parser` flag specifies the parser used to extract reasoning content\nfrom the model output.\n\n### High-Throughput Scenarios (TP8+DCP8)\n\nvLLM supports [Decode Context Parallel](https://docs.vllm.ai/en/latest/serving/context_parallel_deployment.html#decode-context-parallel),\nwhich provides significant benefits in high-throughput scenarios. Enable DCP by adding\n`--decode-context-parallel-size 8`:\n\n```bash\nvllm serve moonshotai/Kimi-K2-Thinking \\\n --tensor-parallel-size 8 \\\n --decode-context-parallel-size 8 \\\n --enable-auto-tool-choice \\\n --tool-call-parser kimi_k2 \\\n --reasoning-parser kimi_k2 \\\n --trust-remote-code\n```\n\n## Metrics (GSM8K)\n\n| Config | exact_match (flexible) | exact_match (strict) |\n|--------|-----------------------|----------------------|\n| TP8 | 0.9416 | 0.9409 |\n| TP8+DCP8 | 0.9386 | 0.9371 |\n\n## Benchmarking\n\nWe used the following script to benchmark `moonshotai/Kimi-K2-Thinking` on 8xH200:\n\n```bash\nvllm bench serve \\\n --model moonshotai/Kimi-K2-Thinking \\\n --dataset-name random \\\n --random-input 8000 \\\n --random-output 4000 \\\n --request-rate 100 \\\n --num-prompt 1000 \\\n --trust-remote-code\n```\n\n### DCP Gain Analysis\n\n| Metric | TP8 | TP8+DCP8 | Change | Improvement |\n|--------|-----|----------|--------|-------------|\n| Request throughput (req/s) | 1.25 | 1.57 | +0.32 | +25.6% |\n| Output token throughput (tok/s) | 485.78 | 695.13 | +209.35 | +43.1% |\n| Mean TTFT (s) | 271.2 | 227.8 | -43.4 | +16.0% |\n\nDCP multiplies the GPU KV cache size by `dcp_world_size`:\n- TP8 KV cache: `715,072` tokens\n- TP8+DCP8 KV cache: `5,721,088` tokens (8x)\n\nEnabling DCP delivers strong advantages (43% faster token generation, 26% higher\nthroughput) with minimal drawbacks. Read the\n[DCP doc](https://docs.vllm.ai/en/latest/serving/context_parallel_deployment.html#decode-context-parallel)\nand try it in your LLM workloads.\n\n## References\n\n- [Kimi-K2-Thinking on Hugging Face](https://huggingface.co/moonshotai/Kimi-K2-Thinking)\n- [vLLM Decode Context Parallel docs](https://docs.vllm.ai/en/latest/serving/context_parallel_deployment.html#decode-context-parallel)\n" + } + }, + "moonshotai/Kimi-K2.5": { + "hf_id": "moonshotai/Kimi-K2.5", + "meta": { + "title": "Kimi-K2.5", + "provider": "Moonshot AI", + "description": "Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Kimi-K2.5", + "slug": "kimi-k2.5", + "provider": "Moonshot AI", + "description": "Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes", + "date_updated": "2026-05-14", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention", + "related_recipes": [], + "hardware": { + "h200": "verified", + "gb200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "moonshotai/Kimi-K2.5", + "min_vllm_version": "0.19.1", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "32B", + "context_length": 262144, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code" + ] + }, + "features": { + "tool_calling": { + "description": "Kimi K2 tool-call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "kimi_k2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Kimi K2 reasoning parser for extracting chain-of-thought content", + "args": [ + "--reasoning-parser", + "kimi_k2" + ] + }, + "spec_decoding": { + "description": "Eagle3 speculative decoding for accelerated inference (requires vLLM >= 0.18.0)", + "args": [ + "--speculative-config", + "{\"model\":\"lightseekorg/kimi-k2.5-eagle3-mla\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "hardware_opt_in_features": { + "gb200": [ + "encoder_parallel" + ] + }, + "variants": { + "default": { + "precision": "int4", + "vram_minimum_gb": 714, + "description": "Packed INT4 via compressed-tensors (~595 GB on disk); fits 8\u00d7H200" + }, + "nvfp4": { + "model_id": "nvidia/Kimi-K2.5-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 600, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs (e.g. GB200)", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention-config.use_trtllm_ragged_deepseek_prefill=True" + ] + }, + "amd": { + "extra_args": [ + "--block-size=1" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": "INT4", + "VLLM_ROCM_USE_AITER_RMSNORM": "0" + } + } + }, + "strategy_overrides": { + "single_node_dep": { + "extra_args": [], + "extra_env": {} + }, + "single_node_tep": { + "extra_args": [], + "extra_env": {} + }, + "pd_cluster": { + "env": { + "VLLM_USE_NCCL_SYMM_MEM": "1", + "NCCL_CUMEM_ENABLE": "1", + "NCCL_MNNVL_ENABLE": "1", + "NCCL_NVLS_ENABLE": "1" + }, + "prefill": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--enforce-eager", + "--max-num-batched-tokens", + "16384", + "--block-size", + "64" + ], + "env": {} + }, + "decode": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}", + "--block-size", + "64", + "--all2all-backend", + "flashinfer_nvlink_one_sided" + ], + "env": {} + } + } + }, + "guide": "## Overview\n\nKimi K2.5 is an open-source, native multimodal agentic model built through continual\npretraining on approximately 15 trillion mixed visual and text tokens atop Kimi-K2-Base.\nIt seamlessly integrates vision and language understanding with advanced agentic\ncapabilities, instant and thinking modes, as well as conversational and agentic paradigms.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.15.0 (speculative decoding with Eagle3 requires >= 0.18.0)\n- **Hardware (BF16):** 8x H200 GPUs (verified), or equivalent aggregate VRAM (~640 GB)\n- **Hardware (NVFP4):** 4x Blackwell GPUs (e.g. GB200)\n- **AMD support:** 8x MI300X / MI325X / MI355X with ROCm 7.2.1 and Python 3.12\n\n### Install vLLM\n\n**Pip (NVIDIA):**\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend auto\n```\n\n**Pip (AMD ROCm):**\n```bash\nuv venv --python 3.12\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n**Docker (NVIDIA):**\n```bash\ndocker pull vllm/vllm-openai:latest\n```\n\n## Client Usage\n\nOnce the vLLM server is running, consume it via the OpenAI-compatible API:\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(\n api_key=\"EMPTY\",\n base_url=\"http://localhost:8000/v1\",\n timeout=3600\n)\n\nmessages = [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"\n }\n },\n {\n \"type\": \"text\",\n \"text\": \"Read all the text in the image.\"\n }\n ]\n }\n]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"moonshotai/Kimi-K2.5\",\n messages=messages,\n max_tokens=2048\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(f\"Generated text: {response.choices[0].message.content}\")\n```\n\n## Troubleshooting\n\n- **OOM errors:** Lower `--gpu-memory-utilization` or adjust TP/EP to match your GPU count.\n- **Vision encoder performance:** Use `--mm-encoder-tp-mode data` to run the vision encoder\n in data-parallel mode. The encoder is small, so TP adds communication overhead with little gain.\n- **Unique multimodal inputs:** Pass `--mm-processor-cache-gb 0` to avoid caching overhead.\n For repeated inputs, `--mm-processor-cache-type shm` uses host shared memory for better\n performance at high TP settings.\n- **MoE kernel tuning:** Use the `benchmark_moe` script from vLLM to tune Triton kernels\n for your specific hardware.\n- **Async scheduling:** Enabled by default for better throughput. Disable if you encounter\n issues, and file a bug report to vLLM.\n\n## References\n\n- [Kimi-K2.5 on Hugging Face](https://huggingface.co/moonshotai/Kimi-K2.5)\n- [NVIDIA Kimi-K2.5-NVFP4 on Hugging Face](https://huggingface.co/nvidia/Kimi-K2.5-NVFP4)\n- [Eagle3 MLA speculative decoding model](https://huggingface.co/lightseekorg/kimi-k2.5-eagle3-mla)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n- [vLLM Expert Parallelism docs](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html)\n- [vLLM NixlConnector usage guide](https://docs.vllm.ai/en/latest/features/nixl_connector_usage.html)\n" + } + }, + "moonshotai/Kimi-K2.6": { + "hf_id": "moonshotai/Kimi-K2.6", + "meta": { + "title": "Kimi-K2.6", + "provider": "Moonshot AI", + "description": "Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h200": "verified", + "gb200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "1T" + }, + "recipe": { + "meta": { + "title": "Kimi-K2.6", + "slug": "kimi-k2.6", + "provider": "Moonshot AI", + "description": "Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes", + "date_updated": "2026-05-14", + "difficulty": "intermediate", + "tasks": [ + "multimodal", + "text" + ], + "performance_headline": "Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention", + "related_recipes": [], + "hardware": { + "h200": "verified", + "gb200": "verified" + } + }, + "model": { + "model_id": "moonshotai/Kimi-K2.6", + "min_vllm_version": "0.19.1", + "architecture": "moe", + "parameter_count": "1T", + "active_parameters": "32B", + "context_length": 262144, + "supports_dcp": true, + "base_args": [ + "--trust-remote-code" + ] + }, + "features": { + "tool_calling": { + "description": "Kimi K2 tool-call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "kimi_k2", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Kimi K2 reasoning parser for extracting chain-of-thought content", + "args": [ + "--reasoning-parser", + "kimi_k2" + ] + }, + "spec_decoding": { + "description": "Eagle3 speculative decoding for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"lightseekorg/kimi-k2.6-eagle3-mla\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "hardware_opt_in_features": { + "gb200": [ + "encoder_parallel" + ] + }, + "variants": { + "default": { + "precision": "int4", + "vram_minimum_gb": 714, + "description": "Packed INT4 via compressed-tensors (~595 GB on disk); fits 8\u00d7H200" + }, + "nvfp4": { + "model_id": "nvidia/Kimi-K2.6-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 600, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs (e.g. GB200)", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--attention-config.use_trtllm_ragged_deepseek_prefill=True" + ] + }, + "amd": { + "extra_args": [ + "--block-size=1" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": "INT4", + "VLLM_ROCM_USE_AITER_RMSNORM": "0" + } + } + }, + "strategy_overrides": { + "single_node_dep": { + "extra_args": [], + "extra_env": {} + }, + "single_node_tep": { + "extra_args": [], + "extra_env": {} + }, + "pd_cluster": { + "env": { + "VLLM_USE_NCCL_SYMM_MEM": "1", + "NCCL_CUMEM_ENABLE": "1", + "NCCL_MNNVL_ENABLE": "1", + "NCCL_NVLS_ENABLE": "1" + }, + "prefill": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--enforce-eager", + "--max-num-batched-tokens", + "16384", + "--block-size", + "64" + ], + "env": {} + }, + "decode": { + "nodes": 1, + "parallelism": "dep", + "vllm_args": [ + "--compilation-config", + "{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}", + "--block-size", + "64", + "--all2all-backend", + "flashinfer_nvlink_one_sided" + ], + "env": {} + } + } + }, + "guide": "## Overview\n\nKimi K2.6 is an open-source, native multimodal agentic model built through continual\npretraining on approximately 15 trillion mixed visual and text tokens atop Kimi-K2-Base.\nIt seamlessly integrates vision and language understanding with advanced agentic\ncapabilities, instant and thinking modes, as well as conversational and agentic paradigms.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.19.1\n- **Hardware (INT4):** 8x H200 GPUs (verified), or equivalent aggregate VRAM (~640 GB)\n- **AMD support:** 8x MI300X / MI325X / MI355X with ROCm 7.2.1 and Python 3.12\n\n\n## Client Usage\n\nOnce the vLLM server is running, consume it via the OpenAI-compatible API:\n\n```python\nimport time\nfrom openai import OpenAI\n\nclient = OpenAI(\n api_key=\"EMPTY\",\n base_url=\"http://localhost:8000/v1\",\n timeout=3600\n)\n\nmessages = [\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"\n }\n },\n {\n \"type\": \"text\",\n \"text\": \"Read all the text in the image.\"\n }\n ]\n }\n]\n\nstart = time.time()\nresponse = client.chat.completions.create(\n model=\"moonshotai/Kimi-K2.6\",\n messages=messages,\n max_tokens=2048\n)\nprint(f\"Response costs: {time.time() - start:.2f}s\")\nprint(f\"Generated text: {response.choices[0].message.content}\")\n```\n\n## Troubleshooting\n\n- **OOM errors:** Lower `--gpu-memory-utilization` or adjust TP/EP to match your GPU count.\n- **Vision encoder performance:** Use `--mm-encoder-tp-mode data` to run the vision encoder\n in data-parallel mode. The encoder is small, so TP adds communication overhead with little gain.\n- **Unique multimodal inputs:** Pass `--mm-processor-cache-gb 0` to avoid caching overhead.\n For repeated inputs, `--mm-processor-cache-type shm` uses host shared memory for better\n performance at high TP settings.\n- **MoE kernel tuning:** Use the `benchmark_moe` script from vLLM to tune Triton kernels\n for your specific hardware.\n- **Async scheduling:** Enabled by default for better throughput. Disable if you encounter\n issues, and file a bug report to vLLM.\n\n## References\n\n- [Kimi-K2.6 on Hugging Face](https://huggingface.co/moonshotai/Kimi-K2.6)\n- [NVIDIA Kimi-K2.6-NVFP4 on Hugging Face](https://huggingface.co/nvidia/Kimi-K2.6-NVFP4)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n- [vLLM Expert Parallelism docs](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html)\n- [vLLM NixlConnector usage guide](https://docs.vllm.ai/en/latest/features/nixl_connector_usage.html)\n" + } + }, + "moonshotai/Kimi-Linear-48B-A3B-Instruct": { + "hf_id": "moonshotai/Kimi-Linear-48B-A3B-Instruct", + "meta": { + "title": "Kimi-Linear-48B-A3B-Instruct", + "provider": "Moonshot AI", + "description": "Kimi-Linear is a 48B-parameter instruction-tuned MoE model (~3B activated) with a linear-attention variant supporting very long context (1M tokens).", + "tasks": [ + "text" + ], + "hardware": {} + }, + "model_info": { + "architecture": "moe", + "parameter_count": "48B" + }, + "recipe": { + "meta": { + "title": "Kimi-Linear-48B-A3B-Instruct", + "slug": "kimi-linear-48b-a3b-instruct", + "provider": "Moonshot AI", + "description": "Kimi-Linear is a 48B-parameter instruction-tuned MoE model (~3B activated) with a linear-attention variant supporting very long context (1M tokens).", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Linear-attention MoE with 1M-token context on a single node", + "related_recipes": [] + }, + "model": { + "model_id": "moonshotai/Kimi-Linear-48B-A3B-Instruct", + "min_vllm_version": "0.11.2", + "architecture": "moe", + "parameter_count": "48B", + "active_parameters": "3B", + "context_length": 1048576, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Pin vllm==0.11.2 \u2014 0.12.0 has a known Kimi-Linear regression", + "command": "uv pip install vllm==0.11.2 --torch-backend auto" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 115, + "description": "Full precision BF16 on 4 or 8 GPUs (single node)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nKimi-Linear is Moonshot AI's 48B-parameter instruction-tuned MoE model (`A3B` indicates\n~3B active parameters per token) featuring a linear-attention variant that enables very\nlong context windows (up to 1,048,576 tokens).\n\n## Prerequisites\n\n- **Hardware**: 4 or 8 GPUs on a single node\n- **vLLM**: `0.11.2` recommended. Avoid vLLM `0.12.0`, which has a known bug\n `MLAModules.__init__() missing 1 required positional argument: 'indexer_rotary_emb'`\n that affects Kimi-Linear.\n\n```bash\nuv venv\nsource .venv/bin/activate\n# Install a stable version (avoid 0.12.0)\nuv pip install vllm==0.11.2 --torch-backend auto\n```\n\n## Running Kimi-Linear\n\nThe following snippets assume 4 or 8 GPUs on a single node.\n\n### 4-GPU Tensor Parallel\n\n```bash\nvllm serve moonshotai/Kimi-Linear-48B-A3B-Instruct \\\n --port 8000 \\\n --tensor-parallel-size 4 \\\n --max-model-len 1048576 \\\n --trust-remote-code\n```\n\n### 8-GPU Tensor Parallel\n\n```bash\nvllm serve moonshotai/Kimi-Linear-48B-A3B-Instruct \\\n --port 8000 \\\n --tensor-parallel-size 8 \\\n --max-model-len 1048576 \\\n --trust-remote-code\n```\n\nIf you see OOM, reduce `--max-model-len` (e.g. 65536) or increase\n`--gpu-memory-utilization` (<= 0.95).\n\n## Client Usage\n\nOnce the server is up, test it with:\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\"model\":\"moonshotai/Kimi-Linear-48B-A3B-Instruct\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello!\"}]}'\n```\n\n## Troubleshooting\n\n- **`MLAModules.__init__() missing 1 required positional argument: 'indexer_rotary_emb'`**:\n Known bug in vLLM 0.12.0 affecting Kimi-Linear. Pin to `vllm==0.11.2` instead.\n- **OOM**: Reduce `--max-model-len` or increase `--gpu-memory-utilization` up to 0.95.\n\n## References\n\n- [Kimi-Linear-48B-A3B-Instruct on Hugging Face](https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct)\n" + } + }, + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": { + "hf_id": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "meta": { + "title": "NVIDIA Nemotron-3-Nano-30B-A3B", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Nano Mamba-hybrid MoE (30B total / ~3B active) with BF16 and FP8 variants", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "30B" + }, + "recipe": { + "meta": { + "title": "NVIDIA Nemotron-3-Nano-30B-A3B", + "slug": "nemotron-3-nano-30b-a3b", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Nano Mamba-hybrid MoE (30B total / ~3B active) with BF16 and FP8 variants", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "min_vllm_version": "0.11.2", + "architecture": "moe", + "parameter_count": "30B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--async-scheduling" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 Coder tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)", + "args": [ + "--reasoning-parser-plugin", + "nano_v3_reasoning_parser.py", + "--reasoning-parser", + "nano_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 72, + "description": "BF16 weights", + "extra_args": [ + "--kv-cache-dtype", + "auto" + ] + }, + "fp8": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", + "precision": "fp8", + "vram_minimum_gb": 35, + "description": "FP8 weights + FP8 KV cache", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "1", + "VLLM_FLASHINFER_MOE_BACKEND": "throughput" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNVIDIA Nemotron-3-Nano-30B-A3B is a hybrid-Mamba MoE model (30B total, ~3B active) with\nFP8 and BF16 variants. It supports DGX Spark and Jetson Thor in addition to standard\nHopper/Blackwell servers.\n\n## Prerequisites\n\n- Hardware: 1x H100/H200 or comparable; DGX Spark and Jetson Thor supported\n- vLLM >= 0.11.2 (0.12.0 recommended for full support)\n- Docker with NVIDIA Container Toolkit (recommended)\n\n### Pull Docker Image\n\n```bash\ndocker pull --platform linux/amd64 vllm/vllm-openai:v0.12.0\ndocker tag vllm/vllm-openai:v0.12.0 vllm/vllm-openai:deploy\n```\n\nDGX Spark users can build from source (see README) or use the NGC image:\n\n```bash\ndocker pull nvcr.io/nvidia/vllm:25.12.post1-py3\n```\n\nJetson Thor:\n\n```bash\ndocker pull ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor\n```\n\n## Launch commands\n\nFP8 with FlashInfer MoE backend (Blackwell/Hopper):\n\n```bash\nexport VLLM_USE_FLASHINFER_MOE_FP8=1\nexport VLLM_FLASHINFER_MOE_BACKEND=throughput\n\nvllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \\\n --trust-remote-code \\\n --async-scheduling \\\n --kv-cache-dtype fp8 \\\n --tensor-parallel-size 1\n```\n\nBF16 (with reasoning + tool parsers \u2014 typical for Spark/Thor):\n\n```bash\nwget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/resolve/main/nano_v3_reasoning_parser.py\n\nvllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \\\n --max-num-seqs 8 \\\n --tensor-parallel-size 1 \\\n --max-model-len 262144 \\\n --trust-remote-code \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --reasoning-parser-plugin nano_v3_reasoning_parser.py \\\n --reasoning-parser nano_v3\n```\n\nKey flags:\n- `kv-cache-dtype fp8` for FP8 variant, `auto` for BF16\n- `async-scheduling` reduces host overhead between decode steps\n- `mamba-ssm-cache-dtype float32` for best accuracy, `float16` for speed\n- `max-num-seqs` cap to match client concurrency for lower per-user latency\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \\\n --trust-remote-code \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 \\\n --num-warmups 20 \\\n --ignore-eos \\\n --max-concurrency 1024 \\\n --num-prompts 2048\n```\n\n## Troubleshooting\n\n- Use `--kv-cache-dtype fp8` only with the FP8 checkpoint.\n- Balance TP and `--max-num-seqs` for throughput vs. per-user latency.\n\n## References\n\n- [Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)\n- [Nemotron-3-Nano-30B-A3B-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8)\n" + } + }, + "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16": { + "hf_id": "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16", + "meta": { + "title": "NVIDIA Nemotron-3-Nano-4B", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Nano 4B (Mamba-hybrid dense) \u2014 compact reasoning + tool-use model with BF16 and FP8 variants", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "4B" + }, + "recipe": { + "meta": { + "title": "NVIDIA Nemotron-3-Nano-4B", + "slug": "nemotron-3-nano-4b", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Nano 4B (Mamba-hybrid dense) \u2014 compact reasoning + tool-use model with BF16 and FP8 variants", + "date_updated": "2026-04-28", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16", + "min_vllm_version": "0.11.2", + "architecture": "dense", + "parameter_count": "4B", + "active_parameters": "4B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--async-scheduling" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 Coder tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)", + "args": [ + "--reasoning-parser-plugin", + "nano_v3_reasoning_parser.py", + "--reasoning-parser", + "nano_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 10, + "description": "BF16 weights", + "extra_args": [ + "--kv-cache-dtype", + "auto" + ] + }, + "fp8": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8", + "precision": "fp8", + "vram_minimum_gb": 5, + "description": "FP8 weights + FP8 KV cache", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNemotron-3-Nano-4B is the smallest member of the Nemotron-3 hybrid-Mamba family.\nIt's tuned for low-latency reasoning, tool calling, and edge deployments \u2014 DGX Spark\nand Jetson Thor are both supported alongside standard Hopper/Blackwell servers.\n\n## Prerequisites\n\n- Hardware: 1x H100/H200/B200, DGX Spark, or Jetson Thor\n- vLLM >= 0.11.2 (0.12.0 recommended)\n- Docker with NVIDIA Container Toolkit (recommended)\n\n### Pull Docker Image\n\n```bash\ndocker pull --platform linux/amd64 vllm/vllm-openai:v0.12.0\n```\n\nJetson Thor:\n\n```bash\ndocker pull ghcr.io/nvidia-ai-iot/vllm:latest-jetson-thor\n```\n\n## Launch commands\n\nBF16:\n\n```bash\nwget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16/resolve/main/nano_v3_reasoning_parser.py\n\nvllm serve nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16 \\\n --trust-remote-code \\\n --async-scheduling \\\n --max-model-len 262144 \\\n --tensor-parallel-size 1 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --reasoning-parser-plugin nano_v3_reasoning_parser.py \\\n --reasoning-parser nano_v3\n```\n\nFP8:\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8 \\\n --trust-remote-code \\\n --async-scheduling \\\n --kv-cache-dtype fp8 \\\n --tensor-parallel-size 1\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8 \\\n --trust-remote-code \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 \\\n --num-warmups 20 \\\n --ignore-eos \\\n --max-concurrency 256 \\\n --num-prompts 1024\n```\n\n## References\n\n- [Nemotron-3-Nano-4B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16)\n- [Nemotron-3-Nano-4B-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8)\n" + } + }, + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": { + "hf_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "meta": { + "title": "NVIDIA Nemotron-3-Super-120B-A12B", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Super Mamba-hybrid latent-MoE (~120B total / ~12B active) with BF16, FP8, and NVFP4 variants", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "dgx_station_gb300": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "120B" + }, + "recipe": { + "meta": { + "title": "NVIDIA Nemotron-3-Super-120B-A12B", + "slug": "nemotron-3-super-120b-a12b", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-3-Super Mamba-hybrid latent-MoE (~120B total / ~12B active) with BF16, FP8, and NVFP4 variants", + "date_updated": "2026-04-28", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "dgx_station_gb300": "verified" + } + }, + "model": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "min_vllm_version": "0.17.1", + "architecture": "moe", + "parameter_count": "120B", + "active_parameters": "12B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 Coder tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Built-in nemotron_v3 reasoning parser (vLLM >= 0.17.1)", + "args": [ + "--reasoning-parser", + "nemotron_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 298, + "description": "BF16 weights (FP8 KV cache recommended)", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "fp8": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", + "precision": "fp8", + "vram_minimum_gb": 149, + "description": "FP8 weights + FP8 KV cache", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "nvfp4": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 75, + "description": "NVFP4 weights for Blackwell" + }, + "base_bf16": { + "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16", + "precision": "bf16", + "vram_minimum_gb": 298, + "description": "Pre-RL base checkpoint (BF16)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNVIDIA Nemotron-3-Super-120B-A12B is a hybrid-Mamba latent-MoE model (~120B total,\n~12B active per token) trained for general reasoning, tool use, and agentic workflows.\nIt supports a 1M-token context window and Multi-Token Prediction (MTP). Variants ship\nin BF16, FP8, and NVFP4 (Blackwell). A pre-RL Base BF16 checkpoint is also available\nfor downstream fine-tuning.\n\n## Prerequisites\n\n- Hardware: 4-8x H100/H200/B200/RTX Pro 6000, or DGX Spark\n- vLLM >= 0.17.1\n- Docker with NVIDIA Container Toolkit (recommended)\n\n## Launch commands\n\nReference command from the [vLLM blog](https://vllm.ai/blog/nemotron-3-super) (BF16, 4x H100, FP8 KV cache):\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \\\n --kv-cache-dtype fp8 \\\n --tensor-parallel-size 4 \\\n --trust-remote-code \\\n --served-model-name nemotron \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --reasoning-parser nemotron_v3\n```\n\nFP8 weights:\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 \\\n --kv-cache-dtype fp8 \\\n --tensor-parallel-size 4 \\\n --trust-remote-code \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --reasoning-parser nemotron_v3\n```\n\nNVFP4 (Blackwell only):\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \\\n --tensor-parallel-size 2 \\\n --trust-remote-code \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder \\\n --reasoning-parser nemotron_v3\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 \\\n --trust-remote-code \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 \\\n --num-warmups 20 \\\n --ignore-eos \\\n --max-concurrency 1024 \\\n --num-prompts 2048\n```\n\n## References\n\n- [vLLM blog: Nemotron-3-Super](https://vllm.ai/blog/nemotron-3-super)\n- [Nemotron-3-Super-120B-A12B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16)\n- [Nemotron-3-Super-120B-A12B-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8)\n- [Nemotron-3-Super-120B-A12B-NVFP4](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4)\n- [Nemotron-3-Super-120B-A12B-Base-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16)\n" + } + }, + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16": { + "hf_id": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + "meta": { + "title": "NVIDIA Nemotron-Nano-12B-v2-VL", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-Nano 12B vision-language model with video support and Efficient Video Sampling (EVS)", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "12B" + }, + "recipe": { + "meta": { + "title": "NVIDIA Nemotron-Nano-12B-v2-VL", + "slug": "nemotron-nano-12b-v2-vl", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-Nano 12B vision-language model with video support and Efficient Video Sampling (EVS)", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + ], + "hardware": { + "h100": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + "min_vllm_version": "0.11.1", + "architecture": "dense", + "parameter_count": "12B", + "active_parameters": "12B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code" + ], + "base_env": { + "VLLM_VIDEO_LOADER_BACKEND": "opencv" + } + }, + "features": { + "video_compression": { + "description": "Efficient Video Sampling (EVS) prunes video tokens; 0.75 means 75% pruning", + "args": [ + "--video-pruning-rate", + "0.75" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 29, + "description": "BF16 weights on 1 GPU" + }, + "fp8": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8", + "precision": "fp8", + "vram_minimum_gb": 14, + "description": "FP8 weights on 1 GPU" + }, + "nvfp4": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD", + "precision": "nvfp4", + "vram_minimum_gb": 8, + "description": "NVFP4 (QAD) weights for Blackwell" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNemotron-Nano-12B-v2-VL is a vision-language model with image and video support. It\nincludes Efficient Video Sampling (EVS) to prune video tokens and reduce compute. The\nmodel is available in BF16, FP8, and NVFP4 (QAD) precisions.\n\n## Prerequisites\n\n- Hardware: 1x GPU (A100/H100/B200, etc.)\n- vLLM: 0.11.0 does NOT include this model; use latest nightly or install from source\n- DGX Spark: use `nvcr.io/nvidia/vllm:25.12.post1-py3`\n\n### Install vLLM\n\n```bash\ndocker pull vllm/vllm-openai:nightly-8bff831f0aa239006f34b721e63e1340e3472067\n# or for DGX Spark:\ndocker pull nvcr.io/nvidia/vllm:25.12.post1-py3\n```\n\n## Launch command\n\n```bash\nexport VLLM_VIDEO_LOADER_BACKEND=opencv\nexport CHECKPOINT_PATH=\"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\"\nexport CUDA_VISIBLE_DEVICES=0\n\npython3 -m vllm.entrypoints.openai.api_server \\\n --model ${CHECKPOINT_PATH} \\\n --trust-remote-code \\\n --media-io-kwargs '{\"video\": {\"fps\": 2, \"num_frames\": 128}}' \\\n --max-model-len 131072 \\\n --data-parallel-size 1 \\\n --port 5566 \\\n --allowed-local-media-path / \\\n --video-pruning-rate 0.75 \\\n --served-model-name \"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\"\n```\n\nFlags:\n- `--max-model-len`: reduce for shorter contexts to save memory\n- `--allowed-local-media-path `: limit local-file access\n- `--video-pruning-rate <0..1>`: EVS compression; higher prunes more video tokens\n\n## Client Usage\n\nDescribe a video:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:5566/v1\", api_key=\"\")\ncompletion = client.chat.completions.create(\n model=\"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": \"Describe the video.\"},\n {\"type\": \"video_url\", \"video_url\": {\"url\": \"file:///path/to/video.mp4\"}},\n ],\n }],\n)\nprint(completion.choices[0].message.content)\n```\n\n## Offline / LLM API\n\n```python\nfrom vllm import LLM, SamplingParams\n\nllm = LLM(\n \"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\",\n trust_remote_code=True,\n max_model_len=2**17,\n allowed_local_media_path=\"/\",\n video_pruning_rate=0.75,\n media_io_kwargs=dict(video=dict(fps=2, num_frames=128)),\n)\n```\n\n## Troubleshooting\n\n- Set `VLLM_VIDEO_LOADER_BACKEND=opencv` (required for video inputs).\n- OOM: lower `--max-model-len` or increase `--video-pruning-rate`.\n\n## References\n\n- [Nemotron-Nano-12B-v2-VL-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)\n- [Nemotron-Nano-12B-v2-VL-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8)\n- [Nemotron-Nano-12B-v2-VL-NVFP4-QAD](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD)\n- [EVS paper](https://arxiv.org/abs/2510.14624)\n" + } + }, + "nvidia/NVIDIA-Nemotron-Nano-9B-v2": { + "hf_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2", + "meta": { + "title": "NVIDIA Nemotron-Nano-9B-v2", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-Nano 9B (Mamba-hybrid dense) reasoning + tool-use model with FP8 / NVFP4 / Japanese variants", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "9B" + }, + "recipe": { + "meta": { + "title": "NVIDIA Nemotron-Nano-9B-v2", + "slug": "nemotron-nano-9b-v2", + "provider": "NVIDIA", + "description": "NVIDIA Nemotron-Nano 9B (Mamba-hybrid dense) reasoning + tool-use model with FP8 / NVFP4 / Japanese variants", + "date_updated": "2026-04-28", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2", + "min_vllm_version": "0.10.1", + "architecture": "dense", + "parameter_count": "9B", + "active_parameters": "9B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Custom Nemotron tool-call parser plugin (download: nemotron_toolcall_parser_no_streaming.py)", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser-plugin", + "nemotron_toolcall_parser_no_streaming.py", + "--tool-call-parser", + "nemotron_json" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 22, + "description": "BF16 weights on 1 GPU" + }, + "fp8": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8", + "precision": "fp8", + "vram_minimum_gb": 11, + "description": "FP8 weights" + }, + "nvfp4": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 6, + "description": "NVFP4 weights for Blackwell" + }, + "base": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base", + "precision": "bf16", + "vram_minimum_gb": 22, + "description": "Pre-RL base checkpoint (BF16)" + }, + "japanese": { + "model_id": "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", + "precision": "bf16", + "vram_minimum_gb": 22, + "description": "Japanese-specialized fine-tune (BF16)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNemotron-Nano-9B-v2 is a 9B Mamba-hybrid dense reasoning model that runs on a single\nH100/H200/B200. Variants ship in BF16, FP8, and NVFP4; a Japanese-specialized\nfine-tune and a pre-RL Base checkpoint are also available.\n\n## Prerequisites\n\n- Hardware: 1x H100/H200/B200 (or comparable)\n- vLLM >= 0.10.1 (`pip install -U vllm`)\n- Docker with NVIDIA Container Toolkit (recommended)\n\n## Launch command\n\n```bash\nwget https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2/resolve/main/nemotron_toolcall_parser_no_streaming.py\n\nvllm serve nvidia/NVIDIA-Nemotron-Nano-9B-v2 \\\n --trust-remote-code \\\n --max-model-len 131072 \\\n --tensor-parallel-size 1 \\\n --enable-auto-tool-choice \\\n --tool-call-parser-plugin nemotron_toolcall_parser_no_streaming.py \\\n --tool-call-parser nemotron_json\n```\n\nFP8:\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8 \\\n --trust-remote-code \\\n --tensor-parallel-size 1\n```\n\nNVFP4 (Blackwell only):\n\n```bash\nvllm serve nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4 \\\n --trust-remote-code \\\n --tensor-parallel-size 1\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model nvidia/NVIDIA-Nemotron-Nano-9B-v2 \\\n --trust-remote-code \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 \\\n --num-warmups 20 \\\n --ignore-eos \\\n --max-concurrency 256 \\\n --num-prompts 1024\n```\n\n## References\n\n- [Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2)\n- [Nemotron-Nano-9B-v2-FP8](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8)\n- [Nemotron-Nano-9B-v2-NVFP4](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4)\n- [Nemotron-Nano-9B-v2-Base](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base)\n- [Nemotron-Nano-9B-v2-Japanese](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese)\n" + } + }, + "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16": { + "hf_id": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16", + "meta": { + "title": "Nemotron-3-Nano-Omni-30B-A3B-Reasoning", + "provider": "NVIDIA", + "description": "Mamba2-Transformer hybrid MoE omnimodal model (31B total / 3B active) with unified video, audio, image, and text understanding; reasoning + tool calling; BF16, FP8, and NVFP4 variants", + "tasks": [ + "multimodal", + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "31B" + }, + "recipe": { + "meta": { + "title": "Nemotron-3-Nano-Omni-30B-A3B-Reasoning", + "slug": "nemotron-3-nano-omni-30b-a3b-reasoning", + "provider": "NVIDIA", + "description": "Mamba2-Transformer hybrid MoE omnimodal model (31B total / 3B active) with unified video, audio, image, and text understanding; reasoning + tool calling; BF16, FP8, and NVFP4 variants", + "date_updated": "2026-04-29", + "difficulty": "advanced", + "tasks": [ + "multimodal", + "text" + ], + "related_recipes": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16", + "min_vllm_version": "0.20.0", + "install": { + "pip": { + "command": "uv pip install \"vllm[audio]==0.20.0\"", + "note": "Pinned to 0.20.0 with the audio extra (required for audio + use_audio_in_video)." + } + }, + "architecture": "moe", + "parameter_count": "31B", + "active_parameters": "3B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--max-model-len", + "131072", + "--media-io-kwargs", + "{\"video\": {\"num_frames\": 512, \"fps\": 1}}", + "--video-pruning-rate", + "0.5" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Qwen3 Coder tool-call parser with automatic tool choice", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "qwen3_coder" + ] + }, + "reasoning": { + "description": "Nemotron v3 reasoning parser (chain-of-thought with tags)", + "args": [ + "--reasoning-parser", + "nemotron_v3" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 75, + "description": "BF16 weights \u2014 full-precision reference" + }, + "fp8": { + "model_id": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8", + "precision": "fp8", + "vram_minimum_gb": 38, + "description": "ModelOpt FP8 weights + FP8 KV cache", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + }, + "nvfp4": { + "model_id": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 28, + "description": "ModelOpt NVFP4 weights \u2014 Blackwell-only; smallest footprint", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nNVIDIA Nemotron-3-Nano-Omni-30B-A3B-Reasoning is a Mamba2-Transformer hybrid\nMoE omnimodal model (31B total / 3B active) that unifies video, audio, image,\nand text understanding. It is built on the Nemotron-3-Nano-30B-A3B LLM\nbackbone with a CRADIO v4-H vision encoder and a Parakeet speech encoder, and\nships in BF16, ModelOpt FP8, and ModelOpt NVFP4 variants.\n\nCapabilities:\n- Video (mp4, up to 2 minutes, sampled at 1\u20132 FPS / 128\u2013256 frames)\n- Audio (wav, mp3, up to 1 hour, \u22658 kHz)\n- Image (jpeg, png)\n- Text (English, up to 256K context)\n- Reasoning with chain-of-thought (`` tags)\n- Tool calling\n- Word-level timestamps for transcription\n\n## Prerequisites\n\n- vLLM **0.20.0** (pinned: `pip install vllm[audio]==0.20.0`,\n or pull `vllm/vllm-openai:v0.20.0`)\n- Hardware: 1\u00d7 B200 / H200 / H100 (single-GPU TP1 is the documented profile)\n- The `audio` extra is required for any audio input, including\n `use_audio_in_video=true`\n\n### Pull the Docker image\n\n```bash\n# CUDA 13:\ndocker pull vllm/vllm-openai:v0.20.0\n# CUDA 12.9:\ndocker pull vllm/vllm-openai:v0.20.0-cu129\n```\n\n## Launch command\n\nGeneral single-GPU invocation (B200 / H200 / H100):\n\n```bash\nvllm serve nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16 \\\n --served-model-name nemotron \\\n --host 0.0.0.0 \\\n --port 5000 \\\n --tensor-parallel-size 1 \\\n --max-model-len 131072 \\\n --trust-remote-code \\\n --video-pruning-rate 0.5 \\\n --media-io-kwargs '{\"video\": {\"num_frames\": 512, \"fps\": 1}}' \\\n --reasoning-parser nemotron_v3 \\\n --enable-auto-tool-choice \\\n --tool-call-parser qwen3_coder\n```\n\nSwap the model id for the FP8 or NVFP4 checkpoint and add `--kv-cache-dtype fp8`:\n\n```bash\nvllm serve nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 \\\n --kv-cache-dtype fp8 \\\n ...\n```\n\n### Platform-specific notes\n\n- **RTX Pro:** append `--moe-backend triton` (FlashInfer + RTX Pro bug).\n- **NVFP4 + TP>1:** append `--moe-backend flashinfer_cutlass` (TRTLLM_GEN MoE\n kernel bug at TP>1 on NVFP4).\n- **DGX Spark (aarch64):** unified LPDDR5X memory; lower\n `--gpu-memory-utilization` to 0.70 and reduce `--max-model-len` (e.g. 32768)\n if you hit OOM. Use `--max-num-seqs 8`.\n\n## Recommended sampling\n\n| Mode | temperature | top_p | top_k | max_tokens | reasoning_budget |\n|------|------------:|------:|------:|-----------:|------------------:|\n| Thinking | 0.6 | 0.95 | \u2014 | 20480 | 16384 |\n| Instruct | 0.2 | \u2014 | 1 | 1024 | \u2014 |\n\nToggle thinking mode via `chat_template_kwargs={\"enable_thinking\": true}`\n(default) or `false` to disable.\n\n## Client usage\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"\")\nresp = client.chat.completions.create(\n model=\"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4\",\n messages=[{\"role\": \"user\", \"content\": \"Hello!\"}],\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}},\n)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8 \\\n --trust-remote-code \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 \\\n --num-warmups 20 --ignore-eos \\\n --max-concurrency 1024 --num-prompts 2048\n```\n\n## References\n\n- [BF16 model card](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16)\n- [FP8 model card](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8)\n- [NVFP4 model card](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4)\n- [Nemotron 3 Nano LLM backbone](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)\n" + } + }, + "openai/gpt-oss-120b": { + "hf_id": "openai/gpt-oss-120b", + "meta": { + "title": "GPT-OSS", + "provider": "OpenAI", + "description": "OpenAI's gpt-oss family (20B / 120B) with MXFP4 MoE, attention-sinks, built-in tools via Responses API", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "120B" + }, + "recipe": { + "meta": { + "title": "GPT-OSS", + "slug": "gpt-oss", + "provider": "OpenAI", + "description": "OpenAI's gpt-oss family (20B / 120B) with MXFP4 MoE, attention-sinks, built-in tools via Responses API", + "date_updated": "2026-05-10", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "openai/gpt-oss-120b", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "120B", + "active_parameters": "5.1B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "OpenAI harmony tool-call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "openai", + "--enable-auto-tool-choice" + ] + }, + "spec_decoding": { + "description": "EAGLE3 speculative decoding for accelerated inference", + "args": [ + "--speculative-config", + "{\"model\":\"nvidia/gpt-oss-120b-Eagle3-v3\",\"num_speculative_tokens\":7,\"method\":\"eagle3\",\"draft_tensor_parallel_size\":1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "mxfp4", + "vram_minimum_gb": 96, + "description": "gpt-oss-120b with MXFP4 MoE; fits on 1xA100 80GB, scales to TP 2/4/8" + }, + "amd_fp8": { + "model_id": "amd/gpt-oss-120b-w-mxfp4-a-fp8", + "precision": "mxfp4", + "vram_minimum_gb": 80, + "description": "Quark-quantized MXFP4 weights with FP8 activations for MI355X (gfx950)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--kv-cache-dtype", + "fp8", + "--no-enable-prefix-caching", + "--max-cudagraph-capture-size", + "2048", + "--max-num-batched-tokens", + "8192", + "--stream-interval", + "20" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": "1" + } + }, + "hopper": { + "extra_args": [ + "--no-enable-prefix-caching", + "--max-cudagraph-capture-size", + "2048", + "--max-num-batched-tokens", + "8192", + "--stream-interval", + "20" + ], + "extra_env": {} + }, + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_UNIFIED_ATTN", + "-cc.pass_config.fuse_rope_kvcache=True", + "-cc.use_inductor_graph_partition=True", + "--gpu-memory-utilization", + "0.95", + "--block-size=64" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": "INT4", + "HSA_NO_SCRATCH_RECLAIM": "1", + "AMDGCN_USE_BUFFER_OPS": "0" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n`gpt-oss-20b` and `gpt-oss-120b` are open-weight reasoning models from OpenAI. vLLM\nsupports NVIDIA H100/H200/B200, AMD MI300X/MI325X/MI355X, and Radeon AI PRO R9700,\nwith ongoing work for Ampere/Ada/RTX 5090.\n\nOptimizations:\n- Flexible parallelism (TP 2/4/8)\n- Attention kernels for attention-sinks and sliding-window shapes\n- Asynchronous scheduling for CPU/GPU overlap\n\n## Prerequisites\n\n- Hardware: NVIDIA H100/H200/B200 (or A100 80GB for single-GPU), AMD MI300+\n- vLLM >= 0.10.0\n- CUDA >= 12.8 if building from source (must match between install and serving)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend=auto\n```\n\nDocker quickstart:\n\n```bash\ndocker run --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai --model openai/gpt-oss-20b\n```\n\nAMD ROCm wheels:\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n## Launch commands\n\nA100 (single card, default TRITON_ATTN + Marlin MXFP4 MoE):\n\n```bash\nvllm serve openai/gpt-oss-120b\nvllm serve openai/gpt-oss-120b --tensor-parallel-size 4\n```\n\nBlackwell (B200) with FlashInfer MXFP4+MXFP8 MoE:\n\n```bash\nexport VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1\n\n# GPT-OSS_Blackwell.yaml\n# kv-cache-dtype: fp8\n# no-enable-prefix-caching: true\n# max-cudagraph-capture-size: 2048\n# max-num-batched-tokens: 8192\n# stream-interval: 20\nvllm serve openai/gpt-oss-120b --config GPT-OSS_Blackwell.yaml --tensor-parallel-size 1\n```\n\nHopper (H100/H200): same as Blackwell without `kv-cache-dtype` and without the env var.\n\nAMD MI300X/MI325X:\n\n```bash\nexport HSA_NO_SCRATCH_RECLAIM=1\nexport AMDGCN_USE_BUFFER_OPS=0\nexport VLLM_ROCM_USE_AITER=1\nexport VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4\n\nvllm serve openai/gpt-oss-120b \\\n --tensor-parallel-size 8 \\\n --attention-backend ROCM_AITER_UNIFIED_ATTN \\\n -cc.pass_config.fuse_rope_kvcache=True \\\n -cc.use_inductor_graph_partition=True \\\n --gpu-memory-utilization 0.95 \\\n --block-size 64\n```\n\n## Tool Use\n\n`/v1/responses` endpoint supports built-in tools (browsing, python, MCP). Setup\nrequires `uv pip install gpt-oss` and either docker (for Python sandbox) or\n`PYTHON_EXECUTION_BACKEND=dangerously_use_uv`. For demo tools:\n\n```bash\nvllm serve ... --tool-server demo\n```\n\nFor user-defined function calling:\n\n```bash\nvllm serve ... --tool-call-parser openai --enable-auto-tool-choice\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresponse = client.chat.completions.create(\n model=\"openai/gpt-oss-120b\",\n messages=[{\"role\": \"user\", \"content\": \"Explain sinks attention.\"}],\n)\nprint(response.choices[0].message.content)\n```\n\n## Accuracy Evaluation\n\nOpenAI recommends evaluating with the gpt-oss reference library:\n\n```bash\nvllm serve openai/gpt-oss-120b \\\n --tensor_parallel_size 8 --max-model-len 131072 \\\n --max-num-batched-tokens 10240 --max-num-seqs 128 \\\n --gpu-memory-utilization 0.85 --no-enable-prefix-caching\n\nmkdir -p /tmp/gpqa_openai\nOPENAI_API_KEY=empty python -m gpt_oss.evals \\\n --model openai/gpt-oss-120b --eval gpqa --n-threads 128\n```\n\nReproduced scores (120B): Low 65.3 / 51.2; Mid 72.4 / 79.6; High 79.4 / 93.0 (GPQA / AIME25).\n\n## Troubleshooting\n\n- **Attention sinks dtype error on Blackwell:** ensure env vars above are set.\n- **`tl.language not defined`:** make sure no extra Triton (e.g., pytorch-triton) is installed.\n- **H100 TP1 OOM:** `--gpu-memory-utilization 0.95 --max-num-batched-tokens 1024`.\n- **Harmony vocab download failure:** pre-download tiktoken files and set `TIKTOKEN_ENCODINGS_BASE`.\n\n## Known Limitations\n\n- Responses API: streaming is basic, annotations/citations unsupported, usage accounting returns zeros.\n- Function calling currently supports only `tool_choice=\"auto\"`.\n\n## References\n\n- [gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b)\n- [gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b)\n- [amd/gpt-oss-120b-w-mxfp4-a-fp8](https://huggingface.co/amd/gpt-oss-120b-w-mxfp4-a-fp8)\n- [Eagle3 draft model](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-v3)\n" + } + }, + "openai/gpt-oss-20b": { + "hf_id": "openai/gpt-oss-20b", + "meta": { + "title": "GPT-OSS 20B", + "provider": "OpenAI", + "description": "OpenAI's gpt-oss-20b \u2014 21B-total / 3.6B-active MoE reasoning model with native MXFP4 quant; fits in 16GB VRAM", + "tasks": [ + "text" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "21B" + }, + "recipe": { + "meta": { + "title": "GPT-OSS 20B", + "slug": "gpt-oss-20b", + "provider": "OpenAI", + "description": "OpenAI's gpt-oss-20b \u2014 21B-total / 3.6B-active MoE reasoning model with native MXFP4 quant; fits in 16GB VRAM", + "date_updated": "2026-05-08", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "21B/3.6B-A MoE reasoning model with native MXFP4 \u2014 runs on 16GB", + "related_recipes": [ + "openai/gpt-oss-120b" + ], + "hardware": { + "h100": "verified", + "h200": "verified", + "b200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "openai/gpt-oss-20b", + "min_vllm_version": "0.10.0", + "architecture": "moe", + "parameter_count": "21B", + "active_parameters": "3.6B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "OpenAI harmony tool-call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "openai", + "--enable-auto-tool-choice" + ] + } + }, + "opt_in_features": [], + "variants": { + "default": { + "precision": "mxfp4", + "vram_minimum_gb": 16, + "description": "MXFP4 MoE weights \u2014 fits in 16GB VRAM on a single consumer or datacenter GPU" + } + }, + "compatible_strategies": [ + "single_node_tp" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [ + "--kv-cache-dtype", + "fp8", + "--no-enable-prefix-caching", + "--max-cudagraph-capture-size", + "2048", + "--max-num-batched-tokens", + "8192", + "--stream-interval", + "20" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": "1" + } + }, + "hopper": { + "extra_args": [ + "--no-enable-prefix-caching", + "--max-cudagraph-capture-size", + "2048", + "--max-num-batched-tokens", + "8192", + "--stream-interval", + "20" + ], + "extra_env": {} + }, + "amd": { + "extra_args": [ + "--attention-backend", + "ROCM_AITER_UNIFIED_ATTN", + "-cc.pass_config.fuse_rope_kvcache=True", + "-cc.use_inductor_graph_partition=True", + "--gpu-memory-utilization", + "0.95", + "--block-size=64" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": "INT4", + "HSA_NO_SCRATCH_RECLAIM": "1", + "AMDGCN_USE_BUFFER_OPS": "0" + } + } + }, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[`gpt-oss-20b`](https://huggingface.co/openai/gpt-oss-20b) is OpenAI's smaller open-weight reasoning model: 21B total parameters with 3.6B activated per token across 32 experts (top-4 routing), shipped with native MXFP4 quantization on the MoE weights. It targets lower-latency and on-device use cases \u2014 the model loads in ~16GB of VRAM, runs on a single H100/H200/B200 or AMD MI300X/MI325X/MI355X, and supports the same harmony chat format, configurable reasoning effort (low / medium / high), and built-in tools (browser, python, function calling) as its larger sibling [`gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b).\n\nArchitectural notes:\n- 24 layers alternating sliding-window (window=128) and full attention.\n- YaRN rope scaling (factor=32) extending 4K \u2192 131K context.\n- MXFP4 quant on `model.layers.*.mlp` experts; attention, router, embeddings stay in BF16.\n\n## Prerequisites\n\n- Hardware: NVIDIA H100/H200/B200 or AMD MI300X/MI325X/MI355X (also runs on Ada/Ampere consumer cards with sufficient VRAM).\n- vLLM >= 0.10.0.\n- CUDA >= 12.8 if building from source (must match between install and serving).\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend=auto\n```\n\nDocker quickstart:\n\n```bash\ndocker run --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai --model openai/gpt-oss-20b\n```\n\nAMD ROCm wheels:\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n## Launch commands\n\nSingle GPU (default \u2014 works on any 16GB+ card):\n\n```bash\nvllm serve openai/gpt-oss-20b\n```\n\nBlackwell (B200) with FlashInfer MXFP4+MXFP8 MoE:\n\n```bash\nexport VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1\n\nvllm serve openai/gpt-oss-20b \\\n --kv-cache-dtype fp8 \\\n --no-enable-prefix-caching \\\n --max-cudagraph-capture-size 2048 \\\n --max-num-batched-tokens 8192 \\\n --stream-interval 20\n```\n\nHopper (H100/H200): same as Blackwell minus `--kv-cache-dtype fp8` and the FlashInfer env var.\n\nAMD MI300X/MI325X/MI355X:\n\n```bash\nexport HSA_NO_SCRATCH_RECLAIM=1\nexport AMDGCN_USE_BUFFER_OPS=0\nexport VLLM_ROCM_USE_AITER=1\nexport VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4\n\nvllm serve openai/gpt-oss-20b \\\n --attention-backend ROCM_AITER_UNIFIED_ATTN \\\n -cc.pass_config.fuse_rope_kvcache=True \\\n -cc.use_inductor_graph_partition=True \\\n --gpu-memory-utilization 0.95 \\\n --block-size 64\n```\n\n## Tool use\n\nThe `/v1/responses` endpoint supports built-in tools (browsing, python, MCP). Setup requires `uv pip install gpt-oss` and either Docker (for the Python sandbox) or `PYTHON_EXECUTION_BACKEND=dangerously_use_uv`. For demo tools:\n\n```bash\nvllm serve openai/gpt-oss-20b --tool-server demo\n```\n\nFor user-defined function calling (toggle the **Tool Calling** feature above, or pass manually):\n\n```bash\nvllm serve openai/gpt-oss-20b --tool-call-parser openai --enable-auto-tool-choice\n```\n\n## Reasoning effort\n\ngpt-oss exposes three reasoning levels \u2014 low, medium, high \u2014 selected via the system prompt:\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresponse = client.chat.completions.create(\n model=\"openai/gpt-oss-20b\",\n messages=[\n {\"role\": \"system\", \"content\": \"Reasoning: high\"},\n {\"role\": \"user\", \"content\": \"Explain why eigenvalues matter.\"},\n ],\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **Attention sinks dtype error on Blackwell:** ensure `VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1` and `--kv-cache-dtype fp8`.\n- **`tl.language not defined`:** make sure no extra Triton (e.g., `pytorch-triton`) is installed alongside vLLM's bundled Triton.\n- **Harmony vocab download failure:** pre-download tiktoken files and set `TIKTOKEN_ENCODINGS_BASE`.\n\n## References\n\n- [Model card \u2014 gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b)\n- [Sibling \u2014 gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b)\n- [OpenAI announcement](https://openai.com/index/introducing-gpt-oss/)\n- [gpt-oss model card paper (arXiv:2508.10925)](https://arxiv.org/abs/2508.10925)\n- [vLLM gpt-oss cookbook](https://cookbook.openai.com/articles/gpt-oss/run-vllm)\n" + } + }, + "poolside/Laguna-XS.2": { + "hf_id": "poolside/Laguna-XS.2", + "meta": { + "title": "Laguna XS.2", + "provider": "Poolside", + "description": "Poolside's 33B total / 3B activated MoE coding model with mixed sliding-window + global attention, native interleaved reasoning, and 128K context \u2014 designed for agentic coding.", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "33B" + }, + "recipe": { + "meta": { + "title": "Laguna XS.2", + "slug": "laguna-xs.2", + "provider": "Poolside", + "description": "Poolside's 33B total / 3B activated MoE coding model with mixed sliding-window + global attention, native interleaved reasoning, and 128K context \u2014 designed for agentic coding.", + "date_updated": "2026-04-29", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "33B/3B-A MoE for agentic coding with interleaved thinking and tool use", + "related_recipes": [], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "poolside/Laguna-XS.2", + "min_vllm_version": "nightly", + "nightly_required": true, + "architecture": "moe", + "parameter_count": "33B", + "active_parameters": "3B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Enable automatic tool choice with Poolside's tool-call parser", + "args": [ + "--enable-auto-tool-choice", + "--tool-call-parser", + "poolside_v1" + ] + }, + "reasoning": { + "description": "Enable interleaved thinking with Poolside's reasoning parser", + "args": [ + "--reasoning-parser", + "poolside_v1" + ] + }, + "spec_decoding": { + "description": "DFlash speculative decoding with the Laguna-XS.2 draft model (7 tokens, greedy)", + "args": [ + "--speculative-config", + "{\"model\":\"poolside/Laguna-XS.2-speculator.dflash\",\"num_speculative_tokens\":7,\"method\":\"dflash\"}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 80, + "description": "BF16 weights \u2014 fits on a single 80GB+ GPU (H100/H200/B200)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep" + ], + "hardware_overrides": {}, + "strategy_overrides": { + "single_node_tp": { + "tp": 1 + } + }, + "guide": "## Overview\n\n[Laguna XS.2](https://huggingface.co/poolside/Laguna-XS.2) is Poolside's 33B-total / 3B-activated Mixture-of-Experts model purpose-built for agentic coding and long-horizon work. It combines mixed sliding-window + global attention (3:1 across 40 layers) with sigmoid per-head gating and FP8 KV cache, so it stays compact enough to run locally while supporting a 131K-token context.\n\n### Key features\n- **Mixed SWA + global attention**: 30 sliding-window layers (window=512) interleaved with 10 global-attention layers, each with per-layer rotary scaling.\n- **Native FP8 KV cache**: KV cache is quantized to FP8 to reduce memory per token.\n- **Interleaved reasoning**: thinking blocks emitted between tool calls; toggled per-request via `enable_thinking`.\n- **Tool calling**: Poolside-specific XML-style tool-call protocol, parsed via `poolside_v1`.\n- **256 experts + 1 shared expert** with top-8 routing.\n\n## Prerequisites\n\nLaguna XS.2 support is on the open vLLM PR ([vllm-project/vllm#41129](https://github.com/vllm-project/vllm/pull/41129)) \u2014 install from a nightly wheel or the pinned Docker image below until the PR lands in a stable release.\n\n### Docker (recommended)\n```bash\ndocker pull vllm/vllm-openai:laguna\n```\n\n### pip (nightly)\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly/cu130 \\\n --extra-index-url https://download.pytorch.org/whl/cu130 \\\n --index-strategy unsafe-best-match\n```\n\n## Launch command\n\n### Single GPU (H100/H200/B200, BF16)\n```bash\nvllm serve poolside/Laguna-XS.2 \\\n --trust-remote-code \\\n --max-model-len 131072 \\\n --enable-auto-tool-choice \\\n --tool-call-parser poolside_v1 \\\n --reasoning-parser poolside_v1\n```\n\n### Docker\n```bash\ndocker run -itd --name laguna-xs2 \\\n --ipc=host --network host --shm-size 16G --gpus all \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:laguna \\\n --model poolside/Laguna-XS.2 \\\n --trust-remote-code \\\n --max-model-len 131072 \\\n --enable-auto-tool-choice \\\n --tool-call-parser poolside_v1 \\\n --reasoning-parser poolside_v1 \\\n --host 0.0.0.0 --port 8000\n```\n\n## Controlling reasoning\n\nReasoning is **off by default** in the chat template. Enable it per-request:\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nresp = client.chat.completions.create(\n model=\"poolside/Laguna-XS.2\",\n messages=[{\"role\": \"user\", \"content\": \"Write a Python retry wrapper with exponential backoff.\"}],\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": True}},\n temperature=0.7,\n top_p=1.0,\n extra_query={\"top_k\": 20},\n)\nprint(resp.choices[0].message.reasoning_content)\nprint(resp.choices[0].message.content)\n```\n\nOr default-on with `--default-chat-template-kwargs '{\"enable_thinking\": true}'`.\n\n## Speculative decoding (DFlash)\n\nEnable the **Spec Decoding** toggle above to attach Poolside's [DFlash draft model](https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash) \u2014 a 5-layer Llama-style speculator that proposes up to 7 tokens per step. Reported per-position acceptance with reasoning enabled is ~70% at position 1 across coding, math, QA, and writing workloads.\n\nRequires:\n- vLLM built from [PR #41880](https://github.com/vllm-project/vllm/pull/41880) (extends the base Laguna PR with DFlash support).\n- `VLLM_USE_DEEP_GEMM=0` in the launch environment \u2014 DeepGEMM is currently incompatible with the DFlash draft path.\n\nExample:\n\n```bash\nVLLM_USE_DEEP_GEMM=0 vllm serve poolside/Laguna-XS.2 \\\n --trust-remote-code \\\n --max-model-len 16384 \\\n --enable-auto-tool-choice \\\n --tool-call-parser poolside_v1 \\\n --reasoning-parser poolside_v1 \\\n --speculative-config '{\"model\":\"poolside/Laguna-XS.2-speculator.dflash\",\"num_speculative_tokens\":7,\"method\":\"dflash\"}'\n```\n\n## References\n\n- [Model card](https://huggingface.co/poolside/Laguna-XS.2)\n- [Release blog post](https://poolside.ai/blog/laguna-a-deeper-dive)\n- [vLLM support PR #41129](https://github.com/vllm-project/vllm/pull/41129)\n- [vLLM DFlash spec-decoding PR #41880](https://github.com/vllm-project/vllm/pull/41880)\n- [DFlash draft model](https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash)\n" + } + }, + "stabilityai/stable-audio-open-1.0": { + "hf_id": "stabilityai/stable-audio-open-1.0", + "meta": { + "title": "Stable Audio Open", + "provider": "Stability AI", + "description": "Text-to-audio generation model (1.2B params) producing up to ~47 s stereo audio at 44.1 kHz, served via vLLM-Omni", + "tasks": [ + "omni" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "1.2B" + }, + "recipe": { + "meta": { + "title": "Stable Audio Open", + "slug": "stable-audio-open", + "provider": "Stability AI", + "description": "Text-to-audio generation model (1.2B params) producing up to ~47 s stereo audio at 44.1 kHz, served via vLLM-Omni", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "omni" + ], + "related_recipes": [ + "stabilityai/stable-diffusion-3.5-medium" + ] + }, + "model": { + "model_id": "stabilityai/stable-audio-open-1.0", + "min_vllm_version": "0.14.1", + "architecture": "dense", + "parameter_count": "1.2B", + "active_parameters": "1.2B", + "context_length": 0, + "base_args": [ + "--trust-remote-code", + "--enforce-eager", + "--gpu-memory-utilization", + "0.9" + ], + "base_env": {} + }, + "omni": { + "serve_binary": "vllm-omni serve", + "tasks": [ + "t2a" + ] + }, + "dependencies": [ + { + "note": "Pin vllm==0.14.1 for Stable Audio Open", + "command": "uv pip install vllm==0.14.1" + }, + { + "note": "vllm-omni provides the audio generation backend", + "command": "uv pip install git+https://github.com/vllm-project/vllm-omni.git" + }, + { + "note": "soundfile (recommended) or scipy for WAV output", + "command": "uv pip install soundfile" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 19, + "description": "BF16 weights for text-to-audio generation (via vLLM-Omni)" + } + }, + "compatible_strategies": [], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Stable Audio Open 1.0](https://huggingface.co/stabilityai/stable-audio-open-1.0) is\nStability AI's text-to-audio generation model (~1.2B parameters). It produces stereo\naudio at 44.1 kHz, up to ~47 seconds. Served via **vLLM-Omni** (not standard vLLM).\n\nLimitations:\n- No realistic vocals (no singing or speech).\n- English-only training data.\n- Better at sound effects than complex music.\n\n## Prerequisites\n\n- vLLM-Omni on top of vLLM 0.14.1\n- `soundfile` or `scipy` for saving audio\n\n## Installation\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm==0.14.1\nuv pip install git+https://github.com/vllm-project/vllm-omni.git\n\n# Audio saving\nuv pip install soundfile\n```\n\n## Python Usage\n\n```python\nimport torch\nimport soundfile as sf\nfrom vllm_omni.entrypoints.omni import Omni\n\nomni = Omni(model=\"stabilityai/stable-audio-open-1.0\")\ngenerator = torch.Generator(device=\"cuda\").manual_seed(42)\n\naudio = omni.generate(\n \"The sound of a dog barking\",\n negative_prompt=\"Low quality.\",\n generator=generator,\n guidance_scale=7.0,\n num_inference_steps=100,\n extra={\"audio_start_in_s\": 0.0, \"audio_end_in_s\": 10.0},\n)\n\naudio_data = audio[0].cpu().float().numpy().T # [samples, channels]\nsf.write(\"output.wav\", audio_data, 44100)\n```\n\n## CLI Usage (from vLLM-Omni repo)\n\n```bash\npython examples/offline_inference/text_to_audio/text_to_audio.py \\\n --model stabilityai/stable-audio-open-1.0 \\\n --prompt \"The sound of a dog barking\" \\\n --audio-length 10.0 \\\n --num-inference-steps 100 \\\n --guidance-scale 7.0 \\\n --output dog_barking.wav\n```\n\n## Key Parameters\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `audio_start_in_s` | 0.0 | Start time in seconds |\n| `audio_end_in_s` | 10.0 | End time in seconds |\n| `num_inference_steps` | 100 | Denoising steps (higher = better quality, slower) |\n| `guidance_scale` | 7.0 | Classifier-free guidance scale |\n| `negative_prompt` | \"Low quality.\" | Text to avoid |\n| `num_waveforms` | 1 | Samples per prompt |\n| `sample_rate` | 44100 | Output sample rate (Hz) |\n\n## License\n\nReleased under the Stability AI Community License. Commercial use requires a separate license.\n\n## References\n\n- [Stable Audio Open on Hugging Face](https://huggingface.co/stabilityai/stable-audio-open-1.0)\n- [vLLM-Omni text-to-audio example](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/text_to_audio/text_to_audio.py)\n" + } + }, + "stabilityai/stable-diffusion-3.5-medium": { + "hf_id": "stabilityai/stable-diffusion-3.5-medium", + "meta": { + "title": "Stable Diffusion 3.5", + "provider": "Stability AI", + "description": "Stability AI's Stable Diffusion 3.5 text-to-image family (medium 2.5B, large 8.1B, large-turbo) via vLLM-Omni with Cache-DiT acceleration", + "tasks": [ + "omni" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "2.5B" + }, + "recipe": { + "meta": { + "title": "Stable Diffusion 3.5", + "slug": "stable-diffusion-3.5", + "provider": "Stability AI", + "description": "Stability AI's Stable Diffusion 3.5 text-to-image family (medium 2.5B, large 8.1B, large-turbo) via vLLM-Omni with Cache-DiT acceleration", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "omni" + ], + "related_recipes": [ + "stabilityai/stable-audio-open-1.0" + ] + }, + "model": { + "model_id": "stabilityai/stable-diffusion-3.5-medium", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "2.5B", + "active_parameters": "2.5B", + "context_length": 0, + "base_args": [], + "base_env": {} + }, + "omni": { + "tasks": [ + "t2i" + ] + }, + "dependencies": [ + { + "note": "Pin vllm==0.12.0 for Stable Diffusion 3.5", + "command": "uv pip install vllm==0.12.0" + }, + { + "note": "vllm-omni provides the image generation backend", + "command": "uv pip install git+https://github.com/vllm-project/vllm-omni.git" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "label": "Medium", + "precision": "bf16", + "vram_minimum_gb": 44, + "description": "Stable Diffusion 3.5 medium (2.5B)" + }, + "large": { + "label": "Large", + "model_id": "stabilityai/stable-diffusion-3.5-large", + "precision": "bf16", + "vram_minimum_gb": 24, + "description": "Stable Diffusion 3.5 large (8.1B)" + }, + "large_turbo": { + "label": "Large Turbo", + "model_id": "stabilityai/stable-diffusion-3.5-large-turbo", + "precision": "bf16", + "vram_minimum_gb": 24, + "description": "Stable Diffusion 3.5 large-turbo (8.1B, timestep-distilled for few-step inference)" + } + }, + "compatible_strategies": [], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nStable Diffusion 3.5 text-to-image generation models, served via **vLLM-Omni** with\noptional **Cache-DiT** acceleration.\n\nSupported variants:\n- `stabilityai/stable-diffusion-3.5-medium` \u2014 2.5B params\n- `stabilityai/stable-diffusion-3.5-large` \u2014 8.1B params\n- `stabilityai/stable-diffusion-3.5-large-turbo` \u2014 8.1B params (timestep-distilled for few-step inference)\n\n## Prerequisites\n\n- vLLM-Omni on top of vLLM 0.12.0\n- `diffusers` library\n\n## Installation\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm==0.12.0\nuv pip install git+https://github.com/vllm-project/vllm-omni.git\n```\n\n## Python Usage\n\n```python\nfrom vllm_omni.entrypoints.omni import Omni\n\nomni = Omni(model=\"stabilityai/stable-diffusion-3.5-medium\")\n\nimages = omni.generate(\n prompt=\"a cat wearing sunglasses, cyberpunk style\",\n negative_prompt=\"blurry, low quality\",\n height=1024, width=1024,\n num_inference_steps=28,\n guidance_scale=7.5,\n num_outputs_per_prompt=2,\n)\n```\n\n## CLI Usage\n\n```bash\npython examples/offline_inference/text_to_image/text_to_image.py \\\n --model stabilityai/stable-diffusion-3.5-medium \\\n --prompt \"a cat wearing sunglasses, cyberpunk style\" \\\n --negative-prompt \"blurry, low quality\" \\\n --height 1024 --width 1024 \\\n --num-inference-steps 28 \\\n --guidance-scale 7.5\n```\n\n## Cache-DiT Acceleration\n\nEnable caching for significant speed-ups:\n\n```python\nomni = Omni(\n model=\"stabilityai/stable-diffusion-3.5-medium\",\n cache_backend=\"cache_dit\",\n cache_config={\n \"Fn_compute_blocks\": 8,\n \"Bn_compute_blocks\": 0,\n \"max_warmup_steps\": 4,\n \"residual_diff_threshold\": 0.12,\n },\n)\n```\n\n## Key Parameters\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `height` | 1024 | Image height (multiples of 16) |\n| `width` | 1024 | Image width (multiples of 16) |\n| `num_inference_steps` | 28 | Denoising steps |\n| `guidance_scale` | 1.0 | Classifier-free guidance scale |\n\n## References\n\n- [SD3.5 Medium](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium)\n- [SD3.5 Large](https://huggingface.co/stabilityai/stable-diffusion-3.5-large)\n- [SD3.5 Large Turbo](https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo)\n- [Cache-DiT Acceleration](https://github.com/vipshop/cache-dit)\n" + } + }, + "stepfun-ai/Step-3.5-Flash": { + "hf_id": "stepfun-ai/Step-3.5-Flash", + "meta": { + "title": "Step-3.5-Flash", + "provider": "StepFun", + "description": "Production-grade reasoning MoE (~196B total / 11B active parameters) with hybrid attention schedules, SWA compensation, and multi-token prediction for low-latency long-context inference", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "b200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "196B" + }, + "recipe": { + "meta": { + "title": "Step-3.5-Flash", + "slug": "step-3.5-flash", + "provider": "StepFun", + "description": "Production-grade reasoning MoE (~196B total / 11B active parameters) with hybrid attention schedules, SWA compensation, and multi-token prediction for low-latency long-context inference", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Sparse MoE reasoning model with hybrid attention and step3p5 MTP speculative decoding", + "related_recipes": [], + "hardware": { + "h200": "verified", + "b200": "verified" + } + }, + "model": { + "model_id": "stepfun-ai/Step-3.5-Flash", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "196B", + "active_parameters": "11B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Step-3.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "step3p5", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Step-3.5 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "step3p5" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding with the step3p5_mtp method", + "args": [ + "--hf-overrides", + "{\"num_nextn_predict_layers\": 1}", + "--speculative-config", + "{\"method\": \"step3p5_mtp\", \"num_speculative_tokens\": 1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 470, + "description": "Full precision BF16 \u2014 runs on 4xH200/H20/B200" + }, + "fp8": { + "model_id": "stepfun-ai/Step-3.5-Flash-FP8", + "precision": "fp8", + "vram_minimum_gb": 235, + "tp": 2, + "description": "Native FP8 checkpoint (TP not supported beyond 2 \u2014 use DP4)" + }, + "int4": { + "model_id": "stepfun-ai/Step-3.5-Flash-INT4", + "precision": "int4", + "vram_minimum_gb": 118, + "description": "INT4 quantized weights" + }, + "int8": { + "model_id": "stepfun-ai/Step-3.5-Flash-INT8", + "precision": "int8", + "vram_minimum_gb": 235, + "description": "INT8 quantized weights" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": { + "blackwell": { + "extra_args": [], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP8": "0" + } + }, + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "0" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Step-3.5-Flash](https://huggingface.co/stepfun-ai/Step-3.5-Flash) is an advanced\nreasoning engine from [StepFun](https://www.stepfun.com/company). Highlights:\n\n- Hybrid attention schedules with compensation for sliding-window attention (SWA)\n- Sparse MoE structure (196B total parameters, 11B active)\n- Multi-token prediction mechanism for faster inference\n\nAvailable precisions:\n\n- [stepfun-ai/Step-3.5-Flash](https://huggingface.co/stepfun-ai/Step-3.5-Flash) (BF16)\n- [stepfun-ai/Step-3.5-Flash-FP8](https://huggingface.co/stepfun-ai/Step-3.5-Flash-FP8)\n- [stepfun-ai/Step-3.5-Flash-Int4](https://huggingface.co/stepfun-ai/Step-3.5-Flash-Int4) (not yet supported by vLLM)\n\n## Prerequisites\n\n- **vLLM version:** latest stable\n- **Hardware:** 4x H200/H20/B200\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --torch-backend auto\n```\n\n## Launching the Server\n\n### Tensor Parallel\n\n```bash\nvllm serve stepfun-ai/Step-3.5-Flash \\\n --tensor-parallel-size 4 \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nNote: The FP8 version cannot use TP4 \u2014 use DP4 instead.\n\n### Data Parallel + Expert Parallel (recommended for FP8)\n\n```bash\nvllm serve stepfun-ai/Step-3.5-Flash \\\n --data-parallel-size 4 \\\n --enable-expert-parallel \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\n### Enabling MTP Speculative Decoding\n\n```bash\nvllm serve stepfun-ai/Step-3.5-Flash \\\n --tensor-parallel-size 4 \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --trust-remote-code \\\n --hf-overrides '{\"num_nextn_predict_layers\": 1}' \\\n --speculative-config '{\"method\": \"step3p5_mtp\", \"num_speculative_tokens\": 1}'\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model stepfun-ai/Step-3.5-Flash \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **MoE kernel tuning:** See [tune-moe-kernel](https://github.com/vllm-project/recipes/blob/main/Qwen/Qwen3-Next.md#tune-moe-kernel)\n to tune Triton kernels for your hardware.\n- **FP8 DeepGEMM:** For FP8, install DeepGEMM via [install_deepgemm.sh](https://github.com/vllm-project/vllm/blob/v0.16.0rc0/tools/install_deepgemm.sh).\n- **B200 FlashInfer FP8 MoE error:** If you see\n `routing_logits must be bfloat16` when serving FP8 on B200, set\n `export VLLM_USE_FLASHINFER_MOE_FP8=0` as a workaround.\n- **FP8 + TP4 incompatibility:** Use DP4+EP instead.\n\n## References\n\n- [Model card](https://huggingface.co/stepfun-ai/Step-3.5-Flash)\n- [FP8 checkpoint](https://huggingface.co/stepfun-ai/Step-3.5-Flash-FP8)\n- [StepFun](https://www.stepfun.com/company)\n" + } + }, + "stepfun-ai/Step-3.7-Flash": { + "hf_id": "stepfun-ai/Step-3.7-Flash", + "meta": { + "title": "Step-3.7-Flash", + "provider": "StepFun", + "description": "Production-grade vision-language MoE (~198B total / 11B active parameters) combining a 196B sparse language backbone with a 1.8B perception encoder, hybrid SWA/Global attention, and 3-way Multi-Token Prediction", + "tasks": [ + "multimodal" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "dgx_station_gb300": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "198B" + }, + "recipe": { + "meta": { + "title": "Step-3.7-Flash", + "slug": "step-3.7-flash", + "provider": "StepFun", + "description": "Production-grade vision-language MoE (~198B total / 11B active parameters) combining a 196B sparse language backbone with a 1.8B perception encoder, hybrid SWA/Global attention, and 3-way Multi-Token Prediction", + "date_updated": "2026-05-30", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Sparse MoE VLM with hybrid attention and 3-layer MTP speculative decoding", + "related_recipes": [ + "stepfun-ai/Step-3.5-Flash" + ], + "hardware": { + "h200": "verified", + "b200": "verified", + "dgx_station_gb300": "verified" + } + }, + "model": { + "model_id": "stepfun-ai/Step-3.7-Flash", + "min_vllm_version": "nightly", + "nightly_required": true, + "docker_image": "vllm/vllm-openai:stepfun37", + "install": { + "docker": { + "note": "Dedicated Step-3.7 image \u2014 preferred over the nightly pip wheel until support lands in vllm:latest." + } + }, + "architecture": "moe", + "parameter_count": "198B", + "active_parameters": "11B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel", + "--disable-cascade-attn" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Step-3.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "step3p5", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Step-3.5 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "step3p5" + ] + }, + "spec_decoding": { + "description": "3-layer Multi-Token Prediction speculative decoding (MTP-3)", + "args": [ + "--speculative-config", + "{\"method\": \"mtp\", \"num_speculative_tokens\": 3}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 475, + "description": "Full precision BF16 \u2014 recommended on 8xH200/B200 with TP8+EP" + }, + "fp8": { + "model_id": "stepfun-ai/Step-3.7-Flash-FP8", + "precision": "fp8", + "vram_minimum_gb": 238, + "description": "Native FP8 checkpoint \u2014 runs on 8xH200/B200 with TP8+EP" + }, + "nvfp4": { + "model_id": "stepfun-ai/Step-3.7-Flash-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 119, + "tp": 4, + "extra_args": [ + "--quantization", + "modelopt", + "--kv-cache-dtype", + "fp8", + "--gpu-memory-utilization", + "0.9", + "--async-scheduling" + ], + "description": "NVFP4 quantized \u2014 Blackwell only; TP4+EP with FP8 KV cache" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Step-3.7-Flash](https://huggingface.co/stepfun-ai/Step-3.7-Flash) is a 198B-parameter\nsparse Mixture-of-Experts vision-language model from\n[StepFun](https://www.stepfun.com/company), pairing a 196B language backbone with a\n1.8B perception encoder. It activates ~11B parameters per token and supports a\n256k context window with three selectable reasoning levels (low / medium / high).\n\nKey highlights:\n\n- **Multimodal Understanding**: Native vision encoder for single and multi-image inputs alongside text\n- **Hybrid Attention Architecture**: Interleaves Sliding Window Attention (512-token window) and Global Attention at a 3:1 ratio\n- **Sparse MoE**: 11B active parameters out of 198B total\n- **Multi-Layer MTP**: 3-way Multi-Token Prediction (MTP-3) for low-latency reasoning chains\n\nAvailable precisions:\n\n- [stepfun-ai/Step-3.7-Flash](https://huggingface.co/stepfun-ai/Step-3.7-Flash) (BF16)\n- [stepfun-ai/Step-3.7-Flash-FP8](https://huggingface.co/stepfun-ai/Step-3.7-Flash-FP8)\n- [stepfun-ai/Step-3.7-Flash-NVFP4](https://huggingface.co/stepfun-ai/Step-3.7-Flash-NVFP4) (Blackwell only)\n\n## Prerequisites\n\n- **vLLM version:** nightly (the model registry hasn't shipped in a stable release yet)\n- **Hardware:** 8xH200/B200 for BF16 and FP8; 4xB200 for NVFP4\n\n### Install vLLM (nightly)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre \\\n --extra-index-url https://wheels.vllm.ai/nightly\n```\n\nOr via Docker:\n\n```bash\ndocker pull vllm/vllm-openai:stepfun37\n```\n\n## Launching the Server\n\n### BF16\n\n```bash\nvllm serve stepfun-ai/Step-3.7-Flash \\\n --served-model-name step3p7-flash \\\n --tensor-parallel-size 8 \\\n --enable-expert-parallel \\\n --disable-cascade-attn \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 3}' \\\n --trust-remote-code\n```\n\n### FP8\n\n```bash\nvllm serve stepfun-ai/Step-3.7-Flash-FP8 \\\n --served-model-name step3p7-flash \\\n --tensor-parallel-size 8 \\\n --enable-expert-parallel \\\n --disable-cascade-attn \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --speculative-config '{\"method\": \"mtp\", \"num_speculative_tokens\": 3}' \\\n --trust-remote-code\n```\n\n### NVFP4 (Blackwell only)\n\nRequires modelopt quantization and FP8 KV cache alignment.\n\n```bash\nvllm serve stepfun-ai/Step-3.7-Flash-NVFP4 \\\n --served-model-name step3p7 \\\n --tensor-parallel-size 4 \\\n --gpu-memory-utilization 0.9 \\\n --enable-expert-parallel \\\n --quantization modelopt \\\n --kv-cache-dtype fp8 \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --async-scheduling \\\n --trust-remote-code\n```\n\n### DGX Station Single-GPU\n\nThe [DGX Station](https://www.nvidia.com/en-us/products/workstations/dgx-station/)\nships a single GB300 Grace-Blackwell Ultra Superchip with 252 GB of HBM3e, so the\nFP8 and NVFP4 checkpoints both fit entirely in VRAM on one GPU (BF16 at ~475 GB does\nnot). Use the dedicated `vllm/vllm-openai:stepfun37` image and serve on a single GPU:\n\n```bash\nvllm serve stepfun-ai/Step-3.7-Flash-FP8 \\\n --served-model-name step3p7-flash \\\n --tensor-parallel-size 1 \\\n --gpu-memory-utilization 0.95 \\\n --kv-cache-dtype fp8 \\\n --reasoning-parser step3p5 \\\n --tool-call-parser step3p5 \\\n --enable-auto-tool-choice \\\n --trust-remote-code\n```\n\nSwap in `stepfun-ai/Step-3.7-Flash-NVFP4` to run the NVFP4 checkpoint instead. Or\nlaunch the prebuilt container directly:\n\n```bash\ndocker run -d --name vllm-server \\\n --gpus all --ipc host \\\n --ulimit memlock=-1 --ulimit stack=67108864 \\\n -p 8000:8000 \\\n -e HF_TOKEN=\"$HF_TOKEN\" \\\n -v \"$HOME/.cache/huggingface/hub:/root/.cache/huggingface/hub\" \\\n vllm/vllm-openai:stepfun37 \\\n stepfun-ai/Step-3.7-Flash-FP8 \\\n --gpu-memory-utilization 0.95 \\\n --trust-remote-code \\\n --reasoning-parser step3p5 \\\n --enable-auto-tool-choice \\\n --tool-call-parser step3p5 \\\n --kv-cache-dtype fp8\n```\n\nSee the [NVIDIA build DGX Station instructions](https://build.nvidia.com/station/vllm/instructions)\nfor the full container setup.\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend vllm \\\n --model stepfun-ai/Step-3.7-Flash \\\n --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input 2048 \\\n --random-output 1024 \\\n --max-concurrency 10 \\\n --num-prompt 100\n```\n\n## Troubleshooting\n\n- **MoE kernel tuning:** See [tune-moe-kernel](https://github.com/vllm-project/recipes/blob/main/Qwen/Qwen3-Next.md#tune-moe-kernel) to tune Triton kernels for your hardware.\n- **NVFP4 + TP > 4:** The author recommends TP4+EP for NVFP4. Higher TP isn't validated.\n- **Cascade attention:** Always pass `--disable-cascade-attn` \u2014 the hybrid SWA/GA schedule is not compatible with cascade attention in vLLM.\n\n## References\n\n- [Model card](https://huggingface.co/stepfun-ai/Step-3.7-Flash)\n- [FP8 checkpoint](https://huggingface.co/stepfun-ai/Step-3.7-Flash-FP8)\n- [NVFP4 checkpoint](https://huggingface.co/stepfun-ai/Step-3.7-Flash-NVFP4)\n- [StepFun](https://www.stepfun.com/company)\n" + } + }, + "tencent/Hunyuan-A13B-Instruct": { + "hf_id": "tencent/Hunyuan-A13B-Instruct", + "meta": { + "title": "Hunyuan-A13B-Instruct", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan A13B instruct-tuned MoE language model with AITER-accelerated AMD ROCm deployment", + "tasks": [ + "text" + ], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "80B" + }, + "recipe": { + "meta": { + "title": "Hunyuan-A13B-Instruct", + "slug": "hunyuan-a13b-instruct", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan A13B instruct-tuned MoE language model with AITER-accelerated AMD ROCm deployment", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "text" + ], + "performance_headline": "Hunyuan-A13B MoE with AITER acceleration on AMD MI300X/MI325X/MI355X", + "related_recipes": [], + "hardware": { + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "tencent/Hunyuan-A13B-Instruct", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "80B", + "active_parameters": "13B", + "context_length": 32768, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 580, + "description": "Full precision BF16 \u2014 2x GPU (TP=2) on AMD MI300X/MI325X/MI355X" + }, + "fp8": { + "model_id": "tencent/Hunyuan-A13B-Instruct-FP8", + "precision": "fp8", + "vram_minimum_gb": 96, + "description": "FP8 quantized weights for Hopper/Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ] + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nHunyuan-A13B-Instruct is Tencent's instruct-tuned Hunyuan MoE model. This recipe\ncovers deployment on AMD ROCm GPUs (MI300X / MI325X / MI355X) with\n[AITER](https://github.com/ROCm/aiter) acceleration enabled via\n`VLLM_ROCM_USE_AITER=1`.\n\n## Prerequisites\n\n- **vLLM version:** ROCm build\n- **Python:** 3.12\n- **Hardware:** AMD MI300X / MI325X / MI355X\n- **ROCm:** 7.0+, glibc >= 2.35 (or use Docker)\n\n### Install vLLM (ROCm)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/\n```\n\nIf the environment does not meet the Python/ROCm/glibc requirements, use the\nDocker-based setup from the vLLM install docs.\n\n## Launching the Server\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nvllm serve tencent/Hunyuan-A13B-Instruct \\\n --tensor-parallel-size 2 \\\n --trust-remote-code\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model \"tencent/Hunyuan-A13B-Instruct\" \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **First launch delay:** AITER JIT-compiles optimized kernels on first launch, which\n can take several minutes. Subsequent runs use cached kernels.\n- **Environment mismatch:** If wheel install fails, fall back to the vLLM ROCm Docker image.\n\n## References\n\n- [Model card](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)\n- [AITER](https://github.com/ROCm/aiter)\n- [vLLM install docs](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/)\n" + } + }, + "tencent/HunyuanOCR": { + "hf_id": "tencent/HunyuanOCR", + "meta": { + "title": "HunyuanOCR", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan end-to-end OCR expert VLM (~1B) for online OCR serving with an OpenAI-compatible API", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "1B" + }, + "recipe": { + "meta": { + "title": "HunyuanOCR", + "slug": "hunyuan-ocr", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan end-to-end OCR expert VLM (~1B) for online OCR serving with an OpenAI-compatible API", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "performance_headline": "Compact 1B end-to-end OCR VLM from the Hunyuan native multimodal family", + "related_recipes": [] + }, + "model": { + "model_id": "tencent/HunyuanOCR", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "1B", + "active_parameters": "1B", + "context_length": 32768, + "base_args": [ + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0" + ], + "base_env": {} + }, + "features": { + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Full precision BF16 \u2014 single-GPU deployment" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[HunyuanOCR](https://huggingface.co/tencent/HunyuanOCR) is a leading end-to-end OCR\nexpert VLM powered by Hunyuan's native multimodal architecture. This recipe covers\nonline serving with the OpenAI-compatible API.\n\n## Prerequisites\n\n- **vLLM version:** latest stable\n- **Hardware:** single GPU (1B model)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve tencent/HunyuanOCR \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0\n```\n\n### Configuration Tips\n\n- Use greedy sampling (`temperature=0.0`) or low temperature for optimal OCR accuracy.\n- OCR tasks generally do not benefit from prefix caching or image reuse; disabling\n them (as above) removes hashing/caching overhead.\n- Adjust `--max-num-batched-tokens` for throughput based on your hardware.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [\n {\"role\": \"system\", \"content\": \"\"},\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chat-ui/tools-dark.png\"}},\n {\n \"type\": \"text\",\n \"text\": (\n \"Extract all information from the main body of the document image \"\n \"and represent it in markdown format, ignoring headers and footers. \"\n \"Tables should be expressed in HTML format, formulas in the document \"\n \"should be represented using LaTeX format, and the parsing should be \"\n \"organized according to the reading order.\"\n )\n }\n ]\n }\n]\n\nresponse = client.chat.completions.create(\n model=\"tencent/HunyuanOCR\",\n messages=messages,\n temperature=0.0,\n extra_body={\"top_k\": 1, \"repetition_penalty\": 1.0},\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **Accuracy:** Use `temperature=0.0` and `top_k=1` for deterministic OCR output.\n- **Application-oriented prompts:** See the [official model card](https://huggingface.co/tencent/HunyuanOCR#%F0%9F%92%AC-application-oriented-prompts)\n for prompts tuned to various document parsing tasks.\n\n## References\n\n- [Model card](https://huggingface.co/tencent/HunyuanOCR)\n" + } + }, + "tencent/Hy3-preview": { + "hf_id": "tencent/Hy3-preview", + "meta": { + "title": "Hy3-preview", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan Hy3-preview \u2014 scaled-up MoE language model (295B total / 21B active) with a 3.8B MTP layer for speculative decoding, 256K context, and hy_v3 tool/reasoning parsers", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi350x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "295B" + }, + "recipe": { + "meta": { + "title": "Hy3-preview", + "slug": "hy3-preview", + "provider": "Hunyuan (Tencent)", + "description": "Tencent Hunyuan Hy3-preview \u2014 scaled-up MoE language model (295B total / 21B active) with a 3.8B MTP layer for speculative decoding, 256K context, and hy_v3 tool/reasoning parsers", + "date_updated": "2026-04-23", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Hunyuan Hy3-preview MoE \u2014 295B/21B on 8\u00d7H200, 8\u00d7H20-3e(141GB), or 8\u00d7AMD MI300X/MI355X with MTP", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi350x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "tencent/Hy3-preview", + "min_vllm_version": "0.20.0", + "install": { + "docker": { + "note": "Use the dedicated hy3-preview image until changes land in vllm:latest." + } + }, + "architecture": "moe", + "parameter_count": "295B", + "active_parameters": "21B", + "context_length": 262144, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "Hunyuan v3 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "hy_v3", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "Hunyuan v3 reasoning parser for thinking-mode chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "hy_v3" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding using the model's built-in MTP layer", + "args": [ + "--speculative-config", + "{\"method\":\"mtp\",\"num_speculative_tokens\":1}" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 708, + "description": "Full precision BF16 \u2014 8\u00d7H200 or 8\u00d7H20-3e(141GB) minimum for weights + KV cache" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "install_note": "Hy3-preview model code is being added in PR #40681. Until it merges, build\nvLLM editable from the PR branch in rocm/vllm-dev:nightly:\n\n docker run -it --device=/dev/kfd --device=/dev/dri --network=host \\\n --ipc=host --shm-size=128g --group-add video --cap-add SYS_PTRACE \\\n --security-opt seccomp=unconfined -v ~/work:/work -w /work \\\n -e PYTHONPATH=/work/vllm rocm/vllm-dev:nightly bash\n git clone -b feature/support_hy_v3 \\\n https://github.com/stevenkuang-tencent/vllm.git\n cd vllm && pip uninstall -y vllm\n SETUPTOOLS_SCM_PRETEND_VERSION=0.20.0.dev0 VLLM_TARGET_DEVICE=rocm \\\n pip install --editable . --no-build-isolation\n\nSetting PYTHONPATH avoids a known editable-install conflict with the\nempty /app/vllm namespace directory shipped in the base image.\n", + "extra_args": [], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "VLLM_ROCM_USE_AITER_MOE": "1", + "VLLM_ROCM_USE_AITER_MHA": "1", + "VLLM_ROCM_USE_AITER_RMSNORM": "1", + "VLLM_ROCM_USE_AITER_LINEAR": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "# Hy3-preview Usage Guide\n\nHy3-preview is Tencent Hunyuan's latest open-source Mixture-of-Experts language model:\n295B total parameters with 21B activated per token, plus a 3.8B MTP layer for\nspeculative decoding. 80 transformer layers, 192 routed experts (top-8) + 1 shared\nexpert, GQA with 64 heads over 8 KV heads, 256K context.\n\nA pretrained base checkpoint is published at\n[tencent/Hy3-preview-Base](https://huggingface.co/tencent/Hy3-preview-Base); this\nrecipe covers the instruct model.\n\n## Setup\n\nChoose one of the following setup methods.\n\n### Using Docker\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:hy3-preview tencent/Hy3-preview \\\n --tensor-parallel-size 8 \\\n --tool-call-parser hy_v3 \\\n --reasoning-parser hy_v3 \\\n --enable-auto-tool-choice \\\n --served-model-name hy3-preview\n```\n\n### Installing from source\n\n```bash\nuv venv --python 3.12 --seed --managed-python\nsource .venv/bin/activate\ngit clone https://github.com/vllm-project/vllm.git\ncd vllm\nuv pip install --editable . --torch-backend=auto\n```\n\n## Model Deployment\n\nTo serve Hy3-preview on 8 GPUs, use H20-3e(141GB), H200, AMD MI300X/MI325X (192 GB),\nAMD MI350X/MI355X (288 GB), or other GPUs with larger memory capacity. Smaller-memory\n8-GPU configurations (8\u00d7H100 80GB, 8\u00d7A100 80GB) do not fit the BF16 weights plus KV\ncache \u2014 use multi-node TP for those.\n\n### Serving on 8\u00d7AMD MI300X / MI325X / MI350X / MI355X\n\nHy3-preview support is being added in vLLM PR\n[#40681](https://github.com/vllm-project/vllm/pull/40681). Until it merges, AMD users\nmust build vLLM from the PR branch inside the published ROCm vLLM nightly image\n(`rocm/vllm-dev:nightly`). See the AMD install note in `hardware_overrides.amd`\nabove for the full reproducer.\n\nOnce vLLM is installed, serve with the standard launcher plus the AITER environment\nvariables (the recipe's `hardware_overrides.amd.extra_env` applies these\nautomatically when the AMD profile is selected on the recipe site):\n\n```bash\nexport VLLM_ROCM_USE_AITER=1\nexport VLLM_ROCM_USE_AITER_MOE=1\nexport VLLM_ROCM_USE_AITER_MHA=1\nexport VLLM_ROCM_USE_AITER_RMSNORM=1\nexport VLLM_ROCM_USE_AITER_LINEAR=1\n\nvllm serve tencent/Hy3-preview \\\n --tensor-parallel-size 8 \\\n --tool-call-parser hy_v3 \\\n --reasoning-parser hy_v3 \\\n --enable-auto-tool-choice \\\n --served-model-name hy3-preview \\\n --gpu-memory-utilization 0.90\n```\n\nMTP (recommended on AMD for lower latency, same flags as the NVIDIA path):\n\n```bash\nvllm serve tencent/Hy3-preview \\\n --tensor-parallel-size 8 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1 \\\n --tool-call-parser hy_v3 \\\n --reasoning-parser hy_v3 \\\n --enable-auto-tool-choice \\\n --served-model-name hy3-preview \\\n --gpu-memory-utilization 0.90\n```\n\n### Serving on 8\u00d7H200 or 8\u00d7H20-3e(141GB)\n\nWithout Multi-Token Prediction (MTP):\n\n```bash\nvllm serve tencent/Hy3-preview \\\n --tensor-parallel-size 8 \\\n --tool-call-parser hy_v3 \\\n --reasoning-parser hy_v3 \\\n --enable-auto-tool-choice \\\n --served-model-name hy3-preview\n```\n\nWith MTP (recommended for lower latency):\n\n```bash\nvllm serve tencent/Hy3-preview \\\n --tensor-parallel-size 8 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1 \\\n --tool-call-parser hy_v3 \\\n --reasoning-parser hy_v3 \\\n --enable-auto-tool-choice \\\n --served-model-name hy3-preview\n```\n\n## Sampling and Reasoning Modes\n\nTencent's recommended sampling parameters: `temperature=0.9`, `top_p=1.0`.\n\nReasoning is controlled via `chat_template_kwargs.reasoning_effort`:\n\n| Value | Behavior |\n|:---|:---|\n| `no_think` (default) | Direct response, no chain-of-thought |\n| `low` | Light reasoning |\n| `high` | Deep chain-of-thought for math/coding/complex reasoning |\n\nWhen tools are registered, set `interleaved_thinking: true` to allow the model to\nthink between tool calls.\n\n### OpenAI Client Example\n\n```bash\nuv pip install -U openai\n```\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\nmessages = [\n {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n {\"role\": \"user\", \"content\": \"Hello.\"},\n]\n\n# Direct response (default).\nresp = client.chat.completions.create(\n model=\"hy3-preview\",\n messages=messages,\n temperature=0.9,\n top_p=1.0,\n max_tokens=4096,\n)\nprint(resp.choices[0].message.content)\n\n# Deep reasoning: set reasoning_effort (and interleaved_thinking if using tools).\nresp_think = client.chat.completions.create(\n model=\"hy3-preview\",\n messages=messages,\n temperature=0.9,\n top_p=1.0,\n max_tokens=4096,\n extra_body={\n \"chat_template_kwargs\": {\n \"reasoning_effort\": \"high\",\n \"interleaved_thinking\": True,\n },\n },\n)\noutput_msg = resp_think.choices[0].message\nprint(output_msg.reasoning_content) # chain-of-thought\nprint(output_msg.content) # final answer\n```\n\n### cURL Usage\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"hy3-preview\",\n \"messages\": [\n {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n {\"role\": \"user\", \"content\": \"Hello.\"}\n ],\n \"temperature\": 0.9,\n \"top_p\": 1.0,\n \"max_tokens\": 4096\n }'\n```\n\n## Benchmarking\n\nFor benchmarking, disable prefix caching by adding `--no-enable-prefix-caching` to\nthe server command.\n\nThe following uses 8\u00d7H20-3e(141GB) as an example.\n\n```bash\nvllm bench serve \\\n --model tencent/Hy3-preview \\\n --dataset-name random \\\n --random-input-len 8192 \\\n --random-output-len 1024 \\\n --max-concurrency 32 \\\n --num-prompts 160 \\\n --served-model-name hy3-preview\n```\n\nRepresentative output:\n\n```shell\n============ Serving Benchmark Result ============\nSuccessful requests: 160 \nFailed requests: 0 \nMaximum request concurrency: 32 \nBenchmark duration (s): 280.58 \nTotal input tokens: 1310720 \nTotal generated tokens: 163840 \nRequest throughput (req/s): 0.57 \nOutput token throughput (tok/s): 583.93 \nPeak output token throughput (tok/s): 1024.00 \nPeak concurrent requests: 36.00 \nTotal token throughput (tok/s): 5255.36 \n---------------Time to First Token----------------\nMean TTFT (ms): 4542.41 \nMedian TTFT (ms): 2762.17 \nP99 TTFT (ms): 21062.96 \n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms): 50.34 \nMedian TPOT (ms): 51.77 \nP99 TPOT (ms): 54.07 \n---------------Inter-token Latency----------------\nMean ITL (ms): 50.34 \nMedian ITL (ms): 34.32 \nP99 ITL (ms): 689.10 \n==================================================\n```\n## References\n\n- [Hugging Face model card](https://huggingface.co/tencent/Hy3-preview)\n- [Hy3-preview-Base (pretrained)](https://huggingface.co/tencent/Hy3-preview-Base)\n- [GitHub: Tencent-Hunyuan/Hy3-preview](https://github.com/Tencent-Hunyuan/Hy3-preview)\n- [Tencent Hunyuan](https://hy.tencent.com/research/hy3)\n" + } + }, + "zai-org/GLM-4.5": { + "hf_id": "zai-org/GLM-4.5", + "meta": { + "title": "GLM-4.5", + "provider": "GLM (Z-AI)", + "description": "GLM-4.5 MoE language model (~358B total parameters, BF16) with built-in MTP layers for speculative decoding and native tool calling", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "358B" + }, + "recipe": { + "meta": { + "title": "GLM-4.5", + "slug": "glm-4.5", + "provider": "GLM (Z-AI)", + "description": "GLM-4.5 MoE language model (~358B total parameters, BF16) with built-in MTP layers for speculative decoding and native tool calling", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "GLM-4.X series MoE model with native FP8 and BF16 support and MTP speculative decoding", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-4.5", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "358B", + "active_parameters": "32B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "GLM-4.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm45", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding using the model's built-in MTP layers", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "1" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 859, + "description": "Full precision BF16 on 8xH200 or equivalent" + }, + "fp8": { + "model_id": "zai-org/GLM-4.5-FP8", + "precision": "fp8", + "vram_minimum_gb": 430, + "description": "Native FP8 checkpoint with minimal accuracy loss \u2014 recommended for cost-efficient serving" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-4.5 is a Mixture-of-Experts language model from Z-AI with ~358B total parameters.\nThe checkpoint ships in both BF16 and native FP8 formats. FP8 models have minimal\naccuracy loss, so unless you need strict reproducibility for benchmarking, FP8 is the\nrecommended precision for lower-cost serving. All GLM-4.X models include built-in\nMulti-Token Prediction (MTP) layers that enable speculative decoding for higher\ngeneration throughput.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.11.0 (latest stable recommended)\n- **Hardware:** 8x H200 (BF16) or 4x-8x H200 (FP8), AMD MI300X / MI325X / MI355X for ROCm\n- **Python:** 3.10 - 3.13 (3.12 required for ROCm wheels)\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\nThe vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35.\n\n## Launching the Server\n\n### Tensor Parallel (FP8 on 8 GPUs)\n\n```bash\nvllm serve zai-org/GLM-4.5-FP8 \\\n --tensor-parallel-size 8 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice\n```\n\n### Enabling MTP Speculative Decoding\n\n```bash\nvllm serve zai-org/GLM-4.5-FP8 \\\n --tensor-parallel-size 4 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice\n```\n\nUse `--speculative-config.num_speculative_tokens 1` for optimal throughput. Higher\nvalues increase mean acceptance length but drop acceptance rate significantly.\n\n### Tuning Tips\n\n- `--max-model-len=65536` works well for most scenarios; max is 128K.\n- `--max-num-batched-tokens=32768` is a good default for prompt-heavy workloads.\n Reduce to 16K/8K to cut activation memory and latency.\n- Set `--gpu-memory-utilization=0.95` to maximize KV cache headroom.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"zai-org/GLM-4.5-FP8\",\n messages=[{\"role\": \"user\", \"content\": \"Explain MTP speculative decoding.\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Benchmarking\n\nDisable prefix caching with `--no-enable-prefix-caching` on the server command,\nthen:\n\n```bash\nvllm bench serve \\\n --model zai-org/GLM-4.5-FP8 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **Tool calling not firing:** Ensure `--tool-call-parser glm45 --enable-auto-tool-choice`\n are both present.\n- **MTP memory overhead:** MTP adds memory for draft computations. Monitor GPU memory\n and reduce `--max-model-len` or `--max-num-batched-tokens` if you OOM.\n- **Low MTP acceptance:** If acceptance rate is below ~90%, drop speculative tokens to 1.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-4.5)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-4.5-FP8)\n- [vLLM docs](https://docs.vllm.ai/)\n" + } + }, + "zai-org/GLM-4.5V": { + "hf_id": "zai-org/GLM-4.5V", + "meta": { + "title": "GLM-4.5V", + "provider": "GLM (Z-AI)", + "description": "GLM-4.5 vision-language MoE model (~107B parameters, BF16) with image-text-to-text capability, 64K context, expert parallelism, and native FP8", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "107B" + }, + "recipe": { + "meta": { + "title": "GLM-4.5V", + "slug": "glm-4.5v", + "provider": "GLM (Z-AI)", + "description": "GLM-4.5 vision-language MoE model (~107B parameters, BF16) with image-text-to-text capability, 64K context, expert parallelism, and native FP8", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Multimodal GLM-4.5V with native FP8 and expert parallelism, deploys on 4xH100", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-4.5V", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "107B", + "active_parameters": "12B", + "context_length": 65536, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel", + "--allowed-local-media-path", + "/", + "--mm-encoder-tp-mode", + "data", + "--mm-processor-cache-type", + "shm" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "GLM-4.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm45", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 257, + "description": "Full precision BF16 \u2014 runs on 4xH100/H200" + }, + "fp8": { + "model_id": "zai-org/GLM-4.5V-FP8", + "precision": "fp8", + "vram_minimum_gb": 128, + "description": "Native FP8 checkpoint with minimal accuracy loss \u2014 recommended for cost-efficient serving" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data", + "--allowed-local-media-path", + "/" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-4.5V is the vision-language variant of GLM-4.5. It is an MoE model with ~107B\ntotal parameters that accepts image and text inputs. FP8 models have minimal accuracy\nloss versus BF16 and are recommended for cost-efficient serving. GLM-4.5V supports a\n64K context length (use GLM-4.6V for 128K).\n\n## Prerequisites\n\n- **vLLM version:** >= 0.12.0\n- **Hardware:** 4x H100/H200 (BF16 or FP8)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launching the Server\n\n### Tensor Parallel + Expert Parallel (FP8 on 4 GPUs)\n\n```bash\nvllm serve zai-org/GLM-4.5V-FP8 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --enable-expert-parallel \\\n --allowed-local-media-path / \\\n --mm-encoder-tp-mode data \\\n --mm-processor-cache-type shm\n```\n\n### Tuning Tips\n\n- `--max-model-len=65536` is near the model's max context (64K).\n- `--max-num-batched-tokens=32768` for prompt-heavy workloads.\n- `--gpu-memory-utilization=0.95` maximizes KV cache.\n- `--mm-encoder-tp-mode data` runs the vision encoder data-parallel \u2014 preferable to TP\n since the encoder is small and TP adds communication overhead.\n- `--mm-processor-cache-type shm` enables shared-memory caching for repeated image inputs.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"zai-org/GLM-4.5V-FP8\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://example.com/image.png\"}},\n {\"type\": \"text\", \"text\": \"Describe the image.\"}\n ]\n }],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend openai-chat \\\n --endpoint /v1/chat/completions \\\n --model zai-org/GLM-4.5V-FP8 \\\n --dataset-name hf \\\n --dataset-path lmarena-ai/VisionArena-Chat \\\n --num-prompts 1000 \\\n --request-rate 20\n```\n\n## Troubleshooting\n\n- **Vision encoder overhead:** Use `--mm-encoder-tp-mode data` unless TP-sharded encoder is known-good for your config.\n- **Context length errors:** GLM-4.5V max is 64K; use GLM-4.6V if you need 128K.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-4.5V)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-4.5V-FP8)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n" + } + }, + "zai-org/GLM-4.6": { + "hf_id": "zai-org/GLM-4.6", + "meta": { + "title": "GLM-4.6", + "provider": "GLM (Z-AI)", + "description": "GLM-4.6 MoE language model (~357B total parameters, BF16) with MTP speculative decoding, native tool calling and reasoning", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "357B" + }, + "recipe": { + "meta": { + "title": "GLM-4.6", + "slug": "glm-4.6", + "provider": "GLM (Z-AI)", + "description": "GLM-4.6 MoE language model (~357B total parameters, BF16) with MTP speculative decoding, native tool calling and reasoning", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Updated GLM-4.X series MoE model with native FP8 and BF16, MTP speculative decoding", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-4.6", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "357B", + "active_parameters": "32B", + "context_length": 202752, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "GLM-4.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm45", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding using the model's built-in MTP layers", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "1" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 857, + "description": "Full precision BF16 on 8xH200 or equivalent" + }, + "fp8": { + "model_id": "zai-org/GLM-4.6-FP8", + "precision": "fp8", + "vram_minimum_gb": 428, + "description": "Native FP8 checkpoint with minimal accuracy loss \u2014 recommended for cost-efficient serving" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-4.6 is the successor to GLM-4.5 with ~357B total parameters. It retains the\nMoE architecture and built-in Multi-Token Prediction (MTP) layers used for\nspeculative decoding. FP8 is the recommended precision for cost-efficient\nserving with minimal accuracy loss relative to BF16.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.11.0 (latest stable recommended)\n- **Hardware:** 8x H200 (BF16) or 4x-8x H200 (FP8), AMD MI300X / MI325X / MI355X for ROCm\n- **Python:** 3.10 - 3.13 (3.12 required for ROCm wheels)\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n## Launching the Server\n\n### Tensor Parallel (FP8)\n\n```bash\nvllm serve zai-org/GLM-4.6-FP8 \\\n --tensor-parallel-size 8 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice\n```\n\n### Enabling MTP Speculative Decoding\n\n```bash\nvllm serve zai-org/GLM-4.6-FP8 \\\n --tensor-parallel-size 4 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice\n```\n\n### Tuning Tips\n\n- `--max-model-len=65536` works well for most scenarios; max is 128K.\n- `--max-num-batched-tokens=32768` is a good default for prompt-heavy workloads.\n- `--gpu-memory-utilization=0.95` maximizes KV cache headroom.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"zai-org/GLM-4.6-FP8\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize MTP speculative decoding.\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model zai-org/GLM-4.6-FP8 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **MTP memory overhead:** Monitor GPU memory and tune batch size when enabling MTP.\n- **Tool calling not firing:** Ensure `--tool-call-parser glm45 --enable-auto-tool-choice`\n are both present.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-4.6)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-4.6-FP8)\n- [vLLM docs](https://docs.vllm.ai/)\n" + } + }, + "zai-org/GLM-4.6V": { + "hf_id": "zai-org/GLM-4.6V", + "meta": { + "title": "GLM-4.6V", + "provider": "GLM (Z-AI)", + "description": "GLM-4.6 vision-language MoE model \u2014 image-text-to-text with 128K context, native FP8 checkpoint, and expert parallelism", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "107B" + }, + "recipe": { + "meta": { + "title": "GLM-4.6V", + "slug": "glm-4.6v", + "provider": "GLM (Z-AI)", + "description": "GLM-4.6 vision-language MoE model \u2014 image-text-to-text with 128K context, native FP8 checkpoint, and expert parallelism", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Updated GLM-V series with 128K context length and native FP8", + "related_recipes": [], + "hardware": { + "h100": "verified", + "h200": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-4.6V", + "min_vllm_version": "0.12.0", + "architecture": "moe", + "parameter_count": "107B", + "active_parameters": "12B", + "context_length": 131072, + "base_args": [ + "--trust-remote-code", + "--enable-expert-parallel", + "--allowed-local-media-path", + "/", + "--mm-encoder-tp-mode", + "data", + "--mm-processor-cache-type", + "shm" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "GLM-4.5 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm45", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 257, + "description": "Full precision BF16 \u2014 runs on 4xH100/H200" + }, + "fp8": { + "model_id": "zai-org/GLM-4.6V-FP8", + "precision": "fp8", + "vram_minimum_gb": 128, + "description": "Native FP8 checkpoint with minimal accuracy loss" + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep" + ], + "hardware_overrides": { + "amd": { + "extra_args": [ + "--mm-encoder-tp-mode", + "data", + "--allowed-local-media-path", + "/" + ], + "extra_env": { + "VLLM_ROCM_USE_AITER": "1", + "SAFETENSORS_FAST_GPU": "1" + } + } + }, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-4.6V is an updated vision-language MoE model from Z-AI. It supports a 128K\ncontext length (vs 64K for GLM-4.5V). Native FP8 is recommended for cost-efficient\nserving, matching BF16 accuracy to within a small margin.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.12.0\n- **Hardware:** 4x H100/H200 (BF16 or FP8)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n## Launching the Server\n\n### Tensor Parallel + Expert Parallel (FP8 on 4 GPUs)\n\n```bash\nvllm serve zai-org/GLM-4.6V-FP8 \\\n --tensor-parallel-size 4 \\\n --tool-call-parser glm45 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --enable-expert-parallel \\\n --allowed-local-media-path / \\\n --mm-encoder-tp-mode data \\\n --mm-processor-cache-type shm\n```\n\n### Tuning Tips\n\n- `--max-model-len=65536` is a common default; you can push to 131072.\n- `--max-num-batched-tokens=32768` for prompt-heavy workloads.\n- `--mm-encoder-tp-mode data` + `--mm-processor-cache-type shm` for efficient vision processing.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"zai-org/GLM-4.6V-FP8\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://example.com/image.png\"}},\n {\"type\": \"text\", \"text\": \"Describe the image.\"}\n ]\n }],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --backend openai-chat \\\n --endpoint /v1/chat/completions \\\n --model zai-org/GLM-4.6V-FP8 \\\n --dataset-name hf \\\n --dataset-path lmarena-ai/VisionArena-Chat \\\n --num-prompts 1000 \\\n --request-rate 20\n```\n\n## Troubleshooting\n\n- **Vision encoder overhead:** Prefer `--mm-encoder-tp-mode data` over TP for the encoder.\n- **Long-context memory:** At 128K context, tune `--max-num-batched-tokens` and\n `--gpu-memory-utilization` to prevent OOM.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-4.6V)\n- [vLLM multimodal inputs guide](https://docs.vllm.ai/en/latest/features/multimodal_inputs.html)\n" + } + }, + "zai-org/GLM-4.7": { + "hf_id": "zai-org/GLM-4.7", + "meta": { + "title": "GLM-4.7", + "provider": "GLM (Z-AI)", + "description": "GLM-4.7 MoE language model (~358B total parameters) with MTP speculative decoding, updated tool call parser, and reasoning support", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "358B" + }, + "recipe": { + "meta": { + "title": "GLM-4.7", + "slug": "glm-4.7", + "provider": "GLM (Z-AI)", + "description": "GLM-4.7 MoE language model (~358B total parameters) with MTP speculative decoding, updated tool call parser, and reasoning support", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "text" + ], + "performance_headline": "Latest GLM-4.X release with updated glm47 tool call parser and MTP speculative decoding", + "related_recipes": [], + "hardware": { + "h200": "verified", + "mi300x": "verified", + "mi325x": "verified", + "mi355x": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-4.7", + "min_vllm_version": "0.11.0", + "architecture": "moe", + "parameter_count": "358B", + "active_parameters": "32B", + "context_length": 202752, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "GLM-4.7 requires the nightly vllm wheel", + "command": "uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly" + }, + { + "note": "transformers from source \u2014 GLM-4.7 tokenizer is newer than any release", + "command": "uv pip install git+https://github.com/huggingface/transformers.git" + } + ], + "features": { + "tool_calling": { + "description": "GLM-4.7 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm47", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser for chain-of-thought extraction", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding using the model's built-in MTP layers", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "1" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 859, + "description": "Full precision BF16 on 8xH200 or equivalent" + }, + "fp8": { + "model_id": "zai-org/GLM-4.7-FP8", + "precision": "fp8", + "vram_minimum_gb": 430, + "description": "Native FP8 checkpoint with minimal accuracy loss" + }, + "nvfp4": { + "model_id": "nvidia/GLM-4.7-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 215, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "single_node_dep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_dep", + "multi_node_tep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-4.7 is the latest GLM-4.X MoE release from Z-AI. It introduces the `glm47`\ntool call parser while retaining the GLM-4.5 reasoning parser. Built-in\nMulti-Token Prediction (MTP) layers enable speculative decoding for throughput\ngains on decode-heavy workloads.\n\nA smaller `zai-org/GLM-4.7-Flash` variant is also available for lower-latency\nscenarios.\n\n## Prerequisites\n\n- **vLLM version:** nightly recommended for GLM-4.7 (until packaged in a stable release)\n- **Hardware:** 4x-8x H200 (FP8), AMD MI300X / MI325X / MI355X for ROCm\n- **Python:** 3.10 - 3.13 (3.12 required for ROCm wheels)\n\n### Install vLLM (NVIDIA, nightly)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly\nuv pip install git+https://github.com/huggingface/transformers.git\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\n## Launching the Server\n\n### Tensor Parallel + MTP (FP8 on 4xH200)\n\n```bash\nvllm serve zai-org/GLM-4.7-FP8 \\\n --tensor-parallel-size 4 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1 \\\n --tool-call-parser glm47 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice\n```\n\n### AMD ROCm\n\n```bash\nSAFETENSORS_FAST_GPU=1 \\\nvllm serve zai-org/GLM-4.7 \\\n --tensor-parallel-size 8 \\\n --gpu-memory-utilization 0.9 \\\n --disable-log-requests \\\n --no-enable-prefix-caching \\\n --trust-remote-code\n```\n\n### Tuning Tips\n\n- `--max-model-len=65536` is a sensible default; max is 128K.\n- `--max-num-batched-tokens=32768` for prompt-heavy workloads; reduce to 8K-16K for latency-sensitive.\n- Use `--gpu-memory-utilization=0.95` to maximize KV cache.\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"zai-org/GLM-4.7-FP8\",\n messages=[{\"role\": \"user\", \"content\": \"Hello!\"}],\n max_tokens=512,\n)\nprint(resp.choices[0].message.content)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model zai-org/GLM-4.7-FP8 \\\n --dataset-name random \\\n --random-input-len 8000 \\\n --random-output-len 1000 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **Parser mismatch:** GLM-4.7 uses `--tool-call-parser glm47` (not `glm45`).\n- **MTP acceptance:** 1 speculative token gives ~90%+ acceptance and best throughput.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-4.7)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-4.7-FP8)\n- [GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)\n- [vLLM docs](https://docs.vllm.ai/)\n" + } + }, + "zai-org/GLM-5.1": { + "hf_id": "zai-org/GLM-5.1", + "meta": { + "title": "GLM-5.1", + "provider": "GLM (Z-AI)", + "description": "GLM-5.1 refreshed version of GLM-5 \u2014 frontier-scale MoE language model (~744B total parameters) with MTP speculative decoding and thinking mode", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "744B" + }, + "recipe": { + "meta": { + "title": "GLM-5.1", + "slug": "glm-5.1", + "provider": "GLM (Z-AI)", + "description": "GLM-5.1 refreshed version of GLM-5 \u2014 frontier-scale MoE language model (~744B total parameters) with MTP speculative decoding and thinking mode", + "date_updated": "2026-05-21", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "Refreshed GLM-5 series MoE with improved reasoning, coding, and agentic performance", + "related_recipes": [], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-5.1", + "min_vllm_version": "0.19.1", + "architecture": "moe", + "parameter_count": "744B", + "active_parameters": "40B", + "context_length": 202752, + "base_args": [ + "--trust-remote-code", + "--chat-template-content-format=string" + ], + "base_env": {} + }, + "features": { + "tool_calling": { + "description": "GLM-4.7 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm47", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser \u2014 thinking mode enabled by default on requests", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding (3 draft tokens)", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "3" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 1786, + "description": "Full precision BF16 \u2014 requires multi-node deployment" + }, + "fp8": { + "model_id": "zai-org/GLM-5.1-FP8", + "precision": "fp8", + "vram_minimum_gb": 893, + "description": "Native FP8 checkpoint \u2014 8xH200/H20 (141GB \u00d7 8) single-node serving" + }, + "nvfp4": { + "model_id": "nvidia/GLM-5.1-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 446, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-5.1 is a refreshed version of GLM-5, the 744B parameter frontier MoE model from Z-AI.\nIt keeps the asynchronous RL training recipe and delivers best-in-class open-source\nperformance on reasoning, coding, and agentic benchmarks. Both BF16 and native FP8\ncheckpoints are published.\n\nThinking mode is enabled by default; disable it by passing\n`\"chat_template_kwargs\": {\"enable_thinking\": false}` in request extras.\n\n## Prerequisites\n\n- **vLLM version:** 0.19.0 (stable \u2014 preferred over nightly for model performance).\n Use the latest main branch if you need tool calling + MTP simultaneously.\n- **Hardware (FP8):** 8xH200 or 8xH20 (141GB \u00d7 8)\n- **DeepGEMM (FP8):** install via `install_deepgemm.sh` from vLLM repo\n\n### Using Docker\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:latest zai-org/GLM-5.1-FP8 \\\n --tensor-parallel-size 8 \\\n --tool-call-parser glm47 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --chat-template-content-format=string \\\n --served-model-name glm-5.1-fp8\n```\n\nUse `vllm/vllm-openai:latest-cu129` for CUDA 12.x.\n\n### Install vLLM from Source\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install \"vllm==0.19.0\" --torch-backend=auto\nuv pip install \"transformers>=5.4.0\"\n```\n\n## Launching the Server\n\n### FP8 on 8xH200 with MTP\n\n```bash\nvllm serve zai-org/GLM-5.1-FP8 \\\n --tensor-parallel-size 8 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 3 \\\n --tool-call-parser glm47 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --chat-template-content-format=string \\\n --served-model-name glm-5.1-fp8\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\n\n# Thinking ON (default)\nresp_on = client.chat.completions.create(\n model=\"glm-5.1-fp8\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize GLM-5.1 in one sentence.\"}],\n temperature=1,\n max_tokens=4096,\n)\nprint(resp_on.choices[0].message.reasoning)\n\n# Thinking OFF\nresp_off = client.chat.completions.create(\n model=\"glm-5.1-fp8\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize GLM-5.1 in one sentence.\"}],\n temperature=1,\n max_tokens=4096,\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n)\n```\n\n### cURL (Thinking ON)\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"glm-5.1-fp8\",\n \"messages\": [\n {\"role\": \"user\", \"content\": \"Summarize GLM-5.1 in one sentence.\"}\n ],\n \"temperature\": 1,\n \"max_tokens\": 4096\n }'\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model zai-org/GLM-5.1-FP8 \\\n --dataset-name random \\\n --random-input 8000 \\\n --random-output 1024 \\\n --request-rate 10 \\\n --num-prompts 32 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **Accuracy drift:** Prefer the 0.19.0 stable release for best accuracy.\n- **Tool calling + MTP:** If both are needed, use the latest vLLM main branch.\n- **FP8 installation:** DeepGEMM required for FP8 performance.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-5.1)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-5.1-FP8)\n- [DeepGEMM install script](https://github.com/vllm-project/vllm/blob/v0.16.0rc0/tools/install_deepgemm.sh)\n" + } + }, + "zai-org/GLM-5": { + "hf_id": "zai-org/GLM-5", + "meta": { + "title": "GLM-5", + "provider": "GLM (Z-AI)", + "description": "GLM-5 frontier-scale MoE language model (~744B total parameters, 28.5T training tokens) with asynchronous RL infrastructure for reasoning, coding, and agentic tasks", + "tasks": [ + "text" + ], + "hardware": { + "h200": "verified" + } + }, + "model_info": { + "architecture": "moe", + "parameter_count": "744B" + }, + "recipe": { + "meta": { + "title": "GLM-5", + "slug": "glm-5", + "provider": "GLM (Z-AI)", + "description": "GLM-5 frontier-scale MoE language model (~744B total parameters, 28.5T training tokens) with asynchronous RL infrastructure for reasoning, coding, and agentic tasks", + "date_updated": "2026-04-17", + "difficulty": "advanced", + "tasks": [ + "text" + ], + "performance_headline": "Frontier-scale MoE with 744B parameters, best-in-class open-source performance on reasoning/coding/agents", + "related_recipes": [], + "hardware": { + "h200": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-5", + "min_vllm_version": "0.16.0", + "architecture": "moe", + "parameter_count": "744B", + "active_parameters": "40B", + "context_length": 202752, + "base_args": [ + "--trust-remote-code", + "--chat-template-content-format=string" + ], + "base_env": {} + }, + "dependencies": [ + { + "note": "Pin vllm==0.19.0 (avoid nightly)", + "command": "uv pip install \"vllm==0.19.0\" --torch-backend=auto" + }, + { + "note": "GLM-5 requires transformers >= 5.4.0", + "command": "uv pip install \"transformers>=5.4.0\"" + }, + { + "note": "Optional: DeepGEMM for FP8 MoE kernels (FP8 variant only)", + "command": "bash install_deepgemm.sh", + "optional": true + } + ], + "features": { + "tool_calling": { + "description": "GLM-4.7 tool call parser with automatic tool choice", + "args": [ + "--tool-call-parser", + "glm47", + "--enable-auto-tool-choice" + ] + }, + "reasoning": { + "description": "GLM-4.5 reasoning parser \u2014 thinking mode enabled by default on requests", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding (3 draft tokens)", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "3" + ] + } + }, + "opt_in_features": [ + "spec_decoding" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 1786, + "description": "Full precision BF16 \u2014 requires multi-node deployment" + }, + "fp8": { + "model_id": "zai-org/GLM-5-FP8", + "precision": "fp8", + "vram_minimum_gb": 893, + "description": "Native FP8 checkpoint \u2014 8xH200/H20 (141GB x 8) single-node serving" + }, + "nvfp4": { + "model_id": "nvidia/GLM-5-NVFP4", + "precision": "nvfp4", + "vram_minimum_gb": 446, + "description": "NVIDIA NVFP4 quantized weights for Blackwell GPUs", + "extra_args": [ + "--kv-cache-dtype", + "fp8" + ], + "extra_env": { + "VLLM_USE_FLASHINFER_MOE_FP4": "1" + } + } + }, + "compatible_strategies": [ + "single_node_tp", + "single_node_tep", + "multi_node_tp", + "multi_node_tp_pp", + "multi_node_tep", + "multi_node_dep", + "pd_cluster" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-5 is a significantly scaled-up language model with 744B parameters, trained on\n28.5T tokens using novel asynchronous RL infrastructure. It delivers best-in-class\nopen-source performance on reasoning, coding, and agentic tasks, rivaling frontier\nclosed-source models. GLM-5 is available in both BF16 and native FP8 precisions.\n\nThinking mode is enabled by default; disable it by passing\n`\"chat_template_kwargs\": {\"enable_thinking\": false}` in request extras.\n\n## Prerequisites\n\n- **vLLM version:** 0.19.0 (stable \u2014 preferred over nightly for model performance)\n- **Hardware (FP8):** 8xH200 or 8xH20 (141GB \u00d7 8)\n- **DeepGEMM (FP8):** install via `install_deepgemm.sh` from the vLLM repo\n\n### Using Docker\n\n```bash\ndocker run --gpus all \\\n -p 8000:8000 \\\n --ipc=host \\\n -v ~/.cache/huggingface:/root/.cache/huggingface \\\n vllm/vllm-openai:latest zai-org/GLM-5-FP8 \\\n --tensor-parallel-size 8 \\\n --tool-call-parser glm47 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --chat-template-content-format=string\n```\n\nUse `vllm/vllm-openai:latest-cu129` for CUDA 12.x.\n\n### Install vLLM from Source\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install \"vllm==0.19.0\" --torch-backend=auto\nuv pip install \"transformers>=5.4.0\"\n```\n\nInstall DeepGEMM using `install_deepgemm.sh` from the vLLM tools directory.\n\n## Launching the Server\n\n### FP8 on 8xH200 with MTP\n\n```bash\nvllm serve zai-org/GLM-5-FP8 \\\n --tensor-parallel-size 8 \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 3 \\\n --tool-call-parser glm47 \\\n --reasoning-parser glm45 \\\n --enable-auto-tool-choice \\\n --chat-template-content-format=string \\\n --served-model-name glm-5-fp8\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\n\n# Thinking ON (default)\nresp_on = client.chat.completions.create(\n model=\"glm-5-fp8\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize GLM-5 in one sentence.\"}],\n temperature=1,\n max_tokens=4096,\n)\nprint(\"thinking=on, think content:\\n\", resp_on.choices[0].message.reasoning)\n\n# Thinking OFF\nresp_off = client.chat.completions.create(\n model=\"glm-5-fp8\",\n messages=[{\"role\": \"user\", \"content\": \"Summarize GLM-5 in one sentence.\"}],\n temperature=1,\n max_tokens=4096,\n extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n)\nprint(\"thinking=off:\\n\", resp_off.choices[0].message.reasoning)\n```\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model zai-org/GLM-5-FP8 \\\n --dataset-name random \\\n --random-input 8000 \\\n --random-output 1024 \\\n --request-rate 10 \\\n --num-prompts 32 \\\n --ignore-eos\n```\n\nThe MTP acceptance rate can be relatively low in pure benchmarks; measured throughput\nmay underestimate real-world speed.\n\n## Troubleshooting\n\n- **Accuracy drift:** Prefer the 0.19.0 stable release over nightly for best accuracy.\n- **Tool calling + MTP:** If you need both, use the latest vLLM main branch.\n- **FP8 installation:** DeepGEMM is required for FP8 performance.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-5)\n- [FP8 checkpoint](https://huggingface.co/zai-org/GLM-5-FP8)\n- [DeepGEMM install script](https://github.com/vllm-project/vllm/blob/v0.16.0rc0/tools/install_deepgemm.sh)\n" + } + }, + "zai-org/GLM-ASR-Nano-2512": { + "hf_id": "zai-org/GLM-ASR-Nano-2512", + "meta": { + "title": "GLM-ASR-Nano-2512", + "provider": "GLM (Z-AI)", + "description": "Open-source speech recognition model (~2B) with strong dialect support (Cantonese and others) and robust low-volume speech transcription", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "2.3B" + }, + "recipe": { + "meta": { + "title": "GLM-ASR-Nano-2512", + "slug": "glm-asr-nano-2512", + "provider": "GLM (Z-AI)", + "description": "Open-source speech recognition model (~2B) with strong dialect support (Cantonese and others) and robust low-volume speech transcription", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "performance_headline": "Outperforms Whisper V3 on multiple benchmarks at compact 1.5B active / 2B total size", + "related_recipes": [] + }, + "model": { + "model_id": "zai-org/GLM-ASR-Nano-2512", + "min_vllm_version": "0.14.1", + "architecture": "dense", + "parameter_count": "2.3B", + "active_parameters": "1.5B", + "context_length": 8192, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "Audio extras required for ASR (requires vllm>=0.14.1)", + "command": "uv pip install -U \"vllm[audio]\" --torch-backend auto" + }, + { + "note": "Install transformers from source for GLM-ASR tokenizer support", + "command": "uv pip install git+https://github.com/huggingface/transformers.git" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 11, + "description": "Full precision BF16 \u2014 single-GPU deployment" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-ASR-Nano-2512 is an open-source automatic speech recognition model with 1.5B\nactive parameters (2B total). It outperforms OpenAI Whisper V3 on multiple benchmarks\nwhile remaining compact enough for single-GPU deployment.\n\n### Key Capabilities\n\n- **Dialect support:** Beyond standard Mandarin and English, strong on Cantonese\n (\u7ca4\u8bed) and other Chinese dialects.\n- **Low-volume speech:** Specifically trained for \"whisper/quiet speech\" scenarios.\n- **SOTA accuracy:** Lowest average error rate (4.10) among comparable open-source\n models, strong on Wenet Meeting, Aishell-1, and similar Chinese benchmarks.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.14.1 (with `[audio]` extras)\n- **Transformers:** install from source for latest\n\n### Install Dependencies\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install git+https://github.com/huggingface/transformers.git\nuv pip install -U \"vllm[audio]\" --torch-backend auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve zai-org/GLM-ASR-Nano-2512\n```\n\n## Client Usage\n\n### OpenAI SDK (Audio URL)\n\n```python\nimport base64\nimport httpx\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\naudio_url = \"https://github.com/zai-org/GLM-ASR/raw/main/examples/example_en.wav\"\naudio_data = base64.b64encode(httpx.get(audio_url).content).decode(\"utf-8\")\n\nresponse = client.chat.completions.create(\n model=\"zai-org/GLM-ASR-Nano-2512\",\n messages=[{\n \"role\": \"user\",\n \"content\": [{\n \"type\": \"input_audio\",\n \"input_audio\": {\"data\": audio_data, \"format\": \"wav\"}\n }]\n }],\n max_tokens=500,\n)\nprint(response.choices[0].message.content)\n```\n\n### Transcribe Endpoint\n\n```python\nimport httpx\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\naudio_file = httpx.get(\"https://github.com/zai-org/GLM-ASR/raw/main/examples/example_en.wav\").content\n\nresponse = client.audio.transcriptions.create(\n model=\"zai-org/GLM-ASR-Nano-2512\",\n file=(\"audio.wav\", audio_file),\n)\nprint(response.text)\n```\n\n### cURL (Transcribe)\n\n```bash\ncurl http://localhost:8000/v1/audio/transcriptions \\\n -H \"Authorization: Bearer EMPTY\" \\\n -F \"model=zai-org/GLM-ASR-Nano-2512\" \\\n -F \"file=@your_audio.wav\"\n```\n\n### Local Audio File (chat API)\n\n```python\nimport base64\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nwith open(\"your_audio.mp3\", \"rb\") as f:\n audio_data = base64.b64encode(f.read()).decode(\"utf-8\")\n\nresponse = client.chat.completions.create(\n model=\"zai-org/GLM-ASR-Nano-2512\",\n messages=[{\n \"role\": \"user\",\n \"content\": [{\"type\": \"input_audio\", \"input_audio\": {\"data\": audio_data, \"format\": \"mp3\"}}]\n }],\n max_tokens=500,\n)\nprint(response.choices[0].message.content)\n```\n\n## Troubleshooting\n\n- **Transformers version:** Requires `transformers >= 5.0.0` for best compatibility.\n- **Audio formats:** Supports wav, mp3, flac, and other common formats.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)\n- [GitHub repo](https://github.com/zai-org/GLM-ASR)\n" + } + }, + "zai-org/GLM-Image": { + "hf_id": "zai-org/GLM-Image", + "meta": { + "title": "GLM-Image", + "provider": "GLM (Z-AI)", + "description": "Hybrid autoregressive + diffusion image generation model \u2014 text-to-image and image-to-image with strong text rendering and knowledge-intensive generation", + "tasks": [ + "omni" + ], + "hardware": { + "h100": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "16B" + }, + "recipe": { + "meta": { + "title": "GLM-Image", + "slug": "glm-image", + "provider": "GLM (Z-AI)", + "description": "Hybrid autoregressive + diffusion image generation model \u2014 text-to-image and image-to-image with strong text rendering and knowledge-intensive generation", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "omni" + ], + "performance_headline": "9B AR generator + 7B DiT decoder, state-of-the-art text rendering in generated images", + "related_recipes": [], + "hardware": { + "h100": "verified" + } + }, + "model": { + "model_id": "zai-org/GLM-Image", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "16B", + "active_parameters": "16B", + "context_length": 4096, + "base_args": [ + "--trust-remote-code" + ], + "base_env": {} + }, + "omni": { + "tasks": [ + "t2i", + "i2i" + ] + }, + "dependencies": [ + { + "note": "vllm-omni provides the diffusion decoder path", + "command": "uv pip install vllm-omni" + }, + { + "note": "transformers from source (GLM-Image tokenizer)", + "command": "uv pip install git+https://github.com/huggingface/transformers.git" + }, + { + "note": "diffusers from source \u2014 required for the DiT decoder", + "command": "uv pip install git+https://github.com/huggingface/diffusers.git" + } + ], + "features": {}, + "opt_in_features": [], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 38, + "description": "Single-GPU deployment (~33 GB for model weights, plus activation headroom)" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-Image is an image generation model with a hybrid architecture:\n\n- **Autoregressive Generator (9B):** initialized from GLM-4-9B-0414 with an expanded\n vocabulary for visual tokens. Produces a compact encoding (~256 tokens), then\n expands to 1K\u20134K tokens corresponding to 1K\u20132K resolution images.\n- **Diffusion Decoder (7B):** single-stream DiT that decodes latents into pixels.\n Includes a Glyph Encoder text module for accurate in-image text rendering.\n\nServed via vLLM-Omni for OpenAI-compatible online inference.\n\n### Key Capabilities\n\n- Text-to-image and image-to-image (editing, style transfer, identity-preserving)\n- Exceptional text rendering inside generated images\n- Strong knowledge-intensive generation\n\n## Prerequisites\n\n- **vLLM version:** latest (with `vllm-omni` extension)\n- **Transformers:** >= 5.0.0 (use source install for latest)\n- **Hardware:** single H100-class GPU (approx. 33 GB for weights)\n\n### Install Dependencies\n\n```bash\nuv venv --python 3.12 --seed\nsource .venv/bin/activate\n\nuv pip install -U vllm --torch-backend auto\nuv pip install vllm-omni\n\npip install git+https://github.com/huggingface/transformers.git\npip install git+https://github.com/huggingface/diffusers.git\n```\n\n## Online Serving\n\n```bash\nvllm serve zai-org/GLM-Image --omni\n```\n\n## Client Usage\n\n### OpenAI SDK (Text-to-Image)\n\n```python\nimport base64\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nresponse = client.chat.completions.create(\n model=\"zai-org/GLM-Image\",\n messages=[{\"role\": \"user\", \"content\": \"A beautiful landscape painting with mountains and a lake at sunset\"}],\n)\n\nimage_url = response.choices[0].message.content[0].image_url.url\nimage_data = base64.b64decode(image_url.split(\",\")[1])\nwith open(\"output.png\", \"wb\") as f:\n f.write(image_data)\n```\n\n### cURL (Text-to-Image)\n\n```bash\ncurl -s http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"messages\": [{\"role\": \"user\", \"content\": \"A beautiful landscape painting\"}]\n }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png\n```\n\n### Image-to-Image\n\n```python\nimport base64\nfrom openai import OpenAI\n\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"EMPTY\")\n\nwith open(\"input.png\", \"rb\") as f:\n image_base64 = base64.b64encode(f.read()).decode(\"utf-8\")\n\nresponse = client.chat.completions.create(\n model=\"zai-org/GLM-Image\",\n messages=[{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/png;base64,{image_base64}\"}},\n {\"type\": \"text\", \"text\": \"Replace the background with a sunset beach scene\"}\n ]\n }],\n)\n\nimage_url = response.choices[0].message.content[0].image_url.url\nimage_data = base64.b64decode(image_url.split(\",\")[1])\nwith open(\"output.png\", \"wb\") as f:\n f.write(image_data)\n```\n\n## Offline Inference\n\n```bash\n# Text to Image\ncd examples/offline_inference/text_to_image\npython3 text_to_image.py --model zai-org/GLM-Image --output t2i_output.png\n\n# Image to Image\ncd examples/offline_inference/image_to_image\nwget https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/qwen-bear.png\npython3 image_to_image.py --model zai-org/GLM-Image --image qwen-bear.png --output i2i_output.png\n```\n\n## Troubleshooting\n\n- **Resolution errors:** Target image dimensions must be divisible by 32.\n- **Text rendering:** Wrap text that should appear in the image with quotation marks\n in the prompt.\n- **Output stability:** Default `temperature=0.9`, `top_p=0.75`. Higher temperature\n gives more diverse outputs but may reduce stability.\n- **Transformers version:** Requires `transformers >= 5.0.0`.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-Image)\n- [Technical blog](https://z.ai/blog/glm-image)\n- [GitHub repo](https://github.com/zai-org/GLM-Image)\n" + } + }, + "zai-org/GLM-OCR": { + "hf_id": "zai-org/GLM-OCR", + "meta": { + "title": "GLM-OCR", + "provider": "GLM (Z-AI)", + "description": "GLM-OCR image-to-text model with built-in MTP speculative decoding for high-throughput OCR serving", + "tasks": [ + "multimodal" + ], + "hardware": {} + }, + "model_info": { + "architecture": "dense", + "parameter_count": "0.9B" + }, + "recipe": { + "meta": { + "title": "GLM-OCR", + "slug": "glm-ocr", + "provider": "GLM (Z-AI)", + "description": "GLM-OCR image-to-text model with built-in MTP speculative decoding for high-throughput OCR serving", + "date_updated": "2026-04-17", + "difficulty": "beginner", + "tasks": [ + "multimodal" + ], + "performance_headline": "Multilingual end-to-end OCR VLM with MTP-accelerated decoding", + "related_recipes": [] + }, + "model": { + "model_id": "zai-org/GLM-OCR", + "min_vllm_version": "0.12.0", + "architecture": "dense", + "parameter_count": "0.9B", + "active_parameters": "0.9B", + "context_length": 131072, + "base_args": [], + "base_env": {} + }, + "dependencies": [ + { + "note": "GLM-OCR requires the nightly vllm wheel", + "command": "uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly" + }, + { + "note": "transformers from source for GLM-OCR tokenizer support", + "command": "uv pip install git+https://github.com/huggingface/transformers.git" + } + ], + "features": { + "spec_decoding": { + "description": "Multi-Token Prediction speculative decoding using the model's built-in MTP layers", + "args": [ + "--speculative-config.method", + "mtp", + "--speculative-config.num_speculative_tokens", + "1" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 2, + "description": "Full precision BF16 \u2014 single-GPU deployment" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\nGLM-OCR is a vision-language model for end-to-end OCR. It includes built-in\nMulti-Token Prediction (MTP) layers enabling speculative decoding for higher\nthroughput generation.\n\n## Prerequisites\n\n- **vLLM version:** nightly recommended (or latest stable with MTP support)\n- **Transformers:** >= 5.0.0 (install from source for latest)\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\nOr nightly:\n\n```bash\nuv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly\nuv pip install git+https://github.com/huggingface/transformers.git\n```\n\n## Launching the Server\n\n### With MTP Speculative Decoding\n\n```bash\nvllm serve zai-org/GLM-OCR \\\n --speculative-config.method mtp \\\n --speculative-config.num_speculative_tokens 1\n```\n\n## Client Usage\n\n### OpenAI SDK\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\", timeout=3600)\n\nmessages = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png\"}},\n {\"type\": \"text\", \"text\": \"Text Recognition:\"}\n ]\n}]\n\nresponse = client.chat.completions.create(\n model=\"zai-org/GLM-OCR\",\n messages=messages,\n max_tokens=2048,\n temperature=0.0,\n)\nprint(response.choices[0].message.content)\n```\n\n### cURL\n\n```bash\ncurl -s http://localhost:8000/v1/chat/completions \\\n -H \"Content-Type: application/json\" \\\n -d '{\n \"model\": \"zai-org/GLM-OCR\",\n \"messages\": [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://example.com/receipt.png\"}},\n {\"type\": \"text\", \"text\": \"Text Recognition:\"}\n ]\n }],\n \"max_tokens\": 2048,\n \"temperature\": 0.0\n }'\n```\n\n## Troubleshooting\n\n- **Greedy sampling recommended:** Use `temperature=0.0` for optimal OCR accuracy.\n- **Transformers version:** Requires `transformers >= 5.0.0`.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/GLM-OCR)\n" + } + }, + "zai-org/Glyph": { + "hf_id": "zai-org/Glyph", + "meta": { + "title": "Glyph", + "provider": "GLM (Z-AI)", + "description": "Visual-text compression framework that renders long text into images and processes them with a reasoning VLM, scaling effective context length", + "tasks": [ + "multimodal" + ], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified" + } + }, + "model_info": { + "architecture": "dense", + "parameter_count": "10B" + }, + "recipe": { + "meta": { + "title": "Glyph", + "slug": "glyph", + "provider": "GLM (Z-AI)", + "description": "Visual-text compression framework that renders long text into images and processes them with a reasoning VLM, scaling effective context length", + "date_updated": "2026-04-17", + "difficulty": "intermediate", + "tasks": [ + "multimodal" + ], + "performance_headline": "Reasoning multimodal model for visual-text compression, single-GPU deployable", + "related_recipes": [], + "hardware": { + "h100": "verified", + "mi300x": "verified", + "mi325x": "verified" + } + }, + "model": { + "model_id": "zai-org/Glyph", + "min_vllm_version": "0.11.0", + "architecture": "dense", + "parameter_count": "10B", + "active_parameters": "10B", + "context_length": 131072, + "base_args": [ + "--no-enable-prefix-caching", + "--mm-processor-cache-gb", + "0", + "--limit-mm-per-prompt.video", + "0" + ], + "base_env": {} + }, + "features": { + "reasoning": { + "description": "GLM-4.5 reasoning parser for extracting reasoning traces from Glyph outputs", + "args": [ + "--reasoning-parser", + "glm45" + ] + }, + "text_only": { + "description": "Skip loading the vision encoder for text-only workloads \u2014 frees VRAM for KV cache. Mutually exclusive with encoder_parallel.", + "args": [ + "--language-model-only" + ] + }, + "encoder_parallel": { + "description": "Run the vision encoder in data-parallel mode \u2014 avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.", + "args": [ + "--mm-encoder-tp-mode", + "data" + ] + } + }, + "opt_in_features": [ + "text_only", + "encoder_parallel" + ], + "variants": { + "default": { + "precision": "bf16", + "vram_minimum_gb": 24, + "description": "Full precision BF16 \u2014 single-GPU deployment on 1xH100" + } + }, + "compatible_strategies": [ + "single_node_tp", + "multi_node_tp" + ], + "hardware_overrides": {}, + "strategy_overrides": {}, + "guide": "## Overview\n\n[Glyph](https://github.com/thu-coai/Glyph) is a framework from Zhipu AI for scaling\ncontext length via visual-text compression. It renders long textual sequences into\nimages and processes them with a vision-language model. This recipe covers the vLLM\ndeployment of the `zai-org/Glyph` VLM as a component in that framework.\n\nGlyph is a reasoning multimodal model, so `--reasoning-parser glm45` is recommended\nto parse reasoning traces from outputs.\n\n## Prerequisites\n\n- **vLLM version:** latest stable\n- **Hardware:** 1x H100 or 1x MI300X/MI325X\n\n### Install vLLM (NVIDIA)\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend auto\n```\n\n### Install vLLM (AMD ROCm)\n\n```bash\nuv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm\n```\n\nROCm wheel requires Python 3.12, ROCm 7.0, glibc >= 2.35.\n\n## Launching the Server\n\n### Single H100 GPU\n\n```bash\nvllm serve zai-org/Glyph \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0 \\\n --reasoning-parser glm45 \\\n --limit-mm-per-prompt.video 0\n```\n\n### Single MI300X / MI325X\n\n```bash\nVLLM_ROCM_USE_AITER=1 \\\nSAFETENSORS_FAST_GPU=1 \\\nvllm serve zai-org/Glyph \\\n --no-enable-prefix-caching \\\n --mm-processor-cache-gb 0 \\\n --reasoning-parser glm45 \\\n --limit-mm-per-prompt.video 0\n```\n\n### Configuration Tips\n\n- `--no-enable-prefix-caching` and `--mm-processor-cache-gb 0` are recommended for\n OCR-like workloads where image reuse is uncommon; they avoid unnecessary hashing\n and caching overhead.\n- Adjust `--max-num-batched-tokens` for throughput according to your hardware.\n\n## Benchmarking\n\n```bash\nvllm bench serve \\\n --model zai-org/Glyph \\\n --dataset-name random \\\n --random-input-len 8192 \\\n --random-output-len 512 \\\n --request-rate 10000 \\\n --num-prompts 16 \\\n --ignore-eos\n```\n\n## Troubleshooting\n\n- **Reasoning traces:** Use `--reasoning-parser glm45` to extract reasoning content.\n- **Slow first inference:** Disabling prefix caching and multimodal processor caching\n is intentional for Glyph's use case and trades off first-request latency for predictable throughput.\n\n## References\n\n- [Model card](https://huggingface.co/zai-org/Glyph)\n- [Glyph framework on GitHub](https://github.com/thu-coai/Glyph)\n- [vLLM deployment section of Glyph docs](https://github.com/thu-coai/Glyph?tab=readme-ov-file#model-deployment-vllm-acceleration)\n" + } + } + } +} \ No newline at end of file diff --git a/skills/serving-llms-on-instinct/reference.md b/skills/serving-llms-on-instinct/reference.md new file mode 100644 index 0000000..277e44e --- /dev/null +++ b/skills/serving-llms-on-instinct/reference.md @@ -0,0 +1,107 @@ +# serving-llms-on-instinct -- Reference + +## Table of Contents +1. [Precision Compatibility](#precision-compatibility) +2. [Docker Flags](#docker-flags) +3. [Known Quirks](#known-quirks) + +--- + +## Precision Compatibility + +| Format | gfx942 (MI300X) | gfx950 (MI350X) | Notes | +|---|---|---|---| +| BF16 / FP16 | Native | Native | Default for all models | +| FP8 (FNUZ) | Native | Emulated | MI300X uses E4M3FNUZ dialect | +| FP8 (OCP) | Emulated | Native | MI350X uses E4M3FN (OCP standard) | +| INT8 | Native | Native | | +| MXFP4 | Emulated | Native | On gfx942: compute dequants to BF16, weights stay compressed | +| MXFP6 | Emulated | Native | On gfx942: compute dequants to BF16, weights stay compressed | +| NVFP4 | Not supported | Not supported | NVIDIA-specific, no dequant kernel on ROCm | + +"Emulated" means compute is handled via dequantization to BF16 during matmul. +Weights stay in their compressed format in VRAM, so quantized models still +benefit from reduced memory. vLLM auto-converts between FP8 dialects +(FNUZ/OCP) transparently. NVFP4 models (e.g. `nvidia/*-NVFP4`) will not +load on AMD GPUs -- use FP8 or MXFP4 alternatives instead. + +### VRAM Estimation + +Use `scripts/estimate_vram.py` to estimate weight memory and KV cache +requirements from the HuggingFace Hub API (no model download): +```bash +python3 scripts/estimate_vram.py --model-id --vram-gb +``` +Returns JSON with `weight_memory_gb`, `kv_cache_bytes_per_token`, +achievable context length, and fit status. The script reserves ~4 GB for +vLLM's runtime overhead (activation profiling, HIP graph capture, internal +buffers). Weight memory is derived from safetensors metadata (tested: +GPT-OSS-120B reports 65 GB, vLLM logs show 68.7 GB actual load on MI300X). +KV cache per token is calculated from the model's `config.json` architecture +parameters. MLA models (DeepSeek-R1/V3) are detected and use their compressed +KV dimensions. + +--- + +## Docker Flags + +### Mandatory (all AMD Instinct) + +| Flag | Why | +|---|---| +| `--group-add=video` | amdgpu exposes GPUs to the `video` group | +| `--group-add=render` | GPU render nodes require the `render` group on many hosts | +| `--cap-add=SYS_PTRACE` | ROCm JIT compilation requires ptrace | +| `--security-opt seccomp=unconfined` | ROCm mmap variants blocked by default seccomp | +| `--device /dev/kfd` | Kernel Fusion Driver -- primary GPU access | +| `--device /dev/dri` | Render nodes for GPU command submission | +| `--ipc=host` | ROCm shared memory needs host IPC namespace | + +### Docker image + +`vllm/vllm-openai-rocm:` -- tag is auto-resolved from Docker Hub +during recipe sync. Includes gfx942 and gfx950 kernels. +Do NOT use `vllm/vllm-openai` (CUDA-only). + +### GPU visibility + +| Variable | Rule | +|---|---| +| `CUDA_VISIBLE_DEVICES` | ROCm maps this to `HIP_VISIBLE_DEVICES`. Works with explicit indices (e.g. `0,1`). **Never set to empty string** -- hides all GPUs. | +| `HIP_VISIBLE_DEVICES` | Canonical AMD variable. Use to restrict visible GPUs by index on multi-GPU hosts. | + +--- + +## Known Quirks + +**vLLM #34641 -- FP4BMM crash on gfx942** +Segfault or illegal instruction during model warmup on MI300X/MI325X/MI300A. +Triggered when `VLLM_ROCM_USE_AITER_FP4BMM=1` on gfx942. +Fix: always set `VLLM_ROCM_USE_AITER_FP4BMM=0` on gfx942. +This is set correctly in `data/gpu_overrides.json` for gfx942. + +**CUDA_VISIBLE_DEVICES empty string** +ROCm maps `CUDA_VISIBLE_DEVICES` to `HIP_VISIBLE_DEVICES`. Setting it to an +empty string hides all GPUs. Setting it to explicit indices (e.g. `0,1`) works +correctly. If the host has it set to empty, unset it: `unset CUDA_VISIBLE_DEVICES`. +Do not pass `--env CUDA_VISIBLE_DEVICES=` (empty) into Docker. + +**NUMA balancing latency spikes** +`/proc/sys/kernel/numa_balancing=1` periodically migrates pages between NUMA +nodes. For GPU workloads this causes latency spikes as GPU DMA must follow +moved pages. Disable: `echo 0 | sudo tee /proc/sys/kernel/numa_balancing` +Non-persistent -- resets on reboot. + +**First-token warmup delay** +vLLM compiles and caches HIP kernels on first use per input shape. +First inference after model load: ~40-45 seconds on gfx942. +Send a warmup request immediately after `/health` returns 200. + +**"Engine core initialization failed"** +This opaque error covers many root causes. Check early container logs +(`docker logs 2>&1 | head -50`). Common causes: +- Gated model: HF license not accepted (not just missing token) +- Unsupported architecture on this vLLM version +- OOM during weight loading +- Missing `--trust-remote-code` for custom model architectures +- vLLM version too old (check `min_vllm_version` in the recipe) diff --git a/skills/serving-llms-on-instinct/scripts/detect.py b/skills/serving-llms-on-instinct/scripts/detect.py new file mode 100644 index 0000000..7cb77dc --- /dev/null +++ b/skills/serving-llms-on-instinct/scripts/detect.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Detect AMD GPU hardware via amd-smi. + +Usage: + python scripts/detect.py + python scripts/detect.py --host root@10.0.0.5 + +Output: JSON with gpu_count, gfx_version (first GPU), rocm_version, full GPU list. +Exits 0 on success, 1 on failure. + +Env vars (used when --host is not given): + ROCM_SSH_HOST -- remote host + ROCM_SSH_USER -- SSH user (default: root) + ROCM_SSH_PORT -- SSH port (default: 22) +""" + +import argparse +import json +import os +import subprocess +import sys + + +def _is_local(host): + return not host or host in ("local", "localhost", "127.0.0.1") + + +def _run(cmd, host, user, port, timeout=30): + try: + if _is_local(host): + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout) + else: + ssh_target = f"{user}@{host}" if user else host + ssh = [ + "ssh", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=15", + "-o", "BatchMode=yes", + "-o", "LogLevel=ERROR", + "-p", str(port), + ssh_target, cmd, + ] + r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout) + return r.returncode, r.stdout, r.stderr + except subprocess.TimeoutExpired: + target = f"{user}@{host}" if user else host + return 1, "", f"Command timed out after {timeout}s on {target}" + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--host", default="", help="[user@]host (default: local or ROCM_SSH_HOST)") + parser.add_argument("--user", default="", help="SSH user (default: root)") + parser.add_argument("--port", type=int, default=0) + args = parser.parse_args() + + host = args.host + user = args.user + if "@" in host: + user, host = host.split("@", 1) + + host = host or os.environ.get("ROCM_SSH_HOST", "") + user = user or os.environ.get("ROCM_SSH_USER", "") + port = args.port or int(os.environ.get("ROCM_SSH_PORT", "22")) + + rc, out, err = _run("amd-smi static --asic --vram --json", host, user, port) + if rc != 0 and "required groups" in err: + # User not in video/render group -- retry with sudo + rc, out, err = _run("sudo amd-smi static --asic --vram --json", host, user, port) + if rc != 0: + print(json.dumps({ + "error": "amd-smi failed", + "detail": err.strip() or f"exit code {rc}", + "hint": "Is amd-smi installed? Is amdgpu kernel module loaded? Try: lsmod | grep amdgpu", + })) + sys.exit(1) + + try: + data = json.loads(out) + except json.JSONDecodeError as e: + print(json.dumps({"error": f"amd-smi JSON parse failed: {e}", "raw": out[:200]})) + sys.exit(1) + + if isinstance(data, list): + gpu_list = data + elif isinstance(data, dict): + gpu_list = data.get("gpu_data", [data]) + else: + gpu_list = [data] + gpus = [] + for entry in gpu_list: + asic = entry.get("asic", {}) + vram = entry.get("vram", {}) + vram_size = vram.get("size", {}) + vram_mb = vram_size.get("value") if isinstance(vram_size, dict) else vram_size + gpus.append({ + "index": entry.get("gpu", len(gpus)), + "market_name": asic.get("market_name", "Unknown"), + "gfx_version": asic.get("target_graphics_version", "unknown").lower(), + "vram_gb": round(vram_mb / 1024, 1) if vram_mb else None, + "vram_type": vram.get("type"), + "compute_units": asic.get("num_compute_units"), + }) + + rocm_version = "unknown" + rc2, out2, err2 = _run("amd-smi version --json", host, user, port, timeout=10) + if rc2 != 0 and "required groups" in err2: + rc2, out2, _ = _run("sudo amd-smi version --json", host, user, port, timeout=10) + if rc2 == 0: + try: + vdata = json.loads(out2) + if isinstance(vdata, list) and vdata: + rocm_version = vdata[0].get("rocm_version", "unknown") + elif isinstance(vdata, dict): + rocm_version = vdata.get("rocm_version", "unknown") + except json.JSONDecodeError: + pass + + print(json.dumps({ + "gpu_count": len(gpus), + "gfx_version": gpus[0]["gfx_version"] if gpus else "unknown", + "rocm_version": rocm_version, + "target": "local" if _is_local(host) else host, + "gpus": gpus, + }, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-instinct/scripts/estimate_vram.py b/skills/serving-llms-on-instinct/scripts/estimate_vram.py new file mode 100644 index 0000000..8f606b8 --- /dev/null +++ b/skills/serving-llms-on-instinct/scripts/estimate_vram.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +Estimate VRAM requirements for a HuggingFace model. + +Reports weight memory (from safetensors metadata) and KV cache per token +(from model config). With --vram-gb, estimates achievable context length +and whether the model fits. + +Usage: + python3 scripts/estimate_vram.py --model-id Qwen/Qwen3-32B + python3 scripts/estimate_vram.py --model-id Qwen/Qwen3-32B --vram-gb 192 + python3 scripts/estimate_vram.py --model-id Qwen/Qwen3-32B --vram-gb 192 --tp 2 + +Output: JSON to stdout. Exits 0 on success, 1 on failure. + +Env vars: + HF_TOKEN -- required for gated/private models +""" + +import argparse +import json +import math +import os +import sys +import urllib.request +import urllib.error + +HF_BASE = "https://huggingface.co" + +DTYPE_BYTES = { + "F64": 8, "F32": 4, "F16": 2, "BF16": 2, + "I64": 8, "I32": 4, "I16": 2, "I8": 1, "U8": 1, + "BOOL": 1, "F8_E4M3": 1, "F8_E5M2": 1, +} + + +def _fetch(url, token=None): + headers = {"User-Agent": "estimate-vram/1.0"} + if token: + headers["Authorization"] = f"Bearer {token}" + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=30) as r: + return json.loads(r.read()), None + except urllib.error.HTTPError as e: + if e.code == 401: + return None, "Authentication required. Set HF_TOKEN for gated/private models." + if e.code == 403: + return None, "Access denied. Accept the model license at huggingface.co." + if e.code == 404: + return None, "Not found." + return None, f"HTTP {e.code}: {e.reason}" + except Exception as e: + return None, str(e) + + +def _weight_memory(model_id, revision, token): + """Get weight memory in bytes. + + Uses two signals and picks the more reliable one: + 1. safetensors metadata (dtype x param count) from the model info API + 2. raw .safetensors file sizes from the tree API + + For standard BF16/FP16 checkpoints both agree. For quantized models + (QAT, GPTQ, AWQ) the metadata reports packed INT32 containers while the + actual data is 4-bit, making (1) vastly overestimate. File sizes are + always ground truth because safetensors is uncompressed, so when both + are available we take the smaller value. + """ + metadata_bytes = 0 + file_bytes = 0 + + # Signal 1: safetensors dtype x count from model info API + url = f"{HF_BASE}/api/models/{model_id}?expand[]=safetensors" + info, err = _fetch(url, token) + if info: + params = info.get("safetensors", {}).get("parameters", {}) + if params: + metadata_bytes = sum( + count * DTYPE_BYTES.get(dtype, 2) + for dtype, count in params.items() + ) + + # Signal 2: raw file sizes from tree API + tree_url = f"{HF_BASE}/api/models/{model_id}/tree/{revision}" + entries, tree_err = _fetch(tree_url, token) + if entries and isinstance(entries, list): + file_bytes = sum( + e.get("size", 0) for e in entries + if e.get("type") == "file" + and e.get("path", "").endswith(".safetensors") + ) + + # Pick the best estimate + if metadata_bytes and file_bytes: + if file_bytes < metadata_bytes * 0.8: + # Large gap means quantized weights packed in wider containers. + return file_bytes, "file_sizes", None + return metadata_bytes, "safetensors_metadata", None + if metadata_bytes: + return metadata_bytes, "safetensors_metadata", None + if file_bytes: + return file_bytes, "file_sizes", None + + return 0, None, err or tree_err or "No safetensors files found" + + +def _model_config(model_id, revision, token): + """Fetch config.json. Handles nested configs (VLMs, multimodal).""" + url = f"{HF_BASE}/{model_id}/resolve/{revision}/config.json" + config, err = _fetch(url, token) + if not config: + return None, err + + # Some multimodal models nest the LLM config under a sub-key + if "num_hidden_layers" not in config: + for key in ("text_config", "language_config", "llm_config"): + sub = config.get(key, {}) + if "num_hidden_layers" in sub: + # Merge sub-config but keep top-level max_position_embeddings + max_seq = config.get("max_position_embeddings", + sub.get("max_position_embeddings")) + merged = dict(sub) + if max_seq: + merged["max_position_embeddings"] = max_seq + return merged, None + + return config, None + + +def _kv_per_token(config): + """KV cache bytes per token at BF16. Returns (bytes, details).""" + if not config: + return 0, {} + + n_layers = config.get("num_hidden_layers", 0) + if not n_layers: + return 0, {} + + n_kv = config.get("num_key_value_heads", + config.get("num_attention_heads", 0)) + hdim = config.get("head_dim", 0) + if not hdim: + hsz = config.get("hidden_size", 0) + n_heads = config.get("num_attention_heads", 1) + hdim = hsz // n_heads if n_heads else 0 + + details = {"num_layers": n_layers, "num_kv_heads": n_kv, "head_dim": hdim} + + # MLA (DeepSeek-R1/V3): compressed KV via latent projection + if "kv_lora_rank" in config: + kv_rank = config["kv_lora_rank"] + rope_dim = config.get("qk_rope_head_dim", 0) + kv = 2 * n_layers * (kv_rank + rope_dim) * 2 + details.update(mla=True, kv_lora_rank=kv_rank, qk_rope_head_dim=rope_dim) + return kv, details + + # Standard: 2 (K+V) * layers * kv_heads * head_dim * 2 bytes (bf16) + return 2 * n_layers * n_kv * hdim * 2, details + + +def main(): + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument("--model-id", required=True, help="HuggingFace model ID") + p.add_argument("--revision", default="main") + p.add_argument("--vram-gb", type=float, default=0, + help="Per-GPU VRAM in GB (enables fit check)") + p.add_argument("--tp", type=int, default=1, help="Tensor parallelism") + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + args = p.parse_args() + + token = os.environ.get("HF_TOKEN", "") + + # Weight memory + w_bytes, source, err = _weight_memory(args.model_id, args.revision, token) + if not w_bytes: + print(json.dumps({"error": f"Cannot estimate weight memory: {err}", + "model_id": args.model_id})) + sys.exit(1) + + w_gb = round(w_bytes / (1024**3), 1) + + # Model config and KV cache + config, _ = _model_config(args.model_id, args.revision, token) + kv_bytes, kv_details = _kv_per_token(config) + max_seq = config.get("max_position_embeddings") if config else None + + result = { + "model_id": args.model_id, + "weight_memory_gb": w_gb, + "source": source, + } + + if kv_bytes: + result["kv_cache_bytes_per_token"] = kv_bytes + result["kv_cache"] = kv_details + if max_seq: + result["model_max_seq_len"] = max_seq + + # Fit estimation + if args.vram_gb > 0: + tp = args.tp + util = args.gpu_memory_utilization + w_per_gpu = round(w_gb / tp, 1) + usable = round(args.vram_gb * util, 1) + # Reserve for activations, HIP graph capture, internal buffers. + # vLLM profiles peak activation memory then captures HIP graphs; + # together these use ~4 GB beyond model weights on typical models. + overhead = 4.0 + remaining = round(max(0, usable - w_per_gpu - overhead), 1) + + fit = { + "gpu_vram_gb": args.vram_gb, + "tp": tp, + "gpu_memory_utilization": util, + "weight_per_gpu_gb": w_per_gpu, + "usable_vram_gb": usable, + "overhead_gb": overhead, + "remaining_for_kv_gb": remaining, + "weights_fit": w_per_gpu < usable, + } + + if not fit["weights_fit"]: + raw = math.ceil(w_gb / (usable - overhead)) + min_tp = 1 + while min_tp < raw: + min_tp *= 2 + fit["min_tp_required"] = min_tp + + if kv_bytes > 0 and remaining > 0: + kv_per_gpu = kv_bytes / tp + rem_bytes = remaining * (1024**3) + ctx_bf16 = int(rem_bytes / kv_per_gpu) + ctx_fp8 = int(rem_bytes / (kv_per_gpu / 2)) + + fit["max_seq_len_bf16_kv"] = ctx_bf16 + fit["max_seq_len_fp8_kv"] = ctx_fp8 + + if max_seq: + rec = min(ctx_bf16, max_seq) + rec = (rec // 1024) * 1024 + fit["recommended_max_model_len"] = rec + fit["context_limited"] = ctx_bf16 < max_seq + + result["fit"] = fit + + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-instinct/scripts/sync_recipes.py b/skills/serving-llms-on-instinct/scripts/sync_recipes.py new file mode 100644 index 0000000..02bd420 --- /dev/null +++ b/skills/serving-llms-on-instinct/scripts/sync_recipes.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Sync vLLM recipes and Docker Hub tags into a local cache. + +Fetches: + 1. Shallow clone of vllm-project/recipes from GitHub + 2. Reads all model YAML files from models//.yaml + 3. Latest stable Docker image tag from Docker Hub API + +Writes output to: data/recipes_cache.json + +Usage: + python3 scripts/sync_recipes.py # refresh cache + python3 scripts/sync_recipes.py --verbose # show progress + +Exit 0 always (callers fall back to existing cache on failure). +""" + +import argparse +import glob +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +from datetime import datetime, timezone + +try: + import yaml + HAS_YAML = True +except ImportError: + HAS_YAML = False + +REPO_URL = "https://github.com/vllm-project/recipes.git" +DOCKERHUB_URL = "https://hub.docker.com/v2/repositories/vllm/vllm-openai-rocm/tags" + +CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data") +CACHE_FILE = os.path.join(CACHE_DIR, "recipes_cache.json") + + +def _log(msg, verbose): + if verbose: + print(f" [sync] {msg}", file=sys.stderr, flush=True) + + +def _parse_yaml(path): + """Parse a YAML file. Requires PyYAML.""" + with open(path) as f: + return yaml.safe_load(f) + + +def _clone_recipes(verbose=False): + """Shallow clone the recipes repo into a temp directory. Returns path.""" + tmpdir = tempfile.mkdtemp(prefix="vllm-recipes-") + _log(f"Cloning {REPO_URL} (shallow)...", verbose) + r = subprocess.run( + ["git", "clone", "--depth=1", "--single-branch", "--filter=blob:none", + REPO_URL, tmpdir], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=30, + ) + if r.returncode != 0: + shutil.rmtree(tmpdir, ignore_errors=True) + raise RuntimeError(f"git clone failed: {r.stderr[:200]}") + return tmpdir + + +def _read_all_recipes(repo_dir, verbose=False): + """Read all model YAML files from the cloned repo.""" + models_dir = os.path.join(repo_dir, "models") + if not os.path.isdir(models_dir): + raise RuntimeError(f"No models/ directory in cloned repo") + + recipes = {} + yaml_files = glob.glob(os.path.join(models_dir, "*", "*.yaml")) + _log(f"Found {len(yaml_files)} model YAML files", verbose) + + for path in sorted(yaml_files): + org = os.path.basename(os.path.dirname(path)) + model = os.path.splitext(os.path.basename(path))[0] + hf_id = f"{org}/{model}" + + try: + recipe = _parse_yaml(path) + if not recipe: + continue + + meta = recipe.get("meta", {}) + model_section = recipe.get("model", {}) + + recipes[hf_id] = { + "hf_id": hf_id, + "meta": { + "title": meta.get("title", model), + "provider": meta.get("provider", org), + "description": meta.get("description", ""), + "tasks": meta.get("tasks", []), + "hardware": meta.get("hardware", {}), + }, + "model_info": { + "architecture": model_section.get("architecture", "dense"), + "parameter_count": model_section.get("parameter_count", ""), + }, + "recipe": recipe, + } + except Exception as e: + _log(f"Failed to parse {hf_id}: {e}", verbose) + + return recipes + + +def _fetch_docker_tag(verbose=False): + """Fetch the latest stable vllm-openai-rocm tag from Docker Hub.""" + _log("Fetching Docker Hub tags...", verbose) + url = f"{DOCKERHUB_URL}?page_size=50&ordering=last_updated" + r = subprocess.run( + ["curl", "-sf", "--max-time", "5", url], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10, + ) + if r.returncode != 0: + return "latest", "" + + data = json.loads(r.stdout) + for tag in data.get("results", []): + name = tag["name"] + if "nightly" in name or "base" in name: + continue + if name.startswith("v") and re.match(r"v\d+\.\d+", name): + return name, tag.get("last_updated", "") + if name == "latest": + return name, tag.get("last_updated", "") + + return "latest", "" + + +def sync(verbose=False): + if not HAS_YAML: + print("WARN: PyYAML not installed, cannot sync recipes", file=sys.stderr) + return False + + # Step 1: Clone the repo + repo_dir = _clone_recipes(verbose) + + try: + # Step 2: Read all YAML recipes + recipes = _read_all_recipes(repo_dir, verbose) + _log(f"Parsed {len(recipes)} models", verbose) + finally: + shutil.rmtree(repo_dir, ignore_errors=True) + + if not recipes: + print("WARN: No recipes found in cloned repo", file=sys.stderr) + return False + + # Step 3: Fetch Docker Hub tag + docker_tag, docker_date = "latest", "" + try: + docker_tag, docker_date = _fetch_docker_tag(verbose) + _log(f"Latest stable ROCm tag: {docker_tag} ({docker_date})", verbose) + except Exception as e: + _log(f"Docker Hub fetch failed: {e}", verbose) + + # Step 4: Write cache + cache = { + "fetched_at": datetime.now(timezone.utc).isoformat(), + "docker_image": f"vllm/vllm-openai-rocm:{docker_tag}", + "docker_tag": docker_tag, + "docker_tag_date": docker_date, + "model_count": len(recipes), + "models": recipes, + } + + os.makedirs(CACHE_DIR, exist_ok=True) + with open(CACHE_FILE, "w") as f: + json.dump(cache, f, indent=2, default=str) + + _log(f"Cache written: {len(recipes)} models, tag={docker_tag}", verbose) + return True + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--verbose", "-v", action="store_true") + args = parser.parse_args() + + try: + ok = sync(verbose=args.verbose) + if ok: + print(json.dumps({"status": "ok", "cache": CACHE_FILE})) + else: + print(json.dumps({"status": "partial", "cache": CACHE_FILE})) + except Exception as e: + print(f"WARN: sync_recipes failed: {e}", file=sys.stderr) + print(json.dumps({"status": "failed", "error": str(e)})) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-instinct/scripts/validate.py b/skills/serving-llms-on-instinct/scripts/validate.py new file mode 100644 index 0000000..263656f --- /dev/null +++ b/skills/serving-llms-on-instinct/scripts/validate.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Validate the environment on an AMD GPU machine before launching vLLM. + +Usage: + python scripts/validate.py + python scripts/validate.py --host root@10.0.0.5 + python scripts/validate.py --auto-fix # apply safe fixes (NUMA, hipBLASLt) + +Checks: /dev/kfd, /dev/dri, Docker, NUMA balancing, hipBLASLt, HF_TOKEN. +Each issue is classified as: error (blocks launch), warning (degrades perf), advisory (info). + +Exits 0 if no error-severity issues remain, 1 otherwise. + +Env vars: + ROCM_SSH_HOST, ROCM_SSH_USER, ROCM_SSH_PORT +""" + +import argparse +import json +import os +import subprocess +import sys + + +def _is_local(host): + return not host or host in ("local", "localhost", "127.0.0.1") + + +def _run(cmd, host, user, port, timeout=20): + try: + if _is_local(host): + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout) + else: + ssh_target = f"{user}@{host}" if user else host + ssh = [ + "ssh", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=15", + "-o", "BatchMode=yes", + "-o", "LogLevel=ERROR", + "-p", str(port), + ssh_target, cmd, + ] + r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout) + return r.returncode, r.stdout.strip(), r.stderr.strip() + except subprocess.TimeoutExpired: + target = f"{user}@{host}" if user else host + return 1, "", f"Command timed out after {timeout}s on {target}" + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--host", default="", help="[user@]host (default: local or ROCM_SSH_HOST)") + parser.add_argument("--user", default="", help="SSH user (default: root)") + parser.add_argument("--port", type=int, default=0) + parser.add_argument("--auto-fix", action="store_true", help="Apply safe fixes without prompting") + args = parser.parse_args() + + host = args.host + user = args.user + if "@" in host: + user, host = host.split("@", 1) + + host = host or os.environ.get("ROCM_SSH_HOST", "") + user = user or os.environ.get("ROCM_SSH_USER", "") + port = args.port or int(os.environ.get("ROCM_SSH_PORT", "22")) + + issues = [] + fixes_applied = [] + + # /dev/kfd + rc, out, _ = _run("test -e /dev/kfd && echo exists || echo missing", host, user, port) + if "missing" in out: + issues.append({ + "check": "dev_kfd", + "severity": "error", + "message": "/dev/kfd not found. The amdgpu kernel module is not loaded or the driver is not installed.", + "fix": "sudo modprobe amdgpu # or install ROCm driver", + }) + else: + rc2, out2, _ = _run("test -r /dev/kfd && echo ok || echo denied", host, user, port) + if "denied" in out2: + # Docker passes --device /dev/kfd directly, so host user permissions + # don't block containerized workloads. Downgrade to warning. + issues.append({ + "check": "dev_kfd", + "severity": "warning", + "message": "/dev/kfd exists but current user is not in video/render group. Docker containers will still work.", + "fix": "sudo usermod -aG video,render $USER # then re-login (only needed for non-Docker use)", + }) + + # /dev/dri + rc, out, _ = _run("ls /dev/dri/renderD* 2>/dev/null | wc -l", host, user, port) + try: + render_count = int(out) + except ValueError: + render_count = 0 + if render_count == 0: + issues.append({ + "check": "dev_dri", + "severity": "error", + "message": "No /dev/dri/renderD* nodes found. GPU render nodes not present.", + "fix": "Check that the amdgpu driver is loaded: lsmod | grep amdgpu", + }) + + # Docker + rc, out, err = _run("docker ps -q 2>&1 | head -1", host, user, port) + if rc != 0 or "permission denied" in err.lower() or "cannot connect" in err.lower(): + issues.append({ + "check": "docker", + "severity": "error", + "message": f"Docker not accessible: {err or 'docker ps failed'}", + "fix": "Start Docker: sudo systemctl start docker | Or add user to docker group: sudo usermod -aG docker $USER", + }) + + # NUMA balancing + rc, out, _ = _run("cat /proc/sys/kernel/numa_balancing 2>/dev/null || echo 0", host, user, port) + numa_val = out.strip() + if numa_val == "1": + if args.auto_fix: + rc2, _, _ = _run("echo 0 | sudo tee /proc/sys/kernel/numa_balancing > /dev/null", host, user, port) + if rc2 == 0: + fixes_applied.append("NUMA balancing disabled (non-persistent, resets on reboot)") + else: + issues.append({ + "check": "numa_balancing", + "severity": "warning", + "message": "NUMA balancing is enabled. Causes latency spikes during GPU inference.", + "fix": "echo 0 | sudo tee /proc/sys/kernel/numa_balancing", + }) + else: + issues.append({ + "check": "numa_balancing", + "severity": "warning", + "message": "NUMA balancing is enabled. Causes latency spikes during GPU inference.", + "fix": "echo 0 | sudo tee /proc/sys/kernel/numa_balancing (or run with --auto-fix)", + }) + + # hipBLASLt + rc, out, _ = _run("ls /opt/rocm/lib/libhipblaslt* 2>/dev/null | head -1", host, user, port) + if not out.strip(): + issues.append({ + "check": "hipblaslt", + "severity": "warning", + "message": "hipBLASLt not found at /opt/rocm/lib/. GEMM performance may be reduced.", + "fix": "Ensure ROCm is fully installed: sudo apt install hipblaslt or reinstall ROCm", + }) + + # HF_TOKEN + rc, out, _ = _run("printenv HF_TOKEN | head -c 4", host, user, port) + if not out.strip(): + issues.append({ + "check": "hf_token", + "severity": "advisory", + "message": "HF_TOKEN not set. Required for gated models (Llama, Gemma). Not needed for Qwen3.", + "fix": "export HF_TOKEN=hf_...", + }) + + # vLLM Docker image + rc, out, _ = _run("docker images vllm/vllm-openai-rocm --format '{{.Tag}}' 2>/dev/null | head -1", host, user, port) + if not out.strip(): + issues.append({ + "check": "vllm_image", + "severity": "advisory", + "message": "vllm/vllm-openai-rocm image not pulled yet. First launch will download ~20GB.", + "fix": "docker pull vllm/vllm-openai-rocm:latest", + }) + + # CUDA_VISIBLE_DEVICES footgun -- empty string hides all GPUs, explicit indices are OK + rc, out, _ = _run("env | grep -c '^CUDA_VISIBLE_DEVICES=' || true", host, user, port) + if out.strip() and out.strip() != "0": + rc2, val, _ = _run("printenv CUDA_VISIBLE_DEVICES", host, user, port) + raw_val = val.strip() + if raw_val == "": + issues.append({ + "check": "cuda_visible_devices", + "severity": "error", + "message": "CUDA_VISIBLE_DEVICES is set to '' (empty string). This hides all GPUs from the ROCm runtime.", + "fix": "unset CUDA_VISIBLE_DEVICES", + }) + else: + issues.append({ + "check": "cuda_visible_devices", + "severity": "advisory", + "message": f"CUDA_VISIBLE_DEVICES is set to {raw_val}. ROCm maps this to HIP_VISIBLE_DEVICES. Only the listed GPUs will be visible.", + "fix": "unset CUDA_VISIBLE_DEVICES # to use all GPUs", + }) + + errors = [i for i in issues if i["severity"] == "error"] + warnings = [i for i in issues if i["severity"] == "warning"] + advisories = [i for i in issues if i["severity"] == "advisory"] + + result = { + "ready": len(errors) == 0, + "target": "local" if _is_local(host) else host, + "errors": errors, + "warnings": warnings, + "advisories": advisories, + "fixes_applied": fixes_applied, + } + print(json.dumps(result, indent=2)) + sys.exit(0 if len(errors) == 0 else 1) + + +if __name__ == "__main__": + main()