NVIDIA · ChenhanYu · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -63,10 +63,27 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs
                 specdec["disable_padded_drafter_batch"] = True
                 specdec["parallel_draft_block_sizes"] = kwargs.get("parallel_draft_block_sizes")
         elif kwargs.get("speculative_algorithm") == "MTP":
-            specdec = {
-                "method": "mtp",
-                "num_speculative_tokens": kwargs.get("speculative_num_steps", 3),
-            }
+            draft_model_dir = kwargs.get("draft_model_dir")
+            if draft_model_dir:
+                # Assistant-model MTP (e.g. Gemma 4): vLLM's Gemma4 MTP
+                # support (vllm-project/vllm#41745) expects
+                # ``speculative_config={"model": <assistant>, ...}`` with
+                # no ``method`` key — vLLM auto-detects Gemma4 from the
+                # assistant model. Passing ``method: "mtp"`` here triggers
+                # ``NotImplementedError: Unsupported speculative method:
+                # 'mtp'`` on Gemma4 even on a container that has the
+                # support (e.g. ``vllm/vllm-openai:v0.22.1``+).
+                specdec = {
+                    "model": draft_model_dir,
+                    "num_speculative_tokens": kwargs.get("speculative_num_steps", 3),
+                }
+            else:
+                # Generic MTP path (Qwen3.5 etc.) — model carries its
+                # own MTP layer; no separate draft / assistant model.
+                specdec = {
+                    "method": "mtp",
+                    "num_speculative_tokens": kwargs.get("speculative_num_steps", 3),
+                }
         elif kwargs.get("speculative_algorithm") == "DFLASH":
             specdec = {
                 "method": "dflash",

@@ -35,7 +35,20 @@
 
 
 def get_tokenizer(path, trust_remote_code=False):
-    return AutoTokenizer.from_pretrained(path, trust_remote_code=trust_remote_code)
+    extra_special_tokens = None
+    tokenizer_config_path = os.path.join(path, "tokenizer_config.json")
+    if os.path.exists(tokenizer_config_path):
+        with open(tokenizer_config_path) as f:
+            tokenizer_config = json.load(f)
+        extra_special_tokens = tokenizer_config.get("extra_special_tokens")
+
+    kwargs = {"trust_remote_code": trust_remote_code}
+    if isinstance(extra_special_tokens, list):
+        kwargs["extra_special_tokens"] = {
+            token.strip("<|>").replace("|", "_") + "_token": token for token in extra_special_tokens
+        }
+
+    return AutoTokenizer.from_pretrained(path, **kwargs)
 
 
 def encode_chat(tokenizer, messages, chat_template_args={}, completions=False):

@@ -0,0 +1,83 @@
+# SPEED-bench MTP speculative-decoding run for gemma-4-E4B-it via vLLM.
+#
+# Gemma 4 MTP support landed in vLLM PR vllm-project/vllm#41745 (2026-05-06)
+# and is in ``vllm/vllm-openai:v0.22.1`` (and later). Gemma 4 MTP uses a
+# separate assistant model passed via ``--draft_model_dir``; vLLM
+# auto-detects Gemma 4 from the assistant and does NOT take a ``method``
+# key in ``speculative_config``. The wrapper at
+# ``examples/specdec_bench/specdec_bench/models/vllm.py`` routes to the
+# assistant-model config shape when ``--speculative_algorithm MTP`` is
+# paired with ``--draft_model_dir``.
+#
+# Assistant model: ``google/gemma-4-E4B-it-assistant`` (public, ungated).
+#
+# Slurm run on cw_dfw — cells override per-cell knobs via
+# pipeline.task_N.args+=[...]:
+#
+#   uv run slurm.py \
+#     --yaml modules/Model-Optimizer/tools/launcher/examples/gemma-4/gemma-4-E4B-it/specdec_bench_mtp_vllm.yaml \
+#     --yes detach=true \
+#     pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/qualitative","--draft_length 3"] \
+#     pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace/<sweep>/throughput_32k","--num_requests 80","--draft_length 3"]
+
+job_name: gemma-4-E4B-it_specdec_bench_mtp_vllm
+
+pipeline:
+  global_vars:
+    hf_model: /hf-local/google/gemma-4-E4B-it
+    draft_model: /hf-local/google/gemma-4-E4B-it-assistant
+
+  # task_0: SPEED qualitative split
+  task_0:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
+      - --engine VLLM
+      - --speculative_algorithm MTP
+      - --draft_model_dir <<global_vars.draft_model>>
+      - --draft_length 3
+      - --tp_size 1
+      - --ep_size 1
+      - --concurrency 32
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/qualitative
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: vllm/vllm-openai:v0.22.1
+
+  # task_1: SPEED throughput_32k split
+  task_1:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
+      - --engine VLLM
+      - --speculative_algorithm MTP
+      - --draft_model_dir <<global_vars.draft_model>>
+      - --draft_length 3
+      - --tp_size 1
+      - --ep_size 1
+      - --concurrency 8
+      - --num_requests 80
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/{sweep_name_default}/throughput_32k
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: vllm/vllm-openai:v0.22.1