NVIDIA · ChenhanYu · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_dflash_vllm_t0_d3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_dflash_vllm_t0_d3.yaml
@@ -0,0 +1,118 @@
+# SPEED-bench DFlash speculative-decoding run for Qwen3.5-4B via vLLM,
+# matrix cell t0_d3 (temperature=0, draft_length=3 → block_size=4).
+#
+# Companion to specdec_bench_mtp.yaml. This variant exercises the
+# z-lab/Qwen3.5-4B-DFlash external draft model. DFlash ignores
+# --draft_length (which maps to vLLM's speculative_num_steps); it reads
+# `speculative_num_draft_tokens` instead, which we pass via --block_size
+# = draft_length + 1.
+#
+# Two-task pipeline:
+#   task_0  Quantitative quality split (nvidia/SPEED-Bench-Internal/qualitative)
+#   task_1  Long-context throughput split (nvidia/SPEED-Bench-Internal/throughput_32k)
+#
+# Results write to /scratchspace/qwen35_4b_dflash_vllm_t0_d3/<split>/.
+# The pensieve-intern `specdec_bench` workflow's wrap_up stage owns
+# publishing these to s3://team-specdec-workgroup/results/qwen35_4b_dflash_vllm_t0_d3/<split>/
+# with provenance stamps (jira_ticket + huggingface_model_id).
+# Sweep-name convention: <model>_<algorithm>_<engine>_<cell_tag> so
+# multi-model / multi-engine / multi-cell records don't collide in S3.
+#
+# Container: vllm/vllm-openai:v0.22.1+ is required. The `dflash`
+# speculative method landed in vLLM v0.22.0; the qwen3_5-cu130 image
+# used by sibling MTP/NONE YAMLs predates this and rejects
+# `--speculative_algorithm DFLASH` with "Input should be 'ngram', ...,
+# 'mtp'".
+#
+# Slurm run on cw_dfw:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_dflash_vllm_t0_d3.yaml --yes
+
+job_name: Qwen3.5-4B_specdec_bench_dflash_vllm_t0_d3
+
+pipeline:
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-4B
+
+  # Step 1: qualitative split — quality / acceptance-rate numbers with
+  # DFlash block_size=4 (draft_length=3 + 1). tp_size=2 + concurrency=32
+  # trades aa_timing fidelity for ~30x wall-clock speedup;
+  # acceptance-length (AL) is concurrency-independent and is the primary
+  # metric we care about for this split.
+  #
+  # No --temperature: run.py defaults sampling_kwargs to
+  # {"temperature": 0} when --temperature is not supplied, which is
+  # exactly what this cell (t0_*) wants. Cells with non-zero temperature
+  # (t1_d3 / t1_d7) will pass `--temperature 1` on the args list.
+  task_0:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
+      - --engine VLLM
+      - --speculative_algorithm DFLASH
+      - --draft_model_dir /hf-local/z-lab/Qwen3.5-4B-DFlash
+      - --block_size 4
+      - --tp_size 2
+      - --ep_size 1
+      - --concurrency 32
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/qwen35_4b_dflash_vllm_t0_d3/qualitative
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:v0.22.1
+
+  # Step 2: throughput_32k split — long-context throughput with DFlash
+  # block_size=4. `--num_requests 80` caps the run at 80 samples (split
+  # has 1,536) so it fits in the 4h Slurm time-limit; each 32K-input
+  # sample takes ~60-90s. tp_size=2 doubles the KV-cache budget across
+  # 2 GPUs; concurrency=8 keeps 8 * 32K = 256K tokens of in-flight KV
+  # under that doubled budget.
+  #
+  # --max_seq_len 40960 pins the engine's sequence-length cap for the
+  # 32K input + 4K output + 4K headroom; vLLM's auto-derivation from
+  # gpu_memory_utilization can otherwise cap below the 32K input we
+  # need. Generic CLI flag (run.py maps it to engine-specific kwarg —
+  # max_model_len for vLLM here).
+  task_1:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
+      - --engine VLLM
+      - --speculative_algorithm DFLASH
+      - --draft_model_dir /hf-local/z-lab/Qwen3.5-4B-DFlash
+      - --block_size 4
+      - --max_seq_len 40960
+      - --tp_size 2
+      - --ep_size 1
+      - --concurrency 8
+      - --num_requests 80
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/qwen35_4b_dflash_vllm_t0_d3/throughput_32k
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:v0.22.1
+
+
+# S3 upload is intentionally not a task in this YAML — the bench
+# pipeline only writes results to /scratchspace/qwen35_4b_dflash_vllm_t0_d3/<split>/.
+# The pensieve-intern specdec_bench workflow's wrap_up stage owns
+# harvesting these from lustre and publishing them to the team S3 vault
+# with provenance stamps (jira_ticket + huggingface_model_id) for the
+# "official record" tracking.