NVIDIA · ChenhanYu · May 14, 2026
diff --git a/VERIFICATION_COMMENT.txt b/VERIFICATION_COMMENT.txt
@@ -0,0 +1 @@
+Read task_0 of /tmp/pensieve-intern-agent-wwltwvsh/target/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml; uses TRT-LLM serve with model=/hf-local/Kimi/Kimi-K2.5-DFlash, container=nvcr.io/nvidia/tensorrt-llm/release:1.2.0; dataset path /hf-local/modelopt/Speculative-Decoding-Prompt-Samples exists; OK
diff --git a/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml b/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml
@@ -0,0 +1,104 @@
+# EAGLE3 offline speculative decoding pipeline for Kimi-K2.5-DFlash.
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query TRT-LLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml --yes
+
+job_name: Kimi-K2.5-DFlash_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Kimi/Kimi-K2.5-DFlash
+
+  # Step 1: Data synthesis via TRT-LLM server
+  # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
+  task_0:
+    script: common/tensorrt_llm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tp_size 8
+      - --ep_size 8
+      - --max_num_tokens 32000
+      - --port 8000
+      - --host 0.0.0.0
+      - --trust_remote_code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 8
+      - --moe-ep 8
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
+
+  # Step 3: Train EAGLE3 draft head (offline, single task)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 8
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 8
+      container: vllm/vllm-openai:latest
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Read task_0 of /tmp/pensieve-intern-agent-wwltwvsh/target/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml; uses TRT-LLM serve with model=/hf-local/Kimi/Kimi-K2.5-DFlash, container=nvcr.io/nvidia/tensorrt-llm/release:1.2.0; dataset path /hf-local/modelopt/Speculative-Decoding-Prompt-Samples exists; OK