Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions VERIFICATION_COMMENT.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Read task_0 of /tmp/pensieve-intern-agent-wwltwvsh/target/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml; uses TRT-LLM serve with model=/hf-local/Kimi/Kimi-K2.5-DFlash, container=nvcr.io/nvidia/tensorrt-llm/release:1.2.0; dataset path /hf-local/modelopt/Speculative-Decoding-Prompt-Samples exists; OK
104 changes: 104 additions & 0 deletions tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# EAGLE3 offline speculative decoding pipeline for Kimi-K2.5-DFlash.
#
# 4-step pipeline:
# task_0: Data synthesis — query TRT-LLM server to generate prompt samples
# task_1: Dump hidden states — run target model to capture hidden states
# task_2: Offline training — train the EAGLE3 draft head
# task_3: Benchmark — evaluate speculative decoding speedup via VLLM
#
# All tasks share /scratchspace to pass artifacts between steps.
#
# Usage:
# uv run launch.py --yaml examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Kimi/Kimi-K2.5-DFlash/hf_offline_eagle3.yaml --yes

job_name: Kimi-K2.5-DFlash_EAGLE3_offline
pipeline:
allow_to_fail: false
skip: false
note:

global_vars:
hf_model: /hf-local/Kimi/Kimi-K2.5-DFlash

# Step 1: Data synthesis via TRT-LLM server
# Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
task_0:
script: common/tensorrt_llm/query.sh
args:
- --model <<global_vars.hf_model>>
- --tp_size 8
- --ep_size 8
- --max_num_tokens 32000
- --port 8000
- --host 0.0.0.0
- --trust_remote_code
- --
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
- --save /scratchspace/data
environment:
- HF_LOCAL: /hf-local
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

# Step 2: Dump hidden states from target model
task_1:
script: common/eagle3/dump_offline_data.sh
args:
- --input-data /scratchspace/data
- --output-dir /scratchspace/offline_hidden_states
- --max-seq-len 8192
- --tp 8
- --moe-ep 8
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

# Step 3: Train EAGLE3 draft head (offline, single task)
task_2:
script: common/eagle3/train_eagle.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
- data.offline_data_path=/scratchspace/offline_hidden_states
- training.output_dir=/scratchspace/eagle3
- training.training_seq_len=4096
- training.disable_tqdm=true
- training.ar_validate_steps=500000
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

# Step 4: Benchmark speculative decoding (VLLM backend)
task_3:
script: common/specdec_bench/quick_check.sh
args:
- --draft_model_dir /scratchspace/export
- --draft_length 3
- --output_length 4096
- --engine VLLM
- --tp_size 8
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: vllm/vllm-openai:latest
Loading
Loading