File tree Expand file tree Collapse file tree
examples/Qwen/qwen3-v0339a-demo Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1818native_mpi_rank=$OMPI_COMM_WORLD_RANK
1919native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
2020# Works with Slurm launching with `--mpi=pmix`
21- mpi_rank=${PMIX_RANK:- $native_mpi_rank }
22- mpi_local_rank=${PMIX_LOCAL_RANK:- $native_mpi_local_rank }
21+ mpi_rank=${PMIX_RANK:- ${ native_mpi_rank:- ${SLURM_PROCID :- 0} } }
22+ mpi_local_rank=${PMIX_LOCAL_RANK:- ${ native_mpi_local_rank:- ${SLURM_LOCALID :- 0} } }
2323
2424FAIL=0
2525FAIL_EXIT=0
@@ -48,8 +48,23 @@ function report_result {
4848}
4949
5050function util_install_extra_dep {
51+ local _marker=/tmp/.nmm_extra_dep_installed
52+ if [[ -f " $_marker " ]]; then
53+ return 0
54+ fi
5155 if [[ " $mpi_local_rank " -eq 0 ]]; then
5256 pip install diskcache
57+ local _nvrx_dir
58+ _nvrx_dir=" $( mktemp -d) /nvidia-resiliency-ext"
59+ git clone --depth 1 https://github.com/NVIDIA/nvidia-resiliency-ext " ${_nvrx_dir} " \
60+ && pip install " ${_nvrx_dir} "
61+ touch " $_marker "
62+ else
63+ local _waited=0
64+ while [[ ! -f " $_marker " && $_waited -lt 600 ]]; do
65+ sleep 1
66+ _waited=$(( _waited + 1 ))
67+ done
5368 fi
5469}
5570
Original file line number Diff line number Diff line change 1+ # EAGLE3 offline speculative decoding pipeline — Step 1: Data synthesis
2+ # for qwen3-v0339a-demo
3+
4+ job_name : qwen3-v0339a-demo_EAGLE3_offline_step1
5+
6+ pipeline :
7+ allow_to_fail : false
8+ skip : false
9+ note :
10+
11+ global_vars :
12+ hf_model : /hf-local/Qwen/qwen3-v0339a-demo
13+
14+ task_0 :
15+ script : common/tensorrt_llm/query.sh
16+ args :
17+ - --model <<global_vars.hf_model>>
18+ - --tp_size 8
19+ - --ep_size 8
20+ - --max_num_tokens 32000
21+ - --port 8000
22+ - --host 0.0.0.0
23+ - --trust_remote_code
24+ - --
25+ - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
26+ - --save /scratchspace/data
27+ environment :
28+ - HF_LOCAL : /hf-local
29+ slurm_config :
30+ _factory_ : " slurm_factory"
31+ nodes : 1
32+ ntasks_per_node : 8
33+ gpus_per_node : 8
34+ container : nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Original file line number Diff line number Diff line change 1+ # EAGLE3 offline speculative decoding pipeline — Step 2: Dump hidden states
2+ # for qwen3-v0339a-demo
3+
4+ job_name : qwen3-v0339a-demo_EAGLE3_offline_step2
5+
6+ pipeline :
7+ allow_to_fail : false
8+ skip : false
9+ note :
10+
11+ global_vars :
12+ hf_model : /hf-local/Qwen/qwen3-v0339a-demo
13+
14+ task_0 :
15+ script : common/eagle3/dump_offline_data.sh
16+ args :
17+ - --input-data /scratchspace/data
18+ - --output-dir /scratchspace/offline_hidden_states
19+ - --max-seq-len 8192
20+ - --tp 8
21+ - --moe-ep 8
22+ environment :
23+ - HF_MODEL_CKPT : <<global_vars.hf_model>>
24+ slurm_config :
25+ _factory_ : " slurm_factory"
26+ nodes : 1
27+ ntasks_per_node : 8
28+ gpus_per_node : 8
29+ container : nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Original file line number Diff line number Diff line change 1+ # EAGLE3 offline speculative decoding pipeline — Step 3: Train EAGLE3 draft head
2+ # for qwen3-v0339a-demo
3+
4+ job_name : qwen3-v0339a-demo_EAGLE3_offline_step3
5+
6+ pipeline :
7+ allow_to_fail : false
8+ skip : false
9+ note :
10+
11+ global_vars :
12+ hf_model : /hf-local/Qwen/qwen3-v0339a-demo
13+
14+ task_0 :
15+ script : common/eagle3/train_eagle.sh
16+ args :
17+ - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
18+ - model.model_name_or_path=<<global_vars.hf_model>>
19+ - data.offline_data_path=/scratchspace/offline_hidden_states
20+ - training.output_dir=/scratchspace/eagle3
21+ - training.training_seq_len=4096
22+ - training.disable_tqdm=true
23+ - training.ar_validate_steps=500000
24+ slurm_config :
25+ _factory_ : " slurm_factory"
26+ nodes : 1
27+ ntasks_per_node : 1
28+ gpus_per_node : 8
29+ container : nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Original file line number Diff line number Diff line change 1+ # EAGLE3 offline speculative decoding pipeline — Step 4: Benchmark
2+ # for qwen3-v0339a-demo
3+
4+ job_name : qwen3-v0339a-demo_EAGLE3_offline_step4
5+
6+ pipeline :
7+ allow_to_fail : false
8+ skip : false
9+ note :
10+
11+ global_vars :
12+ hf_model : /hf-local/Qwen/qwen3-v0339a-demo
13+
14+ task_0 :
15+ script : common/specdec_bench/quick_check.sh
16+ args :
17+ - --draft_model_dir /scratchspace/export
18+ - --draft_length 3
19+ - --output_length 4096
20+ - --engine VLLM
21+ - --tp_size 8
22+ - --ep_size 1
23+ - --speculative_algorithm EAGLE3
24+ - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
25+ - --concurrency 1
26+ environment :
27+ - HF_MODEL_CKPT : <<global_vars.hf_model>>
28+ slurm_config :
29+ _factory_ : " slurm_factory"
30+ nodes : 1
31+ ntasks_per_node : 1
32+ gpus_per_node : 8
33+ container : vllm/vllm-openai:latest
You can’t perform that action at this time.
0 commit comments