Skip to content

Commit 311f683

Browse files
author
pensieve-intern
committed
[OMNIML-4333] training_support — pensieve-intern agent draft
1 parent d30ebbd commit 311f683

5 files changed

Lines changed: 142 additions & 2 deletions

File tree

tools/launcher/common/service_utils.sh

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
native_mpi_rank=$OMPI_COMM_WORLD_RANK
1919
native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
2020
# Works with Slurm launching with `--mpi=pmix`
21-
mpi_rank=${PMIX_RANK:-$native_mpi_rank}
22-
mpi_local_rank=${PMIX_LOCAL_RANK:-$native_mpi_local_rank}
21+
mpi_rank=${PMIX_RANK:-${native_mpi_rank:-${SLURM_PROCID:-0}}}
22+
mpi_local_rank=${PMIX_LOCAL_RANK:-${native_mpi_local_rank:-${SLURM_LOCALID:-0}}}
2323

2424
FAIL=0
2525
FAIL_EXIT=0
@@ -48,8 +48,23 @@ function report_result {
4848
}
4949

5050
function util_install_extra_dep {
51+
local _marker=/tmp/.nmm_extra_dep_installed
52+
if [[ -f "$_marker" ]]; then
53+
return 0
54+
fi
5155
if [[ "$mpi_local_rank" -eq 0 ]]; then
5256
pip install diskcache
57+
local _nvrx_dir
58+
_nvrx_dir="$(mktemp -d)/nvidia-resiliency-ext"
59+
git clone --depth 1 https://github.com/NVIDIA/nvidia-resiliency-ext "${_nvrx_dir}" \
60+
&& pip install "${_nvrx_dir}"
61+
touch "$_marker"
62+
else
63+
local _waited=0
64+
while [[ ! -f "$_marker" && $_waited -lt 600 ]]; do
65+
sleep 1
66+
_waited=$((_waited + 1))
67+
done
5368
fi
5469
}
5570

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# EAGLE3 offline speculative decoding pipeline — Step 1: Data synthesis
2+
# for qwen3-v0339a-demo
3+
4+
job_name: qwen3-v0339a-demo_EAGLE3_offline_step1
5+
6+
pipeline:
7+
allow_to_fail: false
8+
skip: false
9+
note:
10+
11+
global_vars:
12+
hf_model: /hf-local/Qwen/qwen3-v0339a-demo
13+
14+
task_0:
15+
script: common/tensorrt_llm/query.sh
16+
args:
17+
- --model <<global_vars.hf_model>>
18+
- --tp_size 8
19+
- --ep_size 8
20+
- --max_num_tokens 32000
21+
- --port 8000
22+
- --host 0.0.0.0
23+
- --trust_remote_code
24+
- --
25+
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
26+
- --save /scratchspace/data
27+
environment:
28+
- HF_LOCAL: /hf-local
29+
slurm_config:
30+
_factory_: "slurm_factory"
31+
nodes: 1
32+
ntasks_per_node: 8
33+
gpus_per_node: 8
34+
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# EAGLE3 offline speculative decoding pipeline — Step 2: Dump hidden states
2+
# for qwen3-v0339a-demo
3+
4+
job_name: qwen3-v0339a-demo_EAGLE3_offline_step2
5+
6+
pipeline:
7+
allow_to_fail: false
8+
skip: false
9+
note:
10+
11+
global_vars:
12+
hf_model: /hf-local/Qwen/qwen3-v0339a-demo
13+
14+
task_0:
15+
script: common/eagle3/dump_offline_data.sh
16+
args:
17+
- --input-data /scratchspace/data
18+
- --output-dir /scratchspace/offline_hidden_states
19+
- --max-seq-len 8192
20+
- --tp 8
21+
- --moe-ep 8
22+
environment:
23+
- HF_MODEL_CKPT: <<global_vars.hf_model>>
24+
slurm_config:
25+
_factory_: "slurm_factory"
26+
nodes: 1
27+
ntasks_per_node: 8
28+
gpus_per_node: 8
29+
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# EAGLE3 offline speculative decoding pipeline — Step 3: Train EAGLE3 draft head
2+
# for qwen3-v0339a-demo
3+
4+
job_name: qwen3-v0339a-demo_EAGLE3_offline_step3
5+
6+
pipeline:
7+
allow_to_fail: false
8+
skip: false
9+
note:
10+
11+
global_vars:
12+
hf_model: /hf-local/Qwen/qwen3-v0339a-demo
13+
14+
task_0:
15+
script: common/eagle3/train_eagle.sh
16+
args:
17+
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
18+
- model.model_name_or_path=<<global_vars.hf_model>>
19+
- data.offline_data_path=/scratchspace/offline_hidden_states
20+
- training.output_dir=/scratchspace/eagle3
21+
- training.training_seq_len=4096
22+
- training.disable_tqdm=true
23+
- training.ar_validate_steps=500000
24+
slurm_config:
25+
_factory_: "slurm_factory"
26+
nodes: 1
27+
ntasks_per_node: 1
28+
gpus_per_node: 8
29+
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# EAGLE3 offline speculative decoding pipeline — Step 4: Benchmark
2+
# for qwen3-v0339a-demo
3+
4+
job_name: qwen3-v0339a-demo_EAGLE3_offline_step4
5+
6+
pipeline:
7+
allow_to_fail: false
8+
skip: false
9+
note:
10+
11+
global_vars:
12+
hf_model: /hf-local/Qwen/qwen3-v0339a-demo
13+
14+
task_0:
15+
script: common/specdec_bench/quick_check.sh
16+
args:
17+
- --draft_model_dir /scratchspace/export
18+
- --draft_length 3
19+
- --output_length 4096
20+
- --engine VLLM
21+
- --tp_size 8
22+
- --ep_size 1
23+
- --speculative_algorithm EAGLE3
24+
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
25+
- --concurrency 1
26+
environment:
27+
- HF_MODEL_CKPT: <<global_vars.hf_model>>
28+
slurm_config:
29+
_factory_: "slurm_factory"
30+
nodes: 1
31+
ntasks_per_node: 1
32+
gpus_per_node: 8
33+
container: vllm/vllm-openai:latest

0 commit comments

Comments
 (0)