Skip to content

Commit d39231b

Browse files
author
pensieve-intern
committed
[OMNIML-4652] training_support — pensieve-intern agent draft
1 parent 62401e1 commit d39231b

1 file changed

Lines changed: 33 additions & 0 deletions

File tree

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Step3 offline EAGLE3 draft-head training for moonshotai/Kimi-K2.5-DFlash.
2+
#
3+
# Standalone task extracted from the 4-task hf_offline_eagle3 pipeline.
4+
# Reads hidden states produced by step2 from /scratchspace/offline_hidden_states.
5+
#
6+
# Usage:
7+
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/moonshotai/Kimi-K2.5-DFlash/step3_train.yaml --yes
8+
9+
job_name: moonshotai/Kimi-K2.5-DFlash_EAGLE3_train
10+
pipeline:
11+
allow_to_fail: false
12+
skip: false
13+
note:
14+
15+
global_vars:
16+
hf_model: /hf-local/Qwen/moonshotai/Kimi-K2.5-DFlash
17+
18+
task_0:
19+
script: common/eagle3/train_eagle.sh
20+
args:
21+
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
22+
- model.model_name_or_path=<<global_vars.hf_model>>
23+
- data.offline_data_path=/scratchspace/offline_hidden_states
24+
- training.output_dir=/scratchspace/eagle3
25+
- training.training_seq_len=4096
26+
- training.disable_tqdm=true
27+
- training.ar_validate_steps=500000
28+
slurm_config:
29+
_factory_: "slurm_factory"
30+
nodes: 1
31+
ntasks_per_node: 1
32+
gpus_per_node: 8
33+
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0

0 commit comments

Comments
 (0)