Skip to content

Commit 9a98d01

Browse files
vidushi8clairesongleeYour Name
authored
PR for primus/megatron v25.7 release (#145)
Co-authored-by: clairesonglee <[email protected]> Co-authored-by: Your Name <[email protected]>
1 parent 6df74ac commit 9a98d01

13 files changed

+294
-4
lines changed

examples/megatron/configs/deepseek_v2-pretrain.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ modules:
6060
# 20250321: need latest megatron docker image
6161
moe_permute_fusion: false
6262
# fused wgrad gemm and accumulation
63-
gradient_accumulation_fusion: true
63+
gradient_accumulation_fusion: false
6464
# recommend set `false` in fp8
6565
moe_use_legacy_grouped_gemm: true
6666
# fused topk router with aux score

examples/megatron/configs/deepseek_v2_lite-pretrain.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,15 @@ modules:
5555
# 20250321: need latest megatron docker image
5656
moe_permute_fusion: false
5757
# fused wgrad gemm and accumulation
58-
gradient_accumulation_fusion: true
58+
gradient_accumulation_fusion: false
5959
# recommend set `false` in fp8
6060
moe_use_legacy_grouped_gemm: true
6161
# fused topk router with aux score
6262
moe_use_fused_router_with_aux_score: false
6363
# pad 192/128 for deepseek attention
6464
fused_padded_mla_attention: false
65+
66+
multi_latent_attention: false
6567

6668
# ckpt
6769
finetune: false
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
work_group: ${TEAM:amd}
2+
user_name: ${USER:root}
3+
exp_name: ${EXP_NAME:deepseek_v3-pretrain}
4+
workspace: ./output
5+
6+
modules:
7+
pre_trainer:
8+
framework: megatron
9+
config: pre_trainer.yaml
10+
11+
# model to run
12+
model: ${PRIMUS_MODEL:deepseek_v3}.yaml
13+
overrides:
14+
# log
15+
wandb_project: "Primus_DeepSeek_Pretrain"
16+
stderr_sink_level: DEBUG
17+
18+
# debug
19+
moe_router_force_load_balancing: true
20+
log_avg_skip_iterations: 2
21+
log_avg_reset_interval: 50
22+
23+
# hyber parameters
24+
train_iters: 50
25+
micro_batch_size: 4
26+
global_batch_size: 256
27+
seq_length: ${PRIMUS_SEQ_LENGTH:4096}
28+
max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096}
29+
lr: 1.0e-5
30+
min_lr: 0.0
31+
lr_warmup_iters: 2
32+
lr_decay_iters: null
33+
lr_decay_style: cosine
34+
weight_decay: 0.1
35+
adam_beta1: 0.9
36+
adam_beta2: 0.95
37+
eod_mask_loss: true
38+
init_method_std: 0.008
39+
norm_epsilon: 1.0e-6
40+
41+
# parallel
42+
tensor_model_parallel_size: ${PRIMUS_TP:1}
43+
pipeline_model_parallel_size: ${PRIMUS_PP:1}
44+
expert_model_parallel_size: ${PRIMUS_EP:8}
45+
overlap_grad_reduce: true
46+
overlap_param_gather: true
47+
48+
# data
49+
mock_data: true
50+
train_data_path: ${TOKENIZED_DATA_PATH:null}
51+
valid_data_path: null
52+
test_data_path: null
53+
54+
# fusion
55+
# 20250321: need latest megatron docker image
56+
moe_permute_fusion: false
57+
# fused wgrad gemm and accumulation
58+
gradient_accumulation_fusion: false
59+
# recommend set `false` in fp8
60+
moe_use_legacy_grouped_gemm: true
61+
# fused topk router with aux score
62+
moe_use_fused_router_with_aux_score: false
63+
# pad 192/128 for deepseek attention
64+
fused_padded_mla_attention: false
65+
66+
# Performance toggles
67+
#multi_latent_attention: false
68+
#apply_rope_fusion: true
69+
70+
# ckpt
71+
finetune: false
72+
auto_continue_train: false
73+
load: null
74+
no_load_optim: null
75+
no_load_rng: null
76+
save: null
77+
save_interval: 20000
78+
no_save_optim: null
79+
no_save_rng: null
80+
disable_last_saving: true
81+
ckpt_format: torch
82+
eval_iters: 0

examples/megatron/configs/llama2_7B-pretrain.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ modules:
4747
expert_model_parallel_size: 1
4848
overlap_grad_reduce: true
4949
overlap_param_gather: true
50+
gradient_accumulation_fusion: false
5051

5152
# data
5253
mock_data: true

examples/megatron/configs/llama3.1_8B-pretrain.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ modules:
4545
expert_model_parallel_size: 1
4646
overlap_grad_reduce: true
4747
overlap_param_gather: true
48+
gradient_accumulation_fusion: false
4849

4950
# data
5051
mock_data: true

examples/megatron/configs/llama3.3_70B-pretrain.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ modules:
77
pre_trainer:
88
framework: megatron
99
config: pre_trainer.yaml
10-
model: llama3.1_70B.yaml
10+
model: llama3.3_70B.yaml
1111
overrides:
1212
# log
1313
wandb_project: "Primus_DeepSeek_Pretrain"
@@ -22,7 +22,7 @@ modules:
2222

2323
seq_length: 8192
2424
max_position_embeddings: 8192
25-
25+
2626
lr: 1.0e-5
2727
min_lr: 0.0
2828
lr_warmup_iters: 2

examples/megatron/configs/llama3_8B-pretrain.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ modules:
4646
expert_model_parallel_size: 1
4747
overlap_grad_reduce: true
4848
overlap_param_gather: true
49+
gradient_accumulation_fusion: false
4950

5051
# data
5152
mock_data: true
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
work_group: ${TEAM:amd}
2+
user_name: ${USER:root}
3+
exp_name: ${EXP_NAME:qwen2.5_72B-pretrain}
4+
workspace: ./output
5+
6+
modules:
7+
pre_trainer:
8+
framework: megatron
9+
config: pre_trainer.yaml
10+
11+
# model to run
12+
model: qwen2.5_72B.yaml
13+
overrides:
14+
# log
15+
wandb_project: "Primus_Qwen2.5_72B_Pretrain"
16+
# disable_wandb: false
17+
# disable_tensorboard: false
18+
stderr_sink_level: DEBUG
19+
20+
log_avg_skip_iterations: 2
21+
log_avg_reset_interval: 50
22+
23+
train_iters: 50
24+
micro_batch_size: 4
25+
global_batch_size: 32
26+
27+
seq_length: ${SEQ_LENGTH:2048}
28+
max_position_embeddings: ${MAX_POSITION_EMBEDDINGS:131072}
29+
30+
lr: 1.0e-4
31+
min_lr: 1.0e-5
32+
lr_warmup_iters: 2
33+
lr_decay_iters: 320000
34+
lr_decay_style: cosine
35+
weight_decay: 1.0e-1
36+
adam_beta1: 0.9
37+
adam_beta2: 0.95
38+
eod_mask_loss: true
39+
init_method_std: 0.008
40+
norm_epsilon: 1.0e-6
41+
42+
# parallel
43+
tensor_model_parallel_size: 1
44+
pipeline_model_parallel_size: 1
45+
expert_model_parallel_size: 1
46+
sequence_parallel: 1
47+
overlap_grad_reduce: true
48+
49+
overlap_param_gather: false
50+
use_torch_fsdp2: true
51+
use_distributed_optimizer: false
52+
gradient_accumulation_fusion: false
53+
ckpt_format: torch_dist
54+
55+
# data
56+
mock_data: true
57+
train_data_path: null
58+
valid_data_path: null
59+
test_data_path: null
60+
61+
# ckpt
62+
finetune: false
63+
auto_continue_train: false
64+
load: null
65+
no_load_optim: null
66+
no_load_rng: null
67+
save: null
68+
save_interval: 20000
69+
no_save_optim: null
70+
no_save_rng: null
71+
disable_last_saving: true
72+
73+
# recompute
74+
recompute_granularity: full # full, selective
75+
recompute_method: block # uniform, block
76+
recompute_num_layers: 80 # int
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
work_group: ${TEAM:amd}
2+
user_name: ${USER:root}
3+
exp_name: ${EXP_NAME:qwen2.5_7B-pretrain}
4+
workspace: ./output
5+
6+
modules:
7+
pre_trainer:
8+
framework: megatron
9+
config: pre_trainer.yaml
10+
# data_path: ./data
11+
12+
# model to run
13+
model: qwen2.5_7B.yaml
14+
overrides:
15+
# log
16+
wandb_project: "Primus_Qwen2.5_7B_Pretrain"
17+
# disable_wandb: false
18+
# disable_tensorboard: false
19+
stderr_sink_level: DEBUG
20+
21+
log_avg_skip_iterations: 2
22+
log_avg_reset_interval: 50
23+
24+
train_iters: 50
25+
micro_batch_size: 10
26+
global_batch_size: 640
27+
28+
seq_length: ${SEQ_LENGTH:2048}
29+
max_position_embeddings: ${MAX_POSITION_EMBEDDINGS:131072}
30+
31+
lr: 1.0e-5
32+
min_lr: 0.0
33+
lr_warmup_iters: 2
34+
lr_decay_iters: 320000
35+
lr_decay_style: cosine
36+
weight_decay: 0.1
37+
adam_beta1: 0.9
38+
adam_beta2: 0.95
39+
eod_mask_loss: true
40+
init_method_std: 0.008
41+
norm_epsilon: 1.0e-6
42+
43+
# parallel
44+
tensor_model_parallel_size: 1
45+
pipeline_model_parallel_size: 1
46+
expert_model_parallel_size: 1
47+
overlap_grad_reduce: true
48+
overlap_param_gather: true
49+
50+
# data
51+
mock_data: true
52+
train_data_path: null
53+
valid_data_path: null
54+
test_data_path: null
55+
56+
# ckpt
57+
finetune: false
58+
auto_continue_train: ${CONTI_PARAMS:0}
59+
load: null
60+
no_load_optim: null
61+
no_load_rng: null
62+
save: null
63+
save_interval: 20000
64+
no_save_optim: null
65+
no_save_rng: null
66+
disable_last_saving: true
67+
ckpt_format: torch
68+
69+
gradient_accumulation_fusion: false
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
bases:
2+
- llama3_70B.yaml
3+
4+
tokenizer_type: Llama3Tokenizer
5+
tokenizer_model: meta-llama/Llama-3.3-70B-Instruct
6+
7+
max_position_embeddings: 131072

0 commit comments

Comments
 (0)