PR for primus/megatron v25.7 release (#145)

vidushi8 · clairesonglee · Your Name · web-flow · commit 9a98d0137d53 · 2025-08-13T15:33:56.000+08:00
Co-authored-by: clairesonglee &lt;claire.lee2@amd.com&gt;
Co-authored-by: Your Name &lt;you@example.com&gt;
diff --git a/examples/megatron/configs/deepseek_v2-pretrain.yaml b/examples/megatron/configs/deepseek_v2-pretrain.yaml
@@ -60,7 +60,7 @@ modules:
       # 20250321: need latest megatron docker image
       moe_permute_fusion: false
       # fused wgrad gemm and accumulation
-      gradient_accumulation_fusion: true
+      gradient_accumulation_fusion: false
       # recommend set `false` in fp8
       moe_use_legacy_grouped_gemm: true
       # fused topk router with aux score
diff --git a/examples/megatron/configs/deepseek_v2_lite-pretrain.yaml b/examples/megatron/configs/deepseek_v2_lite-pretrain.yaml
@@ -55,13 +55,15 @@ modules:
       # 20250321: need latest megatron docker image
       moe_permute_fusion: false
       # fused wgrad gemm and accumulation
-      gradient_accumulation_fusion: true
+      gradient_accumulation_fusion: false
       # recommend set `false` in fp8
       moe_use_legacy_grouped_gemm: true
       # fused topk router with aux score
       moe_use_fused_router_with_aux_score: false
       # pad 192/128 for deepseek attention
       fused_padded_mla_attention: false
+      
+      multi_latent_attention: false
 
       # ckpt
       finetune: false
diff --git a/examples/megatron/configs/deepseek_v3-pretrain.yaml b/examples/megatron/configs/deepseek_v3-pretrain.yaml
@@ -0,0 +1,82 @@
+work_group: ${TEAM:amd}
+user_name: ${USER:root}
+exp_name: ${EXP_NAME:deepseek_v3-pretrain}
+workspace: ./output
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+
+    # model to run
+    model: ${PRIMUS_MODEL:deepseek_v3}.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_DeepSeek_Pretrain"
+      stderr_sink_level: DEBUG
+
+      # debug
+      moe_router_force_load_balancing: true
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      # hyber parameters
+      train_iters: 50
+      micro_batch_size: 4
+      global_batch_size: 256
+      seq_length: ${PRIMUS_SEQ_LENGTH:4096}
+      max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096}
+      lr: 1.0e-5
+      min_lr: 0.0
+      lr_warmup_iters: 2
+      lr_decay_iters: null
+      lr_decay_style: cosine
+      weight_decay: 0.1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.008
+      norm_epsilon: 1.0e-6
+
+      # parallel
+      tensor_model_parallel_size: ${PRIMUS_TP:1}
+      pipeline_model_parallel_size: ${PRIMUS_PP:1}
+      expert_model_parallel_size: ${PRIMUS_EP:8}
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+
+      # data
+      mock_data: true
+      train_data_path: ${TOKENIZED_DATA_PATH:null}
+      valid_data_path: null
+      test_data_path: null
+
+      # fusion
+      # 20250321: need latest megatron docker image
+      moe_permute_fusion: false
+      # fused wgrad gemm and accumulation
+      gradient_accumulation_fusion: false
+      # recommend set `false` in fp8
+      moe_use_legacy_grouped_gemm: true
+      # fused topk router with aux score
+      moe_use_fused_router_with_aux_score: false
+      # pad 192/128 for deepseek attention
+      fused_padded_mla_attention: false
+
+      # Performance toggles
+      #multi_latent_attention: false
+      #apply_rope_fusion: true
+
+      # ckpt
+      finetune: false
+      auto_continue_train: false
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
+      disable_last_saving: true
+      ckpt_format: torch
+      eval_iters: 0
diff --git a/examples/megatron/configs/llama2_7B-pretrain.yaml b/examples/megatron/configs/llama2_7B-pretrain.yaml
@@ -47,6 +47,7 @@ modules:
       expert_model_parallel_size: 1
       overlap_grad_reduce: true
       overlap_param_gather: true
+      gradient_accumulation_fusion: false
 
       # data
       mock_data: true
diff --git a/examples/megatron/configs/llama3.1_8B-pretrain.yaml b/examples/megatron/configs/llama3.1_8B-pretrain.yaml
@@ -45,6 +45,7 @@ modules:
       expert_model_parallel_size: 1
       overlap_grad_reduce: true
       overlap_param_gather: true
+      gradient_accumulation_fusion: false
 
       # data
       mock_data: true
diff --git a/examples/megatron/configs/llama3.3_70B-pretrain.yaml b/examples/megatron/configs/llama3.3_70B-pretrain.yaml
@@ -7,7 +7,7 @@ modules:
   pre_trainer:
     framework: megatron
     config: pre_trainer.yaml
-    model: llama3.1_70B.yaml
+    model: llama3.3_70B.yaml
     overrides:
       # log
       wandb_project: "Primus_DeepSeek_Pretrain"
@@ -22,7 +22,7 @@ modules:
 
       seq_length: 8192
       max_position_embeddings: 8192
-
+      
       lr: 1.0e-5
       min_lr: 0.0
       lr_warmup_iters: 2
diff --git a/examples/megatron/configs/llama3_8B-pretrain.yaml b/examples/megatron/configs/llama3_8B-pretrain.yaml
@@ -46,6 +46,7 @@ modules:
       expert_model_parallel_size: 1
       overlap_grad_reduce: true
       overlap_param_gather: true
+      gradient_accumulation_fusion: false
 
       # data
       mock_data: true
diff --git a/examples/megatron/configs/qwen2.5_72B-pretrain.yaml b/examples/megatron/configs/qwen2.5_72B-pretrain.yaml
@@ -0,0 +1,76 @@
+work_group: ${TEAM:amd}
+user_name: ${USER:root}
+exp_name: ${EXP_NAME:qwen2.5_72B-pretrain}
+workspace: ./output
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+
+    # model to run
+    model: qwen2.5_72B.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_Qwen2.5_72B_Pretrain"
+      # disable_wandb: false
+      # disable_tensorboard: false
+      stderr_sink_level: DEBUG
+
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      train_iters: 50
+      micro_batch_size: 4
+      global_batch_size: 32
+
+      seq_length: ${SEQ_LENGTH:2048}
+      max_position_embeddings: ${MAX_POSITION_EMBEDDINGS:131072}
+
+      lr: 1.0e-4
+      min_lr: 1.0e-5
+      lr_warmup_iters: 2
+      lr_decay_iters: 320000 
+      lr_decay_style: cosine
+      weight_decay: 1.0e-1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.008
+      norm_epsilon: 1.0e-6
+
+      # parallel
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      sequence_parallel: 1
+      overlap_grad_reduce: true
+      
+      overlap_param_gather: false
+      use_torch_fsdp2: true
+      use_distributed_optimizer: false
+      gradient_accumulation_fusion: false
+      ckpt_format: torch_dist
+
+      # data
+      mock_data: true
+      train_data_path: null
+      valid_data_path: null
+      test_data_path: null
+
+      # ckpt
+      finetune: false
+      auto_continue_train: false
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
+      disable_last_saving: true
+
+      # recompute
+      recompute_granularity: full # full, selective
+      recompute_method: block # uniform, block
+      recompute_num_layers: 80 # int
diff --git a/examples/megatron/configs/qwen2.5_7B-pretrain.yaml b/examples/megatron/configs/qwen2.5_7B-pretrain.yaml
@@ -0,0 +1,69 @@
+work_group: ${TEAM:amd}
+user_name: ${USER:root}
+exp_name: ${EXP_NAME:qwen2.5_7B-pretrain}
+workspace: ./output
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+    # data_path: ./data
+
+    # model to run
+    model: qwen2.5_7B.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_Qwen2.5_7B_Pretrain"
+      # disable_wandb: false
+      # disable_tensorboard: false
+      stderr_sink_level: DEBUG
+
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      train_iters: 50
+      micro_batch_size: 10
+      global_batch_size: 640
+
+      seq_length: ${SEQ_LENGTH:2048}
+      max_position_embeddings: ${MAX_POSITION_EMBEDDINGS:131072}
+
+      lr: 1.0e-5
+      min_lr: 0.0
+      lr_warmup_iters: 2
+      lr_decay_iters: 320000 
+      lr_decay_style: cosine
+      weight_decay: 0.1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.008
+      norm_epsilon: 1.0e-6
+
+      # parallel
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+
+      # data
+      mock_data: true
+      train_data_path: null
+      valid_data_path: null
+      test_data_path: null
+
+      # ckpt
+      finetune: false
+      auto_continue_train: ${CONTI_PARAMS:0}
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
+      disable_last_saving: true
+      ckpt_format: torch
+
+      gradient_accumulation_fusion: false
diff --git a/primus/configs/models/megatron/llama3.3_70B.yaml b/primus/configs/models/megatron/llama3.3_70B.yaml
@@ -0,0 +1,7 @@
+bases:
+  - llama3_70B.yaml
+
+tokenizer_type: Llama3Tokenizer
+tokenizer_model: meta-llama/Llama-3.3-70B-Instruct
+
+max_position_embeddings: 131072
diff --git a/primus/configs/models/megatron/qwen2.5_72B.yaml b/primus/configs/models/megatron/qwen2.5_72B.yaml
@@ -0,0 +1,12 @@
+bases:
+  - qwen2.5_base.yaml
+
+tokenizer_type: HuggingFaceTokenizer
+tokenizer_model: Qwen/Qwen2.5-72B
+
+# Qwen 2.5 72B specific parameters
+hidden_size: 8192
+ffn_hidden_size: 29568
+num_layers: 80
+num_attention_heads: 64
+num_query_groups: 8 
diff --git a/primus/configs/models/megatron/qwen2.5_7B.yaml b/primus/configs/models/megatron/qwen2.5_7B.yaml
@@ -0,0 +1,12 @@
+bases:
+  - qwen2.5_base.yaml
+
+tokenizer_type: HuggingFaceTokenizer
+tokenizer_model: Qwen/Qwen2.5-7B
+
+# Qwen 2.5 7B specific parameters
+hidden_size: 3584
+ffn_hidden_size: 18944
+num_layers: 28
+num_attention_heads: 28
+num_query_groups: 4 
diff --git a/primus/configs/models/megatron/qwen2.5_base.yaml b/primus/configs/models/megatron/qwen2.5_base.yaml
@@ -0,0 +1,27 @@
+bases:
+  - llama_base.yaml
+
+group_query_attention: true
+num_query_groups: null
+qk_layernorm: false
+
+swiglu: true
+add_bias_linear: false
+attention_softmax_in_fp32: true
+untie_embeddings_and_output_weights: true
+hidden_dropout: 0.0
+attention_dropout: 0.0
+
+position_embedding_type: rope
+rotary_base: 1000000
+rotary_percent: 1.0
+rotary_seq_len_interpolation_factor: 1
+normalization: RMSNorm
+
+# Qwen 2.5 specific settings
+max_position_embeddings: 131072
+norm_epsilon: 1.0e-6
+init_method_std: 0.008
+
+apply_rope_fusion: true
+masked_softmax_fusion: false