Skip to content

Commit e16b27b

Browse files
committed
update common perf arguments - ce fusion - moe gemms
1 parent 5fcbe85 commit e16b27b

13 files changed

+57
-5
lines changed

examples/megatron/configs/deepseek_v2_lite-pretrain.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ modules:
5757
# fused wgrad gemm and accumulation
5858
gradient_accumulation_fusion: false
5959
# recommend set `false` in fp8
60-
moe_use_legacy_grouped_gemm: true
60+
moe_use_legacy_grouped_gemm: false
6161
# fused topk router with aux score
6262
moe_use_fused_router_with_aux_score: false
6363
# pad 192/128 for deepseek attention
@@ -82,4 +82,8 @@ modules:
8282
# Turbo
8383
enable_primus_turbo: true
8484
use_turbo_attention: true
85-
use_turbo_grouped_mlp: true
85+
use_turbo_grouped_mlp: true
86+
87+
# Cross entropy flags
88+
cross_entropy_fusion_impl: "te"
89+
cross_entropy_loss_fusion: true

examples/megatron/configs/deepseek_v3-pretrain.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ modules:
5757
# fused wgrad gemm and accumulation
5858
gradient_accumulation_fusion: false
5959
# recommend set `false` in fp8
60-
moe_use_legacy_grouped_gemm: true
60+
moe_use_legacy_grouped_gemm: false
6161
# fused topk router with aux score
6262
moe_use_fused_router_with_aux_score: false
6363
# pad 192/128 for deepseek attention
@@ -85,3 +85,7 @@ modules:
8585
enable_primus_turbo: true
8686
use_turbo_attention: true
8787
use_turbo_grouped_mlp: true
88+
89+
# Cross entropy flags
90+
cross_entropy_fusion_impl: "te"
91+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama2_70B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,7 @@ modules:
7575
enable_primus_turbo: true
7676
use_turbo_attention: true
7777
use_turbo_grouped_mlp: true
78+
79+
# Cross entropy flags
80+
cross_entropy_fusion_impl: "te"
81+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama2_7B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,7 @@ modules:
7878
# overlap_param_gather: false
7979
# ckpt_format: torch
8080
# sequence_parallel: 1
81+
82+
# Cross entropy flags
83+
cross_entropy_fusion_impl: "te"
84+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama3.1_70B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,7 @@ modules:
7474
# Turbo
7575
enable_primus_turbo: true
7676
use_turbo_attention: true
77+
78+
# Cross entropy flags
79+
cross_entropy_fusion_impl: "te"
80+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama3.1_8B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,7 @@ modules:
6565
no_save_rng: null
6666
disable_last_saving: true
6767
ckpt_format: torch
68+
69+
# Cross entropy flags
70+
cross_entropy_fusion_impl: "te"
71+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama3.3_70B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,7 @@ modules:
7575
enable_primus_turbo: true
7676
use_turbo_attention: true
7777
use_turbo_grouped_mlp: true
78+
79+
# Cross entropy flags
80+
cross_entropy_fusion_impl: "te"
81+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama3_70B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,7 @@ modules:
7575
enable_primus_turbo: true
7676
use_turbo_attention: true
7777
use_turbo_grouped_mlp: true
78+
79+
# Cross entropy flags
80+
cross_entropy_fusion_impl: "te"
81+
cross_entropy_loss_fusion: true

examples/megatron/configs/llama3_8B-pretrain.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,7 @@ modules:
7171
enable_primus_turbo: true
7272
use_turbo_attention: true
7373
use_turbo_grouped_mlp: true
74+
75+
# Cross entropy flags
76+
cross_entropy_fusion_impl: "te"
77+
cross_entropy_loss_fusion: true

examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ modules:
5959
# fusion
6060
moe_permute_fusion: false
6161
gradient_accumulation_fusion: false
62-
moe_use_legacy_grouped_gemm: true
62+
moe_use_legacy_grouped_gemm: false
6363

6464
# ckpt
6565
finetune: false
@@ -73,3 +73,7 @@ modules:
7373
no_save_rng: null
7474
disable_last_saving: true
7575
ckpt_format: torch
76+
77+
# Cross entropy flags
78+
cross_entropy_fusion_impl: "te"
79+
cross_entropy_loss_fusion: true

0 commit comments

Comments
 (0)