Skip to content

Commit 7861d66

Browse files
- Updated dmoe config
- Fixed merge issues - Fixed moe sequence parallelism bug - Added assertions for zero stages 2 and 3 with moe - Updated moe requirements
1 parent 7b9679a commit 7861d66

File tree

5 files changed

+32
-22
lines changed

5 files changed

+32
-22
lines changed

Diff for: configs/125M-dmoe.yml

+24-14
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,20 @@
22
{
33
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
44
# across the node boundaries )
5-
"pipe_parallel_size": 2, # MoE supports PP
6-
"model_parallel_size": 2, # MoE uses model parallel group to split both experts and attention weights
5+
"pipe_parallel_size": 1, # MoE supports PP
6+
"model_parallel_size": 1, # MoE uses model parallel group to split both experts and attention weights
77

88
# model settings
99
"num_layers": 12,
10-
"hidden_size": 1024,
11-
"num_attention_heads": 16,
10+
"hidden_size": 768,
11+
"num_attention_heads": 12,
1212
"seq_length": 2048,
1313
"max_position_embeddings": 2048,
1414
"norm": "layernorm",
1515
"pos_emb": "rotary",
1616
"no_weight_tying": true,
17+
"gpt_j_residual": false,
18+
"output_layer_parallelism": "column",
1719

1820
# moe settings
1921
"moe_num_experts": 8,
@@ -24,19 +26,24 @@
2426
"rope_fusion": false,
2527
"layernorm_fusion": false,
2628

27-
29+
# init methods
30+
"init_method": "small_init",
31+
"output_layer_init_method": "wang_init",
32+
2833
# optimizer settings
2934
"optimizer": {
3035
"type": "Adam",
3136
"params": {
3237
"lr": 0.0006,
33-
"betas": [0.9, 0.999],
38+
"betas": [0.9, 0.95],
3439
"eps": 1.0e-8,
3540
}
3641
},
42+
"min_lr": 0.00006,
43+
3744
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
3845
"zero_optimization": {
39-
"stage": 0,
46+
"stage": 1,
4047
"allgather_partitions": True,
4148
"allgather_bucket_size": 500000000,
4249
"overlap_comm": True,
@@ -48,7 +55,6 @@
4855
# batch / data settings
4956
"train_micro_batch_size_per_gpu": 4,
5057
"data_impl": "mmap",
51-
"split": "949,50,1",
5258

5359
# activation checkpointing
5460
"checkpoint_activations": true,
@@ -58,26 +64,30 @@
5864

5965
# regularization
6066
"gradient_clipping": 1.0,
61-
"weight_decay": 0.0,
67+
"weight_decay": 0.1,
6268
"hidden_dropout": 0.0,
6369
"attention_dropout": 0.0,
6470

6571
"precision": "bfloat16",
6672

6773
"fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
74+
6875
# misc. training settings
69-
"train_iters": 5,
76+
"train_iters": 320000,
7077
"lr_decay_iters": 320000,
7178
"distributed_backend": "nccl",
72-
"min_lr": 0.0006,
73-
"warmup": 0.0,
79+
"lr_decay_style": "cosine",
80+
"warmup": 0.1,
7481
"checkpoint_factor": 10000,
7582
"eval_interval": 1000,
7683
"eval_iters": 10,
7784

7885
# logging
79-
"log_interval": 1,
80-
"steps_per_print": 1,
86+
"log_interval": 100,
87+
"steps_per_print": 10,
8188
"keep_last_n_checkpoints": 4,
8289
"wall_clock_breakdown": true,
90+
91+
# networking
92+
"hostfile": "/mock_path"
8393
}

Diff for: megatron/model/transformer.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,7 @@ def __init__(
928928

929929
super().__init__()
930930
self.layer_number = layer_number
931+
self.neox_args = neox_args
931932

932933
norm, eps = get_norm(neox_args)
933934

@@ -1014,12 +1015,6 @@ def get_te_lnmlp(**kw):
10141015
**kw,
10151016
)
10161017

1017-
self.num_experts = (
1018-
neox_args.moe_num_experts
1019-
if layer_number % neox_args.expert_interval == 0
1020-
else 1
1021-
)
1022-
10231018
if self.num_experts <= 1:
10241019
if neox_args.te_layernorm_mlp:
10251020
self.mlp = get_te_lnmlp()

Diff for: megatron/mpu/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from .initialize import get_expert_token_counts_for_rank
4040
from .initialize import initialize_model_parallel
4141
from .initialize import model_parallel_is_initialized
42+
from .initialize import get_fp32_allreduce
4243

4344
from .layers import ColumnParallelLinear
4445
from .layers import RowParallelLinear

Diff for: megatron/neox_arguments/arguments.py

+4
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,10 @@ def calculate_derived(self):
10841084
# the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
10851085
self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
10861086

1087+
# MoE config
1088+
if self.moe_num_experts > 1:
1089+
assert self.zero_optimization["stage"] < 2, "MoE is not compatible with zero stages 2 and 3"
1090+
10871091
# Attention config
10881092
if self.attention_config is None:
10891093
self.update_value("attention_config", [[["global"], self.num_layers]])

Diff for: requirements/requirements-moe.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
grouped-gemm==0.1.4
2-
megablocks==0.5.1
1+
grouped-gemm==0.1.6
2+
megablocks==0.7.0

0 commit comments

Comments
 (0)