Skip to content

Commit 143ee71

Browse files
authored
fix moe freq config (#12)
1 parent dbb8f97 commit 143ee71

File tree

9 files changed

+39
-16
lines changed

9 files changed

+39
-16
lines changed

examples/deepseek/exp_pretrain.yaml

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,33 @@ modules:
1717
# log
1818
wandb_project: "Primus_DeepSeek_Pretrain"
1919
# disable_wandb: false
20+
# disable_tensorboard: false
2021
stderr_sink_level: DEBUG
2122

2223
# debug
23-
num_layers: 5
24-
optimizer: adam
24+
# num_layers: 5
25+
# optimizer: adam
2526
moe_router_force_load_balancing: true
2627
moe_router_dtype: fp32
2728
log_avg_skip_iterations: 2
2829
log_avg_reset_interval: 5
30+
# ddp_bucket_size: 629145600
31+
32+
# recompute
33+
# recompute_granularity: full # full, selective
34+
# recompute_method: block # uniform, block
35+
# recompute_num_layers: 1 # int
36+
37+
# profile
38+
# profile: true
39+
# use_pytorch_profiler: true
40+
# profile_step_end: 7
41+
# profile_step_start: 6
2942

3043
# hyber parameters
3144
train_iters: 10
3245
micro_batch_size: 1
33-
global_batch_size: 16
46+
global_batch_size: 64
3447
seq_length: 4096
3548
max_position_embeddings: 4096
3649
lr: 1.0e-5

examples/deepseek/run_pretrain.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@ export NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7
4242
export NCCL_IB_GID_INDEX=3
4343
export NCCL_CROSS_NIC=0
4444
export HSA_ENABLE_SDMA=0
45-
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-ens51f0}
46-
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-ens51f0}
45+
IP_INTERFACE=$(ip -o -4 addr show | awk -v ip="$(hostname -I | awk '{print $1}')" '$4 ~ ip {print $2}')
46+
export IP_INTERFACE
47+
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-${IP_INTERFACE}}
48+
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-${IP_INTERFACE}}
4749
export CUDA_DEVICE_MAX_CONNECTIONS=1 # Reducing to 1 ensures no PCIE traffic (even on single node)
4850
export NCCL_PROTO=Simple
4951
export RCCL_MSCCL_ENABLE=0
@@ -108,6 +110,7 @@ export HIP_VISIBLE_DEVICES=$gpus
108110

109111
echo "[NODE-$NODE_RANK] MASTER_ADDR: $MASTER_ADDR"
110112
echo "[NODE-$NODE_RANK] MASTER_PORT: $MASTER_PORT"
113+
echo "[NODE-$NODE_RANK] IP_INTERFACE: $IP_INTERFACE"
111114
echo "[NODE-$NODE_RANK] NNODES: $NNODES"
112115
echo "[NODE-$NODE_RANK] NODE_RANK: $NODE_RANK"
113116
echo "[NODE-$NODE_RANK] GPUS_PER_NODE: $GPUS_PER_NODE"

examples/deepseek/run_slurm_pretrain.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ SCRIPT_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
88
echo "Current script path: $SCRIPT_DIR"
99

1010
export RUN_ENV=slurm
11-
export NCCL_SOCKET_IFNAME=bond0
12-
export GLOO_SOCKET_IFNAME=bond0
11+
export MODEL_CONFIG=deepseek_v2_lite
1312

1413
srun -N 2 \
1514
--gres=gpu:8 \

primus/configs/models/megatron/deepseek_v3.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ kv_channels: 128
2424
mtp_num_layers: 1
2525
mtp_loss_scaling_factor: 0.1
2626
# moe
27-
moe_layer_freq: 3
27+
moe_layer_freq: "([0]*3+[1]*58)"
2828
num_experts: 256
2929
moe_router_topk: 8
3030
# num_shared_experts: 1

primus/configs/models/megatron/deepseek_v3_17B.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ qk_pos_emb_head_dim: 0
2020
v_head_dim: 128
2121
kv_channels: 128
2222
# moe
23-
moe_layer_freq: 2
23+
moe_layer_freq: "([0]*2+[1]*26)"
2424
num_experts: 96
2525
moe_router_topk: 6
2626
# num_shared_experts: 1

primus/configs/models/megatron/deepseek_v3_393B.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ qk_pos_emb_head_dim: 0
2020
v_head_dim: 128
2121
kv_channels: 128
2222
# moe
23-
moe_layer_freq: 2
23+
moe_layer_freq: "([0]*2+[1]*62)"
2424
num_experts: 256
2525
moe_router_topk: 8
2626
# num_shared_experts: 1

primus/configs/models/megatron/deepseek_v3_45B.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ qk_pos_emb_head_dim: 64
2020
v_head_dim: 128
2121
kv_channels: 128
2222
# moe
23-
moe_layer_freq: 2
23+
moe_layer_freq: "([0]*2+[1]*30)"
2424
num_experts: 96
2525
moe_router_topk: 4
2626
# num_shared_experts: 1

primus/configs/modules/megatron/trainer_base.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,9 @@ adlr_autoresume_interval: 1000
184184

185185
# activation recomputation
186186
recompute_activations: false
187-
recompute_granularity: null
188-
recompute_method: null
189-
recompute_num_layers: null
187+
recompute_granularity: null # full, selective
188+
recompute_method: null # uniform, block
189+
recompute_num_layers: null # int
190190
distribute_saved_activations: false
191191
checkpoint_activations: false # deprecated
192192

@@ -224,6 +224,7 @@ mmap_bin_files: true
224224

225225
#profile:
226226
profile: false
227+
use_pytorch_profiler: false
227228
profile_ranks: [0]
228229
profile_step_end: 12
229230
profile_step_start: 10
@@ -329,7 +330,7 @@ straggler_minmax_count: 1
329330
inference_batch_times_seqlen_threshold: -1
330331
inference_dynamic_batching: false
331332
inference_dynamic_batching_buffer_size_gb: 40.0 # float
332-
inference_dynamic_batching_buffer_guaranteed_fraction: 0.2 # float
333+
inference_dynamic_batching_buffer_guaranteed_fraction: 0.2 # float
333334
inference_dynamic_batching_buffer_overflow_factor: null # float
334335
inference_dynamic_batching_max_requests_override: null # int
335336
inference_dynamic_batching_max_tokens_override: null # int

primus/modules/trainer/megatron/trainer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,11 @@
7777
global_vars,
7878
one_logger_utils,
7979
)
80-
from megatron.training.arguments import core_transformer_config_from_args, validate_args
80+
from megatron.training.arguments import (
81+
core_transformer_config_from_args,
82+
moe_freq_type,
83+
validate_args,
84+
)
8185
from megatron.training.async_utils import (
8286
init_persistent_async_worker,
8387
maybe_finalize_async_save,
@@ -404,6 +408,9 @@ def update_primus_config(
404408
if args.iterations_to_skip is None:
405409
args.iterations_to_skip = []
406410

411+
# support moe_freq_type
412+
args.moe_layer_freq = moe_freq_type(args.moe_layer_freq)
413+
407414
def vocab_size_with_padding(self, orig_vocab_size, args):
408415
"""Pad vocab size so it is divisible by model parallel size and
409416
still having GPU friendly size."""

0 commit comments

Comments
 (0)