Skip to content

Commit 9940a17

Browse files
authored
moe config and megatron init (#5)
1 parent 5ab247e commit 9940a17

File tree

29 files changed

+597
-87
lines changed

29 files changed

+597
-87
lines changed

.github/workflows/ci.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Megatron-LM-X-CI
1+
name: Primus-CI
22

33
on:
44
workflow_dispatch:
@@ -16,7 +16,7 @@ jobs:
1616
matrix:
1717
python-version: ["3.12"]
1818
steps:
19-
- run: echo "🎉 Begin Megatron-LM-X Python Lint."
19+
- run: echo "🎉 Begin Primus Python Lint."
2020
- run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
2121
- run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
2222
- uses: actions/checkout@v4

examples/deepseek_v3/exp_pretrain.yaml

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,49 @@ modules:
1414
config: pre_trainer.yaml
1515
model: deepseek_v3_45BA3B.yaml
1616
overrides:
17+
# log
1718
wandb_project: "Primus_DeepSeekV3_Pretrain"
18-
# TODO
19-
disable_wandb: true
19+
# disable_wandb: false
2020
stderr_sink_level: DEBUG
2121

22-
# TODO
23-
build_tokenizer: false
22+
# debug
23+
num_layers: 4
2424

25+
# hyber parameters
2526
train_iters: 10
2627
micro_batch_size: 1
2728
global_batch_size: 16
2829
seq_length: 4096
29-
tensor_model_parallel_size: 1
30-
pipeline_model_parallel_size: 1
31-
# TODO(wenx)
32-
expert_model_parallel_size: 1
33-
30+
max_position_embeddings: 4096
3431
lr: 1.0e-5
3532
min_lr: 0.0
3633
lr_warmup_iters: 2
3734
lr_decay_iters: null
3835
lr_decay_style: cosine
3936

37+
# parallel
38+
tensor_model_parallel_size: 1
39+
pipeline_model_parallel_size: 1
40+
expert_model_parallel_size: 8
41+
42+
# data
43+
train_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
44+
valid_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
45+
test_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
46+
47+
# fusion
48+
# 20250317: need latest apex in docker image
49+
gradient_accumulation_fusion: false
50+
# 20250317: TE grouped gemm has numerical issue
51+
moe_use_legacy_grouped_gemm: true
52+
53+
# ckpt
4054
finetune: false
4155
auto_continue_train: true
56+
load: null
57+
no_load_optim: null
58+
no_load_rng: null
59+
save: null
60+
save_interval: 20000
61+
no_save_optim: null
62+
no_save_rng: null

examples/deepseek_v3/run_pretrain.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ export PYTHONPATH=${SITE_PACKAGES}:${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
1111
echo "MEGATRON_PATH path is not set"
1212
exit 1
1313
}
14+
# build helper_cpp
15+
pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
1416

1517
export EXP_CONFIG=${EXP_CONFIG:-examples/deepseek_v3/exp_pretrain.yaml}
1618

@@ -32,9 +34,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
3234
# export AMD_SERIALIZE_KERNEL=3
3335
# export HSA_NO_SCRATCH_RECLAIM=1
3436

35-
# TODO(wenx)
36-
export GPUS_PER_NODE=2
37-
3837
# cluster node envs
3938
RUN_ENV="${RUN_ENV:-torchrun}"
4039
if [ "$RUN_ENV" = "torchrun" ]; then
@@ -71,6 +70,7 @@ echo "GPUS_PER_NODE: $GPUS_PER_NODE"
7170
echo "HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES"
7271
echo ""
7372

73+
7474
DISTRIBUTED_ARGS=(
7575
--nproc_per_node "${GPUS_PER_NODE}"
7676
--nnodes "${NNODES}"
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)