11#! /bin/bash
2+ # shellcheck disable=SC2086
23
3- # python path
4- SITE_PACKAGES=$( python -c " import sysconfig; print(sysconfig.get_paths()['purelib'])" )
4+
5+ # avaliable model configs:
6+ # deepseek_v2_lite, deepseek_v2
7+ # deepseek_v3, deepseek_v3_17B, deepseek_v3_45B
8+ export MODEL_CONFIG=${MODEL_CONFIG:- deepseek_v2_lite}
9+ echo " MODEL_CONFIG: $MODEL_CONFIG "
10+
11+ # framework path
512PRIMUS_PATH=$( realpath " $( dirname " $0 " ) /../.." )
6- export MEGATRON_PATH=${PRIMUS_PATH} /../Megatron-LM
7- export PYTHONPATH=${SITE_PACKAGES} :${MEGATRON_PATH} :${PRIMUS_PATH} :${PYTHONPATH}
13+ export PRIMUS_PATH
14+ export MEGATRON_PATH=${MEGATRON_PATH:- ${PRIMUS_PATH} / ../ Megatron-LM}
15+ echo " PRIMUS_PATH: $PRIMUS_PATH "
16+ echo " MEGATRON_PATH: $MEGATRON_PATH "
817
9- # check the path
18+ # check megatron path
1019[[ -z " ${MEGATRON_PATH} " ]] && {
1120 echo " MEGATRON_PATH path is not set"
1221 exit 1
1322}
14- # build helper_cpp
15- pushd " ${MEGATRON_PATH} /megatron/core/datasets" && make && popd || exit 1
1623
17- # avaliable model configs:
18- # deepseek_v2_lite, deepseek_v2
19- # deepseek_v3, deepseek_v3_17B, deepseek_v3_45B
20- export MODEL_CONFIG=deepseek_v2_lite
24+ # data
25+ mkdir -p " ${PRIMUS_PATH} " /data/deepseek-datasets
26+ export HF_HOME=" ${PRIMUS_PATH} " /data/huggingface
27+ export DATA_PATH=" ${PRIMUS_PATH} " /data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
28+ echo " HF_HOME: $HF_HOME "
29+ echo " DATA_PATH: $DATA_PATH "
30+ if [[ ! -f " ${DATA_PATH} .bin" || ! -f " ${DATA_PATH} .idx" ]]; then
31+ echo " Error: Missing required deepseek files. \
32+ Please follow the README.md and download ${DATA_PATH} .bin and ${DATA_PATH} .idx."
33+ exit 1
34+ fi
2135
2236# network envs
2337export OMP_NUM_THREADS=1
@@ -28,49 +42,76 @@ export NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7
2842export NCCL_IB_GID_INDEX=3
2943export NCCL_CROSS_NIC=0
3044export HSA_ENABLE_SDMA=0
31- export GLOO_SOCKET_IFNAME=eth0
45+ export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:- eth0}
46+ export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:- eth0}
3247export CUDA_DEVICE_MAX_CONNECTIONS=1 # Reducing to 1 ensures no PCIE traffic (even on single node)
3348export NCCL_PROTO=Simple
3449export RCCL_MSCCL_ENABLE=0
35- export CUDA_DEVICE_MAX_CONNECTIONS=1
3650# export AMD_LOG_LEVEL=3
3751# export AMD_SERIALIZE_KERNEL=3
3852# export HSA_NO_SCRATCH_RECLAIM=1
3953
54+ export GEMM_TUNING=0
55+ export NVTE_CK_USES_BWD_V3=1
56+ echo " GEMM_TUNING: $GEMM_TUNING "
57+ echo " NVTE_CK_USES_BWD_V3: $NVTE_CK_USES_BWD_V3 "
58+
59+ # gemm tuning, https://github.com/ROCm/TransformerEngine
60+ if [ " $GEMM_TUNING " -eq 1 ]; then
61+ export TE_HIPBLASLT_TUNING_RUN_COUNT=10
62+ export TE_HIPBLASLT_TUNING_ALGO_COUNT=50
63+ else
64+ unset TE_HIPBLASLT_TUNING_RUN_COUNT
65+ unset TE_HIPBLASLT_TUNING_ALGO_COUNT
66+ fi
67+
4068# cluster node envs
4169RUN_ENV=" ${RUN_ENV:- torchrun} "
70+ echo " RUN_ENV: $RUN_ENV "
4271if [ " $RUN_ENV " = " torchrun" ]; then
4372 export MASTER_ADDR=${MASTER_ADDR:- localhost}
4473 export MASTER_PORT=${MASTER_PORT:- $(shuf -n 1 -i 10000-65535)}
4574 export NNODES=${NNODES:- 1}
4675 export NODE_RANK=${NODE_RANK:- 0}
4776 export GPUS_PER_NODE=${GPUS_PER_NODE:- 8}
4877elif [ " $RUN_ENV " = " slurm" ]; then
78+ # use the first node as the master node
79+ node_list=$( scontrol show hostnames " $SLURM_JOB_NODELIST " )
80+ mapfile -t node_array <<< " $node_list"
81+ HEAD_NODE=${node_array[0]}
82+
83+ export SLURM_MASTER_ADDR=$HEAD_NODE
84+ export SLURM_MASTER_PORT=29509
85+ export SLURM_WORLD_SIZE=$(( SLURM_NNODES * SLURM_GPUS_ON_NODE))
86+
87+ echo " [NODE-$SLURM_NODEID ] NODELIST=${node_array[*]} "
88+ echo " [NODE-$SLURM_NODEID ] NODENAME=$SLURMD_NODENAME "
89+ echo " [NODE-$SLURM_NODEID ] SLURM_MASTER_ADDR=$SLURM_MASTER_ADDR "
90+ echo " [NODE-$SLURM_NODEID ] SLURM_MASTER_PORT=$SLURM_MASTER_PORT "
91+ echo " [NODE-$SLURM_NODEID ] SLURM_NNODES=$SLURM_NNODES "
92+ echo " [NODE-$SLURM_NODEID ] SLURM_GPUS_ON_NODE=$SLURM_GPUS_ON_NODE "
93+ echo " [NODE-$SLURM_NODEID ] SLURM_WORLD_SIZE=$SLURM_WORLD_SIZE "
94+ echo " [NODE-$SLURM_NODEID ] SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK "
95+ echo " [NODE-$SLURM_NODEID ] SLURM_PROCID: $SLURM_PROCID "
96+
4997 export MASTER_ADDR=${SLURM_MASTER_ADDR}
5098 export MASTER_PORT=${SLURM_MASTER_PORT}
51- export NNODES=$SLURM_NNODES
99+ export NNODES=${ SLURM_NNODES}
52100 export NODE_RANK=${SLURM_NODEID}
53101 export GPUS_PER_NODE=$(( SLURM_WORLD_SIZE / SLURM_NNODES))
54- echo " Error: SLURM mode is not implemented yet!"
55- exit 1
56102else
57103 echo " Error: Unknown RUN_ENV value: $RUN_ENV "
58104 exit 1
59105fi
60106gpus=$( seq -s, 0 $(( GPUS_PER_NODE - 1 )) )
61107export HIP_VISIBLE_DEVICES=$gpus
62108
63- echo " RUN_ENV: $RUN_ENV "
64- echo " PRIMUS_PATH: $PRIMUS_PATH "
65- echo " MEGATRON_PATH: $MEGATRON_PATH "
66- echo " SITE_PACKAGES: $SITE_PACKAGES "
67- echo " MODEL_CONFIG: $MODEL_CONFIG "
68- echo " MASTER_ADDR: $MASTER_ADDR "
69- echo " MASTER_PORT: $MASTER_PORT "
70- echo " NNODES: $NNODES "
71- echo " NODE_RANK: $NODE_RANK "
72- echo " GPUS_PER_NODE: $GPUS_PER_NODE "
73- echo " HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES "
109+ echo " [NODE-$NODE_RANK ] MASTER_ADDR: $MASTER_ADDR "
110+ echo " [NODE-$NODE_RANK ] MASTER_PORT: $MASTER_PORT "
111+ echo " [NODE-$NODE_RANK ] NNODES: $NNODES "
112+ echo " [NODE-$NODE_RANK ] NODE_RANK: $NODE_RANK "
113+ echo " [NODE-$NODE_RANK ] GPUS_PER_NODE: $GPUS_PER_NODE "
114+ echo " [NODE-$NODE_RANK ] HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES "
74115echo " "
75116
76117DISTRIBUTED_ARGS=(
@@ -84,6 +125,80 @@ DISTRIBUTED_ARGS=(
84125mkdir -p output
85126TRAIN_LOG=output/log_torchrun_pretrain_${MODEL_CONFIG} .txt
86127
87- torchrun " ${DISTRIBUTED_ARGS[@]} " examples/deepseek/pretrain.py \
88- --exp examples/deepseek/exp_pretrain.yaml \
89- 2>&1 | tee $TRAIN_LOG
128+ if [ " $RUN_ENV " = " torchrun" ]; then
129+ SITE_PACKAGES=$( python -c " import sysconfig; print(sysconfig.get_paths()['purelib'])" )
130+ export PYTHONPATH=${SITE_PACKAGES} :${MEGATRON_PATH} :${PRIMUS_PATH} :${PYTHONPATH}
131+
132+ # build helper_cpp of megatron
133+ pushd " ${MEGATRON_PATH} /megatron/core/datasets" && make && popd || exit 1
134+
135+ torchrun " ${DISTRIBUTED_ARGS[@]} " examples/deepseek/pretrain.py \
136+ --exp examples/deepseek/exp_pretrain.yaml \
137+ 2>&1 | tee $TRAIN_LOG
138+
139+ elif [ " $RUN_ENV " = " slurm" ]; then
140+ export DOCKER_IMAGE=" docker.io/rocm/megatron-lm:latest"
141+ # podman pull $DOCKER_IMAGE;
142+ echo " [NODE-$NODE_RANK ] stop all podmann containers..."
143+ podman stop -a && \
144+ module load rocm && \
145+ podman run \
146+ --rm \
147+ --env SLURM_MASTER_ADDR=$SLURM_MASTER_ADDR \
148+ --env SLURM_MASTER_PORT=$SLURM_MASTER_PORT \
149+ --env SLURM_PROCID=$SLURM_PROCID \
150+ --env SLURM_WORLD_SIZE=$SLURM_WORLD_SIZE \
151+ --env SLURM_NODEID=$SLURM_NODEID \
152+ --env SLURM_NNODES=$SLURM_NNODES \
153+ --env MASTER_ADDR=${MASTER_ADDR} \
154+ --env MASTER_PORT=${MASTER_PORT} \
155+ --env NNODES=${NNODES} \
156+ --env NODE_RANK=${NODE_RANK} \
157+ --env GPUS_PER_NODE=${GPUS_PER_NODE} \
158+ --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES \
159+ --env OMP_NUM_THREADS=$OMP_NUM_THREADS \
160+ --env GPU_MAX_HW_QUEUES=$GPU_MAX_HW_QUEUES \
161+ --env TORCH_NCCL_HIGH_PRIORITY=$TORCH_NCCL_HIGH_PRIORITY \
162+ --env NCCL_CHECKS_DISABLE=$NCCL_CHECKS_DISABLE \
163+ --env NCCL_IB_HCA=$NCCL_IB_HCA \
164+ --env NCCL_IB_GID_INDEX=$NCCL_IB_GID_INDEX \
165+ --env NCCL_CROSS_NIC=$NCCL_CROSS_NIC \
166+ --env HSA_ENABLE_SDMA=$HSA_ENABLE_SDMA \
167+ --env NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
168+ --env GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
169+ --env CUDA_DEVICE_MAX_CONNECTIONS=$CUDA_DEVICE_MAX_CONNECTIONS \
170+ --env NCCL_PROTO=$NCCL_PROTO \
171+ --env RCCL_MSCCL_ENABLE=$RCCL_MSCCL_ENABLE \
172+ --env HF_HOME=$HF_HOME \
173+ --env DATA_PATH=$DATA_PATH \
174+ --env MODEL_CONFIG=$MODEL_CONFIG \
175+ --env TE_HIPBLASLT_TUNING_RUN_COUNT=$TE_HIPBLASLT_TUNING_RUN_COUNT \
176+ --env TE_HIPBLASLT_TUNING_ALGO_COUNT=$TE_HIPBLASLT_TUNING_ALGO_COUNT \
177+ --env NVTE_CK_USES_BWD_V3=$NVTE_CK_USES_BWD_V3 \
178+ --ipc=host --network=host \
179+ --device=/dev/kfd --device=/dev/dri \
180+ --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN \
181+ --security-opt seccomp=unconfined --group-add video \
182+ --privileged --device=/dev/infiniband \
183+ -v $MEGATRON_PATH :$MEGATRON_PATH \
184+ -v $PRIMUS_PATH :$PRIMUS_PATH \
185+ $DOCKER_IMAGE /bin/bash -c \
186+ " echo $( date) && \
187+ pip install -q loguru wandb && \
188+ cd ${MEGATRON_PATH} /megatron/core/datasets && make && \
189+ cd $PRIMUS_PATH && \
190+ PYTHONPATH=${MEGATRON_PATH} :${PRIMUS_PATH} :${PYTHONPATH} \
191+ torchrun \
192+ --nproc_per_node ${GPUS_PER_NODE} \
193+ --nnodes ${NNODES} \
194+ --node_rank ${NODE_RANK} \
195+ --master_addr ${MASTER_ADDR} \
196+ --master_port ${MASTER_PORT} \
197+ examples/deepseek/pretrain.py \
198+ --exp examples/deepseek/exp_pretrain.yaml \
199+ 2>&1 | tee $TRAIN_LOG && \
200+ echo $( date) "
201+ else
202+ echo " Error: Unknown RUN_ENV value: $RUN_ENV "
203+ exit 1
204+ fi
0 commit comments