@@ -159,14 +159,6 @@ if [ "$NODE_RANK" = "0" ]; then
159159 echo " "
160160fi
161161
162- DISTRIBUTED_ARGS=(
163- --nproc_per_node " ${GPUS_PER_NODE} "
164- --nnodes " ${NNODES} "
165- --node_rank " ${NODE_RANK} "
166- --master_addr " ${MASTER_ADDR} "
167- --master_port " ${MASTER_PORT} "
168- )
169-
170162mkdir -p output
171163TRAIN_LOG=output/log_torchrun_pretrain_${MODEL_CONFIG} .txt
172164if [ " $NODE_RANK " = " 0" ]; then
@@ -179,6 +171,14 @@ if [ "$RUN_ENV" = "torchrun" ]; then
179171 SITE_PACKAGES=$( python -c " import sysconfig; print(sysconfig.get_paths()['purelib'])" )
180172 export PYTHONPATH=${SITE_PACKAGES} :${MEGATRON_PATH} :${PRIMUS_PATH} :${PYTHONPATH}
181173
174+ DISTRIBUTED_ARGS=(
175+ --nproc_per_node " ${GPUS_PER_NODE} "
176+ --nnodes " ${NNODES} "
177+ --node_rank " ${NODE_RANK} "
178+ --master_addr " ${MASTER_ADDR} "
179+ --master_port " ${MASTER_PORT} "
180+ )
181+
182182 # build helper_cpp of megatron
183183 pushd " ${MEGATRON_PATH} /megatron/core/datasets" && make && popd || exit 1
184184
@@ -228,6 +228,7 @@ elif [ "$RUN_ENV" = "slurm" ]; then
228228 --privileged --device=/dev/infiniband \
229229 -v $MEGATRON_PATH :$MEGATRON_PATH \
230230 -v $PRIMUS_PATH :$PRIMUS_PATH \
231+ -v $DATA_PATH :$DATA_PATH \
231232 $DOCKER_IMAGE /bin/bash -c \
232233 " echo '[NODE-${NODE_RANK} ]: begin, time=$( date +" %Y.%m.%d %H:%M:%S" ) ' && \
233234 pip install -q loguru wandb && \
@@ -240,8 +241,8 @@ elif [ "$RUN_ENV" = "slurm" ]; then
240241 --node_rank ${NODE_RANK} \
241242 --master_addr ${MASTER_ADDR} \
242243 --master_port ${MASTER_PORT} \
243- examples/deepseek /pretrain.py \
244- --exp examples/deepseek /exp_pretrain.yaml \
244+ examples/megatron /pretrain.py \
245+ --exp examples/megatron /exp_pretrain.yaml \
245246 2>&1 | tee $TRAIN_LOG && \
246247 echo '[NODE-${NODE_RANK} ]: end time=$( date +" %Y.%m.%d %H:%M:%S" ) '"
247248else
0 commit comments