Skip to content

Commit cc6985e

Browse files
authored
fix datapath issue of slurm run (#18)
1 parent 7003718 commit cc6985e

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

examples/megatron/run_pretrain.sh

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,6 @@ if [ "$NODE_RANK" = "0" ]; then
159159
echo ""
160160
fi
161161

162-
DISTRIBUTED_ARGS=(
163-
--nproc_per_node "${GPUS_PER_NODE}"
164-
--nnodes "${NNODES}"
165-
--node_rank "${NODE_RANK}"
166-
--master_addr "${MASTER_ADDR}"
167-
--master_port "${MASTER_PORT}"
168-
)
169-
170162
mkdir -p output
171163
TRAIN_LOG=output/log_torchrun_pretrain_${MODEL_CONFIG}.txt
172164
if [ "$NODE_RANK" = "0" ]; then
@@ -179,6 +171,14 @@ if [ "$RUN_ENV" = "torchrun" ]; then
179171
SITE_PACKAGES=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")
180172
export PYTHONPATH=${SITE_PACKAGES}:${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
181173

174+
DISTRIBUTED_ARGS=(
175+
--nproc_per_node "${GPUS_PER_NODE}"
176+
--nnodes "${NNODES}"
177+
--node_rank "${NODE_RANK}"
178+
--master_addr "${MASTER_ADDR}"
179+
--master_port "${MASTER_PORT}"
180+
)
181+
182182
# build helper_cpp of megatron
183183
pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
184184

@@ -228,6 +228,7 @@ elif [ "$RUN_ENV" = "slurm" ]; then
228228
--privileged --device=/dev/infiniband \
229229
-v $MEGATRON_PATH:$MEGATRON_PATH \
230230
-v $PRIMUS_PATH:$PRIMUS_PATH \
231+
-v $DATA_PATH:$DATA_PATH \
231232
$DOCKER_IMAGE /bin/bash -c \
232233
"echo '[NODE-${NODE_RANK}]: begin, time=$(date +"%Y.%m.%d %H:%M:%S")' && \
233234
pip install -q loguru wandb && \
@@ -240,8 +241,8 @@ elif [ "$RUN_ENV" = "slurm" ]; then
240241
--node_rank ${NODE_RANK} \
241242
--master_addr ${MASTER_ADDR} \
242243
--master_port ${MASTER_PORT} \
243-
examples/deepseek/pretrain.py \
244-
--exp examples/deepseek/exp_pretrain.yaml \
244+
examples/megatron/pretrain.py \
245+
--exp examples/megatron/exp_pretrain.yaml \
245246
2>&1 | tee $TRAIN_LOG && \
246247
echo '[NODE-${NODE_RANK}]: end time=$(date +"%Y.%m.%d %H:%M:%S")'"
247248
else

0 commit comments

Comments
 (0)