Skip to content

Commit f06200c

Browse files
authored
slurm run (#10)
1 parent 4e44a0d commit f06200c

File tree

3 files changed

+168
-31
lines changed

3 files changed

+168
-31
lines changed

examples/deepseek/exp_pretrain.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ modules:
2121

2222
# debug
2323
num_layers: 5
24+
optimizer: adam
2425
moe_router_force_load_balancing: true
2526
moe_router_dtype: fp32
2627
log_avg_skip_iterations: 2
@@ -52,11 +53,13 @@ modules:
5253
overlap_param_gather: true
5354

5455
# data
55-
train_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
56+
train_data_path: ${DATA_PATH:/home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document}
5657
valid_data_path: null
5758
test_data_path: null
5859

5960
# fusion
61+
# 20250321: need latest megatron docker image
62+
moe_permute_fusion: false
6063
# 20250317: need latest apex in docker image
6164
gradient_accumulation_fusion: false
6265
# 20250317: TE grouped gemm has numerical issue

examples/deepseek/run_pretrain.sh

Lines changed: 145 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,37 @@
11
#!/bin/bash
2+
# shellcheck disable=SC2086
23

3-
# python path
4-
SITE_PACKAGES=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")
4+
5+
# avaliable model configs:
6+
# deepseek_v2_lite, deepseek_v2
7+
# deepseek_v3, deepseek_v3_17B, deepseek_v3_45B
8+
export MODEL_CONFIG=${MODEL_CONFIG:-deepseek_v2_lite}
9+
echo "MODEL_CONFIG: $MODEL_CONFIG"
10+
11+
# framework path
512
PRIMUS_PATH=$(realpath "$(dirname "$0")/../..")
6-
export MEGATRON_PATH=${PRIMUS_PATH}/../Megatron-LM
7-
export PYTHONPATH=${SITE_PACKAGES}:${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
13+
export PRIMUS_PATH
14+
export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM}
15+
echo "PRIMUS_PATH: $PRIMUS_PATH"
16+
echo "MEGATRON_PATH: $MEGATRON_PATH"
817

9-
# check the path
18+
# check megatron path
1019
[[ -z "${MEGATRON_PATH}" ]] && {
1120
echo "MEGATRON_PATH path is not set"
1221
exit 1
1322
}
14-
# build helper_cpp
15-
pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
1623

17-
# avaliable model configs:
18-
# deepseek_v2_lite, deepseek_v2
19-
# deepseek_v3, deepseek_v3_17B, deepseek_v3_45B
20-
export MODEL_CONFIG=deepseek_v2_lite
24+
# data
25+
mkdir -p "${PRIMUS_PATH}"/data/deepseek-datasets
26+
export HF_HOME="${PRIMUS_PATH}"/data/huggingface
27+
export DATA_PATH="${PRIMUS_PATH}"/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
28+
echo "HF_HOME: $HF_HOME"
29+
echo "DATA_PATH: $DATA_PATH"
30+
if [[ ! -f "${DATA_PATH}.bin" || ! -f "${DATA_PATH}.idx" ]]; then
31+
echo "Error: Missing required deepseek files. \
32+
Please follow the README.md and download ${DATA_PATH}.bin and ${DATA_PATH}.idx."
33+
exit 1
34+
fi
2135

2236
# network envs
2337
export OMP_NUM_THREADS=1
@@ -28,49 +42,76 @@ export NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7
2842
export NCCL_IB_GID_INDEX=3
2943
export NCCL_CROSS_NIC=0
3044
export HSA_ENABLE_SDMA=0
31-
export GLOO_SOCKET_IFNAME=eth0
45+
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
46+
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
3247
export CUDA_DEVICE_MAX_CONNECTIONS=1 # Reducing to 1 ensures no PCIE traffic (even on single node)
3348
export NCCL_PROTO=Simple
3449
export RCCL_MSCCL_ENABLE=0
35-
export CUDA_DEVICE_MAX_CONNECTIONS=1
3650
# export AMD_LOG_LEVEL=3
3751
# export AMD_SERIALIZE_KERNEL=3
3852
# export HSA_NO_SCRATCH_RECLAIM=1
3953

54+
export GEMM_TUNING=0
55+
export NVTE_CK_USES_BWD_V3=1
56+
echo "GEMM_TUNING: $GEMM_TUNING"
57+
echo "NVTE_CK_USES_BWD_V3: $NVTE_CK_USES_BWD_V3"
58+
59+
# gemm tuning, https://github.com/ROCm/TransformerEngine
60+
if [ "$GEMM_TUNING" -eq 1 ]; then
61+
export TE_HIPBLASLT_TUNING_RUN_COUNT=10
62+
export TE_HIPBLASLT_TUNING_ALGO_COUNT=50
63+
else
64+
unset TE_HIPBLASLT_TUNING_RUN_COUNT
65+
unset TE_HIPBLASLT_TUNING_ALGO_COUNT
66+
fi
67+
4068
# cluster node envs
4169
RUN_ENV="${RUN_ENV:-torchrun}"
70+
echo "RUN_ENV: $RUN_ENV"
4271
if [ "$RUN_ENV" = "torchrun" ]; then
4372
export MASTER_ADDR=${MASTER_ADDR:-localhost}
4473
export MASTER_PORT=${MASTER_PORT:-$(shuf -n 1 -i 10000-65535)}
4574
export NNODES=${NNODES:-1}
4675
export NODE_RANK=${NODE_RANK:-0}
4776
export GPUS_PER_NODE=${GPUS_PER_NODE:-8}
4877
elif [ "$RUN_ENV" = "slurm" ]; then
78+
# use the first node as the master node
79+
node_list=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
80+
mapfile -t node_array <<< "$node_list"
81+
HEAD_NODE=${node_array[0]}
82+
83+
export SLURM_MASTER_ADDR=$HEAD_NODE
84+
export SLURM_MASTER_PORT=29509
85+
export SLURM_WORLD_SIZE=$((SLURM_NNODES * SLURM_GPUS_ON_NODE))
86+
87+
echo "[NODE-$SLURM_NODEID] NODELIST=${node_array[*]}"
88+
echo "[NODE-$SLURM_NODEID] NODENAME=$SLURMD_NODENAME"
89+
echo "[NODE-$SLURM_NODEID] SLURM_MASTER_ADDR=$SLURM_MASTER_ADDR"
90+
echo "[NODE-$SLURM_NODEID] SLURM_MASTER_PORT=$SLURM_MASTER_PORT"
91+
echo "[NODE-$SLURM_NODEID] SLURM_NNODES=$SLURM_NNODES"
92+
echo "[NODE-$SLURM_NODEID] SLURM_GPUS_ON_NODE=$SLURM_GPUS_ON_NODE"
93+
echo "[NODE-$SLURM_NODEID] SLURM_WORLD_SIZE=$SLURM_WORLD_SIZE"
94+
echo "[NODE-$SLURM_NODEID] SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
95+
echo "[NODE-$SLURM_NODEID] SLURM_PROCID: $SLURM_PROCID"
96+
4997
export MASTER_ADDR=${SLURM_MASTER_ADDR}
5098
export MASTER_PORT=${SLURM_MASTER_PORT}
51-
export NNODES=$SLURM_NNODES
99+
export NNODES=${SLURM_NNODES}
52100
export NODE_RANK=${SLURM_NODEID}
53101
export GPUS_PER_NODE=$((SLURM_WORLD_SIZE / SLURM_NNODES))
54-
echo "Error: SLURM mode is not implemented yet!"
55-
exit 1
56102
else
57103
echo "Error: Unknown RUN_ENV value: $RUN_ENV"
58104
exit 1
59105
fi
60106
gpus=$(seq -s, 0 $((GPUS_PER_NODE - 1)))
61107
export HIP_VISIBLE_DEVICES=$gpus
62108

63-
echo "RUN_ENV: $RUN_ENV"
64-
echo "PRIMUS_PATH: $PRIMUS_PATH"
65-
echo "MEGATRON_PATH: $MEGATRON_PATH"
66-
echo "SITE_PACKAGES: $SITE_PACKAGES"
67-
echo "MODEL_CONFIG: $MODEL_CONFIG"
68-
echo "MASTER_ADDR: $MASTER_ADDR"
69-
echo "MASTER_PORT: $MASTER_PORT"
70-
echo "NNODES: $NNODES"
71-
echo "NODE_RANK: $NODE_RANK"
72-
echo "GPUS_PER_NODE: $GPUS_PER_NODE"
73-
echo "HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES"
109+
echo "[NODE-$NODE_RANK] MASTER_ADDR: $MASTER_ADDR"
110+
echo "[NODE-$NODE_RANK] MASTER_PORT: $MASTER_PORT"
111+
echo "[NODE-$NODE_RANK] NNODES: $NNODES"
112+
echo "[NODE-$NODE_RANK] NODE_RANK: $NODE_RANK"
113+
echo "[NODE-$NODE_RANK] GPUS_PER_NODE: $GPUS_PER_NODE"
114+
echo "[NODE-$NODE_RANK] HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES"
74115
echo ""
75116

76117
DISTRIBUTED_ARGS=(
@@ -84,6 +125,80 @@ DISTRIBUTED_ARGS=(
84125
mkdir -p output
85126
TRAIN_LOG=output/log_torchrun_pretrain_${MODEL_CONFIG}.txt
86127

87-
torchrun "${DISTRIBUTED_ARGS[@]}" examples/deepseek/pretrain.py \
88-
--exp examples/deepseek/exp_pretrain.yaml \
89-
2>&1 | tee $TRAIN_LOG
128+
if [ "$RUN_ENV" = "torchrun" ]; then
129+
SITE_PACKAGES=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")
130+
export PYTHONPATH=${SITE_PACKAGES}:${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
131+
132+
# build helper_cpp of megatron
133+
pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
134+
135+
torchrun "${DISTRIBUTED_ARGS[@]}" examples/deepseek/pretrain.py \
136+
--exp examples/deepseek/exp_pretrain.yaml \
137+
2>&1 | tee $TRAIN_LOG
138+
139+
elif [ "$RUN_ENV" = "slurm" ]; then
140+
export DOCKER_IMAGE="docker.io/rocm/megatron-lm:latest"
141+
# podman pull $DOCKER_IMAGE;
142+
echo "[NODE-$NODE_RANK] stop all podmann containers..."
143+
podman stop -a && \
144+
module load rocm && \
145+
podman run \
146+
--rm \
147+
--env SLURM_MASTER_ADDR=$SLURM_MASTER_ADDR \
148+
--env SLURM_MASTER_PORT=$SLURM_MASTER_PORT \
149+
--env SLURM_PROCID=$SLURM_PROCID \
150+
--env SLURM_WORLD_SIZE=$SLURM_WORLD_SIZE \
151+
--env SLURM_NODEID=$SLURM_NODEID \
152+
--env SLURM_NNODES=$SLURM_NNODES \
153+
--env MASTER_ADDR=${MASTER_ADDR} \
154+
--env MASTER_PORT=${MASTER_PORT} \
155+
--env NNODES=${NNODES} \
156+
--env NODE_RANK=${NODE_RANK} \
157+
--env GPUS_PER_NODE=${GPUS_PER_NODE} \
158+
--env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES \
159+
--env OMP_NUM_THREADS=$OMP_NUM_THREADS \
160+
--env GPU_MAX_HW_QUEUES=$GPU_MAX_HW_QUEUES \
161+
--env TORCH_NCCL_HIGH_PRIORITY=$TORCH_NCCL_HIGH_PRIORITY \
162+
--env NCCL_CHECKS_DISABLE=$NCCL_CHECKS_DISABLE \
163+
--env NCCL_IB_HCA=$NCCL_IB_HCA \
164+
--env NCCL_IB_GID_INDEX=$NCCL_IB_GID_INDEX \
165+
--env NCCL_CROSS_NIC=$NCCL_CROSS_NIC \
166+
--env HSA_ENABLE_SDMA=$HSA_ENABLE_SDMA \
167+
--env NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
168+
--env GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
169+
--env CUDA_DEVICE_MAX_CONNECTIONS=$CUDA_DEVICE_MAX_CONNECTIONS \
170+
--env NCCL_PROTO=$NCCL_PROTO \
171+
--env RCCL_MSCCL_ENABLE=$RCCL_MSCCL_ENABLE \
172+
--env HF_HOME=$HF_HOME \
173+
--env DATA_PATH=$DATA_PATH \
174+
--env MODEL_CONFIG=$MODEL_CONFIG \
175+
--env TE_HIPBLASLT_TUNING_RUN_COUNT=$TE_HIPBLASLT_TUNING_RUN_COUNT \
176+
--env TE_HIPBLASLT_TUNING_ALGO_COUNT=$TE_HIPBLASLT_TUNING_ALGO_COUNT \
177+
--env NVTE_CK_USES_BWD_V3=$NVTE_CK_USES_BWD_V3 \
178+
--ipc=host --network=host \
179+
--device=/dev/kfd --device=/dev/dri \
180+
--cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN \
181+
--security-opt seccomp=unconfined --group-add video \
182+
--privileged --device=/dev/infiniband \
183+
-v $MEGATRON_PATH:$MEGATRON_PATH \
184+
-v $PRIMUS_PATH:$PRIMUS_PATH \
185+
$DOCKER_IMAGE /bin/bash -c \
186+
"echo $(date) && \
187+
pip install -q loguru wandb && \
188+
cd ${MEGATRON_PATH}/megatron/core/datasets && make && \
189+
cd $PRIMUS_PATH && \
190+
PYTHONPATH=${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH} \
191+
torchrun \
192+
--nproc_per_node ${GPUS_PER_NODE} \
193+
--nnodes ${NNODES} \
194+
--node_rank ${NODE_RANK} \
195+
--master_addr ${MASTER_ADDR} \
196+
--master_port ${MASTER_PORT} \
197+
examples/deepseek/pretrain.py \
198+
--exp examples/deepseek/exp_pretrain.yaml \
199+
2>&1 | tee $TRAIN_LOG && \
200+
echo $(date)"
201+
else
202+
echo "Error: Unknown RUN_ENV value: $RUN_ENV"
203+
exit 1
204+
fi
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# shellcheck disable=SC2086
3+
4+
# salloc --reservation=gpu-40_gpu-41_gpu-43_gpu-44_gpu-46_gpu-47_gpu-50_gpu-55_reservation --exclusive --mem=0 -N 8
5+
# salloc --nodelist=gpu-56 --exclusive --mem=0 -N 8
6+
7+
SCRIPT_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
8+
echo "Current script path: $SCRIPT_DIR"
9+
10+
export RUN_ENV=slurm
11+
export NCCL_SOCKET_IFNAME=bond0
12+
export GLOO_SOCKET_IFNAME=bond0
13+
14+
srun -N 2 \
15+
--gres=gpu:8 \
16+
--exclusive \
17+
--ntasks-per-node=1 \
18+
--cpus-per-task=64 \
19+
bash ${SCRIPT_DIR}/run_pretrain.sh

0 commit comments

Comments
 (0)