File tree Expand file tree Collapse file tree 1 file changed +4
-1
lines changed Expand file tree Collapse file tree 1 file changed +4
-1
lines changed Original file line number Diff line number Diff line change 105105
106106export NCCL_IB_GID_INDEX=3
107107export NCCL_CROSS_NIC=0
108- export HSA_ENABLE_SDMA=0
108+ export HSA_ENABLE_SDMA=1
109+ export HSA_NO_SCRATCH_RECLAIM=1
109110NCCL_IB_HCA=$( bash " ${PRIMUS_PATH} " /examples/scripts/get_nccl_ib_hca.sh)
110111export NCCL_IB_HCA
111112export NCCL_IB_GDR_LEVEL=2
@@ -195,6 +196,7 @@ elif [ "$PRIMUS_HIPBLASLT_TUNING_STAGE" -eq 2 ]; then
195196 --env TORCH_NCCL_HIGH_PRIORITY=$TORCH_NCCL_HIGH_PRIORITY \
196197 --env OMP_NUM_THREADS=$OMP_NUM_THREADS \
197198 --env HSA_ENABLE_SDMA=$HSA_ENABLE_SDMA \
199+ --env HSA_NO_SCRATCH_RECLAIM=$HSA_NO_SCRATCH_RECLAIM \
198200 --env CUDA_DEVICE_MAX_CONNECTIONS=$CUDA_DEVICE_MAX_CONNECTIONS \
199201 --env MODEL_CONFIG=$MODEL_CONFIG \
200202 --ipc=host --network=host \
@@ -323,6 +325,7 @@ elif [ "$RUN_ENV" = "slurm" ]; then
323325 --env NCCL_IB_GID_INDEX=$NCCL_IB_GID_INDEX \
324326 --env NCCL_CROSS_NIC=$NCCL_CROSS_NIC \
325327 --env HSA_ENABLE_SDMA=$HSA_ENABLE_SDMA \
328+ --env HSA_NO_SCRATCH_RECLAIM=$HSA_NO_SCRATCH_RECLAIM \
326329 --env NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME \
327330 --env GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME \
328331 --env CUDA_DEVICE_MAX_CONNECTIONS=$CUDA_DEVICE_MAX_CONNECTIONS \
You can’t perform that action at this time.
0 commit comments