Skip to content

Commit fe24e1b

Browse files
committed
TL/CUDA: Add new data types into NVLS MPI CI
This commit adds new data types and enables reduce scatter mpi test for nvls CI. Signed-off-by: Juee14Desai <jueehimalbha@nvidia.com>
1 parent d949707 commit fe24e1b

File tree

3 files changed

+36
-6
lines changed

3 files changed

+36
-6
lines changed

.ci/pipeline/test_nvls_matrix.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,22 @@ steps:
8080
onfail: |
8181
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/stop_slurm_allocation.sh
8282
83-
- name: Run UCC NVLS MPI tests
83+
- name: Run UCC NVLS MPI tests (allreduce)
8484
containerSelector: "{name: 'build_helper'}"
8585
timeout: "${TEST_TIMEOUT_MINUTES}"
8686
run: |
8787
set -x
8888
export DOCKER_IMAGE_NAME="${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${DOCKER_IMAGE_TAG}"
8989
export SLURM_JOB_ID=$(cat ${WORKSPACE}/job_id.txt)
9090
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_nvls_slurm.sh '/opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls_mpi.sh' ${NVLS_MPI_PPN:-4}
91+
92+
- name: Run UCC NVLS MPI tests (reduce_scatter)
93+
containerSelector: "{name: 'build_helper'}"
94+
timeout: "${TEST_TIMEOUT_MINUTES}"
95+
run: |
96+
set -x
97+
export DOCKER_IMAGE_NAME="${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${DOCKER_IMAGE_TAG}"
98+
export SLURM_JOB_ID=$(cat ${WORKSPACE}/job_id.txt)
99+
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_nvls_slurm.sh '/opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls_mpi_reduce_scatter.sh' ${NVLS_MPI_PPN:-4}
91100
always: |
92101
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/stop_slurm_allocation.sh

.ci/scripts/run_tests_ucc_nvls_mpi.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,8 @@ export UCC_TL_CUDA_NVLS_SM_COUNT=4
1515

1616
EXE="/opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi"
1717
EXE+=" --set_device 2 --mtypes cuda"
18+
DTYPES="float32,int32,uint32,int64,uint64"
1819

1920
echo "INFO: NVLS MPI tests (allreduce) ..."
20-
UCC_TL_CUDA_TUNE="allreduce:cuda:@0" $EXE -c allreduce -d float32 -o sum -m 1024:33554432
21+
UCC_TL_CUDA_TUNE="allreduce:cuda:@0" $EXE -c allreduce -d ${DTYPES} -o sum -m 1024:33554432
2122
echo "INFO: NVLS MPI tests (allreduce) ... DONE"
22-
23-
# echo "INFO: NVLS MPI tests (reduce_scatter) ..."
24-
# UCC_TL_CUDA_TUNE="reduce_scatter:cuda:@3" $EXE -c reduce_scatter -d float32 -o sum -m 1024:33554432
25-
# echo "INFO: NVLS MPI tests (reduce_scatter) ... DONE"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash -xe
2+
# NVLS reduce_scatter only. Run as a separate srun step (separate MPI job).
3+
# -m 1024:33554432:4 keeps per-rank counts NVLS-aligned.
4+
5+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6+
source "${SCRIPT_DIR}/env.sh"
7+
8+
if [ "${SLURM_LOCALID:-0}" = "0" ]; then
9+
"${SCRIPT_DIR}/check_nvls_fabric.sh"
10+
fi
11+
12+
export OMPI_MCA_coll=^hcoll
13+
export OMPI_MCA_coll_ucc_enable=0
14+
export UCC_TLS=cuda,ucp
15+
export UCC_LOG_LEVEL=info
16+
export UCC_TL_CUDA_NVLS_SM_COUNT=4
17+
18+
EXE="/opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi"
19+
EXE+=" --set_device 2 --mtypes cuda"
20+
DTYPES="float32,int32,uint32,int64,uint64"
21+
22+
echo "INFO: NVLS MPI tests (reduce_scatter) ..."
23+
UCC_TL_CUDA_TUNE="reduce_scatter:cuda:@3" $EXE -c reduce_scatter -d ${DTYPES} -o sum -m 1024:33554432:4
24+
echo "INFO: NVLS MPI tests (reduce_scatter) ... DONE"

0 commit comments

Comments
 (0)