Skip to content

Commit b7d4f76

Browse files
authored
CI: enhance NVLS tests (#1269)
## What 1. Added correctness tests for NVLS collectives 2. Updated CUDA to 13.1.1 for NVLS path 3. Use official CUDA base image `nvcr.io/nvidia/cuda:${CUDA_VER}-devel-ubuntu24.04` (more lightweight) 4. Use official HPCX instead of building from source 5. Added smoke testing of nvlink on allocated nodes to separate node issues from UCC --------- Signed-off-by: Ilya Kryukov <ikryukov@nvidia.com>
1 parent d0c6697 commit b7d4f76

File tree

7 files changed

+190
-16
lines changed

7 files changed

+190
-16
lines changed

.ci/Dockerfile.nvls

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
ARG CUDA_VER='13.1.1'
2+
FROM nvcr.io/nvidia/cuda:${CUDA_VER}-devel-ubuntu24.04
3+
4+
ARG _UID=6213
5+
ARG _GID=11429
6+
ARG _LOGIN=swx-jenkins
7+
ARG _GROUP=swx-jenkins
8+
ARG _HOME=/labhome
9+
ARG UCC_ENABLE_NVLS=yes
10+
ARG UCC_ENABLE_GTEST=no
11+
ARG UCC_BUILD_TLS=cuda,ucp
12+
13+
#==============================================================================
14+
# Build tools
15+
#==============================================================================
16+
RUN apt-get update && apt-get install -y --no-install-recommends \
17+
sudo \
18+
build-essential \
19+
autoconf \
20+
automake \
21+
libtool \
22+
numactl \
23+
libnuma-dev \
24+
wget \
25+
ca-certificates \
26+
&& rm -rf /var/lib/apt/lists/*
27+
28+
#==============================================================================
29+
# Install HPC-X (provides UCX + OpenMPI)
30+
#==============================================================================
31+
ARG HPCX_VERSION=v2.26
32+
ARG HPCX_CUDA=cuda13
33+
ARG HPCX_OS=ubuntu24.04
34+
RUN cd /tmp && \
35+
HPCX_ARCH=$(uname -m) && \
36+
HPCX_FILENAME="hpcx-${HPCX_VERSION}-gcc-inbox-${HPCX_OS}-${HPCX_CUDA}-${HPCX_ARCH}" && \
37+
wget -q "https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}_${HPCX_CUDA}/${HPCX_FILENAME}.tbz" && \
38+
tar xf "${HPCX_FILENAME}.tbz" && \
39+
mv "${HPCX_FILENAME}" /opt/hpcx && \
40+
rm -f "${HPCX_FILENAME}.tbz"
41+
42+
#==============================================================================
43+
# Environment
44+
#==============================================================================
45+
ENV CUDA_HOME=/usr/local/cuda
46+
ENV SRC_DIR=/opt/nvidia/src
47+
ENV UCX_INSTALL_DIR=/opt/hpcx/ucx
48+
ENV UCC_INSTALL_DIR=/opt/nvidia/bin/ucc/build
49+
ENV PATH=/opt/hpcx/ompi/bin:${PATH}
50+
ENV LD_LIBRARY_PATH=/opt/hpcx/ompi/lib:/opt/hpcx/ucx/lib:${LD_LIBRARY_PATH}
51+
ENV OPAL_PREFIX=/opt/hpcx/ompi
52+
53+
#==============================================================================
54+
# Build UCC
55+
#==============================================================================
56+
RUN rm -rf ${SRC_DIR}/ucc
57+
COPY . ${SRC_DIR}/ucc
58+
59+
ENV UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS}
60+
ENV UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST}
61+
ENV UCC_BUILD_TLS=${UCC_BUILD_TLS}
62+
RUN ${SRC_DIR}/ucc/.ci/scripts/build_ucc.sh
63+
64+
#==============================================================================
65+
# User setup
66+
#==============================================================================
67+
RUN echo "${_LOGIN} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
68+
RUN chown -R ${_UID}:${_GID} /opt/nvidia
69+
RUN if ! getent group "${_GID}" > /dev/null 2>&1; then \
70+
groupadd -g "${_GID}" "${_GROUP}"; \
71+
fi && \
72+
useradd --no-create-home --uid ${_UID} --gid ${_GID} --home ${_HOME}/${_LOGIN} ${_LOGIN}
73+
74+
USER ${_LOGIN}

.ci/pipeline/test_nvls_matrix.yaml

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ volumes:
1818
- { mountPath: "/home/svcnbu-swx-hpcx", hostPath: "/labhome/svcnbu-swx-hpcx" }
1919

2020
env:
21-
CUDA_VER: 13.0
21+
CUDA_VER: 13.1.1
2222
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/aarch64/ubuntu24.04/cuda${CUDA_VER}"
2323
DOCKER_IMAGE_TAG: "${BUILD_NUMBER}"
2424
SLURM_NODES: 2
@@ -28,6 +28,7 @@ env:
2828
SLURM_JOB_NAME: 'ucc-ci-test-nvls-${BUILD_NUMBER}'
2929
SLURM_JOB_TIMEOUT: '00:40:00'
3030
TEST_TIMEOUT_MINUTES: 35
31+
NVLS_MPI_PPN: 4
3132

3233
kubernetes:
3334
cloud: il-ipp-blossom-prod
@@ -38,12 +39,12 @@ kubernetes:
3839
# cloud pod to build the shared docker image
3940
runs_on_dockers:
4041
- {
41-
file: ".ci/Dockerfile.ngc_pytorch",
42-
name: "ngc_pytorch",
42+
file: ".ci/Dockerfile.nvls",
43+
name: "nvls",
4344
tag: "${DOCKER_IMAGE_TAG}",
4445
arch: "aarch64",
4546
uri: "${UCC_URI_SUFFIX}",
46-
build_args: "--no-cache --build-arg ARCH=aarch64 --build-arg OS=ubuntu24.04 --build-arg CUDA_VER=${CUDA_VER} --build-arg _UID=149917 --build-arg _GID=30 --build-arg _LOGIN=svcnbu-swx-hpcx --build-arg _GROUP=svcnbu-swx-hpcx --build-arg UCC_ENABLE_NVLS=yes --build-arg UCC_ENABLE_GTEST=no",
47+
build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg _UID=149917 --build-arg _GID=30 --build-arg _LOGIN=svcnbu-swx-hpcx --build-arg _GROUP=svcnbu-swx-hpcx",
4748
}
4849
- {
4950
file: ".ci/dockerfiles/Dockerfile.build_helper",
@@ -68,13 +69,24 @@ steps:
6869
touch job_id.txt && chown svcnbu-swx-hpcx job_id.txt
6970
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_slurm_allocation.sh
7071
71-
- name: Run UCC NVLS tests
72+
- name: Run UCC NVLS perftest
7273
containerSelector: "{name: 'build_helper'}"
7374
timeout: "${TEST_TIMEOUT_MINUTES}"
7475
run: |
7576
set -x
7677
export DOCKER_IMAGE_NAME="${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${DOCKER_IMAGE_TAG}"
7778
export SLURM_JOB_ID=$(cat ${WORKSPACE}/job_id.txt)
78-
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_tests_ucc_nvls_slurm.sh
79+
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_nvls_slurm.sh '/opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls_all.sh' ${NVLS_MPI_PPN:-4}
80+
onfail: |
81+
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/stop_slurm_allocation.sh
82+
83+
- name: Run UCC NVLS MPI tests
84+
containerSelector: "{name: 'build_helper'}"
85+
timeout: "${TEST_TIMEOUT_MINUTES}"
86+
run: |
87+
set -x
88+
export DOCKER_IMAGE_NAME="${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${DOCKER_IMAGE_TAG}"
89+
export SLURM_JOB_ID=$(cat ${WORKSPACE}/job_id.txt)
90+
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/run_nvls_slurm.sh '/opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls_mpi.sh' ${NVLS_MPI_PPN:-4}
7991
always: |
8092
sudo -E -u svcnbu-swx-hpcx ${WORKSPACE}/.ci/scripts/stop_slurm_allocation.sh

.ci/scripts/build_ucc.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export CXXFLAGS="-Wno-error=maybe-uninitialized"
55

66
export UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST:-yes}
77
export UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS:-no}
8+
export UCC_BUILD_TLS=${UCC_BUILD_TLS:-cuda,nccl,self,sharp,shm,ucp,mlx5}
89

910
# In containers, calculate based on memory limits to avoid OOM
1011
# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
@@ -39,7 +40,7 @@ cd "${UCC_SRC_DIR}/build"
3940
# Build base configure flags
4041
CONFIGURE_FLAGS="--with-ucx=${UCX_INSTALL_DIR} --with-cuda=${CUDA_HOME} \
4142
--prefix=${UCC_INSTALL_DIR} --with-mpi \
42-
--with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5"
43+
--with-tls=${UCC_BUILD_TLS}"
4344

4445
# Add NVLS support if enabled
4546
if [ "${UCC_ENABLE_NVLS}" = "yes" ] || [ "${UCC_ENABLE_NVLS}" = "true" ] || [ "${UCC_ENABLE_NVLS}" = "1" ]; then

.ci/scripts/check_nvls_fabric.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash -xe
2+
3+
echo "===== NVLS Fabric Smoke Test ($(hostname)) ====="
4+
5+
echo "INFO: Checking GPU driver ..."
6+
nvidia-smi --query-gpu=index,name,uuid --format=csv,noheader
7+
NGPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
8+
if [ "$NGPUS" -eq 0 ]; then
9+
echo "ERROR: No GPUs found"
10+
exit 1
11+
fi
12+
echo "INFO: Found $NGPUS GPUs"
13+
14+
echo "INFO: Checking NVLink fabric registration ..."
15+
FABRIC_OUTPUT=$(nvidia-smi -q | grep 'Fabric' -A 4)
16+
echo "$FABRIC_OUTPUT"
17+
18+
COMPLETED_COUNT=$(echo "$FABRIC_OUTPUT" | grep -c 'State.*:.*Completed' || true)
19+
if [ "$COMPLETED_COUNT" -ne "$NGPUS" ]; then
20+
echo "ERROR: Expected $NGPUS GPUs with Fabric State 'Completed', found $COMPLETED_COUNT"
21+
exit 1
22+
fi
23+
24+
FAILURES=$(echo "$FABRIC_OUTPUT" | grep 'Status' | grep -cv 'Success' || true)
25+
if [ "$FAILURES" -ne 0 ]; then
26+
echo "ERROR: Some GPUs have Fabric Status != 'Success'"
27+
exit 1
28+
fi
29+
echo "INFO: All $NGPUS GPUs registered to NVLink fabric successfully"
30+
31+
echo "INFO: Checking NVLink link status ..."
32+
nvidia-smi nvlink --status
33+
echo "INFO: NVLink link status ... DONE"
34+
35+
echo "INFO: Checking GPU P2P topology ..."
36+
nvidia-smi topo -p2p n
37+
echo "INFO: GPU P2P topology ... DONE"
38+
39+
echo "===== NVLS Fabric Smoke Test PASSED ($(hostname)) ====="
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,33 @@
11
#!/bin/bash -xe
22

3+
# Generic NVLS Slurm test runner.
4+
# Usage: run_nvls_slurm.sh <container_script> [ntasks_per_node]
35

46
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
57
source "${SCRIPT_DIR}/env.sh"
68

7-
readonly SLURM_COMMAND="srun --jobid=${SLURM_JOB_ID} --nodes=${SLURM_NODES} --mpi=pmi2 --ntasks-per-node=1 --container-image=${DOCKER_IMAGE_NAME} '/opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls.sh'"
9+
CONTAINER_SCRIPT=${1:?"Usage: run_nvls_slurm.sh <container_script> [ntasks_per_node]"}
10+
NTASKS_PER_NODE=${2:-1}
11+
12+
readonly SLURM_COMMAND="srun --jobid=${SLURM_JOB_ID} --nodes=${SLURM_NODES} --mpi=pmix --ntasks-per-node=${NTASKS_PER_NODE} --container-image=${DOCKER_IMAGE_NAME} '${CONTAINER_SCRIPT}'"
813

9-
# Validate SLURM_HEAD_NODE is set
1014
if [ -z "${SLURM_HEAD_NODE}" ]; then
1115
echo "ERROR: SLURM_HEAD_NODE is not set or empty"
1216
exit 1
1317
fi
1418

15-
# Execute based on head node type
1619
case "${SLURM_HEAD_NODE}" in
1720
scctl)
1821
echo "Using scctl client to connect and execute slurm command"
1922
scctl client connect -- "${SLURM_COMMAND}"
2023
;;
2124
dlcluster*)
2225
echo "Connecting to SLURM head node: ${SLURM_HEAD_NODE}"
23-
# Escape the # character in the command for SSH transmission
2426
SLURM_COMMAND_ESCAPED="${SLURM_COMMAND//\#/\\#}"
2527
eval "${SSH_CMD} ${SLURM_HEAD_NODE} \"${SLURM_COMMAND_ESCAPED}\""
2628
;;
27-
"")
28-
echo "ERROR: Invalid SLURM_HEAD_NODE value: ${SLURM_HEAD_NODE}"
29-
exit 1
30-
;;
3129
*)
3230
echo "Connecting to SLURM head node: ${SLURM_HEAD_NODE}"
33-
eval "${SSH_CMD} ${SLURM_HEAD_NODE} ${SLURM_COMMAND}"
31+
eval "${SSH_CMD} ${SLURM_HEAD_NODE} \"${SLURM_COMMAND}\""
3432
;;
3533
esac
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash -xe
2+
3+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4+
source "${SCRIPT_DIR}/env.sh"
5+
6+
if [ "${SLURM_LOCALID:-0}" = "0" ]; then
7+
"${SCRIPT_DIR}/check_nvls_fabric.sh"
8+
fi
9+
10+
export OMPI_MCA_coll=^hcoll
11+
export OMPI_MCA_coll_ucc_enable=0
12+
export UCC_LOG_LEVEL=info
13+
export UCC_TL_CUDA_NVLS_SM_COUNT=4
14+
export UCC_TLS=cuda,ucp
15+
16+
PERFTEST=/opt/nvidia/bin/ucc/build/bin/ucc_perftest
17+
18+
echo "INFO: NVLS perftest (allreduce) ..."
19+
UCC_TL_CUDA_TUNE=allreduce:cuda:@0 $PERFTEST -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum
20+
echo "INFO: NVLS perftest (allreduce) ... DONE"
21+
22+
# Disabled: reduce_scatter NVLS is tested via MPI tests instead.
23+
#echo "INFO: NVLS perftest (reduce_scatter) ..."
24+
#UCC_TL_CUDA_TUNE=reduce_scatter:cuda:@3 $PERFTEST -c reduce_scatter -F -m cuda -b 1k -e 32M -d bfloat16 -o sum
25+
#echo "INFO: NVLS perftest (reduce_scatter) ... DONE"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash -xe
2+
3+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4+
source "${SCRIPT_DIR}/env.sh"
5+
6+
if [ "${SLURM_LOCALID:-0}" = "0" ]; then
7+
"${SCRIPT_DIR}/check_nvls_fabric.sh"
8+
fi
9+
10+
export OMPI_MCA_coll=^hcoll
11+
export OMPI_MCA_coll_ucc_enable=0
12+
export UCC_TLS=cuda,ucp
13+
export UCC_LOG_LEVEL=info
14+
export UCC_TL_CUDA_NVLS_SM_COUNT=4
15+
16+
EXE="/opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi"
17+
EXE+=" --set_device 2 --mtypes cuda"
18+
19+
echo "INFO: NVLS MPI tests (allreduce) ..."
20+
UCC_TL_CUDA_TUNE="allreduce:cuda:@0" $EXE -c allreduce -d float32 -o sum -m 1024:33554432
21+
echo "INFO: NVLS MPI tests (allreduce) ... DONE"
22+
23+
# echo "INFO: NVLS MPI tests (reduce_scatter) ..."
24+
# UCC_TL_CUDA_TUNE="reduce_scatter:cuda:@3" $EXE -c reduce_scatter -d float32 -o sum -m 1024:33554432
25+
# echo "INFO: NVLS MPI tests (reduce_scatter) ... DONE"

0 commit comments

Comments
 (0)