microsoft · dpower4 · Mar 21, 2025 · Jan 29, 2025 · Jan 29, 2025 · Feb 1, 2025
@@ -25,12 +25,24 @@ jobs:
     strategy:
       matrix:
         include:
+        - name: cuda12.8 arm64
+          dockerfile: cuda12.8
+          tags: superbench/main:cuda12.8
+          platforms: linux/arm64
+          runner: [self-hosted, linux/arm64]
+          build_args: "NUM_MAKE_JOBS=16"
+        - name: cuda12.8
+          dockerfile: cuda12.8
+          tags: superbench/main:cuda12.8
+          platforms: linux/amd64
+          runner: [self-hosted, linux/amd64]
+          build_args: "NUM_MAKE_JOBS=16"
         - name: cuda12.4 arm64
           dockerfile: cuda12.4
           tags: superbench/main:cuda12.4
           platforms: linux/arm64
           runner: [self-hosted, linux/arm64]
-          build_args: "NUM_MAKE_JOBS=8"
+          build_args: "NUM_MAKE_JOBS=16"
         - name: cuda12.4
           dockerfile: cuda12.4
           tags: superbench/main:cuda12.4

@@ -0,0 +1,171 @@
+FROM nvcr.io/nvidia/pytorch:25.02-py3
+
+# OS:
+#   - Ubuntu: 24.04
+#   - OpenMPI: 4.1.7+
+#   - Docker Client: 20.10.8
+# NVIDIA:
+#   - CUDA: 12.8.0.38
+#   - cuDNN: 9.7.1.26
+#   - cuBLAS: 12.8.3.14
+#   - NCCL: v2.25.1
+#   - TransformerEngine 2.0
+# Mellanox:
+#   - MOFED_VERSION; 5.4-rdmacore39.0
+#   - HPC-X: v2.21.0-CUDA12.x
+# Intel:
+#   - mlc: v3.11
+
+LABEL maintainer="SuperBench"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    bc \
+    build-essential \
+    curl \
+    dmidecode \
+    ffmpeg \
+    git \
+    iproute2 \
+    jq \
+    libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
+    libboost-program-options-dev \
+    libcap2 \
+    libcurl4-openssl-dev \
+    libnuma-dev \
+    libpci-dev \
+    libswresample-dev \
+    libncurses-dev \
+    libtool \
+    lshw \
+    python3-mpi4py \
+    net-tools \
+    nlohmann-json3-dev \
+    openssh-client \
+    openssh-server \
+    pciutils \
+    sudo \
+    util-linux \
+    vim \
+    wget \
+    rsync \
+    && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+ARG NUM_MAKE_JOBS=
+ARG TARGETPLATFORM
+ARG TARGETARCH
+
+# Install Docker
+ENV DOCKER_VERSION=20.10.8
+RUN TARGETARCH_HW=$(uname -m) && \
+    wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
+    rm docker.tgz
+
+# Update system config
+RUN mkdir -p /root/.ssh && \
+    touch /root/.ssh/authorized_keys && \
+    mkdir -p /var/run/sshd && \
+    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
+    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
+    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
+
+# Install OFED
+ENV OFED_VERSION=24.10-1.1.4.0
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /tmp && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+
+# Install HPC-X
+ENV HPCX_VERSION=v2.21
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /opt && \
+    rm -rf hpcx && \
+    wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \
+    rm hpcx.tbz
+
+# Installs specific to amd64 platform
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+    # Install Intel MLC
+    cd /tmp && \
+    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    tar xzf mlc.tgz Linux/mlc && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    rm -rf ./Linux mlc.tgz && \
+    # Install AOCC compiler
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    # Install AMD BLIS
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
+    else \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
+    fi
+
+# Install NCCL 2.25.1
+RUN cd /tmp && \
+    git clone -b v2.25.1-1 https://github.com/NVIDIA/nccl.git && \
+    cd nccl && \
+    make -j ${NUM_MAKE_JOBS} src.build \
+    NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \
+    -gencode=arch=compute_100,code=sm_100 \
+    -gencode=arch=compute_100,code=sm_100" && \
+    make install && \
+    rm -rf /tmp/nccl
+
+# Install UCX with multi-threading support
+ENV UCX_VERSION=1.18.0
+RUN cd /tmp && \
+    wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
+    tar xzf ucx-${UCX_VERSION}.tar.gz && \
+    cd ucx-${UCX_VERSION} && \
+    ./contrib/configure-release-mt --prefix=/usr/local && \
+    make -j ${NUM_MAKE_JOBS} && \
+    make install
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
+    echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh
+
+# Add config files
+ADD dockerfile/etc /opt/microsoft/
+
+WORKDIR ${SB_HOME}
+
+ADD third_party third_party
+RUN make -C third_party cuda_with_msccl
+
+ADD . .
+RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
+    make cppbuild && \
+    make postinstall && \
+    rm -rf .git
@@ -35,4 +35,7 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
     if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.8)
       list(APPEND NVCC_ARCHS_SUPPORTED 89 90)
     endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
+      list(APPEND NVCC_ARCHS_SUPPORTED 100)
+    endif()
 endif()
@@ -38,12 +38,18 @@ sb_micro_path:
 	mkdir -p $(SB_MICRO_PATH)/lib
 
 # Build cutlass.
+# for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support
 cuda_cutlass:
-ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+	$(eval ARCHS := "75;80;86;89;90a;100;100a")
+	if [ -d cutlass ]; then rm -rf cutlass; fi
+	git clone --single-branch --branch main https://github.com/NVIDIA/cutlass.git && cd cutlass && git checkout 389e493
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := "70;75;80;86;89;90")
 else
 	$(eval ARCHS := "70;75;80;86")
 endif
+
 ifneq (,$(wildcard cutlass/CMakeLists.txt))
 	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \
 		-DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
@@ -55,19 +61,26 @@ endif
 # The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
 # The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
 cuda_bandwidthTest: sb_micro_path
-ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
+	git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
+ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
+	cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
+	cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
 	$(eval ARCHS := "70 75 80 86 90")
+	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
+	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
 else
 	$(eval TEST_PATH := "./cuda-samples/Samples/bandwidthTest")
 	$(eval ARCHS := "70 75 80 86")
-endif
-	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
-	git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
 	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
 	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
+endif
 
-# Build nccl-tests from commit 8274cb4 of default branch.
+# Build nccl-tests.
+# The version we use is the tag v2.13.13
 cuda_nccl_tests: sb_micro_path
 ifneq (,$(wildcard nccl-tests/Makefile))
 	cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
@@ -219,9 +232,21 @@ apex_rocm:
 
 # Build MSCCL for CUDA
 cuda_msccl: sb_micro_path
+ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+    # Get commit 87048bd from msscl to support updated nccl and sm_100
+	$(eval ARCHS := 75 80 86 89 90 100)
+	if [ -d msccl ]; then rm -rf msccl; fi; \
+	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
+    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+	$(eval ARCHS := 70 75 80 86 89 90)
+else
+	$(eval ARCHS := 70 75 80 86")
+endif
+	$(eval NVCC_GENCODE := "$(foreach arch, $(ARCHS), $(NVCC_GENCODE) -gencode=arch=compute_$(arch),code=sm_$(arch))")
 ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
 	cd ./msccl/executor/msccl-executor-nccl && \
-	make -j ${NUM_MAKE_JOBS} src.build && \
+	make -j ${NUM_MAKE_JOBS} src.build NVCC_GENCODE=$(NVCC_GENCODE) && \
 	cd ../../..
 	mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
 	cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
@@ -235,7 +260,7 @@ ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
 endif
 ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
 	cd ./msccl/tests/msccl-tests-nccl && \
-	make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j ${NUM_MAKE_JOBS} && cd ../../..
+	make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl NVCC_GENCODE=$(NVCC_GENCODE) -j ${NUM_MAKE_JOBS} && cd ../../..
 	mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
 	cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
 endif
+9 −4		README.md
+1 −0		doc/PERFORMANCE.md
+18 −3		src/Makefile
+3 −5		src/all_gather.cu
+2 −2		src/all_reduce.cu
+5 −5		src/alltoall.cu
+2 −2		src/broadcast.cu
+81 −6		src/common.cu
+2 −2		src/common.h
+6 −6		src/gather.cu
+3 −3		src/hypercube.cu
+2 −2		src/reduce.cu
+3 −5		src/reduce_scatter.cu
+6 −6		src/scatter.cu
+2 −2		src/sendrecv.cu