Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dockerfile - Support cuda12.8 for Blackwell arch #682

Merged
merged 19 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,24 @@ jobs:
strategy:
matrix:
include:
- name: cuda12.8 arm64
dockerfile: cuda12.8
tags: superbench/main:cuda12.8
platforms: linux/arm64
runner: [self-hosted, linux/arm64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.8
dockerfile: cuda12.8
tags: superbench/main:cuda12.8
platforms: linux/amd64
runner: [self-hosted, linux/amd64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.4 arm64
dockerfile: cuda12.4
tags: superbench/main:cuda12.4
platforms: linux/arm64
runner: [self-hosted, linux/arm64]
build_args: "NUM_MAKE_JOBS=8"
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.4
dockerfile: cuda12.4
tags: superbench/main:cuda12.4
Expand Down
171 changes: 171 additions & 0 deletions dockerfile/cuda12.8.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
FROM nvcr.io/nvidia/pytorch:25.02-py3

# OS:
# - Ubuntu: 24.04
# - OpenMPI: 4.1.7+
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 12.8.0.38
# - cuDNN: 9.7.1.26
# - cuBLAS: 12.8.3.14
# - NCCL: v2.25.1
# - TransformerEngine 2.0
# Mellanox:
# - MOFED_VERSION; 5.4-rdmacore39.0
# - HPC-X: v2.21.0-CUDA12.x
# Intel:
# - mlc: v3.11

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
ffmpeg \
git \
iproute2 \
jq \
libaio-dev \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libswresample-dev \
libncurses-dev \
libtool \
lshw \
python3-mpi4py \
net-tools \
nlohmann-json3-dev \
openssh-client \
openssh-server \
pciutils \
sudo \
util-linux \
vim \
wget \
rsync \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
RUN TARGETARCH_HW=$(uname -m) && \
cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.21
RUN TARGETARCH_HW=$(uname -m) && \
cd /opt && \
rm -rf hpcx && \
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \
rm hpcx.tbz

# Installs specific to amd64 platform
RUN if [ "$TARGETARCH" = "amd64" ]; then \
# Install Intel MLC
cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz && \
# Install AOCC compiler
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
# Install AMD BLIS
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
else \
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
fi

# Install NCCL 2.25.1
RUN cd /tmp && \
git clone -b v2.25.1-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j ${NUM_MAKE_JOBS} src.build \
NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_100,code=sm_100" && \
make install && \
rm -rf /tmp/nccl

# Install UCX with multi-threading support
ENV UCX_VERSION=1.18.0
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install

ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh

# Add config files
ADD dockerfile/etc /opt/microsoft/

WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl

ADD . .
RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git
3 changes: 3 additions & 0 deletions superbench/benchmarks/micro_benchmarks/cuda_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,7 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.8)
list(APPEND NVCC_ARCHS_SUPPORTED 89 90)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
list(APPEND NVCC_ARCHS_SUPPORTED 100)
endif()
endif()
41 changes: 33 additions & 8 deletions third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,18 @@ sb_micro_path:
mkdir -p $(SB_MICRO_PATH)/lib

# Build cutlass.
# for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support
cuda_cutlass:
ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
$(eval ARCHS := "75;80;86;89;90a;100;100a")
if [ -d cutlass ]; then rm -rf cutlass; fi
git clone --single-branch --branch main https://github.com/NVIDIA/cutlass.git && cd cutlass && git checkout 389e493
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
$(eval ARCHS := "70;75;80;86;89;90")
else
$(eval ARCHS := "70;75;80;86")
endif

ifneq (,$(wildcard cutlass/CMakeLists.txt))
cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \
-DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
Expand All @@ -55,19 +61,26 @@ endif
# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
cuda_bandwidthTest: sb_micro_path
ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
$(eval ARCHS := "70 75 80 86 90")
cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
else
$(eval TEST_PATH := "./cuda-samples/Samples/bandwidthTest")
$(eval ARCHS := "70 75 80 86")
endif
if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
endif

# Build nccl-tests from commit 8274cb4 of default branch.
# Build nccl-tests.
# The version we use is the tag v2.13.13
cuda_nccl_tests: sb_micro_path
ifneq (,$(wildcard nccl-tests/Makefile))
cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
Expand Down Expand Up @@ -219,9 +232,21 @@ apex_rocm:

# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
# Get commit 87048bd from msscl to support updated nccl and sm_100
$(eval ARCHS := 75 80 86 89 90 100)
if [ -d msccl ]; then rm -rf msccl; fi; \
git clone --single-branch --branch main https://github.com/Azure/msccl.git \
&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
$(eval ARCHS := 70 75 80 86 89 90)
else
$(eval ARCHS := 70 75 80 86")
endif
$(eval NVCC_GENCODE := "$(foreach arch, $(ARCHS), $(NVCC_GENCODE) -gencode=arch=compute_$(arch),code=sm_$(arch))")
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
cd ./msccl/executor/msccl-executor-nccl && \
make -j ${NUM_MAKE_JOBS} src.build && \
make -j ${NUM_MAKE_JOBS} src.build NVCC_GENCODE=$(NVCC_GENCODE) && \
cd ../../..
mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
Expand All @@ -235,7 +260,7 @@ ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
endif
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
cd ./msccl/tests/msccl-tests-nccl && \
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j ${NUM_MAKE_JOBS} && cd ../../..
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl NVCC_GENCODE=$(NVCC_GENCODE) -j ${NUM_MAKE_JOBS} && cd ../../..
mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
endif
Expand Down
Loading