Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 7 additions & 68 deletions .github/container/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04
ARG GIT_USER_NAME="JAX Toolbox"
ARG [email protected]
ARG CLANG_VERSION=18
ARG JAX_TOOLBOX_REF

###############################################################################
## Obtain GCP's NCCL TCPx plugin
###############################################################################

FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64

# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
FROM ubuntu AS tcpx-installer-arm64
RUN <<"OUTEREOF" bash -ex
mkdir -p /scripts /var/lib/tcpx/lib64
echo '#!/bin/bash' > /scripts/container_entry.sh
chmod +x /scripts/container_entry.sh
OUTEREOF

FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
RUN /scripts/container_entry.sh install

###############################################################################
## Build base image
###############################################################################
Expand Down Expand Up @@ -153,50 +136,18 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*

###############################################################################
## Install TCPx
###############################################################################

ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}

###############################################################################
## Install the latest versions of Nsight Systems and Nsight Compute
###############################################################################

ADD install-nsight.sh /usr/local/bin
RUN install-nsight.sh

###############################################################################
## Install cuDNN
## Symlink for cuDNN
###############################################################################

ADD install-cudnn.sh /usr/local/bin
RUN install-cudnn.sh
ADD symlnk-cudnn.sh /usr/local/bin
RUN symlnk-cudnn.sh

###############################################################################
## Install NCCL
## Symlink for NCCL
###############################################################################

ADD install-nccl.sh /usr/local/bin
RUN install-nccl.sh

###############################################################################
## RoCE and InfiniteBand support
###############################################################################

ADD install-ofed.sh /usr/local/bin
RUN install-ofed.sh

##############################################################################
## Amazon EFA support (need to run it inside container separately)
##############################################################################

ADD --chmod=777 \
install-efa.sh \
test-aws-efa.sh \
/usr/local/bin/
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
ENV PATH=/opt/amazon/efa/bin:${PATH}
ADD symlnk-nccl.sh /usr/local/bin
RUN symlnk-nccl.sh

##############################################################################
## NCCL sanity check utility
Expand All @@ -207,18 +158,6 @@ ADD nccl-sanity-check.cu /opt
RUN install-nccl-sanity-check.sh
ADD jax-nccl-test parallel-launch /usr/local/bin/

###############################################################################
## Add the systemcheck to the entrypoint.
###############################################################################

COPY check-shm.sh /opt/nvidia/entrypoint.d/

###############################################################################
## Add the GCP - TCPX check to the entrypoint.
###############################################################################

# TODO(chaserileyroberts): Reenable once fully tested on GCP.
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/

###############################################################################
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems
Expand Down
19 changes: 0 additions & 19 deletions .github/container/check-shm.sh

This file was deleted.

72 changes: 0 additions & 72 deletions .github/container/install-cudnn.sh

This file was deleted.

37 changes: 0 additions & 37 deletions .github/container/install-efa.sh

This file was deleted.

58 changes: 0 additions & 58 deletions .github/container/install-nccl.sh

This file was deleted.

18 changes: 0 additions & 18 deletions .github/container/install-nsight.sh

This file was deleted.

42 changes: 0 additions & 42 deletions .github/container/install-ofed.sh

This file was deleted.

41 changes: 41 additions & 0 deletions .github/container/symlnk-cudnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -ex

CUDNN_MAJOR_VERSION=9

# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN
# version that was just installed; this is useful to pass to XLA to avoid it fetching
# its own copy of cuDNN.
prefix=/opt/nvidia/cudnn
if [[ -d "${prefix}" ]]; then
echo "Skipping link farm creation"
exit 1
fi

arch=$(uname -m)-linux-gnu
libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}')
if [[ -z "${libcudnn_pkgs}" ]]; then
echo "No libcudnn packages installed."
exit 1
fi

for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do
# Real files and symlinks are linked into $prefix
if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then
# Replace /usr with $prefix
nosysprefix="${cudnn_file#"/usr/"}"
# include/x86_64-linux-gpu -> include/
noarchinclude="${nosysprefix/#"include/${arch}"/include}"
# cudnn_v9.h -> cudnn.h
noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}"
# lib/x86_64-linux-gnu -> lib/
noarchlib="${noverheader/#"lib/${arch}"/lib}"
link_name="${prefix}/${noarchlib}"
link_dir=$(dirname "${link_name}")
mkdir -p "${link_dir}"
ln -s "${cudnn_file}" "${link_name}"
else
echo "Skipping ${cudnn_file}"
fi
done
Loading
Loading