Skip to content

Commit 1a13844

Browse files
StebossSTEFANO BOSISIO
andauthored
Update the dockerfile base image to cuda-dl-base (#1248)
Update the base docker image to `cuda-dl-base` --------- Co-authored-by: STEFANO BOSISIO <[email protected]>
1 parent e57ade9 commit 1a13844

File tree

10 files changed

+82
-326
lines changed

10 files changed

+82
-326
lines changed

.github/container/Dockerfile.base

Lines changed: 7 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,10 @@
11
# syntax=docker/dockerfile:1-labs
2-
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
2+
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
33
ARG GIT_USER_NAME="JAX Toolbox"
44
55
ARG CLANG_VERSION=18
66
ARG JAX_TOOLBOX_REF
77

8-
###############################################################################
9-
## Obtain GCP's NCCL TCPx plugin
10-
###############################################################################
11-
12-
FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64
13-
14-
# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
15-
FROM ubuntu AS tcpx-installer-arm64
16-
RUN <<"OUTEREOF" bash -ex
17-
mkdir -p /scripts /var/lib/tcpx/lib64
18-
echo '#!/bin/bash' > /scripts/container_entry.sh
19-
chmod +x /scripts/container_entry.sh
20-
OUTEREOF
21-
22-
FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
23-
RUN /scripts/container_entry.sh install
24-
258
###############################################################################
269
## Build base image
2710
###############################################################################
@@ -153,50 +136,18 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
153136
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
154137

155138
###############################################################################
156-
## Install TCPx
157-
###############################################################################
158-
159-
ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
160-
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}
161-
162-
###############################################################################
163-
## Install the latest versions of Nsight Systems and Nsight Compute
164-
###############################################################################
165-
166-
ADD install-nsight.sh /usr/local/bin
167-
RUN install-nsight.sh
168-
169-
###############################################################################
170-
## Install cuDNN
139+
## Symlink for cuDNN
171140
###############################################################################
172141

173-
ADD install-cudnn.sh /usr/local/bin
174-
RUN install-cudnn.sh
142+
ADD symlnk-cudnn.sh /usr/local/bin
143+
RUN symlnk-cudnn.sh
175144

176145
###############################################################################
177-
## Install NCCL
146+
## Symlink for NCCL
178147
###############################################################################
179148

180-
ADD install-nccl.sh /usr/local/bin
181-
RUN install-nccl.sh
182-
183-
###############################################################################
184-
## RoCE and InfiniteBand support
185-
###############################################################################
186-
187-
ADD install-ofed.sh /usr/local/bin
188-
RUN install-ofed.sh
189-
190-
##############################################################################
191-
## Amazon EFA support (need to run it inside container separately)
192-
##############################################################################
193-
194-
ADD --chmod=777 \
195-
install-efa.sh \
196-
test-aws-efa.sh \
197-
/usr/local/bin/
198-
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
199-
ENV PATH=/opt/amazon/efa/bin:${PATH}
149+
ADD symlnk-nccl.sh /usr/local/bin
150+
RUN symlnk-nccl.sh
200151

201152
##############################################################################
202153
## NCCL sanity check utility
@@ -207,18 +158,6 @@ ADD nccl-sanity-check.cu /opt
207158
RUN install-nccl-sanity-check.sh
208159
ADD jax-nccl-test parallel-launch /usr/local/bin/
209160

210-
###############################################################################
211-
## Add the systemcheck to the entrypoint.
212-
###############################################################################
213-
214-
COPY check-shm.sh /opt/nvidia/entrypoint.d/
215-
216-
###############################################################################
217-
## Add the GCP - TCPX check to the entrypoint.
218-
###############################################################################
219-
220-
# TODO(chaserileyroberts): Reenable once fully tested on GCP.
221-
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/
222161

223162
###############################################################################
224163
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems

.github/container/check-shm.sh

Lines changed: 0 additions & 19 deletions
This file was deleted.

.github/container/install-cudnn.sh

Lines changed: 0 additions & 72 deletions
This file was deleted.

.github/container/install-efa.sh

Lines changed: 0 additions & 37 deletions
This file was deleted.

.github/container/install-nccl.sh

Lines changed: 0 additions & 58 deletions
This file was deleted.

.github/container/install-nsight.sh

Lines changed: 0 additions & 18 deletions
This file was deleted.

.github/container/install-ofed.sh

Lines changed: 0 additions & 42 deletions
This file was deleted.

.github/container/symlnk-cudnn.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
CUDNN_MAJOR_VERSION=9
6+
7+
# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN
8+
# version that was just installed; this is useful to pass to XLA to avoid it fetching
9+
# its own copy of cuDNN.
10+
prefix=/opt/nvidia/cudnn
11+
if [[ -d "${prefix}" ]]; then
12+
echo "Skipping link farm creation"
13+
exit 1
14+
fi
15+
16+
arch=$(uname -m)-linux-gnu
17+
libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}')
18+
if [[ -z "${libcudnn_pkgs}" ]]; then
19+
echo "No libcudnn packages installed."
20+
exit 1
21+
fi
22+
23+
for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do
24+
# Real files and symlinks are linked into $prefix
25+
if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then
26+
# Replace /usr with $prefix
27+
nosysprefix="${cudnn_file#"/usr/"}"
28+
# include/x86_64-linux-gpu -> include/
29+
noarchinclude="${nosysprefix/#"include/${arch}"/include}"
30+
# cudnn_v9.h -> cudnn.h
31+
noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}"
32+
# lib/x86_64-linux-gnu -> lib/
33+
noarchlib="${noverheader/#"lib/${arch}"/lib}"
34+
link_name="${prefix}/${noarchlib}"
35+
link_dir=$(dirname "${link_name}")
36+
mkdir -p "${link_dir}"
37+
ln -s "${cudnn_file}" "${link_name}"
38+
else
39+
echo "Skipping ${cudnn_file}"
40+
fi
41+
done

0 commit comments

Comments
 (0)