Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions base/buildspec-cu128-ubuntu22.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK base
version: &VERSION 12.8.0
short_version: &SHORT_VERSION "12.8"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
base_repository: &BASE_REPOSITORY
image_type: &IMAGE_TYPE gpu
root: .
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
base_context: &BASE_CONTEXT
deep_learning_container:
source: src/deep_learning_container.py
target: deep_learning_container.py
install_python:
source: scripts/install_python.sh
target: install_python.sh
install_cuda:
source: scripts/install_cuda.sh
target: install_cuda.sh
install_efa:
source: scripts/install_efa.sh
target: install_efa.sh

images:
base_x86_64_gpu_cuda128:
<<: *BASE_REPOSITORY
context:
<<: *BASE_CONTEXT
image_size_baseline: 11000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu128
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /, *OS_VERSION, /Dockerfile ]
target: final
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
126 changes: 126 additions & 0 deletions base/x86_64/gpu/cu128/ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
ARG PYTHON="python3"
ARG PYTHON_VERSION="3.12.10"
ARG PYTHON_SHORT_VERSION="3.12"
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="8"
ARG EFA_VERSION="1.40.0"
ARG OS_VERSION="ubuntu22.04"
FROM nvidia/cuda:12.8.0-base-${OS_VERSION} AS base-builder

RUN mv /usr/local/cuda/compat /usr/local \
&& apt-get update \
&& apt-get -y upgrade --only-upgrade systemd \
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
automake \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
jq \
libcurl4-openssl-dev \
libglib2.0-0 \
libegl1 \
libgl1 \
libsm6 \
libssl-dev \
libxext6 \
libxrender-dev \
zlib1g-dev \
unzip \
vim \
wget \
libhwloc-dev \
libgomp1 \
libibverbs-dev \
libnuma1 \
libnuma-dev \
libtool \
openssl \
python3-dev \
autoconf \
pkg-config \
check \
libsubunit0 \
libsubunit-dev \
libffi-dev \
libbz2-dev \
liblzma-dev \
&& apt-get autoremove -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

##############################################################################
FROM base-builder AS python-builder
ARG PYTHON_VERSION
COPY install_python.sh install_python.sh
RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh

##############################################################################
FROM base-builder AS cuda-builder
ARG CUDA_MAJOR
ARG CUDA_MINOR
ARG OS_VERSION
COPY install_cuda.sh install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh

##############################################################################
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS final
ARG PYTHON
ARG PYTHON_SHORT_VERSION
ARG CUDA_MAJOR
ARG CUDA_MINOR
ARG EFA_VERSION
LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
DLC_CONTAINER_TYPE=base \
# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONIOENCODING=UTF-8 \
CUDA_HOME="/usr/local/cuda" \
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"

WORKDIR /

# + python and pip packages (awscli, boto3, requests)
COPY --from=python-builder /usr/local/lib/python${PYTHON_SHORT_VERSION} /usr/local/lib/python${PYTHON_SHORT_VERSION}
COPY --from=python-builder /usr/local/include/python${PYTHON_SHORT_VERSION} /usr/local/include/python${PYTHON_SHORT_VERSION}
COPY --from=python-builder /usr/local/bin /usr/local/bin
# + cuda-toolkit, cudnn, nccl
COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
COPY install_efa.sh install_efa.sh
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
RUN chmod +x /usr/local/bin/deep_learning_container.py && \
chmod +x /usr/local/bin/bash_telemetry.sh && \
echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc && \
# Install EFA
bash install_efa.sh ${EFA_VERSION} && \
rm install_efa.sh && \
# OSS compliance
apt-get update && \
apt-get upgrade -y && \
apt-get install -y --allow-change-held-packages --no-install-recommends \
unzip \
wget && \
apt-get clean && \
HOME_DIR=/root && \
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
chmod +x /usr/local/bin/testOSSCompliance && \
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \
rm -rf ${HOME_DIR}/oss_compliance* && \
rm -rf /tmp/tmp* && \
rm -rf /var/lib/apt/lists/* && \
rm -rf /root/.cache | true

CMD ["/bin/bash"]
8 changes: 5 additions & 3 deletions base/x86_64/gpu/cu128/ubuntu24.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ ARG PYTHON_SHORT_VERSION="3.12"
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="8"
ARG EFA_VERSION="1.42.0"
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS base-builder
ARG OS_VERSION="ubuntu24.04"
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS base-builder


RUN mv /usr/local/cuda/compat /usr/local \
Expand Down Expand Up @@ -61,11 +62,12 @@ RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
FROM base-builder AS cuda-builder
ARG CUDA_MAJOR
ARG CUDA_MINOR
ARG OS_VERSION
COPY install_cuda.sh install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh

##############################################################################
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS final
FROM nvidia/cuda:12.8.1-base-${OS_VERSION} AS final
ARG PYTHON
ARG PYTHON_SHORT_VERSION
ARG CUDA_MAJOR
Expand Down
8 changes: 5 additions & 3 deletions base/x86_64/gpu/cu129/ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ ARG PYTHON_SHORT_VERSION="3.12"
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="9"
ARG EFA_VERSION="1.43.1"
FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base-builder
ARG OS_VERSION="ubuntu22.04"
FROM nvidia/cuda:12.9.1-base-${OS_VERSION} AS base-builder


RUN mv /usr/local/cuda/compat /usr/local \
Expand Down Expand Up @@ -61,11 +62,12 @@ RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
FROM base-builder AS cuda-builder
ARG CUDA_MAJOR
ARG CUDA_MINOR
ARG OS_VERSION
COPY install_cuda.sh install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" "${OS_VERSION}" && rm install_cuda.sh

##############################################################################
FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS final
FROM nvidia/cuda:12.9.1-base-${OS_VERSION} AS final
ARG PYTHON
ARG PYTHON_SHORT_VERSION
ARG CUDA_MAJOR
Expand Down
66 changes: 57 additions & 9 deletions scripts/install_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ function install_nvjpeg_for_cuda_below_129 {
}


function install_cuda128_stack {
function install_cuda128_stack_ul24 {
CUDNN_VERSION="9.8.0.87"
NCCL_VERSION="v2.26.2-1"
CUDA_HOME="/usr/local/cuda"
Expand Down Expand Up @@ -87,7 +87,47 @@ function install_cuda128_stack {
ldconfig
}

function install_cuda129_stack {
function install_cuda128_stack_ul22 {
CUDNN_VERSION="9.7.1.26"
NCCL_VERSION="v2.26.2-1"
CUDA_HOME="/usr/local/cuda"

# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
rm -rf /usr/local/cuda-*
rm -rf /usr/local/cuda

# install CUDA
wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
chmod +x cuda_12.8.0_570.86.10_linux.run
./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
rm -f cuda_12.8.0_570.86.10_linux.run
ln -s /usr/local/cuda-12.8 /usr/local/cuda
# bring back cuda-compat
mv /usr/local/compat /usr/local/cuda/compat

# install cudnn
mkdir -p /tmp/cudnn
cd /tmp/cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/

# install nccl
mkdir -p /tmp/nccl
cd /tmp/nccl
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/

install_nvjpeg_for_cuda_below_129
prune_cuda
ldconfig
}

function install_cuda129_stack_ul22 {
CUDNN_VERSION="9.10.2.21"
NCCL_VERSION="v2.27.3-1"
CUDA_HOME="/usr/local/cuda"
Expand Down Expand Up @@ -130,12 +170,20 @@ function install_cuda129_stack {
while test $# -gt 0
do
case "$1" in
12.8) install_cuda128_stack;
;;
12.9) install_cuda129_stack;
;;
*) echo "bad argument $1"; exit 1
;;
12.8)
case "$2" in
"ubuntu22.04") install_cuda128_stack_ul22 ;;
"ubuntu24.04") install_cuda128_stack_ul24 ;;
*) echo "bad OS version $2"; exit 1 ;;
esac
;;
12.9)
case "$2" in
"ubuntu22.04") install_cuda129_stack_ul22 ;;
*) echo "bad OS version $2"; exit 1 ;;
esac
;;
*) echo "bad CUDA version $1"; exit 1 ;;
esac
shift
shift 2 # Skip both arguments at once
done