Skip to content

Commit 56c7466

Browse files
authored
Base DLC (#4822)
base dlc code changes
1 parent 11f52f9 commit 56c7466

File tree

38 files changed

+819
-183
lines changed

38 files changed

+819
-183
lines changed

base/x86_64/gpu/buildspec.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK base
5+
version: &VERSION 12.8.1
6+
short_version: &SHORT_VERSION "12.8"
7+
arch_type: &ARCH_TYPE x86_64
8+
autopatch_build: "False"
9+
10+
repository_info:
11+
base_repository: &BASE_REPOSITORY
12+
image_type: &IMAGE_TYPE gpu
13+
root: .
14+
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
18+
19+
context:
20+
base_context: &BASE_CONTEXT
21+
deep_learning_container:
22+
source: src/deep_learning_container.py
23+
target: deep_learning_container.py
24+
install_python:
25+
source: scripts/install_python.sh
26+
target: install_python.sh
27+
install_cuda:
28+
source: scripts/install_cuda.sh
29+
target: install_cuda.sh
30+
install_efa:
31+
source: scripts/install_efa.sh
32+
target: install_efa.sh
33+
34+
images:
35+
base_x86_64_gpu_cuda128:
36+
<<: *BASE_REPOSITORY
37+
context:
38+
<<: *BASE_CONTEXT
39+
image_size_baseline: 11000
40+
device_type: &DEVICE_TYPE gpu
41+
cuda_version: &CUDA_VERSION cu128
42+
python_version: &DOCKER_PYTHON_VERSION py3
43+
tag_python_version: &TAG_PYTHON_VERSION py312
44+
os_version: &OS_VERSION ubuntu24.04
45+
tag: !join [ *DEVICE_TYPE, "-", *CUDA_VERSION, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-", *ARCH_TYPE, "-ec2" ]
46+
latest_release_tag: !join [ *DEVICE_TYPE, "-", *CUDA_VERSION, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-", *ARCH_TYPE, "-ec2" ]
47+
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ]
48+
target: final
49+
build: true
50+
enable_common_stage_build: false
51+
test_configs:
52+
test_platforms:
53+
- sanity
54+
- security

base/x86_64/gpu/cu128/Dockerfile

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
ARG PYTHON="python3"
2+
ARG PYTHON_VERSION="3.12.10"
3+
ARG PYTHON_SHORT_VERSION="3.12"
4+
ARG CUDA_MAJOR="12"
5+
ARG CUDA_MINOR="8"
6+
ARG EFA_VERSION="1.40.0"
7+
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS base-builder
8+
9+
RUN apt-get update \
10+
&& apt-get -y upgrade --only-upgrade systemd \
11+
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
12+
automake \
13+
build-essential \
14+
ca-certificates \
15+
cmake \
16+
curl \
17+
emacs \
18+
git \
19+
jq \
20+
libcurl4-openssl-dev \
21+
libglib2.0-0 \
22+
libegl1 \
23+
libgl1 \
24+
libsm6 \
25+
libssl-dev \
26+
libxext6 \
27+
libxrender-dev \
28+
zlib1g-dev \
29+
unzip \
30+
vim \
31+
wget \
32+
libhwloc-dev \
33+
libgomp1 \
34+
libibverbs-dev \
35+
libnuma1 \
36+
libnuma-dev \
37+
libtool \
38+
openssl \
39+
python3-dev \
40+
autoconf \
41+
pkg-config \
42+
check \
43+
libsubunit0 \
44+
libsubunit-dev \
45+
libffi-dev \
46+
&& apt-get autoremove -y \
47+
&& apt-get clean \
48+
&& rm -rf /var/lib/apt/lists/*
49+
50+
##############################################################################
51+
FROM base-builder AS python-builder
52+
ARG PYTHON_VERSION
53+
COPY install_python.sh install_python.sh
54+
RUN bash install_python.sh ${PYTHON_VERSION} && rm install_python.sh
55+
56+
##############################################################################
57+
FROM base-builder AS cuda-builder
58+
ARG CUDA_MAJOR
59+
ARG CUDA_MINOR
60+
COPY install_cuda.sh install_cuda.sh
61+
RUN bash install_cuda.sh "${CUDA_MAJOR}.${CUDA_MINOR}" && rm install_cuda.sh
62+
63+
##############################################################################
64+
FROM nvidia/cuda:12.8.1-base-ubuntu24.04 AS final
65+
ARG PYTHON
66+
ARG PYTHON_SHORT_VERSION
67+
ARG CUDA_MAJOR
68+
ARG CUDA_MINOR
69+
ARG EFA_VERSION
70+
LABEL maintainer="Amazon AI"
71+
LABEL dlc_major_version="1"
72+
ENV DEBIAN_FRONTEND=noninteractive \
73+
LANG=C.UTF-8 \
74+
LC_ALL=C.UTF-8 \
75+
DLC_CONTAINER_TYPE=base \
76+
# Python won’t try to write .pyc or .pyo files on the import of source modules
77+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
78+
PYTHONDONTWRITEBYTECODE=1 \
79+
PYTHONUNBUFFERED=1 \
80+
PYTHONIOENCODING=UTF-8 \
81+
CUDA_HOME="/usr/local/cuda" \
82+
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \
83+
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH}"
84+
85+
WORKDIR /
86+
87+
# + python and pip packages (awscli, boto3, requests)
88+
COPY --from=python-builder /usr/local/lib/python${PYTHON_SHORT_VERSION} /usr/local/lib/python${PYTHON_SHORT_VERSION}
89+
COPY --from=python-builder /usr/local/include/python${PYTHON_SHORT_VERSION} /usr/local/include/python${PYTHON_SHORT_VERSION}
90+
COPY --from=python-builder /usr/local/bin /usr/local/bin
91+
# + cuda-toolkit, cudnn, nccl
92+
COPY --from=cuda-builder /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}
93+
COPY install_efa.sh install_efa.sh
94+
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
95+
COPY sitecustomize.py /usr/local/lib/python${PYTHON_SHORT_VERSION}/sitecustomize.py
96+
97+
RUN chmod +x /usr/local/bin/deep_learning_container.py && \
98+
# Install EFA
99+
bash install_efa.sh ${EFA_VERSION} && \
100+
rm install_efa.sh && \
101+
# OSS compliance
102+
apt-get update && \
103+
apt-get install -y --allow-change-held-packages --no-install-recommends unzip && \
104+
HOME_DIR=/root && \
105+
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
106+
unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
107+
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
108+
chmod +x /usr/local/bin/testOSSCompliance && \
109+
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
110+
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \
111+
rm -rf ${HOME_DIR}/oss_compliance* && \
112+
rm -rf /tmp/tmp* && \
113+
rm -rf /root/.cache | true
114+
115+
CMD ["/bin/bash"]

dlc_developer_config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ deep_canary_mode = false
3636

3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
39-
# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
39+
# available frameworks - ["base", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
4040
build_frameworks = []
4141

4242

scripts/install_cuda.sh

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
# https://raw.githubusercontent.com/pytorch/pytorch/842d51500be144d53f4d046d31169e8f46c063f6/.ci/docker/common/install_cuda.sh
6+
7+
function prune_cuda {
8+
# Remove non-essential CUDA components to reduce image size:
9+
# - Documentation and manual pages
10+
# - Sample code, demos, and example projects
11+
# - IDE integration (Nsight Eclipse Edition)
12+
# - Debugging tools (compute-sanitizer, debugger)
13+
# - Profiling tools (Nsight Compute, Nsight Systems)
14+
# - Legacy tools (Visual Profiler)
15+
# This keeps only the essential runtime libraries, headers and development tools
16+
rm -rf /usr/local/cuda/compute-sanitizer/docs \
17+
/usr/local/cuda/nsight-compute-****.*.*/docs \
18+
/usr/local/cuda/nsight-systems-****.*.*/documentation \
19+
/usr/local/cuda/extras/demo_suite \
20+
/usr/local/cuda/extras/CUPTI/samples \
21+
/usr/local/cuda/nsight-compute-****.*.*/extras/samples \
22+
/usr/local/cuda/libnvvp \
23+
/usr/local/cuda/nsightee_plugins \
24+
/usr/local/cuda/compute-sanitizer \
25+
/usr/local/cuda/extras/Debugger \
26+
/usr/local/cuda/nsight-compute-****.*.* \
27+
/usr/local/cuda/nsight-systems-****.*.*
28+
rm -rf /usr/local/cuda/doc
29+
rm -rf /usr/local/cuda/samples
30+
rm -rf /usr/local/cuda/share/doc
31+
}
32+
33+
function install_cuda118_stack {
34+
CUDNN_VERSION=9.1.0.70
35+
NCCL_VERSION=v2.21.5-1
36+
CUDA_HOME="/usr/local/cuda"
37+
38+
# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
39+
mv /usr/local/cuda/compat /usr/local
40+
rm -rf /usr/local/cuda-*
41+
rm -rf /usr/local/cuda
42+
43+
# install cuda
44+
cd /tmp
45+
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
46+
chmod +x cuda_11.8.0_520.61.05_linux.run
47+
./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
48+
rm -f cuda_11.8.0_520.61.05_linux.run
49+
ln -s /usr/local/cuda-11.8 /usr/local/cuda
50+
# bring back cuda-compat
51+
mv /usr/local/compat /usr/local/cuda/compat
52+
53+
# install cudnn
54+
mkdir -p /tmp/cudnn
55+
cd /tmp/cudnn
56+
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
57+
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
58+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
59+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
60+
61+
# install nccl
62+
mkdir -p /tmp/nccl
63+
cd /tmp/nccl
64+
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
65+
cd nccl
66+
make -j src.build
67+
cp -a build/include/* /usr/local/cuda/include/
68+
cp -a build/lib/* /usr/local/cuda/lib64/
69+
70+
prune_cuda
71+
ldconfig
72+
}
73+
74+
75+
function install_cuda124_stack {
76+
CUDNN_VERSION="9.1.0.70"
77+
NCCL_VERSION="v2.23.4-1"
78+
CUDA_HOME="/usr/local/cuda"
79+
80+
# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
81+
mv /usr/local/cuda/compat /usr/local
82+
rm -rf /usr/local/cuda-*
83+
rm -rf /usr/local/cuda
84+
85+
# install cuda
86+
cd /tmp
87+
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
88+
chmod +x cuda_12.4.1_550.54.15_linux.run
89+
./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
90+
rm -f cuda_12.4.1_550.54.15_linux.run
91+
ln -s /usr/local/cuda-12.4 /usr/local/cuda
92+
# bring back cuda-compat
93+
mv /usr/local/compat /usr/local/cuda/compat
94+
95+
# install cudnn
96+
mkdir -p /tmp/cudnn
97+
cd /tmp/cudnn
98+
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
99+
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
100+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
101+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
102+
103+
# install nccl
104+
mkdir -p /tmp/nccl
105+
cd /tmp/nccl
106+
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
107+
cd nccl
108+
make -j src.build
109+
cp -a build/include/* /usr/local/cuda/include/
110+
cp -a build/lib/* /usr/local/cuda/lib64/
111+
112+
prune_cuda
113+
ldconfig
114+
}
115+
116+
117+
function install_cuda126_stack {
118+
CUDNN_VERSION="9.7.0.66"
119+
NCCL_VERSION="v2.23.4-1"
120+
CUDA_HOME="/usr/local/cuda"
121+
122+
# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
123+
mv /usr/local/cuda/compat /usr/local
124+
rm -rf /usr/local/cuda-*
125+
rm -rf /usr/local/cuda
126+
127+
# install CUDA
128+
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
129+
chmod +x cuda_12.6.3_560.35.05_linux.run
130+
./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
131+
rm -f cuda_12.6.3_560.35.05_linux.run
132+
ln -s /usr/local/cuda-12.6 /usr/local/cuda
133+
# bring back cuda-compat
134+
mv /usr/local/compat /usr/local/cuda/compat
135+
136+
# install cudnn
137+
mkdir -p /tmp/cudnn
138+
cd /tmp/cudnn
139+
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
140+
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
141+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
142+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
143+
144+
# install nccl
145+
mkdir -p /tmp/nccl
146+
cd /tmp/nccl
147+
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
148+
cd nccl
149+
make -j src.build
150+
cp -a build/include/* /usr/local/cuda/include/
151+
cp -a build/lib/* /usr/local/cuda/lib64/
152+
153+
prune_cuda
154+
ldconfig
155+
}
156+
157+
158+
function install_cuda128_stack {
159+
CUDNN_VERSION="9.8.0.87"
160+
NCCL_VERSION="v2.26.2-1"
161+
CUDA_HOME="/usr/local/cuda"
162+
163+
# move cuda-compt and remove existing cuda dir from nvidia/cuda:**.*.*-base-*
164+
mv /usr/local/cuda/compat /usr/local
165+
rm -rf /usr/local/cuda-*
166+
rm -rf /usr/local/cuda
167+
168+
# install CUDA
169+
wget -q https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run
170+
chmod +x cuda_12.8.1_570.124.06_linux.run
171+
./cuda_12.8.1_570.124.06_linux.run --toolkit --silent
172+
rm -f cuda_12.8.1_570.124.06_linux.run
173+
ln -s /usr/local/cuda-12.8 /usr/local/cuda
174+
# bring back cuda-compat
175+
mv /usr/local/compat /usr/local/cuda/compat
176+
177+
# install cudnn
178+
mkdir -p /tmp/cudnn
179+
cd /tmp/cudnn
180+
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
181+
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
182+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
183+
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
184+
185+
# install nccl
186+
mkdir -p /tmp/nccl
187+
cd /tmp/nccl
188+
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
189+
cd nccl
190+
make -j src.build
191+
cp -a build/include/* /usr/local/cuda/include/
192+
cp -a build/lib/* /usr/local/cuda/lib64/
193+
194+
prune_cuda
195+
ldconfig
196+
}
197+
198+
# idiomatic parameter and option handling in sh
199+
while test $# -gt 0
200+
do
201+
case "$1" in
202+
11.8) install_cuda118_stack;
203+
;;
204+
12.4) install_cuda124_stack;
205+
;;
206+
12.6) install_cuda126_stack;
207+
;;
208+
12.8) install_cuda128_stack;
209+
;;
210+
*) echo "bad argument $1"; exit 1
211+
;;
212+
esac
213+
shift
214+
done

0 commit comments

Comments
 (0)