Skip to content

Commit 413f52c

Browse files
author
Yadan Wei
committed
add tf 2.18 x86 file
1 parent 5f15f85 commit 413f52c

File tree

4 files changed

+366
-4
lines changed

4 files changed

+366
-4
lines changed

dlc_developer_config.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ deep_canary_mode = false
3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
3939
# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
40-
build_frameworks = []
40+
build_frameworks = ["tensorflow"]
4141

4242
# By default we build both training and inference containers. Set true/false values to determine which to build.
4343
build_training = true
44-
build_inference = true
44+
build_inference = false
4545

4646
# Set do_build to "false" to skip builds and test the latest image built by this PR
4747
# Note: at least one build is required to set do_build to "false"
@@ -106,7 +106,7 @@ use_scheduler = false
106106
# Standard Framework Training
107107
dlc-pr-mxnet-training = ""
108108
dlc-pr-pytorch-training = ""
109-
dlc-pr-tensorflow-2-training = ""
109+
dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml"
110110
dlc-pr-autogluon-training = ""
111111

112112
# HuggingFace Training
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK tensorflow
5+
version: &VERSION 2.18.0
6+
short_version: &SHORT_VERSION "2.18"
7+
arch_type: x86
8+
# autopatch_build: "True"
9+
10+
repository_info:
11+
training_repository: &TRAINING_REPOSITORY
12+
image_type: &TRAINING_IMAGE_TYPE training
13+
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
14+
repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
18+
*RELEASE_REPOSITORY_NAME ]
19+
20+
context:
21+
training_context: &TRAINING_CONTEXT
22+
dockerd-entrypoint:
23+
source: docker/build_artifacts/dockerd-entrypoint.py
24+
target: dockerd-entrypoint.py
25+
deep_learning_container:
26+
source: ../../src/deep_learning_container.py
27+
target: deep_learning_container.py
28+
29+
images:
30+
BuildTensorflowEC2CpuPy310TrainingDockerImage:
31+
<<: *TRAINING_REPOSITORY
32+
build: &TENSORFLOW_CPU_TRAINING_PY3 false
33+
image_size_baseline: &IMAGE_SIZE_BASELINE 4489
34+
device_type: &DEVICE_TYPE cpu
35+
python_version: &DOCKER_PYTHON_VERSION py3
36+
tag_python_version: &TAG_PYTHON_VERSION py310
37+
os_version: &OS_VERSION ubuntu20.04
38+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
39+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION,
40+
"-ec2" ]
41+
# build_tag_override: "beta:2.16.2-cpu-py310-ubuntu20.04-ec2"
42+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
43+
target: ec2
44+
enable_test_promotion: true
45+
context:
46+
<<: *TRAINING_CONTEXT
47+
BuildTensorflowEC2GpuCu123Py310TrainingDockerImage:
48+
<<: *TRAINING_REPOSITORY
49+
build: &TENSORFLOW_GPU_TRAINING_PY3 false
50+
image_size_baseline: &IMAGE_SIZE_BASELINE 9307
51+
device_type: &DEVICE_TYPE gpu
52+
python_version: &DOCKER_PYTHON_VERSION py3
53+
tag_python_version: &TAG_PYTHON_VERSION py310
54+
cuda_version: &CUDA_VERSION cu125
55+
os_version: &OS_VERSION ubuntu22.04
56+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
57+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-",
58+
*OS_VERSION, "-ec2" ]
59+
# build_tag_override: "beta:2.16.2-gpu-py310-cu123-ubuntu20.04-ec2"
60+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
61+
*DEVICE_TYPE ]
62+
target: ec2
63+
enable_test_promotion: true
64+
context:
65+
<<: *TRAINING_CONTEXT

tensorflow/training/buildspec.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
buildspec_pointer: buildspec-2-16-sm.yml
1+
buildspec_pointer: buildspec-2-18-ec2.yml
Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
FROM ubuntu:22.04 AS base_image
2+
3+
ENV DEBIAN_FRONTEND=noninteractive \
4+
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
5+
6+
RUN apt-get update \
7+
&& apt-get upgrade -y \
8+
&& apt-get autoremove -y \
9+
&& apt-get clean \
10+
&& rm -rf /var/lib/apt/lists/*
11+
12+
FROM base_image AS common
13+
14+
LABEL maintainer="Amazon AI"
15+
LABEL dlc_major_version="1"
16+
17+
# TensorFlow major.minor version
18+
ENV TF_VERSION=2.18
19+
20+
# prevent stopping by user interaction
21+
ENV DEBIAN_FRONTEND noninteractive
22+
ENV DEBCONF_NONINTERACTIVE_SEEN true
23+
24+
# Set environment variables for MKL
25+
# For more about MKL with TensorFlow see:
26+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
27+
28+
ENV KMP_AFFINITY=granularity=fine,compact,1,0
29+
ENV KMP_BLOCKTIME=1
30+
ENV KMP_SETTINGS=0
31+
32+
ENV PYTHONDONTWRITEBYTECODE=1
33+
ENV PYTHONUNBUFFERED=1
34+
ENV PYTHONIOENCODING=UTF-8
35+
ENV LANG=C.UTF-8
36+
ENV LC_ALL=C.UTF-8
37+
38+
ARG PYTHON=python3.10
39+
ARG PYTHON_VERSION=3.10.14
40+
41+
ARG PIP=pip3
42+
43+
ARG OMPI_VERSION=4.1.6
44+
45+
# To be passed to ec2 and sagemaker stages
46+
ENV PYTHON=${PYTHON}
47+
ENV PYTHON_VERSION=${PYTHON_VERSION}
48+
ENV PIP=${PIP}
49+
50+
RUN apt-get update && apt-get install -y --no-install-recommends \
51+
build-essential \
52+
openssh-client \
53+
openssh-server \
54+
ca-certificates \
55+
curl \
56+
emacs \
57+
git \
58+
libtemplate-perl \
59+
libssl1.1 \
60+
openssl \
61+
protobuf-compiler \
62+
unzip \
63+
wget \
64+
vim \
65+
zlib1g-dev \
66+
# Install dependent library for OpenCV
67+
libgtk2.0-dev \
68+
&& rm -rf /var/lib/apt/lists/* \
69+
&& apt-get clean
70+
71+
# Install Open MPI
72+
RUN mkdir /tmp/openmpi \
73+
&& cd /tmp/openmpi \
74+
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
75+
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
76+
&& cd openmpi-${OMPI_VERSION} \
77+
&& ./configure --enable-orterun-prefix-by-default \
78+
&& make -j $(nproc) all \
79+
&& make install \
80+
&& ldconfig \
81+
&& rm -rf /tmp/openmpi
82+
83+
# Create a wrapper for OpenMPI to allow running as root by default
84+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
85+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
86+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
87+
&& chmod a+x /usr/local/bin/mpirun
88+
89+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
90+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
91+
92+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
93+
ENV PATH /usr/local/openmpi/bin/:$PATH
94+
95+
# SSH login fix. Otherwise user is kicked off after login
96+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
97+
98+
# Create SSH key.
99+
RUN mkdir -p /root/.ssh/ \
100+
&& mkdir -p /var/run/sshd \
101+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
102+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
103+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
104+
105+
WORKDIR /
106+
107+
RUN apt-get update \
108+
&& apt-get install -y --no-install-recommends \
109+
libbz2-dev \
110+
libc6-dev \
111+
libcurl4-openssl-dev \
112+
libffi-dev \
113+
libgdbm-dev \
114+
liblzma-dev \
115+
libncursesw5-dev \
116+
libreadline-gplv2-dev \
117+
libsqlite3-dev \
118+
libssl-dev \
119+
tk-dev \
120+
ffmpeg \
121+
libsm6 \
122+
libxext6 \
123+
&& rm -rf /var/lib/apt/lists/* \
124+
&& apt-get clean
125+
126+
RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
127+
&& tar -xvf Python-$PYTHON_VERSION.tgz \
128+
&& cd Python-$PYTHON_VERSION \
129+
&& ./configure \
130+
&& make -j $(nproc) \
131+
&& make install \
132+
&& rm -rf ../Python-$PYTHON_VERSION*
133+
134+
RUN ${PIP} --no-cache-dir install --upgrade \
135+
pip \
136+
setuptools
137+
138+
# Some TF tools expect a "python" binary
139+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
140+
&& ln -s $(which ${PIP}) /usr/bin/pip
141+
142+
RUN ${PIP} install --no-cache-dir -U \
143+
pybind11 \
144+
cmake \
145+
scipy \
146+
Pillow \
147+
python-dateutil \
148+
requests \
149+
"awscli<2" \
150+
urllib3 \
151+
mpi4py \
152+
# Let's install TensorFlow separately in the end to avoid
153+
# the library version to be overwritten
154+
&& ${PIP} install --no-cache-dir -U \
155+
h5py \
156+
absl-py \
157+
opencv-python \
158+
werkzeug \
159+
psutil \
160+
"protobuf<4"
161+
162+
ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
163+
164+
RUN chmod +x /usr/local/bin/deep_learning_container.py
165+
166+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-${TF_VERSION}/license.txt -o /license.txt
167+
168+
########################################################
169+
# _____ ____ ____ ___
170+
# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
171+
# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
172+
# | |__| |___ / __/ | || | | | | | (_| | (_| | __/
173+
# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
174+
# |___/
175+
# ____ _
176+
# | _ \ ___ ___(_)_ __ ___
177+
# | |_) / _ \/ __| | '_ \ / _ \
178+
# | _ < __/ (__| | |_) | __/
179+
# |_| \_\___|\___|_| .__/ \___|
180+
# |_|
181+
########################################################
182+
183+
FROM common AS ec2
184+
ARG TF_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/tensorflow/r2.16_aws/cpu/2024-07-12-00-09/tensorflow_cpu-2.16.2-cp310-cp310-linux_x86_64.whl
185+
186+
RUN ${PIP} install --no-cache-dir -U \
187+
${TF_URL} \
188+
"tensorflow-io==0.37.*" \
189+
tensorflow-datasets
190+
191+
RUN HOME_DIR=/root \
192+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
193+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
194+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
195+
&& chmod +x /usr/local/bin/testOSSCompliance \
196+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
197+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
198+
&& rm -rf ${HOME_DIR}/oss_compliance*
199+
200+
# remove tmp files
201+
RUN rm -rf /tmp/*
202+
203+
CMD ["/bin/bash"]
204+
205+
#################################################################
206+
# ____ __ __ _
207+
# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __
208+
# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
209+
# ___) | (_| | (_| | __/ | | | (_| | < __/ |
210+
# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_|
211+
# |___/
212+
# ___ ____ _
213+
# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___
214+
# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
215+
# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/
216+
# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
217+
# |___/ |_|
218+
#################################################################
219+
220+
FROM common AS sagemaker
221+
222+
LABEL maintainer="Amazon AI"
223+
LABEL dlc_major_version="1"
224+
225+
ARG TF_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/tensorflow/r2.16_aws/cpu/2024-07-12-00-09/tensorflow_cpu-2.16.2-cp310-cp310-linux_x86_64.whl
226+
227+
# sagemaker-specific environment variable
228+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
229+
230+
# dependencies for opencv
231+
# these dependencies are not needed for gpu image
232+
RUN apt-get update \
233+
&& apt-get install -y --no-install-recommends \
234+
libgtk2.0-dev \
235+
&& apt-get install -y -qq libkrb5-dev \
236+
&& apt-get install -y -qq libsasl2-dev libsasl2-modules \
237+
&& apt-get install -y -qq krb5-user \
238+
&& rm -rf /var/lib/apt/lists/* \
239+
&& apt-get clean
240+
241+
# https://github.com/yaml/pyyaml/issues/601
242+
# PyYaml less than 6.0.1 failes to build with cython v3 and above.
243+
# tf-models-official uses older versions, breaking the install.
244+
# going to install the older pyyaml and cython to get tfd-models-official
245+
# the sagemaker package will revert pyyaml back to 6 for its requirement
246+
# and this is fine since sagemaker is more important than the models and
247+
# the models still work on pyyaml 6 in this context.
248+
# Need to install wheel before we can fix the pyyaml issue below
249+
RUN pip install wheel \
250+
&& pip install "cython<3" "pyyaml<6" --no-build-isolation
251+
252+
# https://github.com/tensorflow/models/issues/9267
253+
# tf-models does not respect existing installations of TF and always installs open source TF
254+
RUN ${PIP} install --no-cache-dir -U \
255+
tf-models-official==2.16.0 \
256+
tensorflow-text==2.16.1 \
257+
&& ${PIP} uninstall -y tensorflow tensorflow-gpu \
258+
&& ${PIP} install --no-cache-dir -U \
259+
${TF_URL} \
260+
"tensorflow-io==0.37.*" \
261+
tensorflow-datasets
262+
263+
RUN $PYTHON -m pip install --no-cache-dir -U \
264+
numba \
265+
bokeh \
266+
imageio \
267+
opencv-python \
268+
plotly \
269+
seaborn \
270+
shap
271+
272+
RUN $PYTHON -m pip install --no-cache-dir -U \
273+
"sagemaker<3" \
274+
sagemaker-experiments==0.* \
275+
sagemaker-tensorflow-training \
276+
sagemaker-training \
277+
"sagemaker-studio-analytics-extension<1" \
278+
"sparkmagic<1" \
279+
"sagemaker-studio-sparkmagic-lib<1" \
280+
smclarify
281+
282+
# Remove python kernel installed by sparkmagic
283+
RUN /usr/local/bin/jupyter-kernelspec remove -f python3
284+
285+
# remove tmp files
286+
RUN rm -rf /tmp/*
287+
288+
RUN HOME_DIR=/root \
289+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
290+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
291+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
292+
&& chmod +x /usr/local/bin/testOSSCompliance \
293+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
294+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
295+
&& rm -rf ${HOME_DIR}/oss_compliance*
296+
297+
CMD ["/bin/bash"]

0 commit comments

Comments
 (0)