add tf 2.18 x86 file

Yadan Wei · Yadan Wei · commit 413f52c3a293 · 2025-01-09T14:30:57.000-08:00
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -37,11 +37,11 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["tensorflow"]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -106,7 +106,7 @@ use_scheduler = false
 # Standard Framework Training
 dlc-pr-mxnet-training = ""
 dlc-pr-pytorch-training = ""
-dlc-pr-tensorflow-2-training = ""
+dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml"
 dlc-pr-autogluon-training = ""
 
 # HuggingFace Training
diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml
@@ -0,0 +1,65 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK tensorflow
+version: &VERSION 2.18.0
+short_version: &SHORT_VERSION "2.18"
+arch_type: x86
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
+                                                    *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    dockerd-entrypoint:
+      source: docker/build_artifacts/dockerd-entrypoint.py
+      target: dockerd-entrypoint.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildTensorflowEC2CpuPy310TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_CPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 4489
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py310
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION,
+                                "-ec2" ]
+    # build_tag_override: "beta:2.16.2-cpu-py310-ubuntu20.04-ec2"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: ec2
+    enable_test_promotion: true
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildTensorflowEC2GpuCu123Py310TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 9307
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py310
+    cuda_version: &CUDA_VERSION cu125
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-",
+                                *OS_VERSION, "-ec2" ]
+    # build_tag_override: "beta:2.16.2-gpu-py310-cu123-ubuntu20.04-ec2"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    enable_test_promotion: true
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/tensorflow/training/buildspec.yml b/tensorflow/training/buildspec.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-2-16-sm.yml
+buildspec_pointer: buildspec-2-18-ec2.yml
diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.cpu b/tensorflow/training/docker/2.18/py3/Dockerfile.cpu
@@ -0,0 +1,297 @@
+FROM ubuntu:22.04 AS base_image
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get autoremove -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+FROM base_image AS common
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+# TensorFlow major.minor version
+ENV TF_VERSION=2.18
+
+# prevent stopping by user interaction
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+# Set environment variables for MKL
+# For more about MKL with TensorFlow see:
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV KMP_BLOCKTIME=1
+ENV KMP_SETTINGS=0
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+ARG PYTHON=python3.10
+ARG PYTHON_VERSION=3.10.14
+
+ARG PIP=pip3
+
+ARG OMPI_VERSION=4.1.6
+
+# To be passed to ec2 and sagemaker stages
+ENV PYTHON=${PYTHON}
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PIP=${PIP}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    openssh-client \
+    openssh-server \
+    ca-certificates \
+    curl \
+    emacs \
+    git \
+    libtemplate-perl \
+    libssl1.1 \
+    openssl \
+    protobuf-compiler \
+    unzip \
+    wget \
+    vim \
+    zlib1g-dev \
+    # Install dependent library for OpenCV
+    libgtk2.0-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
+ && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
+ && cd openmpi-${OMPI_VERSION} \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH /usr/local/openmpi/bin/:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && mkdir -p /var/run/sshd \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+    libbz2-dev \
+    libc6-dev \
+    libcurl4-openssl-dev \
+    libffi-dev \
+    libgdbm-dev \
+    liblzma-dev \
+    libncursesw5-dev \
+    libreadline-gplv2-dev \
+    libsqlite3-dev \
+    libssl-dev \
+    tk-dev \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+    && tar -xvf Python-$PYTHON_VERSION.tgz \
+    && cd Python-$PYTHON_VERSION \
+    && ./configure \
+    && make -j $(nproc) \
+    && make install \
+    && rm -rf ../Python-$PYTHON_VERSION*
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
+    && ln -s $(which ${PIP}) /usr/bin/pip
+
+RUN ${PIP} install --no-cache-dir -U \
+    pybind11 \
+    cmake \
+    scipy \
+    Pillow \
+    python-dateutil \
+    requests \
+    "awscli<2" \
+    urllib3 \
+    mpi4py \
+    # Let's install TensorFlow separately in the end to avoid
+    # the library version to be overwritten
+ && ${PIP} install --no-cache-dir -U \
+    h5py \
+    absl-py \
+    opencv-python \
+    werkzeug \
+    psutil \
+    "protobuf<4"
+
+ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-${TF_VERSION}/license.txt -o /license.txt
+
+########################################################
+#  _____ ____ ____    ___
+# | ____/ ___|___ \  |_ _|_ __ ___   __ _  __ _  ___
+# |  _|| |     __) |  | || '_ ` _ \ / _` |/ _` |/ _ \
+# | |__| |___ / __/   | || | | | | | (_| | (_| |  __/
+# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
+#                                         |___/
+#  ____           _
+# |  _ \ ___  ___(_)_ __   ___
+# | |_) / _ \/ __| | '_ \ / _ \
+# |  _ <  __/ (__| | |_) |  __/
+# |_| \_\___|\___|_| .__/ \___|
+#                  |_|
+########################################################
+
+FROM common AS ec2
+ARG TF_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/tensorflow/r2.16_aws/cpu/2024-07-12-00-09/tensorflow_cpu-2.16.2-cp310-cp310-linux_x86_64.whl
+
+RUN ${PIP} install --no-cache-dir -U \
+    ${TF_URL} \
+    "tensorflow-io==0.37.*" \
+    tensorflow-datasets
+
+RUN HOME_DIR=/root \
+   && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+   && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+   && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+   && chmod +x /usr/local/bin/testOSSCompliance \
+   && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+   && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+   && rm -rf ${HOME_DIR}/oss_compliance*
+
+# remove tmp files
+RUN rm -rf /tmp/*
+
+CMD ["/bin/bash"]
+
+#################################################################
+#  ____                   __  __       _
+# / ___|  __ _  __ _  ___|  \/  | __ _| | _____ _ __
+# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
+#  ___) | (_| | (_| |  __/ |  | | (_| |   <  __/ |
+# |____/ \__,_|\__, |\___|_|  |_|\__,_|_|\_\___|_|
+#              |___/
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#################################################################
+
+FROM common AS sagemaker
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG TF_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/tensorflow/r2.16_aws/cpu/2024-07-12-00-09/tensorflow_cpu-2.16.2-cp310-cp310-linux_x86_64.whl
+
+# sagemaker-specific environment variable
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
+
+# dependencies for opencv
+# these dependencies are not needed for gpu image
+RUN apt-get update  \
+ && apt-get install -y --no-install-recommends \
+    libgtk2.0-dev \
+ && apt-get install -y -qq libkrb5-dev \
+ && apt-get install -y -qq libsasl2-dev libsasl2-modules \
+ && apt-get install -y -qq krb5-user \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# https://github.com/yaml/pyyaml/issues/601
+# PyYaml less than 6.0.1 failes to build with cython v3 and above.
+# tf-models-official uses older versions, breaking the install.
+# going to install the older pyyaml and cython to get tfd-models-official
+# the sagemaker package will revert pyyaml back to 6 for its requirement
+# and this is fine since sagemaker is more important than the models and
+# the models still work on pyyaml 6 in this context.
+# Need to install wheel before we can fix the pyyaml issue below
+RUN pip install wheel \
+ && pip install "cython<3" "pyyaml<6" --no-build-isolation
+
+# https://github.com/tensorflow/models/issues/9267
+# tf-models does not respect existing installations of TF and always installs open source TF
+RUN ${PIP} install --no-cache-dir -U \
+    tf-models-official==2.16.0 \
+    tensorflow-text==2.16.1 \
+ && ${PIP} uninstall -y tensorflow tensorflow-gpu \
+ && ${PIP} install --no-cache-dir -U \
+    ${TF_URL} \
+    "tensorflow-io==0.37.*" \
+    tensorflow-datasets
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    numba \
+    bokeh \
+    imageio \
+    opencv-python \
+    plotly \
+    seaborn \
+    shap
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    "sagemaker<3" \
+    sagemaker-experiments==0.* \
+    sagemaker-tensorflow-training \
+    sagemaker-training \
+    "sagemaker-studio-analytics-extension<1" \
+    "sparkmagic<1" \
+    "sagemaker-studio-sparkmagic-lib<1" \
+    smclarify
+
+# Remove python kernel installed by sparkmagic
+RUN /usr/local/bin/jupyter-kernelspec remove -f python3
+
+# remove tmp files
+RUN rm -rf /tmp/*
+
+RUN HOME_DIR=/root \
+   && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+   && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+   && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+   && chmod +x /usr/local/bin/testOSSCompliance \
+   && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+   && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+   && rm -rf ${HOME_DIR}/oss_compliance*
+
+CMD ["/bin/bash"]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-buildspec_pointer: buildspec-2-16-sm.yml`
	`1`	`+buildspec_pointer: buildspec-2-18-ec2.yml`