aws · arjraman · Nov 6, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
@@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 base_framework: &BASE_FRAMEWORK pytorch
 framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
-version: &VERSION 2.7.0
-short_version: &SHORT_VERSION "2.7"
+version: &VERSION 2.8.0
+short_version: &SHORT_VERSION "2.8"
 contributor: huggingface
 arch_type: x86
 
@@ -25,19 +25,28 @@ context:
     deep_learning_container:
       source: ../../../src/deep_learning_container.py
       target: deep_learning_container.py
+    apex_setup:
+      source: docker/build_artifacts/apex_setup.py
+      target: apex_setup.py
+    nxdt_install_setup:
+      source: docker/build_artifacts/nxdt_install_setup.sh
+      target: nxdt_install_setup.sh
+    nxdt_requirements:
+      source: docker/build_artifacts/nxdt_requirements.txt
+      target: nxdt_requirements.txt
 
 images:
   BuildNeuronHFPytorchPy310TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &HUGGINGFACE_PYTORCH_INF_TRAINING_PY3 false
-    image_size_baseline: 28000
+    image_size_baseline: 40000
     device_type: &DEVICE_TYPE neuronx
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py310
-    neuron_sdk_version: &NEURON_SDK_VERSION sdk2.24.1
+    neuron_sdk_version: &NEURON_SDK_VERSION sdk2.26.0
     os_version: &OS_VERSION ubuntu22.04
-    transformers_version: &TRANSFORMERS_VERSION 4.51.0
-    datasets_version: &DATASETS_VERSION 2.18.0
+    transformers_version: &TRANSFORMERS_VERSION 4.55.4
+    datasets_version: &DATASETS_VERSION 4.1.1
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION,"-", *NEURON_SDK_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., *DEVICE_TYPE ]
     context:

@@ -0,0 +1,333 @@
+ARG BUILD_STAGE=prod
+
+FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="2"
+
+ARG PYTHON=python3.10
+ARG PYTHON_VERSION=3.10.12
+ARG PIP=pip3
+ARG OMPI_VERSION=4.1.5
+
+# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Python won't try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
+ENV PATH="/opt/aws/neuron/bin:${PATH}"
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+ENV DGLBACKEND=pytorch
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    gnupg2 \
+    gpg-agent \
+    jq \
+    libopencv-dev \
+    libglib2.0-0 \
+    libgl1-mesa-glx \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libncurses-dev \
+    libffi-dev \
+    libcap-dev \
+    libhwloc-dev \
+    openjdk-8-jdk-headless \
+    openjdk-8-jdk \
+    openjdk-8-jre \
+    openjdk-11-jdk \
+    openssl \
+    software-properties-common \
+    tk-dev \
+    unzip \
+    wget \
+    vim \
+    zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+# Install Open MPI
+RUN mkdir -p /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
+ && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
+ && cd openmpi-${OMPI_VERSION} \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Install packages and configure SSH for MPI operator in k8s
+RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
+ && mkdir -p /var/run/sshd \
+ && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
+ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+# Install Python
+RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+ && tar -xzf Python-$PYTHON_VERSION.tgz \
+ && cd Python-$PYTHON_VERSION \
+ && ./configure --enable-shared --prefix=/usr/local \
+ && make -j $(nproc) && make install \
+ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
+ && ln -s /usr/local/bin/pip3 /usr/bin/pip \
+ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
+ && ${PIP} --no-cache-dir install --upgrade pip \
+ && rm -rf ~/.cache/pip/*
+
+WORKDIR /
+
+# The ENV variables declared below are changed in the previous section
+# Grouping these ENV variables in the first section causes
+# ompi_info to fail. This is only observed in CPU containers
+ENV PATH="$PATH:/home/.openmpi/bin"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
+RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
+
+RUN ${PIP} install --no-cache-dir -U \
+    "bokeh>=2.3,<3" \
+    "awscli<2" \
+    scipy \
+    click \
+    "cryptography" \
+    "sagemaker>=2,<3" \
+    "sagemaker-pytorch-training" \
+    psutil==5.6.7 \
+    dataset \
+    Pillow \
+ && rm -rf ~/.cache/pip/*
+
+RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
+
+# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
+# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
+# awscli 1.25.47 has requirement docutils<0.17,>=0.10
+# etcd for kubernetes installation
+# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
+# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
+RUN ${PIP} install --no-cache-dir -U \
+    "attrs<24,>=23.1.0" \
+    "docutils>=0.10,<0.17" \
+    "rsa<4.8,>=3.1.2" \
+    "python-etcd" \
+    "urllib3>=1.26.0,<1.27" \
+ # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
+ && ${PIP} install --no-cache-dir -U \
+    "bokeh>=3.0.1,<4" \
+    "imageio>=2.22,<3" \
+    "opencv-python>=4.8.1.78" \
+    "plotly>=5.11,<6" \
+    "seaborn>=0.12,<1" \
+    "shap>=0.41,<1" \
+ && rm -rf ~/.cache/pip/*
+
+# EFA Installer does apt get. Make sure to run apt update before that
+RUN apt-get update \
+ && cd $HOME \
+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
+ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
+ && cat aws-efa-installer.key | gpg --fingerprint \
+ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
+ && tar -xf aws-efa-installer-latest.tar.gz \
+ && cd aws-efa-installer \
+ && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
+ && cd $HOME \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+# Install some common packages used by training scripts
+# torchvision needed for MLP. since it depends on torch and torch neuron/torch
+# is already installed install it with nodeps
+RUN ${PIP} install --no-cache-dir --no-deps -U \
+    torchvision==0.23.0 \
+ # Needed for running bert training scripts
+ && ${PIP} install --no-cache-dir -U \
+    graphviz \
+    tensorboard==2.6 \
+    accelerate \
+ # Install NxDT dependencies
+ && ${PIP} install --no-cache-dir \
+    Cython \
+    wheel \
+ && rm -rf ~/.cache/pip/*
+
+# Copy workaround script for incorrect hostname
+COPY changehostname.c /
+COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+RUN curl -o /license.txt  https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
+
+# Setting up APT and PIP repo for neuron artifacts
+ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
+ARG NEURON_APT_REPO_KEY
+ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
+ARG NEURON_PIP_REPO_KEY
+RUN mkdir -p /etc/apt/keyrings \
+ && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
+ && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
+ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
+ && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
+ && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"
+
+# Neuron SDK components
+ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
+ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
+ARG NEURONX_TOOLS_VERSION=2.26.14.0
+ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
+ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
+ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
+
+FROM base AS repo
+
+# Install Neuron components from the apt and pip repos (latest versions)
+RUN apt-get update \
+ && apt-get install -y \
+    aws-neuronx-tools \
+    aws-neuronx-collectives \
+    aws-neuronx-runtime-lib \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+RUN ${PIP} install --no-cache-dir --force-reinstall \
+    torch-neuronx \
+    neuronx-cc \
+    neuronx_distributed \
+ && rm -rf ~/.cache/pip/*
+
+FROM base AS prod
+
+# Install Neuron components with specific versions
+RUN apt-get update \
+ && apt-get install -y \
+   aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
+   aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
+   aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+RUN ${PIP} install --force-reinstall \
+   torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
+   neuronx-cc==$NEURONX_CC_VERSION \
+   neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
+ && rm -rf ~/.cache/pip/*
+
+FROM ${BUILD_STAGE} AS final
+
+# Hugging Face version args
+ARG OPTIMUM_NEURON_VERSION=0.4.1
+ARG TRANSFORMERS_VERSION=4.55.4
+ARG DATASETS_VERSION=4.1.1
+ARG GEVENT_VERSION=24.10.3
+
+RUN apt-get remove -y --purge emacs && \
+apt-get autoremove -y
+
+# We need to set this environment variable to avoid the following error when building KenLM:
+# https://github.com/kpu/kenlm/issues/462
+ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
+
+# Install Hugging Face libraries and its dependencies
+# Install optimum-neuron with this exta starting from next release. \
+# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \
+RUN ${PIP} install --no-cache-dir \
+	evaluate \
+	transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
+	datasets==${DATASETS_VERSION} \
+    optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \
+	gevent==${GEVENT_VERSION} \
+ && rm -rf ~/.cache/pip/*
+
+# Pin numpy to version required by neuronx-cc
+# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities
+# neuronx-cc has requirement networkx~=2.6
+RUN ${PIP} install -U \
+	"sagemaker>=2.237.0" \
+	sagemaker-training \
+	"sagemaker-pytorch-training<3.0.0" \
+	"tensorboard>=2.11.0" \
+	"numpy" \
+	"numba" \
+	"Pillow==10.3.0" \
+	"requests" \
+    wandb \
+    pytorch-lightning \
+	Jinja2 \
+	mlflow \
+	tornado \
+	"awscli<2" \
+	"boto3<2.0" \
+	"botocore<1.35.94,>=1.35.74" \
+	google-auth \
+	"urllib3>=1.26.17,<1.27" \
+	"networkx==2.8.8" \
+	bokeh \
+    "opencv-python<4.12.0" \
+    "fsspec==2025.9.0" \
+    "protobuf<4" \
+    "multiprocess<0.70.17" \
+ && rm -rf ~/.cache/pip/*
+
+RUN apt-get update \
+ && apt install -y --no-install-recommends \
+    git-lfs \
+	libgssapi-krb5-2 \
+	libexpat1 \
+	expat \
+	libarchive13 \
+	libgstreamer1.0-0 \
+	libgstreamer-plugins-base1.0-0 \
+ && apt-get upgrade -y apparmor \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+ENV WANDB_MODE=disabled
+
+# Starts framework
+ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
+CMD ["/bin/bash"]
+
+HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
@@ -0,0 +1,6 @@
+{
+    "76839": "[pkg: gevent] [installed: 24.10.3]",
+    "71691": "[pkg: mlflow] [installed: 3.4.0]",
+    "77740": "[pkg: protobuf] [installed: 3.20.3]",
+    "77744": "[pkg: urllib3] [installed: 1.26.20]"
+}