Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b0c591e
update dockerfile
JingyaHuang Oct 27, 2025
5e6cae3
revertme: dlc_developer_config
JingyaHuang Oct 27, 2025
d0a6c55
Fix test errors
arjraman Oct 27, 2025
2e403b4
Merge branch 'master' into update-hf-neuronx-dlc-pt2.8-train
arjraman Oct 27, 2025
4b7550c
Fix test errors
arjraman Oct 27, 2025
7dca14e
Fix test errors
arjraman Oct 27, 2025
adfe88e
Fix test errors
arjraman Oct 27, 2025
ac80005
Fix test errors
arjraman Oct 27, 2025
9c62a23
Fix test errors
arjraman Oct 28, 2025
c93c2d3
Change instance type to ml.trn1.32xlarge
arjraman Oct 28, 2025
154bd50
Modify dockerfile to use py310
arjraman Oct 28, 2025
68cb8b0
Fix test errors
arjraman Oct 28, 2025
c5c8c3e
Fix test errors
arjraman Oct 28, 2025
ffb8f7f
Fix test errors
arjraman Oct 28, 2025
b33ba95
Fix test errors
arjraman Oct 28, 2025
c6131cd
Fix test errors
arjraman Oct 29, 2025
8a49160
Fix test errors
arjraman Oct 29, 2025
26ecdff
Update image_size_baseline to 40000
arjraman Oct 30, 2025
140c5b0
Merge branch 'master' into update-hf-neuronx-dlc-pt2.8-train
arjraman Oct 31, 2025
746cbd8
remove NxDT
JingyaHuang Nov 4, 2025
c8d1d74
remove NxDT
JingyaHuang Nov 4, 2025
4e473a2
Fix test errors
arjraman Nov 4, 2025
09321b5
Fix test errors
arjraman Nov 4, 2025
f24cd92
Fix test errors
arjraman Nov 4, 2025
a2fecbe
Fix test errors
arjraman Nov 4, 2025
1d022c6
Merge branch 'master' into update-hf-neuronx-dlc-pt2.8-train
arjraman Nov 5, 2025
e6624ae
Fix test errors
arjraman Nov 6, 2025
857fdf3
Merge branch 'master' into update-hf-neuronx-dlc-pt2.8-train
arjraman Nov 6, 2025
7525344
Fix test errors
arjraman Nov 6, 2025
eefa608
Fix test errors
arjraman Nov 6, 2025
d6abca7
Fix test errors
arjraman Nov 6, 2025
d544da8
Fix test errors
arjraman Nov 6, 2025
5033121
Fix test errors
arjraman Nov 6, 2025
8850b2a
Update developer configuration settings
arjraman Nov 6, 2025
c608668
Merge branch 'master' into update-hf-neuronx-dlc-pt2.8-train
arjraman Nov 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions huggingface/pytorch/training/buildspec-neuronx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK pytorch
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION 2.7.0
short_version: &SHORT_VERSION "2.7"
version: &VERSION 2.8.0
short_version: &SHORT_VERSION "2.8"
contributor: huggingface
arch_type: x86

Expand All @@ -25,19 +25,28 @@ context:
deep_learning_container:
source: ../../../src/deep_learning_container.py
target: deep_learning_container.py
apex_setup:
source: docker/build_artifacts/apex_setup.py
target: apex_setup.py
nxdt_install_setup:
source: docker/build_artifacts/nxdt_install_setup.sh
target: nxdt_install_setup.sh
nxdt_requirements:
source: docker/build_artifacts/nxdt_requirements.txt
target: nxdt_requirements.txt

images:
BuildNeuronHFPytorchPy310TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &HUGGINGFACE_PYTORCH_INF_TRAINING_PY3 false
image_size_baseline: 28000
image_size_baseline: 40000
device_type: &DEVICE_TYPE neuronx
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py310
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.24.1
neuron_sdk_version: &NEURON_SDK_VERSION sdk2.26.0
os_version: &OS_VERSION ubuntu22.04
transformers_version: &TRANSFORMERS_VERSION 4.51.0
datasets_version: &DATASETS_VERSION 2.18.0
transformers_version: &TRANSFORMERS_VERSION 4.55.4
datasets_version: &DATASETS_VERSION 4.1.1
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION,"-", *NEURON_SDK_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., *DEVICE_TYPE ]
context:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
ARG BUILD_STAGE=prod

FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="2"

ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
ARG DEBIAN_FRONTEND=noninteractive

# Python won't try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH="/opt/aws/neuron/bin:${PATH}"
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
ENV DGLBACKEND=pytorch

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
gnupg2 \
gpg-agent \
jq \
libopencv-dev \
libglib2.0-0 \
libgl1-mesa-glx \
libsm6 \
libxext6 \
libxrender-dev \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
libffi-dev \
libcap-dev \
libhwloc-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
openjdk-11-jdk \
openssl \
software-properties-common \
tk-dev \
unzip \
wget \
vim \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Open MPI
RUN mkdir -p /tmp/openmpi \
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi

# Install packages and configure SSH for MPI operator in k8s
RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
&& mkdir -p /var/run/sshd \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade pip \
&& rm -rf ~/.cache/pip/*

WORKDIR /

# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

RUN ${PIP} install --no-cache-dir -U \
"bokeh>=2.3,<3" \
"awscli<2" \
scipy \
click \
"cryptography" \
"sagemaker>=2,<3" \
"sagemaker-pytorch-training" \
psutil==5.6.7 \
dataset \
Pillow \
&& rm -rf ~/.cache/pip/*

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt

# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
# awscli 1.25.47 has requirement docutils<0.17,>=0.10
# etcd for kubernetes installation
# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
RUN ${PIP} install --no-cache-dir -U \
"attrs<24,>=23.1.0" \
"docutils>=0.10,<0.17" \
"rsa<4.8,>=3.1.2" \
"python-etcd" \
"urllib3>=1.26.0,<1.27" \
# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
&& ${PIP} install --no-cache-dir -U \
"bokeh>=3.0.1,<4" \
"imageio>=2.22,<3" \
"opencv-python>=4.8.1.78" \
"plotly>=5.11,<6" \
"seaborn>=0.12,<1" \
"shap>=0.41,<1" \
&& rm -rf ~/.cache/pip/*

# EFA Installer does apt get. Make sure to run apt update before that
RUN apt-get update \
&& cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install some common packages used by training scripts
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
# is already installed install it with nodeps
RUN ${PIP} install --no-cache-dir --no-deps -U \
torchvision==0.23.0 \
# Needed for running bert training scripts
&& ${PIP} install --no-cache-dir -U \
graphviz \
tensorboard==2.6 \
accelerate \
# Install NxDT dependencies
&& ${PIP} install --no-cache-dir \
Cython \
wheel \
&& rm -rf ~/.cache/pip/*

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt

# Setting up APT and PIP repo for neuron artifacts
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
ARG NEURON_APT_REPO_KEY
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
ARG NEURON_PIP_REPO_KEY
RUN mkdir -p /etc/apt/keyrings \
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \
&& PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} config set global.extra-index-url "${PIP_REPO_URL}"

# Neuron SDK components
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
ARG NEURONX_TOOLS_VERSION=2.26.14.0
ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf

FROM base AS repo

# Install Neuron components from the apt and pip repos (latest versions)
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools \
aws-neuronx-collectives \
aws-neuronx-runtime-lib \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

RUN ${PIP} install --no-cache-dir --force-reinstall \
torch-neuronx \
neuronx-cc \
neuronx_distributed \
&& rm -rf ~/.cache/pip/*

FROM base AS prod

# Install Neuron components with specific versions
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

RUN ${PIP} install --force-reinstall \
torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
neuronx-cc==$NEURONX_CC_VERSION \
neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
&& rm -rf ~/.cache/pip/*

FROM ${BUILD_STAGE} AS final

# Hugging Face version args
ARG OPTIMUM_NEURON_VERSION=0.4.1
ARG TRANSFORMERS_VERSION=4.55.4
ARG DATASETS_VERSION=4.1.1
ARG GEVENT_VERSION=24.10.3

RUN apt-get remove -y --purge emacs && \
apt-get autoremove -y

# We need to set this environment variable to avoid the following error when building KenLM:
# https://github.com/kpu/kenlm/issues/462
ENV CMAKE_POLICY_VERSION_MINIMUM=3.5

# Install Hugging Face libraries and its dependencies
# Install optimum-neuron with this exta starting from next release. \
# "optimum-neuron[training]"==${OPTIMUM_NEURON_VERSION} \
RUN ${PIP} install --no-cache-dir \
evaluate \
transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
datasets==${DATASETS_VERSION} \
optimum-neuron[training]==${OPTIMUM_NEURON_VERSION} \
gevent==${GEVENT_VERSION} \
&& rm -rf ~/.cache/pip/*

# Pin numpy to version required by neuronx-cc
# Update Pillow, urllib, wandb versions to fix high and critical vulnerabilities
# neuronx-cc has requirement networkx~=2.6
RUN ${PIP} install -U \
"sagemaker>=2.237.0" \
sagemaker-training \
"sagemaker-pytorch-training<3.0.0" \
"tensorboard>=2.11.0" \
"numpy" \
"numba" \
"Pillow==10.3.0" \
"requests" \
wandb \
pytorch-lightning \
Jinja2 \
mlflow \
tornado \
"awscli<2" \
"boto3<2.0" \
"botocore<1.35.94,>=1.35.74" \
google-auth \
"urllib3>=1.26.17,<1.27" \
"networkx==2.8.8" \
bokeh \
"opencv-python<4.12.0" \
"fsspec==2025.9.0" \
"protobuf<4" \
"multiprocess<0.70.17" \
&& rm -rf ~/.cache/pip/*

RUN apt-get update \
&& apt install -y --no-install-recommends \
git-lfs \
libgssapi-krb5-2 \
libexpat1 \
expat \
libarchive13 \
libgstreamer1.0-0 \
libgstreamer-plugins-base1.0-0 \
&& apt-get upgrade -y apparmor \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

ENV WANDB_MODE=disabled

# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]

HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"76839": "[pkg: gevent] [installed: 24.10.3]",
"71691": "[pkg: mlflow] [installed: 3.4.0]",
"77740": "[pkg: protobuf] [installed: 3.20.3]",
"77744": "[pkg: urllib3] [installed: 1.26.20]"
}
Loading