Hf pt 2 8 cu129 tr4 56 2 training (#5330)

fgbelidji · EC2 Default User · web-flow · commit d0c54fb5368e · 2025-10-02T20:03:09.000Z
* Added Dockerfile for pt 2.8 cu129 * Updated buildspecs * updated dlc_developer_config.toml * missing lib for docker-compose * formatting * force build * base image to us-west-2 * formatting * fix cve-77744 * Removed sigopt * Revert "updated dlc_developer_config.toml" This reverts commit d24262d. --------- Co-authored-by: EC2 Default User <ec2-user@ip-10-90-0-235.ec2.internal>
diff --git a/huggingface/pytorch/training/buildspec-2-7.yml b/huggingface/pytorch/training/buildspec-2-7.yml
@@ -0,0 +1,41 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK pytorch
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+
+version: &VERSION 2.7.1
+short_version: &SHORT_VERSION "2.7"
+contributor: huggingface
+arch_type: x86
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    cuda-compatibility-lib:
+      source: ../../build_artifacts/training/cuda-compatibility-lib.sh
+      target: cuda-compatibility-lib.sh
+
+images:
+  BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 25000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu128
+    os_version: &OS_VERSION ubuntu22.04
+    transformers_version: &TRANSFORMERS_VERSION 4.55.0
+    datasets_version: &DATASETS_VERSION 4.0.0
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
+                 *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
+                         *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT 
diff --git a/huggingface/pytorch/training/buildspec.yml b/huggingface/pytorch/training/buildspec.yml
@@ -3,8 +3,8 @@ region: &REGION <set-$REGION-in-environment>
 base_framework: &BASE_FRAMEWORK pytorch
 framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
 
-version: &VERSION 2.7.1
-short_version: &SHORT_VERSION "2.7"
+version: &VERSION 2.8.0
+short_version: &SHORT_VERSION "2.8"
 contributor: huggingface
 arch_type: x86
 
@@ -22,17 +22,17 @@ context:
       target: cuda-compatibility-lib.sh
 
 images:
-  BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
+  BuildHuggingFacePytorchGpuPy312Cu129TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
     image_size_baseline: &IMAGE_SIZE_BASELINE 25000
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py312
-    cuda_version: &CUDA_VERSION cu128
+    cuda_version: &CUDA_VERSION cu129
     os_version: &OS_VERSION ubuntu22.04
-    transformers_version: &TRANSFORMERS_VERSION 4.55.0
-    datasets_version: &DATASETS_VERSION 4.0.0
+    transformers_version: &TRANSFORMERS_VERSION 4.56.2
+    datasets_version: &DATASETS_VERSION 4.1.0
     tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
                  *CUDA_VERSION, '-', *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
diff --git a/huggingface/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/huggingface/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -0,0 +1,110 @@
+# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
+# refer to the above page to pull latest Pytorch image
+
+# docker image region us-west-2
+FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.8.0-gpu-py312-cu129-ubuntu22.04-sagemaker
+
+RUN apt-get remove -y --purge emacs && \
+
+apt-get autoremove -y
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+# version args
+ARG TRANSFORMERS_VERSION=4.56.2
+ARG DATASETS_VERSION=4.1.0
+ARG HUGGINGFACE_HUB_VERSION=0.35.3
+ARG DIFFUSERS_VERSION=0.35.1
+ARG EVALUATE_VERSION=0.4.3
+ARG ACCELERATE_VERSION=1.10.1
+ARG TRL_VERSION=0.23.0
+ARG PEFT_VERSION=0.17.1
+ARG FLASH_ATTN_VERSION=2.8.3
+ARG NINJA_VERSION=1.13.0
+ARG KERNELS_VERSION=0.9.0
+ARG PYTHON=python3
+
+# TODO: Remove when the base image is updated
+RUN pip install --upgrade pip \
+ && pip uninstall -y transformer-engine flash-attn pyarrow cryptography \
+ && pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \
+ && pip --no-cache-dir install --upgrade wheel setuptools \
+ && pip install --no-cache-dir -U "werkzeug==3.0.6"
+
+# Pre-install kenlm without build isolation so it uses system cmake
+RUN pip install --no-cache-dir --no-build-isolation kenlm
+
+# Install Hugging Face libraries and dependencies
+RUN pip install --no-cache-dir \
+    huggingface_hub[hf_transfer,hf_xet]==${HUGGINGFACE_HUB_VERSION} \
+    transformers[torch,sentencepiece,tokenizers,torch-speech,vision,integrations,timm,torch-vision,video,codecarbon,accelerate,mistral-common,chat-template,hub-kernels,sklearn,speech,audio,tiktoken,hf_xet,sagemaker]==${TRANSFORMERS_VERSION} \
+    datasets==${DATASETS_VERSION} \
+    diffusers==${DIFFUSERS_VERSION} \
+    Jinja2 \
+    tensorboard \
+    bitsandbytes \
+    kernels==${KERNELS_VERSION} \
+    evaluate==${EVALUATE_VERSION} \
+    accelerate==${ACCELERATE_VERSION} \
+    ninja==${NINJA_VERSION} \
+    trl==${TRL_VERSION} \
+    peft==${PEFT_VERSION} \
+    flash-attn==${FLASH_ATTN_VERSION}
+
+# Override conflicting versions to satisfy datasets requirements
+RUN pip install --no-cache-dir dill==0.3.8 multiprocess==0.70.16 \
+ && pip install --no-cache-dir pathos==0.3.3 --no-deps \
+ && PATHOS_META=$(find /usr/local/lib -type f -path "*pathos-0.3.3.dist-info/METADATA") \
+ && sed -i 's/dill.*/dill/' $PATHOS_META \
+ && sed -i 's/multiprocess.*/multiprocess/' $PATHOS_META
+
+# Fix CVE-77744: Upgrade urllib3 to version 2.5.0 or higher
+# Remove sigopt to avoid dependency conflict (it's not essential for core functionality)
+RUN pip install --no-cache-dir -U "urllib3>=2.5.0" \
+ && pip uninstall -y sigopt || true
+
+# Fix CVE-2023-48022: Remove Ray to eliminate vulnerability
+RUN pip uninstall -y ray
+
+# hf_transfer will be a built-in feature, remove the env variable then
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training"
+
+RUN apt-get update \
+ && apt-get install -y --allow-change-held-packages --no-install-recommends \
+    libgl1-mesa-glx \
+    build-essential \
+    ca-certificates \
+    zlib1g-dev \
+    openssl \
+    python3-dev \
+    pkg-config \
+    check \
+    curl \
+    emacs \
+    git \
+    jq \
+    unzip \
+    vim \
+    wget \
+    libcrypt1 \
+&& rm -rf /var/lib/apt/lists/*
+
+COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh
+RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get autoremove -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance*