Skip to content

Commit d0c54fb

Browse files
fgbelidjiEC2 Default User
andauthored
Hf pt 2 8 cu129 tr4 56 2 training (#5330)
* Added Dockerfile for pt 2.8 cu129 * Updated buildspecs * updated dlc_developer_config.toml * missing lib for docker-compose * formatting * force build * base image to us-west-2 * formatting * fix cve-77744 * Removed sigopt * Revert "updated dlc_developer_config.toml" This reverts commit d24262d. --------- Co-authored-by: EC2 Default User <ec2-user@ip-10-90-0-235.ec2.internal>
1 parent 93ff6b4 commit d0c54fb

File tree

3 files changed

+157
-6
lines changed

3 files changed

+157
-6
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
region: &REGION <set-$REGION-in-environment>
3+
base_framework: &BASE_FRAMEWORK pytorch
4+
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
5+
6+
version: &VERSION 2.7.1
7+
short_version: &SHORT_VERSION "2.7"
8+
contributor: huggingface
9+
arch_type: x86
10+
11+
repository_info:
12+
training_repository: &TRAINING_REPOSITORY
13+
image_type: &TRAINING_IMAGE_TYPE training
14+
root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
15+
repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
16+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
17+
18+
context:
19+
training_context: &TRAINING_CONTEXT
20+
cuda-compatibility-lib:
21+
source: ../../build_artifacts/training/cuda-compatibility-lib.sh
22+
target: cuda-compatibility-lib.sh
23+
24+
images:
25+
BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
26+
<<: *TRAINING_REPOSITORY
27+
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
28+
image_size_baseline: &IMAGE_SIZE_BASELINE 25000
29+
device_type: &DEVICE_TYPE gpu
30+
python_version: &DOCKER_PYTHON_VERSION py3
31+
tag_python_version: &TAG_PYTHON_VERSION py312
32+
cuda_version: &CUDA_VERSION cu128
33+
os_version: &OS_VERSION ubuntu22.04
34+
transformers_version: &TRANSFORMERS_VERSION 4.55.0
35+
datasets_version: &DATASETS_VERSION 4.0.0
36+
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
37+
*CUDA_VERSION, '-', *OS_VERSION ]
38+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
39+
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
40+
context:
41+
<<: *TRAINING_CONTEXT

huggingface/pytorch/training/buildspec.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ region: &REGION <set-$REGION-in-environment>
33
base_framework: &BASE_FRAMEWORK pytorch
44
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
55

6-
version: &VERSION 2.7.1
7-
short_version: &SHORT_VERSION "2.7"
6+
version: &VERSION 2.8.0
7+
short_version: &SHORT_VERSION "2.8"
88
contributor: huggingface
99
arch_type: x86
1010

@@ -22,17 +22,17 @@ context:
2222
target: cuda-compatibility-lib.sh
2323

2424
images:
25-
BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
25+
BuildHuggingFacePytorchGpuPy312Cu129TrainingDockerImage:
2626
<<: *TRAINING_REPOSITORY
2727
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
2828
image_size_baseline: &IMAGE_SIZE_BASELINE 25000
2929
device_type: &DEVICE_TYPE gpu
3030
python_version: &DOCKER_PYTHON_VERSION py3
3131
tag_python_version: &TAG_PYTHON_VERSION py312
32-
cuda_version: &CUDA_VERSION cu128
32+
cuda_version: &CUDA_VERSION cu129
3333
os_version: &OS_VERSION ubuntu22.04
34-
transformers_version: &TRANSFORMERS_VERSION 4.55.0
35-
datasets_version: &DATASETS_VERSION 4.0.0
34+
transformers_version: &TRANSFORMERS_VERSION 4.56.2
35+
datasets_version: &DATASETS_VERSION 4.1.0
3636
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
3737
*CUDA_VERSION, '-', *OS_VERSION ]
3838
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
2+
# refer to the above page to pull latest Pytorch image
3+
4+
# docker image region us-west-2
5+
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.8.0-gpu-py312-cu129-ubuntu22.04-sagemaker
6+
7+
RUN apt-get remove -y --purge emacs && \
8+
9+
apt-get autoremove -y
10+
11+
LABEL maintainer="Amazon AI"
12+
LABEL dlc_major_version="1"
13+
14+
# version args
15+
ARG TRANSFORMERS_VERSION=4.56.2
16+
ARG DATASETS_VERSION=4.1.0
17+
ARG HUGGINGFACE_HUB_VERSION=0.35.3
18+
ARG DIFFUSERS_VERSION=0.35.1
19+
ARG EVALUATE_VERSION=0.4.3
20+
ARG ACCELERATE_VERSION=1.10.1
21+
ARG TRL_VERSION=0.23.0
22+
ARG PEFT_VERSION=0.17.1
23+
ARG FLASH_ATTN_VERSION=2.8.3
24+
ARG NINJA_VERSION=1.13.0
25+
ARG KERNELS_VERSION=0.9.0
26+
ARG PYTHON=python3
27+
28+
# TODO: Remove when the base image is updated
29+
RUN pip install --upgrade pip \
30+
&& pip uninstall -y transformer-engine flash-attn pyarrow cryptography \
31+
&& pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \
32+
&& pip --no-cache-dir install --upgrade wheel setuptools \
33+
&& pip install --no-cache-dir -U "werkzeug==3.0.6"
34+
35+
# Pre-install kenlm without build isolation so it uses system cmake
36+
RUN pip install --no-cache-dir --no-build-isolation kenlm
37+
38+
# Install Hugging Face libraries and dependencies
39+
RUN pip install --no-cache-dir \
40+
huggingface_hub[hf_transfer,hf_xet]==${HUGGINGFACE_HUB_VERSION} \
41+
transformers[torch,sentencepiece,tokenizers,torch-speech,vision,integrations,timm,torch-vision,video,codecarbon,accelerate,mistral-common,chat-template,hub-kernels,sklearn,speech,audio,tiktoken,hf_xet,sagemaker]==${TRANSFORMERS_VERSION} \
42+
datasets==${DATASETS_VERSION} \
43+
diffusers==${DIFFUSERS_VERSION} \
44+
Jinja2 \
45+
tensorboard \
46+
bitsandbytes \
47+
kernels==${KERNELS_VERSION} \
48+
evaluate==${EVALUATE_VERSION} \
49+
accelerate==${ACCELERATE_VERSION} \
50+
ninja==${NINJA_VERSION} \
51+
trl==${TRL_VERSION} \
52+
peft==${PEFT_VERSION} \
53+
flash-attn==${FLASH_ATTN_VERSION}
54+
55+
# Override conflicting versions to satisfy datasets requirements
56+
RUN pip install --no-cache-dir dill==0.3.8 multiprocess==0.70.16 \
57+
&& pip install --no-cache-dir pathos==0.3.3 --no-deps \
58+
&& PATHOS_META=$(find /usr/local/lib -type f -path "*pathos-0.3.3.dist-info/METADATA") \
59+
&& sed -i 's/dill.*/dill/' $PATHOS_META \
60+
&& sed -i 's/multiprocess.*/multiprocess/' $PATHOS_META
61+
62+
# Fix CVE-77744: Upgrade urllib3 to version 2.5.0 or higher
63+
# Remove sigopt to avoid dependency conflict (it's not essential for core functionality)
64+
RUN pip install --no-cache-dir -U "urllib3>=2.5.0" \
65+
&& pip uninstall -y sigopt || true
66+
67+
# Fix CVE-2023-48022: Remove Ray to eliminate vulnerability
68+
RUN pip uninstall -y ray
69+
70+
# hf_transfer will be a built-in feature, remove the env variable then
71+
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
72+
ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training"
73+
74+
RUN apt-get update \
75+
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
76+
libgl1-mesa-glx \
77+
build-essential \
78+
ca-certificates \
79+
zlib1g-dev \
80+
openssl \
81+
python3-dev \
82+
pkg-config \
83+
check \
84+
curl \
85+
emacs \
86+
git \
87+
jq \
88+
unzip \
89+
vim \
90+
wget \
91+
libcrypt1 \
92+
&& rm -rf /var/lib/apt/lists/*
93+
94+
COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh
95+
RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh
96+
97+
RUN apt-get update \
98+
&& apt-get upgrade -y \
99+
&& apt-get autoremove -y \
100+
&& apt-get clean \
101+
&& rm -rf /var/lib/apt/lists/*
102+
103+
RUN HOME_DIR=/root \
104+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
105+
&& unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
106+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
107+
&& chmod +x /usr/local/bin/testOSSCompliance \
108+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
109+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
110+
&& rm -rf ${HOME_DIR}/oss_compliance*

0 commit comments

Comments
 (0)