Skip to content

Commit f692aae

Browse files
Merge branch 'master' into patch-hf-images
2 parents 3a77772 + 3294e1b commit f692aae

File tree

7 files changed

+834
-333
lines changed

7 files changed

+834
-333
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
region: &REGION <set-$REGION-in-environment>
3+
base_framework: &BASE_FRAMEWORK pytorch
4+
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
5+
version: &VERSION 2.5.1
6+
short_version: &SHORT_VERSION "2.5"
7+
contributor: huggingface
8+
arch_type: x86
9+
10+
repository_info:
11+
training_repository: &TRAINING_REPOSITORY
12+
image_type: &TRAINING_IMAGE_TYPE training
13+
root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
14+
repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
17+
context:
18+
training_context: &TRAINING_CONTEXT
19+
cuda-compatibility-lib:
20+
source: ../../build_artifacts/training/cuda-compatibility-lib.sh
21+
target: cuda-compatibility-lib.sh
22+
23+
images:
24+
BuildHuggingFacePytorchGpuPy311Cu124TrainingDockerImage:
25+
<<: *TRAINING_REPOSITORY
26+
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
27+
image_size_baseline: &IMAGE_SIZE_BASELINE 21500
28+
device_type: &DEVICE_TYPE gpu
29+
python_version: &DOCKER_PYTHON_VERSION py3
30+
tag_python_version: &TAG_PYTHON_VERSION py311
31+
cuda_version: &CUDA_VERSION cu124
32+
os_version: &OS_VERSION ubuntu22.04
33+
transformers_version: &TRANSFORMERS_VERSION 4.49.0
34+
datasets_version: &DATASETS_VERSION 3.3.2
35+
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
36+
*CUDA_VERSION, '-', *OS_VERSION ]
37+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
38+
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
39+
context:
40+
<<: *TRAINING_CONTEXT
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
region: &REGION <set-$REGION-in-environment>
3+
base_framework: &BASE_FRAMEWORK pytorch
4+
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
5+
6+
version: &VERSION 2.6.0
7+
short_version: &SHORT_VERSION "2.6"
8+
contributor: huggingface
9+
arch_type: x86
10+
11+
repository_info:
12+
training_repository: &TRAINING_REPOSITORY
13+
image_type: &TRAINING_IMAGE_TYPE training
14+
root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
15+
repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
16+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
17+
18+
context:
19+
training_context: &TRAINING_CONTEXT
20+
cuda-compatibility-lib:
21+
source: ../../build_artifacts/training/cuda-compatibility-lib.sh
22+
target: cuda-compatibility-lib.sh
23+
24+
images:
25+
BuildHuggingFacePytorchGpuPy312Cu126TrainingDockerImage:
26+
<<: *TRAINING_REPOSITORY
27+
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
28+
image_size_baseline: &IMAGE_SIZE_BASELINE 21500
29+
device_type: &DEVICE_TYPE gpu
30+
python_version: &DOCKER_PYTHON_VERSION py3
31+
tag_python_version: &TAG_PYTHON_VERSION py312
32+
cuda_version: &CUDA_VERSION cu126
33+
os_version: &OS_VERSION ubuntu22.04
34+
transformers_version: &TRANSFORMERS_VERSION 4.51.3
35+
datasets_version: &DATASETS_VERSION 3.5.0
36+
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
37+
*CUDA_VERSION, '-', *OS_VERSION ]
38+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
39+
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
40+
context:
41+
<<: *TRAINING_CONTEXT

huggingface/pytorch/training/buildspec.yml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
22
region: &REGION <set-$REGION-in-environment>
33
base_framework: &BASE_FRAMEWORK pytorch
44
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
5-
version: &VERSION 2.6.0
6-
short_version: &SHORT_VERSION "2.6"
5+
6+
version: &VERSION 2.7.1
7+
short_version: &SHORT_VERSION "2.7"
78
contributor: huggingface
89
arch_type: x86
910

@@ -21,17 +22,17 @@ context:
2122
target: cuda-compatibility-lib.sh
2223

2324
images:
24-
BuildHuggingFacePytorchGpuPy312Cu126TrainingDockerImage:
25+
BuildHuggingFacePytorchGpuPy312Cu128TrainingDockerImage:
2526
<<: *TRAINING_REPOSITORY
2627
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
27-
image_size_baseline: &IMAGE_SIZE_BASELINE 21500
28+
image_size_baseline: &IMAGE_SIZE_BASELINE 25000
2829
device_type: &DEVICE_TYPE gpu
2930
python_version: &DOCKER_PYTHON_VERSION py3
3031
tag_python_version: &TAG_PYTHON_VERSION py312
31-
cuda_version: &CUDA_VERSION cu126
32+
cuda_version: &CUDA_VERSION cu128
3233
os_version: &OS_VERSION ubuntu22.04
33-
transformers_version: &TRANSFORMERS_VERSION 4.51.3
34-
datasets_version: &DATASETS_VERSION 3.5.0
34+
transformers_version: &TRANSFORMERS_VERSION 4.55.0
35+
datasets_version: &DATASETS_VERSION 4.0.0
3536
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
3637
*CUDA_VERSION, '-', *OS_VERSION ]
3738
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
2+
# refer to the above page to pull latest Pytorch image
3+
4+
# docker image region us-west-2
5+
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker
6+
7+
RUN apt-get remove -y --purge emacs && \
8+
9+
apt-get autoremove -y
10+
11+
LABEL maintainer="Amazon AI"
12+
LABEL dlc_major_version="1"
13+
14+
# version args
15+
ARG TRANSFORMERS_VERSION=4.55.0
16+
ARG DATASETS_VERSION=4.0.0
17+
ARG HUGGINGFACE_HUB_VERSION=0.34.0
18+
ARG DIFFUSERS_VERSION=0.34.0
19+
ARG EVALUATE_VERSION=0.4.3
20+
ARG ACCELERATE_VERSION=1.4.0
21+
ARG TRL_VERSION=0.21.0
22+
ARG PEFT_VERSION=0.17.0
23+
ARG FLASH_ATTN_VERSION=2.8.2
24+
ARG NINJA_VERSION=1.11.1.4
25+
ARG KERNELS_VERSION=0.9.0
26+
ARG PYTHON=python3
27+
28+
# TODO: Remove when the base image is updated
29+
RUN pip install --upgrade pip \
30+
&& pip uninstall -y transformer-engine flash-attn pyarrow cryptography \
31+
&& pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \
32+
&& pip --no-cache-dir install --upgrade wheel setuptools \
33+
&& pip install --no-cache-dir -U "werkzeug==3.0.6"
34+
35+
# Pre-install kenlm without build isolation so it uses system cmake
36+
RUN pip install --no-cache-dir --no-build-isolation kenlm
37+
38+
# Install Hugging Face libraries and dependencies
39+
RUN pip install --no-cache-dir \
40+
huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB_VERSION} \
41+
transformers[sklearn,sentencepiece,audio,vision,pipelines]==${TRANSFORMERS_VERSION} \
42+
datasets==${DATASETS_VERSION} \
43+
diffusers==${DIFFUSERS_VERSION} \
44+
Jinja2 \
45+
tensorboard \
46+
bitsandbytes \
47+
kernels==${KERNELS_VERSION} \
48+
evaluate==${EVALUATE_VERSION} \
49+
accelerate==${ACCELERATE_VERSION} \
50+
ninja==${NINJA_VERSION} \
51+
trl==${TRL_VERSION} \
52+
peft==${PEFT_VERSION} \
53+
flash-attn==${FLASH_ATTN_VERSION}
54+
55+
# Override conflicting versions to satisfy datasets requirements
56+
RUN pip install --no-cache-dir dill==0.3.8 multiprocess==0.70.16 \
57+
&& pip install --no-cache-dir pathos==0.3.3 --no-deps \
58+
&& PATHOS_META=$(find /usr/local/lib -type f -path "*pathos-0.3.3.dist-info/METADATA") \
59+
&& sed -i 's/dill.*/dill/' $PATHOS_META \
60+
&& sed -i 's/multiprocess.*/multiprocess/' $PATHOS_META
61+
62+
63+
# hf_transfer will be a built-in feature, remove the env variable then
64+
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
65+
ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training"
66+
67+
RUN apt-get update \
68+
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
69+
libgl1-mesa-glx \
70+
build-essential \
71+
ca-certificates \
72+
zlib1g-dev \
73+
openssl \
74+
python3-dev \
75+
pkg-config \
76+
check \
77+
curl \
78+
emacs \
79+
git \
80+
jq \
81+
unzip \
82+
vim \
83+
wget \
84+
&& rm -rf /var/lib/apt/lists/*
85+
86+
COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh
87+
RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh
88+
89+
RUN apt-get update \
90+
&& apt-get upgrade -y \
91+
&& apt-get autoremove -y \
92+
&& apt-get clean \
93+
&& rm -rf /var/lib/apt/lists/*
94+
95+
RUN HOME_DIR=/root \
96+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
97+
&& unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
98+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
99+
&& chmod +x /usr/local/bin/testOSSCompliance \
100+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
101+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
102+
&& rm -rf ${HOME_DIR}/oss_compliance*

release_images_inference.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,4 +261,4 @@ release_images:
261261
cuda_version: "cu128"
262262
example: False
263263
disable_sm_tag: True
264-
force_release: False
264+
force_release: True

0 commit comments

Comments
 (0)