|
| 1 | +# https://github.com/aws/deep-learning-containers/blob/master/available_images.md |
| 2 | +# refer to the above page to pull latest Pytorch image |
| 3 | + |
| 4 | +# docker image region us-west-2 |
| 5 | +FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.8.0-gpu-py312-cu129-ubuntu22.04-sagemaker |
| 6 | + |
| 7 | +RUN apt-get remove -y --purge emacs && \ |
| 8 | + |
| 9 | +apt-get autoremove -y |
| 10 | + |
| 11 | +LABEL maintainer="Amazon AI" |
| 12 | +LABEL dlc_major_version="1" |
| 13 | + |
| 14 | +# version args |
| 15 | +ARG TRANSFORMERS_VERSION=4.56.2 |
| 16 | +ARG DATASETS_VERSION=4.1.0 |
| 17 | +ARG HUGGINGFACE_HUB_VERSION=0.35.3 |
| 18 | +ARG DIFFUSERS_VERSION=0.35.1 |
| 19 | +ARG EVALUATE_VERSION=0.4.3 |
| 20 | +ARG ACCELERATE_VERSION=1.10.1 |
| 21 | +ARG TRL_VERSION=0.23.0 |
| 22 | +ARG PEFT_VERSION=0.17.1 |
| 23 | +ARG FLASH_ATTN_VERSION=2.8.3 |
| 24 | +ARG NINJA_VERSION=1.13.0 |
| 25 | +ARG KERNELS_VERSION=0.9.0 |
| 26 | +ARG PYTHON=python3 |
| 27 | + |
| 28 | +# TODO: Remove when the base image is updated |
| 29 | +RUN pip install --upgrade pip \ |
| 30 | + && pip uninstall -y transformer-engine flash-attn pyarrow cryptography \ |
| 31 | + && pip install --no-cache-dir -U pyarrow cryptography pyopenssl Pillow \ |
| 32 | + && pip --no-cache-dir install --upgrade wheel setuptools \ |
| 33 | + && pip install --no-cache-dir -U "werkzeug==3.0.6" |
| 34 | + |
| 35 | +# Pre-install kenlm without build isolation so it uses system cmake |
| 36 | +RUN pip install --no-cache-dir --no-build-isolation kenlm |
| 37 | + |
| 38 | +# Install Hugging Face libraries and dependencies |
| 39 | +RUN pip install --no-cache-dir \ |
| 40 | + huggingface_hub[hf_transfer,hf_xet]==${HUGGINGFACE_HUB_VERSION} \ |
| 41 | + transformers[torch,sentencepiece,tokenizers,torch-speech,vision,integrations,timm,torch-vision,video,codecarbon,accelerate,mistral-common,chat-template,hub-kernels,sklearn,speech,audio,tiktoken,hf_xet,sagemaker]==${TRANSFORMERS_VERSION} \ |
| 42 | + datasets==${DATASETS_VERSION} \ |
| 43 | + diffusers==${DIFFUSERS_VERSION} \ |
| 44 | + Jinja2 \ |
| 45 | + tensorboard \ |
| 46 | + bitsandbytes \ |
| 47 | + kernels==${KERNELS_VERSION} \ |
| 48 | + evaluate==${EVALUATE_VERSION} \ |
| 49 | + accelerate==${ACCELERATE_VERSION} \ |
| 50 | + ninja==${NINJA_VERSION} \ |
| 51 | + trl==${TRL_VERSION} \ |
| 52 | + peft==${PEFT_VERSION} \ |
| 53 | + flash-attn==${FLASH_ATTN_VERSION} |
| 54 | + |
| 55 | +# Override conflicting versions to satisfy datasets requirements |
| 56 | +RUN pip install --no-cache-dir dill==0.3.8 multiprocess==0.70.16 \ |
| 57 | + && pip install --no-cache-dir pathos==0.3.3 --no-deps \ |
| 58 | + && PATHOS_META=$(find /usr/local/lib -type f -path "*pathos-0.3.3.dist-info/METADATA") \ |
| 59 | + && sed -i 's/dill.*/dill/' $PATHOS_META \ |
| 60 | + && sed -i 's/multiprocess.*/multiprocess/' $PATHOS_META |
| 61 | + |
| 62 | +# Fix CVE-77744: Upgrade urllib3 to version 2.5.0 or higher |
| 63 | +# Remove sigopt to avoid dependency conflict (it's not essential for core functionality) |
| 64 | +RUN pip install --no-cache-dir -U "urllib3>=2.5.0" \ |
| 65 | + && pip uninstall -y sigopt || true |
| 66 | + |
| 67 | +# Fix CVE-2023-48022: Remove Ray to eliminate vulnerability |
| 68 | +RUN pip uninstall -y ray |
| 69 | + |
| 70 | +# hf_transfer will be a built-in feature, remove the env variable then |
| 71 | +ENV HF_HUB_ENABLE_HF_TRANSFER="1" |
| 72 | +ENV HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:training" |
| 73 | + |
| 74 | +RUN apt-get update \ |
| 75 | + && apt-get install -y --allow-change-held-packages --no-install-recommends \ |
| 76 | + libgl1-mesa-glx \ |
| 77 | + build-essential \ |
| 78 | + ca-certificates \ |
| 79 | + zlib1g-dev \ |
| 80 | + openssl \ |
| 81 | + python3-dev \ |
| 82 | + pkg-config \ |
| 83 | + check \ |
| 84 | + curl \ |
| 85 | + emacs \ |
| 86 | + git \ |
| 87 | + jq \ |
| 88 | + unzip \ |
| 89 | + vim \ |
| 90 | + wget \ |
| 91 | + libcrypt1 \ |
| 92 | +&& rm -rf /var/lib/apt/lists/* |
| 93 | + |
| 94 | +COPY cuda-compatibility-lib.sh /usr/local/bin/cuda-compatibility-lib.sh |
| 95 | +RUN chmod +x /usr/local/bin/cuda-compatibility-lib.sh |
| 96 | + |
| 97 | +RUN apt-get update \ |
| 98 | + && apt-get upgrade -y \ |
| 99 | + && apt-get autoremove -y \ |
| 100 | + && apt-get clean \ |
| 101 | + && rm -rf /var/lib/apt/lists/* |
| 102 | + |
| 103 | +RUN HOME_DIR=/root \ |
| 104 | + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ |
| 105 | + && unzip -o ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ |
| 106 | + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ |
| 107 | + && chmod +x /usr/local/bin/testOSSCompliance \ |
| 108 | + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ |
| 109 | + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ |
| 110 | + && rm -rf ${HOME_DIR}/oss_compliance* |
0 commit comments