test pytorch with uv

Jyothirmaikottu · Jyothirmaikottu · commit a5ceb18be5c4 · 2025-09-29T15:42:26.000-07:00
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -37,7 +37,7 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = ["tensorflow"]
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -122,8 +122,8 @@ use_scheduler = false
 dlc-pr-base = ""
 
 # Standard Framework Training
-dlc-pr-pytorch-training = ""
-dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml"
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
+dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
 # ARM64 Training
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -53,6 +53,8 @@ ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING=UTF-8
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
+ENV UV_VERSION=0.8.22 
+ENV UV_SYSTEM_PYTHON=1
 
 ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
 
@@ -80,8 +82,15 @@ ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/o
 # Python Path
 ENV PATH="/usr/local/bin:${PATH}"
 
+
+ENV UV_CACHE_DIR=/root/.cache/uv \
+    UV_SYSTEM_PYTHON=1
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && ln -s /root/.local/bin/uv /usr/local/bin/uv 
+
 # Install common conda packages
-RUN pip install --no-cache-dir \
+RUN uv pip install --no-cache-dir \
     cython \
     cryptography \
     pyOpenSSL \
@@ -114,11 +123,11 @@ RUN pip install --no-cache-dir \
     tornado>=6.5.1
 
 # Install PyTorch
-RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
+RUN uv pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     torchvision==${TORCHVISION_VERSION} \
     torchaudio==${TORCHAUDIO_VERSION} \
     --index-url https://download.pytorch.org/whl/cu129 \
-    && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
+    && uv pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
     torchdata==${TORCHDATA_VERSION} \
     triton \
     s3torchconnector \
@@ -131,18 +140,18 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
     thinc==8.3.4 \
     blis \
     numpy \
- && pip uninstall -y dataclasses
+ && uv pip uninstall dataclasses
 
 # Install flash attn and NVIDIA transformer engine.
 # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
 ENV NVTE_FRAMEWORK=pytorch
 
 RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \
-    && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \
+    && uv pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \
     && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
 
 # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
-RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
+RUN uv pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
 
 RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
 
@@ -245,15 +254,15 @@ ARG PYTHON
 WORKDIR /
 
 # Install SM packages
-RUN pip install --no-cache-dir -U \
+RUN uv pip install --no-cache-dir -U \
     smclarify \
     "sagemaker>=2" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training
 
 # Install extra packages
-RUN pip install --no-cache-dir -U \
+RUN uv pip install --no-cache-dir -U \
     bokeh \
     imageio \
     numba \
diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu
@@ -28,8 +28,7 @@ ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING=UTF-8
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
-ENV UV_VERSION=0.8.22 
-ENV UV_SYSTEM_PYTHON=1
+
 # Set environment variables for MKL
 # For more about MKL with TensorFlow see:
 # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
@@ -200,20 +199,16 @@ RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSIO
    && make install \
    && rm -rf ../Python-$PYTHON_VERSION*
 
-ENV UV_CACHE_DIR=/root/.cache/uv \
-    UV_SYSTEM_PYTHON=1
-
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && ln -s /root/.local/bin/uv /usr/local/bin/uv \
-    && uv pip install --system \
-        setuptools \
-        wheel
+RUN ${PIP} --no-cache-dir install --upgrade \
+   pip \
+   setuptools \
+   wheel
 
 # Some TF tools expect a "python" binary
 RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
    && ln -s $(which ${PIP}) /usr/bin/pip
 
-RUN uv pip install --system \
+RUN ${PIP} install --no-cache-dir -U \
    pybind11 \
    cmake \
    scipy \
@@ -357,20 +352,22 @@ ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
 # and this is fine since sagemaker is more important than the models and
 # the models still work on pyyaml 6 in this context.
 # Need to install wheel before we can fix the pyyaml issue below
-RUN uv pip install --system \
-    "cython<3" "pyyaml<6" --no-build-isolation
+RUN pip install wheel \
+ && pip install "cython<3" "pyyaml<6" --no-build-isolation
 
-RUN uv pip install --system \
+# https://github.com/tensorflow/models/issues/9267
+# tf-models does not respect existing installations of TF and always installs open source TF
+RUN ${PIP} install --no-cache-dir -U \
     tf-models-official==2.18.0 \
     tensorflow-text==2.18.1 \
- && uv pip uninstall tensorflow tensorflow-gpu --no-confirm \
- && uv pip install --system \
+ && ${PIP} uninstall -y tensorflow tensorflow-gpu \
+ && ${PIP} install --no-cache-dir -U \
     ${TF_URL} \
     "tensorflow-io==0.37.*" \
     "tensorflow-datasets==4.9.7"
 
-RUN uv pip install --system \
-    numba \
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    numba==0.61.0 \
     bokeh \
     imageio \
     opencv-python \
@@ -379,15 +376,25 @@ RUN uv pip install --system \
     shap \
     numpy
 
-RUN uv pip install --system \
-    "sagemaker<3" \
-    sagemaker-experiments==0.1.45 \
-    sagemaker-training==4.8.4 
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    "sagemaker<3"
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    sagemaker-experiments==0.1.45
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    sagemaker-training==4.8.4
+
+    RUN $PYTHON -m pip install --no-cache-dir -U \
+    sagemaker-tensorflow-training==20.4.1
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    sagemaker-studio-analytics-extension==0.1.4
+
+RUN $PYTHON -m pip install --no-cache-dir -U \
+    sagemaker-studio-sparkmagic-lib==0.2.0
 
-RUN uv pip install --system \
-    sagemaker-tensorflow-training==20.4.1 \
-    sagemaker-studio-analytics-extension==0.1.4 \
-    sagemaker-studio-sparkmagic-lib==0.2.0 \
+RUN $PYTHON -m pip install --no-cache-dir -U \
     sparkmagic==0.21.0 \
     smclarify