Disable PT 2.7 Autopatch and Onboard BuildX (#5184)

sirutBuasai · web-flow · commit a62e100c4cb0 · 2025-08-21T10:53:27.000-07:00
* Test PR disable build

* temp comment pytorch

* add back sm as target

* install pytorch

* enable ap

* fix build comment

* disable autopatch

* add additional image size

* revert build

* revert dockerfile

* build using buildx

* fix test

* add ec2 heavy tests

* specify buildspec

* rever toml
diff --git a/pytorch/training/buildspec-2-7-sm.yml b/pytorch/training/buildspec-2-7-sm.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.7.1
 short_version: &SHORT_VERSION "2.7"
 arch_type: x86
-autopatch_build: "True"
+autopatch_build: "False"
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
diff --git a/pytorch/training/docker/2.7/py3/Dockerfile.cpu b/pytorch/training/docker/2.7/py3/Dockerfile.cpu
@@ -322,7 +322,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2,<3" \
+    "sagemaker>=2.9.0,<3" \
     "sagemaker-experiments<1" \
     sagemaker-pytorch-training \
     sagemaker-training
@@ -338,7 +338,7 @@ RUN pip install --no-cache-dir -U \
     seaborn \
     shap \
     # pinned for sagemaker==2.233.0
-    cloudpickle 
+    cloudpickle
 
 # Copy workaround script for incorrect hostname
 COPY changehostname.c /
@@ -361,4 +361,3 @@ RUN rm -rf /root/.cache | true
 
 ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
 CMD ["/bin/bash"]
-
diff --git a/pytorch/training/docker/2.7/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/training/docker/2.7/py3/Dockerfile.sagemaker.cpu.core_packages.json
@@ -18,13 +18,13 @@
     "version_specifier": "==0.2.4",
     "skip": "True"
   },
-    "s3torchconnector": {
-    "version_specifier": "==1.4.2",
-     "skip": "True"
+  "s3torchconnector": {
+    "version_specifier": "==1.4.3",
+    "skip": "True"
   },
   "accelerate": {
-    "version_specifier": "==1.9.0",
-     "skip": "True"
+    "version_specifier": "==1.10.0",
+    "skip": "True"
   },
   "thinc": {
     "version_specifier": "==8.3.4"
@@ -41,7 +41,7 @@
   "sagemaker": {
     "version_specifier": ">=2,<3"
   },
-   "sagemaker-experiments": {
+  "sagemaker-experiments": {
     "version_specifier": "<1"
   },
   "sagemaker-training": {
@@ -59,7 +59,7 @@
   "urllib3": {
     "version_specifier": ">=2.5.0"
   },
-   "awscli": {
+  "awscli": {
     "version_specifier": "<2"
   },
   "opencv-python": {
diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu
@@ -143,13 +143,13 @@ RUN mkdir -p /tmp/nvjpeg \
 && rm -rf /tmp/nvjpeg \
 # patch cuobjdump and nvdisasm
 && rm -rf /usr/local/cuda/bin/cuobjdump* \
-&& rm -rf /usr/local/cuda/bin/nvdisasm* 
+&& rm -rf /usr/local/cuda/bin/nvdisasm*
 
 # For EFA, below flags are needed to install EFA on docker image
 #  -n, --no-verify       Skip EFA device verification and test
 #  -l, --skip-limit-conf Skip EFA limit configuration
 #  -k, --skip-kmod       Skip EFA kmod installation
-# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin, 
+# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin,
 # which can now be found in /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu rather than the original /opt/aws-ofi-nccl/.
 RUN mkdir /tmp/efa \
  && cd /tmp/efa \
@@ -424,7 +424,7 @@ RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.g
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2,<3" \
+    "sagemaker>=2.9.0,<3" \
     "sagemaker-experiments<1" \
     sagemaker-pytorch-training \
     sagemaker-training
@@ -440,7 +440,7 @@ RUN pip install --no-cache-dir -U \
     scikit-learn \
     seaborn \
     # pinned for sagemaker==2.233.0
-    cloudpickle 
+    cloudpickle
 
 RUN HOME_DIR=/root \
  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.sagemaker.gpu.core_packages.json
@@ -1,5 +1,5 @@
 {
- "torch": {
+  "torch": {
     "version_specifier": "==2.7.1"
   },
   "torchvision": {
@@ -22,12 +22,12 @@
     "version_specifier": "==2.3",
     "skip": "True"
   },
-    "s3torchconnector": {
-    "version_specifier": "==1.4.2",
+  "s3torchconnector": {
+    "version_specifier": "==1.4.3",
     "skip": "True"
   },
   "accelerate": {
-    "version_specifier": "==1.9.0",
+    "version_specifier": "==1.10.0",
     "skip": "True"
   },
   "thinc": {
@@ -45,13 +45,13 @@
   "tornado": {
     "version_specifier": ">=6.5.1"
   },
-   "sagemaker-training": {
+  "sagemaker-training": {
     "version_specifier": ">=4.8.3"
   },
-   "sagemaker": {
+  "sagemaker": {
     "version_specifier": ">=2,<3"
   },
-   "sagemaker-experiments": {
+  "sagemaker-experiments": {
     "version_specifier": "<1"
   },
   "idna": {
@@ -69,7 +69,7 @@
   "urllib3": {
     "version_specifier": ">=2.5.0"
   },
-   "awscli": {
+  "awscli": {
     "version_specifier": "<2"
   },
   "sagemaker-pytorch-training": {
diff --git a/src/image.py b/src/image.py
@@ -13,14 +13,13 @@
 language governing permissions and limitations under the License.
 """
 
+import logging
+import subprocess
 from datetime import datetime
 
-from docker import APIClient
-from docker import DockerClient
+from docker import APIClient, DockerClient
 
 import constants
-import logging
-import subprocess
 
 LOGGER = logging.getLogger(__name__)
 LOGGER.setLevel(logging.INFO)
@@ -205,11 +204,15 @@ def docker_build(self, context_path, custom_context=False):
         :param custom_context: bool, Whether to use custom context from stdin (default: False)
         :return: int, Build status
         """
-        if self._is_vllm_image():
-            LOGGER.info(f"Using Buildx for vLLM image: {self.repository}:{self.tag}")
+        if self._is_vllm_image() or self._is_pytorch_training_image():
+            LOGGER.info(
+                f"Using Buildx for vLLM and PyTorch Training image: {self.repository}:{self.tag}"
+            )
             return self._buildx_build(context_path, custom_context)
         else:
-            LOGGER.info(f"Using legacy Docker API for non-vLLM image: {self.repository}:{self.tag}")
+            LOGGER.info(
+                f"Using legacy Docker API for non-vLLM and non-PyTorch Training image: {self.repository}:{self.tag}"
+            )
             return self._legacy_docker_build(context_path, custom_context)
 
     def _is_vllm_image(self):
@@ -224,6 +227,14 @@ def _is_vllm_image(self):
             or "vllm" in str(self.info.get("name", "")).lower()
         )
 
+    def _is_pytorch_training_image(self):
+        """
+        Determine if current image is a PyTorch Training image
+
+        :return: bool, True if this is a PyTorch Training image
+        """
+        return self.info.get("framework") == "pytorch" and self.info.get("image_type") == "training"
+
     def _buildx_build(self, context_path, custom_context=False):
         """
         Uses Docker Buildx CLI for building with real-time streaming and advanced caching.