Skip to content

Commit a62e100

Browse files
authored
Disable PT 2.7 Autopatch and Onboard BuildX (#5184)
* Test PR disable build * temp comment pytorch * add back sm as target * install pytorch * enable ap * fix build comment * disable autopatch * add additional image size * revert build * revert dockerfile * build using buildx * fix test * add ec2 heavy tests * specify buildspec * rever toml
1 parent 1ff1164 commit a62e100

File tree

6 files changed

+40
-30
lines changed

6 files changed

+40
-30
lines changed

pytorch/training/buildspec-2-7-sm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
55
version: &VERSION 2.7.1
66
short_version: &SHORT_VERSION "2.7"
77
arch_type: x86
8-
autopatch_build: "True"
8+
autopatch_build: "False"
99

1010
repository_info:
1111
training_repository: &TRAINING_REPOSITORY

pytorch/training/docker/2.7/py3/Dockerfile.cpu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
322322
# Install SM packages
323323
RUN pip install --no-cache-dir -U \
324324
smclarify \
325-
"sagemaker>=2,<3" \
325+
"sagemaker>=2.9.0,<3" \
326326
"sagemaker-experiments<1" \
327327
sagemaker-pytorch-training \
328328
sagemaker-training
@@ -338,7 +338,7 @@ RUN pip install --no-cache-dir -U \
338338
seaborn \
339339
shap \
340340
# pinned for sagemaker==2.233.0
341-
cloudpickle
341+
cloudpickle
342342

343343
# Copy workaround script for incorrect hostname
344344
COPY changehostname.c /
@@ -361,4 +361,3 @@ RUN rm -rf /root/.cache | true
361361

362362
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
363363
CMD ["/bin/bash"]
364-

pytorch/training/docker/2.7/py3/Dockerfile.sagemaker.cpu.core_packages.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
"version_specifier": "==0.2.4",
1919
"skip": "True"
2020
},
21-
"s3torchconnector": {
22-
"version_specifier": "==1.4.2",
23-
"skip": "True"
21+
"s3torchconnector": {
22+
"version_specifier": "==1.4.3",
23+
"skip": "True"
2424
},
2525
"accelerate": {
26-
"version_specifier": "==1.9.0",
27-
"skip": "True"
26+
"version_specifier": "==1.10.0",
27+
"skip": "True"
2828
},
2929
"thinc": {
3030
"version_specifier": "==8.3.4"
@@ -41,7 +41,7 @@
4141
"sagemaker": {
4242
"version_specifier": ">=2,<3"
4343
},
44-
"sagemaker-experiments": {
44+
"sagemaker-experiments": {
4545
"version_specifier": "<1"
4646
},
4747
"sagemaker-training": {
@@ -59,7 +59,7 @@
5959
"urllib3": {
6060
"version_specifier": ">=2.5.0"
6161
},
62-
"awscli": {
62+
"awscli": {
6363
"version_specifier": "<2"
6464
},
6565
"opencv-python": {

pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,13 @@ RUN mkdir -p /tmp/nvjpeg \
143143
&& rm -rf /tmp/nvjpeg \
144144
# patch cuobjdump and nvdisasm
145145
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
146-
&& rm -rf /usr/local/cuda/bin/nvdisasm*
146+
&& rm -rf /usr/local/cuda/bin/nvdisasm*
147147

148148
# For EFA, below flags are needed to install EFA on docker image
149149
# -n, --no-verify Skip EFA device verification and test
150150
# -l, --skip-limit-conf Skip EFA limit configuration
151151
# -k, --skip-kmod Skip EFA kmod installation
152-
# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin,
152+
# start from 0.38.0 EFA now bundles the AWS OFI NCCL plugin,
153153
# which can now be found in /opt/amazon/ofi-nccl/lib/x86_64-linux-gnu rather than the original /opt/aws-ofi-nccl/.
154154
RUN mkdir /tmp/efa \
155155
&& cd /tmp/efa \
@@ -424,7 +424,7 @@ RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.g
424424
# Install SM packages
425425
RUN pip install --no-cache-dir -U \
426426
smclarify \
427-
"sagemaker>=2,<3" \
427+
"sagemaker>=2.9.0,<3" \
428428
"sagemaker-experiments<1" \
429429
sagemaker-pytorch-training \
430430
sagemaker-training
@@ -440,7 +440,7 @@ RUN pip install --no-cache-dir -U \
440440
scikit-learn \
441441
seaborn \
442442
# pinned for sagemaker==2.233.0
443-
cloudpickle
443+
cloudpickle
444444

445445
RUN HOME_DIR=/root \
446446
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \

pytorch/training/docker/2.7/py3/cu128/Dockerfile.sagemaker.gpu.core_packages.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"torch": {
2+
"torch": {
33
"version_specifier": "==2.7.1"
44
},
55
"torchvision": {
@@ -22,12 +22,12 @@
2222
"version_specifier": "==2.3",
2323
"skip": "True"
2424
},
25-
"s3torchconnector": {
26-
"version_specifier": "==1.4.2",
25+
"s3torchconnector": {
26+
"version_specifier": "==1.4.3",
2727
"skip": "True"
2828
},
2929
"accelerate": {
30-
"version_specifier": "==1.9.0",
30+
"version_specifier": "==1.10.0",
3131
"skip": "True"
3232
},
3333
"thinc": {
@@ -45,13 +45,13 @@
4545
"tornado": {
4646
"version_specifier": ">=6.5.1"
4747
},
48-
"sagemaker-training": {
48+
"sagemaker-training": {
4949
"version_specifier": ">=4.8.3"
5050
},
51-
"sagemaker": {
51+
"sagemaker": {
5252
"version_specifier": ">=2,<3"
5353
},
54-
"sagemaker-experiments": {
54+
"sagemaker-experiments": {
5555
"version_specifier": "<1"
5656
},
5757
"idna": {
@@ -69,7 +69,7 @@
6969
"urllib3": {
7070
"version_specifier": ">=2.5.0"
7171
},
72-
"awscli": {
72+
"awscli": {
7373
"version_specifier": "<2"
7474
},
7575
"sagemaker-pytorch-training": {

src/image.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,13 @@
1313
language governing permissions and limitations under the License.
1414
"""
1515

16+
import logging
17+
import subprocess
1618
from datetime import datetime
1719

18-
from docker import APIClient
19-
from docker import DockerClient
20+
from docker import APIClient, DockerClient
2021

2122
import constants
22-
import logging
23-
import subprocess
2423

2524
LOGGER = logging.getLogger(__name__)
2625
LOGGER.setLevel(logging.INFO)
@@ -205,11 +204,15 @@ def docker_build(self, context_path, custom_context=False):
205204
:param custom_context: bool, Whether to use custom context from stdin (default: False)
206205
:return: int, Build status
207206
"""
208-
if self._is_vllm_image():
209-
LOGGER.info(f"Using Buildx for vLLM image: {self.repository}:{self.tag}")
207+
if self._is_vllm_image() or self._is_pytorch_training_image():
208+
LOGGER.info(
209+
f"Using Buildx for vLLM and PyTorch Training image: {self.repository}:{self.tag}"
210+
)
210211
return self._buildx_build(context_path, custom_context)
211212
else:
212-
LOGGER.info(f"Using legacy Docker API for non-vLLM image: {self.repository}:{self.tag}")
213+
LOGGER.info(
214+
f"Using legacy Docker API for non-vLLM and non-PyTorch Training image: {self.repository}:{self.tag}"
215+
)
213216
return self._legacy_docker_build(context_path, custom_context)
214217

215218
def _is_vllm_image(self):
@@ -224,6 +227,14 @@ def _is_vllm_image(self):
224227
or "vllm" in str(self.info.get("name", "")).lower()
225228
)
226229

230+
def _is_pytorch_training_image(self):
231+
"""
232+
Determine if current image is a PyTorch Training image
233+
234+
:return: bool, True if this is a PyTorch Training image
235+
"""
236+
return self.info.get("framework") == "pytorch" and self.info.get("image_type") == "training"
237+
227238
def _buildx_build(self, context_path, custom_context=False):
228239
"""
229240
Uses Docker Buildx CLI for building with real-time streaming and advanced caching.

0 commit comments

Comments
 (0)