From 87568844a2c16da45a31a6eff08886c27934cf9d Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 28 Nov 2025 14:09:48 -0800 Subject: [PATCH 01/19] run tests --- dlc_developer_config.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..553bb3045b5c 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" From 937054a74eafe663cd2462d654ec0833de967011 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 28 Nov 2025 14:13:18 -0800 Subject: [PATCH 02/19] rebuild and run tests --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 553bb3045b5c..7537ae16d525 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 4769818f843f2146e394c9af077d1eb982f70e6f Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 28 Nov 2025 17:19:33 -0800 Subject: [PATCH 03/19] update sagemaker pin and rebuild image --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 9e305de1b54a..b70a631950ba 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -282,7 +282,7 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker>=2" \ + "sagemaker>=2.200.0" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 0b90890db2fb..3b2842695fdb 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \ "awscli<1.42.50" \ "boto3<1.40.50" \ smclarify \ - "sagemaker>=2" \ + "sagemaker>=2.200.0" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training From 80adedf75069b94c93be5fa504da86f16eba981e Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Sun, 30 Nov 2025 16:21:07 -0800 Subject: [PATCH 04/19] rebuild image with sm sdk --- pytorch/training/buildspec.yml | 2 +- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 3 +++ pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e7a0d5614f66..e9f328177b4b 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-ec2.yml +buildspec_pointer: buildspec-2-8-sm.yml diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index b70a631950ba..66126eec6b7d 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -279,6 +279,9 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main WORKDIR / +# Force rebuild of SageMaker packages layer +ARG CACHEBUST=1 + # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 3b2842695fdb..c5562bfc7ba4 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -244,6 +244,9 @@ ARG PYTHON WORKDIR / +# Force rebuild of SageMaker packages layer +ARG CACHEBUST=1 + # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" From b107c51fff128f339a3330d802c73e64f2440c08 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Sun, 30 Nov 2025 20:59:07 -0800 Subject: [PATCH 05/19] pin versions and rebuild image --- .../training/docker/2.8/py3/Dockerfile.cpu | 20 +++++++++++-------- .../docker/2.8/py3/cu129/Dockerfile.gpu | 16 +++++++++------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 66126eec6b7d..0a3a89b66731 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -172,7 +172,7 @@ RUN pip install --no-cache-dir \ typing \ charset-normalizer \ packaging \ - boto3 \ + "boto3<1.40.50" \ PyYAML \ numpy \ scipy \ @@ -188,7 +188,7 @@ RUN pip install --no-cache-dir \ "requests>=2.32.0" \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ - "awscli" \ + "awscli<1.42.50" \ opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ @@ -279,16 +279,15 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main WORKDIR / -# Force rebuild of SageMaker packages layer -ARG CACHEBUST=1 - # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ "sagemaker>=2.200.0" \ sagemaker-experiments \ - sagemaker-pytorch-training \ - sagemaker-training + "sagemaker-pytorch-training>=3.0.0" \ + "sagemaker-training>=5.1.1" \ + "protobuf>=6.31.1" \ + "rich>=14.2.0" # Install extra packages RUN pip install --no-cache-dir -U \ @@ -300,7 +299,12 @@ RUN pip install --no-cache-dir -U \ scikit-learn \ seaborn \ shap \ - cloudpickle + cloudpickle \ + google-pasta \ + multiprocess \ + pathos \ + pox \ + ppft # Copy workaround script for incorrect hostname COPY changehostname.c / diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index c5562bfc7ba4..ed32a649523a 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -244,9 +244,6 @@ ARG PYTHON WORKDIR / -# Force rebuild of SageMaker packages layer -ARG CACHEBUST=1 - # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" @@ -255,8 +252,10 @@ RUN pip install --no-cache-dir -U \ smclarify \ "sagemaker>=2.200.0" \ sagemaker-experiments \ - sagemaker-pytorch-training \ - sagemaker-training + "sagemaker-pytorch-training>=3.0.0" \ + "sagemaker-training>=5.1.1" \ + "protobuf>=6.31.1" \ + "rich>=14.2.0" # Install extra packages RUN pip install --no-cache-dir -U \ @@ -268,7 +267,12 @@ RUN pip install --no-cache-dir -U \ shap \ scikit-learn \ seaborn \ - cloudpickle + cloudpickle \ + google-pasta \ + multiprocess \ + pathos \ + pox \ + ppft COPY setup_oss_compliance.sh setup_oss_compliance.sh RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh From 826a9436a4bf4183dd81b276d270acf14bade6cc Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 00:10:40 -0800 Subject: [PATCH 06/19] Fix version pins and rebuild image --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 8 +++++--- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 0a3a89b66731..7db1e830a86e 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -172,7 +172,7 @@ RUN pip install --no-cache-dir \ typing \ charset-normalizer \ packaging \ - "boto3<1.40.50" \ + "boto3>=1.41.0" \ PyYAML \ numpy \ scipy \ @@ -188,7 +188,7 @@ RUN pip install --no-cache-dir \ "requests>=2.32.0" \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ - "awscli<1.42.50" \ + "awscli>=1.43.0" \ opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ @@ -287,7 +287,9 @@ RUN pip install --no-cache-dir -U \ "sagemaker-pytorch-training>=3.0.0" \ "sagemaker-training>=5.1.1" \ "protobuf>=6.31.1" \ - "rich>=14.2.0" + "rich>=14.2.0" \ + "sniffio>=1.3.0" \ + "wrapt>=2.0.1" # Install extra packages RUN pip install --no-cache-dir -U \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index ed32a649523a..6f62e20c4497 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,15 +247,17 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" - "awscli<1.42.50" \ - "boto3<1.40.50" \ smclarify \ "sagemaker>=2.200.0" \ sagemaker-experiments \ "sagemaker-pytorch-training>=3.0.0" \ "sagemaker-training>=5.1.1" \ "protobuf>=6.31.1" \ - "rich>=14.2.0" + "rich>=14.2.0" \ + "awscli>=1.43.0" \ + "boto3>=1.41.0" \ + "sniffio>=1.3.0" \ + "wrapt>=2.0.1" # Install extra packages RUN pip install --no-cache-dir -U \ From c65ba08410f0124c85fbfe9394dda744d852f8af Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 01:17:03 -0800 Subject: [PATCH 07/19] pins s3fs to prevent the downgrade and rebuild --- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 6f62e20c4497..5961c797dad7 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -257,7 +257,10 @@ RUN pip install --no-cache-dir -U \ "awscli>=1.43.0" \ "boto3>=1.41.0" \ "sniffio>=1.3.0" \ - "wrapt>=2.0.1" + "wrapt>=2.0.1" \ + "s3fs>=2025.10.0" \ + "aiobotocore>=2.15.0" \ + "aiohttp>=3.10.0" # Install extra packages RUN pip install --no-cache-dir -U \ From 57a767b797eeece3ee61a95a322d9750b04b8d58 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 01:54:15 -0800 Subject: [PATCH 08/19] resolve incompatibilty issue --- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 5961c797dad7..f8a4a335da77 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -254,14 +254,15 @@ RUN pip install --no-cache-dir -U \ "sagemaker-training>=5.1.1" \ "protobuf>=6.31.1" \ "rich>=14.2.0" \ - "awscli>=1.43.0" \ - "boto3>=1.41.0" \ "sniffio>=1.3.0" \ "wrapt>=2.0.1" \ "s3fs>=2025.10.0" \ "aiobotocore>=2.15.0" \ "aiohttp>=3.10.0" +# Install AWS CLI and boto3 separately to get compatible versions +RUN pip install --no-cache-dir -U awscli boto3 + # Install extra packages RUN pip install --no-cache-dir -U \ bokeh \ From 9c87c39cada42365080e9ce75305455f3cb28fc6 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 02:41:06 -0800 Subject: [PATCH 09/19] retry tests --- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index f8a4a335da77..1dc46712fc00 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -254,11 +254,7 @@ RUN pip install --no-cache-dir -U \ "sagemaker-training>=5.1.1" \ "protobuf>=6.31.1" \ "rich>=14.2.0" \ - "sniffio>=1.3.0" \ - "wrapt>=2.0.1" \ - "s3fs>=2025.10.0" \ - "aiobotocore>=2.15.0" \ - "aiohttp>=3.10.0" + "sniffio>=1.3.0" # Install AWS CLI and boto3 separately to get compatible versions RUN pip install --no-cache-dir -U awscli boto3 From 0dc80eb583bd038d067503883656c9095739ff89 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 03:55:09 -0800 Subject: [PATCH 10/19] install newer version of s3fs --- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 1dc46712fc00..3a8c41511e55 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,6 +247,7 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" + "s3fs>=2025.10.0" \ smclarify \ "sagemaker>=2.200.0" \ sagemaker-experiments \ From 0b0f708bb7985c7705ad9eb234524c891b789255 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 09:42:08 -0800 Subject: [PATCH 11/19] revert config file --- dlc_developer_config.toml | 12 ++++++------ pytorch/training/buildspec.yml | 2 +- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 3 +++ pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 7537ae16d525..2ddfe8ccb932 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e9f328177b4b..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-8-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 7db1e830a86e..6c39a229e4ba 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -161,6 +161,7 @@ ENV PATH="/usr/local/bin:${PATH}" RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org # Install common packages +# Updated boto3 and awscli versions to match production baseline RUN pip install --no-cache-dir \ cython \ cryptography \ @@ -280,6 +281,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main WORKDIR / # Install SM packages +# Version pins to prevent package regressions and fix protobuf CVE vulnerabilities RUN pip install --no-cache-dir -U \ smclarify \ "sagemaker>=2.200.0" \ @@ -292,6 +294,7 @@ RUN pip install --no-cache-dir -U \ "wrapt>=2.0.1" # Install extra packages +# Required packages missing from earlier builds to match production baseline RUN pip install --no-cache-dir -U \ bokeh \ imageio \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 3a8c41511e55..6d1d1b3f9fca 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,6 +247,8 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" + # Pin s3fs before smclarify to prevent downgrade to 0.4.2 which lacks async dependencies + # Version pins to prevent package regressions and fix protobuf CVE vulnerabilities "s3fs>=2025.10.0" \ smclarify \ "sagemaker>=2.200.0" \ @@ -261,6 +263,7 @@ RUN pip install --no-cache-dir -U \ RUN pip install --no-cache-dir -U awscli boto3 # Install extra packages +# Required packages missing from earlier builds to match production baseline RUN pip install --no-cache-dir -U \ bokeh \ imageio \ From 797fc9af3759851d1f84dd8ba2d6f68776675c2f Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 10:51:39 -0800 Subject: [PATCH 12/19] revert version pin changes and pin sm version --- .../training/docker/2.8/py3/Dockerfile.cpu | 23 +++++------------ .../docker/2.8/py3/cu129/Dockerfile.gpu | 25 +++++-------------- 2 files changed, 12 insertions(+), 36 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 6c39a229e4ba..77a922b8c48f 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -173,7 +173,7 @@ RUN pip install --no-cache-dir \ typing \ charset-normalizer \ packaging \ - "boto3>=1.41.0" \ + boto3 \ PyYAML \ numpy \ scipy \ @@ -189,7 +189,7 @@ RUN pip install --no-cache-dir \ "requests>=2.32.0" \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ - "awscli>=1.43.0" \ + awscli \ opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ @@ -281,20 +281,14 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main WORKDIR / # Install SM packages -# Version pins to prevent package regressions and fix protobuf CVE vulnerabilities RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker>=2.200.0" \ + "sagemaker>=2.254.1" \ sagemaker-experiments \ - "sagemaker-pytorch-training>=3.0.0" \ - "sagemaker-training>=5.1.1" \ - "protobuf>=6.31.1" \ - "rich>=14.2.0" \ - "sniffio>=1.3.0" \ - "wrapt>=2.0.1" + sagemaker-pytorch-training \ + sagemaker-training # Install extra packages -# Required packages missing from earlier builds to match production baseline RUN pip install --no-cache-dir -U \ bokeh \ imageio \ @@ -304,12 +298,7 @@ RUN pip install --no-cache-dir -U \ scikit-learn \ seaborn \ shap \ - cloudpickle \ - google-pasta \ - multiprocess \ - pathos \ - pox \ - ppft + cloudpickle # Copy workaround script for incorrect hostname COPY changehostname.c / diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 6d1d1b3f9fca..d86aa9ff3903 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,23 +247,15 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" - # Pin s3fs before smclarify to prevent downgrade to 0.4.2 which lacks async dependencies - # Version pins to prevent package regressions and fix protobuf CVE vulnerabilities - "s3fs>=2025.10.0" \ + awscli \ + boto3 \ smclarify \ - "sagemaker>=2.200.0" \ + "sagemaker>=2.254.1" \ sagemaker-experiments \ - "sagemaker-pytorch-training>=3.0.0" \ - "sagemaker-training>=5.1.1" \ - "protobuf>=6.31.1" \ - "rich>=14.2.0" \ - "sniffio>=1.3.0" - -# Install AWS CLI and boto3 separately to get compatible versions -RUN pip install --no-cache-dir -U awscli boto3 + sagemaker-pytorch-training \ + sagemaker-training # Install extra packages -# Required packages missing from earlier builds to match production baseline RUN pip install --no-cache-dir -U \ bokeh \ imageio \ @@ -273,12 +265,7 @@ RUN pip install --no-cache-dir -U \ shap \ scikit-learn \ seaborn \ - cloudpickle \ - google-pasta \ - multiprocess \ - pathos \ - pox \ - ppft + cloudpickle COPY setup_oss_compliance.sh setup_oss_compliance.sh RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh From 1a979a8b5ded0a5a2d280129b57a5cf008506e71 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 10:53:11 -0800 Subject: [PATCH 13/19] build image --- dlc_developer_config.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..7537ae16d525 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 6128b85bd37c15d1b9854615f52b22fa6256c351 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 11:53:44 -0800 Subject: [PATCH 14/19] put back awscli and boto3 pins --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 1 - pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 77a922b8c48f..8ce958f77b10 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -161,7 +161,6 @@ ENV PATH="/usr/local/bin:${PATH}" RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org # Install common packages -# Updated boto3 and awscli versions to match production baseline RUN pip install --no-cache-dir \ cython \ cryptography \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index d86aa9ff3903..e7a830017f1c 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -247,8 +247,8 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ # address package regression caused by smclarify depedency s3fs" - awscli \ - boto3 \ + "awscli<1.42.50" \ + "boto3<1.40.50" \ smclarify \ "sagemaker>=2.254.1" \ sagemaker-experiments \ From 55ee780a15e808adce340fb41cee36c37cc46c08 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 13:33:15 -0800 Subject: [PATCH 15/19] Empty commit to trigger rebuild From 1c78f58bedb7a44a4e6b399d2138390dc5d84e95 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 15:54:52 -0800 Subject: [PATCH 16/19] fix typo & rerun --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 8ce958f77b10..9f3a07505d2a 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -282,7 +282,7 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker>=2.254.1" \ + "sagemaker<=2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index e7a830017f1c..d77402164575 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \ "awscli<1.42.50" \ "boto3<1.40.50" \ smclarify \ - "sagemaker>=2.254.1" \ + "sagemaker<=2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training From 55b15e3118894a8f475c3a88002753071e9664b5 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 20:25:45 -0800 Subject: [PATCH 17/19] fix SM version pin & rerun --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 9f3a07505d2a..526f9857c2af 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -282,7 +282,7 @@ WORKDIR / # Install SM packages RUN pip install --no-cache-dir -U \ smclarify \ - "sagemaker<=2.254.1" \ + "sagemaker==2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index d77402164575..3d8a33cb9bd0 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \ "awscli<1.42.50" \ "boto3<1.40.50" \ smclarify \ - "sagemaker<=2.254.1" \ + "sagemaker==2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ sagemaker-training From 7d8e24ea430fca2c065f960133a90f334547624e Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 1 Dec 2025 22:03:09 -0800 Subject: [PATCH 18/19] add sniffio explicitly & rerun --- pytorch/training/docker/2.8/py3/Dockerfile.cpu | 4 +++- pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu index 526f9857c2af..c97508902096 100644 --- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu @@ -285,7 +285,9 @@ RUN pip install --no-cache-dir -U \ "sagemaker==2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ - sagemaker-training + sagemaker-training \ + # Add sniffio explicitly as it's not included in sagemaker==2.254.1 dependencies + sniffio # Install extra packages RUN pip install --no-cache-dir -U \ diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu index 3d8a33cb9bd0..5e24300d78d3 100644 --- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu +++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu @@ -253,7 +253,9 @@ RUN pip install --no-cache-dir -U \ "sagemaker==2.254.1" \ sagemaker-experiments \ sagemaker-pytorch-training \ - sagemaker-training + sagemaker-training \ + # Add sniffio explicitly as it's not included in sagemaker==2.254.1 dependencies + sniffio # Install extra packages RUN pip install --no-cache-dir -U \ From c4034a78a33ce473633257fceff2c55d7b76aa23 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 2 Dec 2025 09:35:36 -0800 Subject: [PATCH 19/19] revert toml file --- dlc_developer_config.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 7537ae16d525..2ddfe8ccb932 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = ""