From 87568844a2c16da45a31a6eff08886c27934cf9d Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Fri, 28 Nov 2025 14:09:48 -0800
Subject: [PATCH 01/19] run tests

---
 dlc_developer_config.toml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 2ddfe8ccb932..553bb3045b5c 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -98,11 +98,11 @@ ipv6_vpc_name = ""
 # run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = false
+sagemaker_efa_tests = true
 # run release_candidate_integration tests
-sagemaker_rc_tests = false
+sagemaker_rc_tests = true
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = false
+sagemaker_benchmark_tests = true
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""

From 937054a74eafe663cd2462d654ec0833de967011 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Fri, 28 Nov 2025 14:13:18 -0800
Subject: [PATCH 02/19] rebuild and run tests

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 553bb3045b5c..7537ae16d525 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -124,7 +124,7 @@ nightly_pr_test_mode = false
 dlc-pr-base = ""
 
 # Standard Framework Training
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 

From 4769818f843f2146e394c9af077d1eb982f70e6f Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Fri, 28 Nov 2025 17:19:33 -0800
Subject: [PATCH 03/19] update sagemaker pin and rebuild image

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 2 +-
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 9e305de1b54a..b70a631950ba 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -282,7 +282,7 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2" \
+    "sagemaker>=2.200.0" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 0b90890db2fb..3b2842695fdb 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \
     "awscli<1.42.50" \
     "boto3<1.40.50" \
     smclarify \
-    "sagemaker>=2" \
+    "sagemaker>=2.200.0" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training

From 80adedf75069b94c93be5fa504da86f16eba981e Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Sun, 30 Nov 2025 16:21:07 -0800
Subject: [PATCH 04/19] rebuild image with sm sdk

---
 pytorch/training/buildspec.yml                       | 2 +-
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 3 +++
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml
index e7a0d5614f66..e9f328177b4b 100644
--- a/pytorch/training/buildspec.yml
+++ b/pytorch/training/buildspec.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-2-9-ec2.yml
+buildspec_pointer: buildspec-2-8-sm.yml
diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index b70a631950ba..66126eec6b7d 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -279,6 +279,9 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 
 WORKDIR /
 
+# Force rebuild of SageMaker packages layer
+ARG CACHEBUST=1
+
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 3b2842695fdb..c5562bfc7ba4 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -244,6 +244,9 @@ ARG PYTHON
 
 WORKDIR /
 
+# Force rebuild of SageMaker packages layer
+ARG CACHEBUST=1
+
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"

From b107c51fff128f339a3330d802c73e64f2440c08 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Sun, 30 Nov 2025 20:59:07 -0800
Subject: [PATCH 05/19] pin versions and rebuild image

---
 .../training/docker/2.8/py3/Dockerfile.cpu    | 20 +++++++++++--------
 .../docker/2.8/py3/cu129/Dockerfile.gpu       | 16 +++++++++------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 66126eec6b7d..0a3a89b66731 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -172,7 +172,7 @@ RUN pip install --no-cache-dir \
     typing \
     charset-normalizer \
     packaging \
-    boto3 \
+    "boto3<1.40.50" \
     PyYAML \
     numpy \
     scipy \
@@ -188,7 +188,7 @@ RUN pip install --no-cache-dir \
     "requests>=2.32.0" \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
-    "awscli" \
+    "awscli<1.42.50" \
     opencv-python==4.11.0.86 \
     mpi4py \
     jinja2>=3.1.6 \
@@ -279,16 +279,15 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 
 WORKDIR /
 
-# Force rebuild of SageMaker packages layer
-ARG CACHEBUST=1
-
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
     "sagemaker>=2.200.0" \
     sagemaker-experiments \
-    sagemaker-pytorch-training \
-    sagemaker-training
+    "sagemaker-pytorch-training>=3.0.0" \
+    "sagemaker-training>=5.1.1" \
+    "protobuf>=6.31.1" \
+    "rich>=14.2.0"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
@@ -300,7 +299,12 @@ RUN pip install --no-cache-dir -U \
     scikit-learn \
     seaborn \
     shap \
-    cloudpickle
+    cloudpickle \
+    google-pasta \
+    multiprocess \
+    pathos \
+    pox \
+    ppft
 
 # Copy workaround script for incorrect hostname
 COPY changehostname.c /
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index c5562bfc7ba4..ed32a649523a 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -244,9 +244,6 @@ ARG PYTHON
 
 WORKDIR /
 
-# Force rebuild of SageMaker packages layer
-ARG CACHEBUST=1
-
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
@@ -255,8 +252,10 @@ RUN pip install --no-cache-dir -U \
     smclarify \
     "sagemaker>=2.200.0" \
     sagemaker-experiments \
-    sagemaker-pytorch-training \
-    sagemaker-training
+    "sagemaker-pytorch-training>=3.0.0" \
+    "sagemaker-training>=5.1.1" \
+    "protobuf>=6.31.1" \
+    "rich>=14.2.0"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
@@ -268,7 +267,12 @@ RUN pip install --no-cache-dir -U \
     shap \
     scikit-learn \
     seaborn \
-    cloudpickle
+    cloudpickle \
+    google-pasta \
+    multiprocess \
+    pathos \
+    pox \
+    ppft
 
 COPY setup_oss_compliance.sh setup_oss_compliance.sh
 RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh

From 826a9436a4bf4183dd81b276d270acf14bade6cc Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 00:10:40 -0800
Subject: [PATCH 06/19] Fix version pins and rebuild image

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 8 +++++---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 0a3a89b66731..7db1e830a86e 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -172,7 +172,7 @@ RUN pip install --no-cache-dir \
     typing \
     charset-normalizer \
     packaging \
-    "boto3<1.40.50" \
+    "boto3>=1.41.0" \
     PyYAML \
     numpy \
     scipy \
@@ -188,7 +188,7 @@ RUN pip install --no-cache-dir \
     "requests>=2.32.0" \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
-    "awscli<1.42.50" \
+    "awscli>=1.43.0" \
     opencv-python==4.11.0.86 \
     mpi4py \
     jinja2>=3.1.6 \
@@ -287,7 +287,9 @@ RUN pip install --no-cache-dir -U \
     "sagemaker-pytorch-training>=3.0.0" \
     "sagemaker-training>=5.1.1" \
     "protobuf>=6.31.1" \
-    "rich>=14.2.0"
+    "rich>=14.2.0" \
+    "sniffio>=1.3.0" \
+    "wrapt>=2.0.1"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index ed32a649523a..6f62e20c4497 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -247,15 +247,17 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
-    "awscli<1.42.50" \
-    "boto3<1.40.50" \
     smclarify \
     "sagemaker>=2.200.0" \
     sagemaker-experiments \
     "sagemaker-pytorch-training>=3.0.0" \
     "sagemaker-training>=5.1.1" \
     "protobuf>=6.31.1" \
-    "rich>=14.2.0"
+    "rich>=14.2.0" \
+    "awscli>=1.43.0" \
+    "boto3>=1.41.0" \
+    "sniffio>=1.3.0" \
+    "wrapt>=2.0.1"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \

From c65ba08410f0124c85fbfe9394dda744d852f8af Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 01:17:03 -0800
Subject: [PATCH 07/19] pins s3fs to prevent the downgrade and rebuild

---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 6f62e20c4497..5961c797dad7 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -257,7 +257,10 @@ RUN pip install --no-cache-dir -U \
     "awscli>=1.43.0" \
     "boto3>=1.41.0" \
     "sniffio>=1.3.0" \
-    "wrapt>=2.0.1"
+    "wrapt>=2.0.1" \
+    "s3fs>=2025.10.0" \
+    "aiobotocore>=2.15.0" \
+    "aiohttp>=3.10.0"
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \

From 57a767b797eeece3ee61a95a322d9750b04b8d58 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 01:54:15 -0800
Subject: [PATCH 08/19] resolve incompatibilty issue

---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 5961c797dad7..f8a4a335da77 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -254,14 +254,15 @@ RUN pip install --no-cache-dir -U \
     "sagemaker-training>=5.1.1" \
     "protobuf>=6.31.1" \
     "rich>=14.2.0" \
-    "awscli>=1.43.0" \
-    "boto3>=1.41.0" \
     "sniffio>=1.3.0" \
     "wrapt>=2.0.1" \
     "s3fs>=2025.10.0" \
     "aiobotocore>=2.15.0" \
     "aiohttp>=3.10.0"
 
+# Install AWS CLI and boto3 separately to get compatible versions
+RUN pip install --no-cache-dir -U awscli boto3
+
 # Install extra packages
 RUN pip install --no-cache-dir -U \
     bokeh \

From 9c87c39cada42365080e9ce75305455f3cb28fc6 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 02:41:06 -0800
Subject: [PATCH 09/19] retry tests

---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index f8a4a335da77..1dc46712fc00 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -254,11 +254,7 @@ RUN pip install --no-cache-dir -U \
     "sagemaker-training>=5.1.1" \
     "protobuf>=6.31.1" \
     "rich>=14.2.0" \
-    "sniffio>=1.3.0" \
-    "wrapt>=2.0.1" \
-    "s3fs>=2025.10.0" \
-    "aiobotocore>=2.15.0" \
-    "aiohttp>=3.10.0"
+    "sniffio>=1.3.0" 
 
 # Install AWS CLI and boto3 separately to get compatible versions
 RUN pip install --no-cache-dir -U awscli boto3

From 0dc80eb583bd038d067503883656c9095739ff89 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 03:55:09 -0800
Subject: [PATCH 10/19] install newer version of s3fs

---
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 1dc46712fc00..3a8c41511e55 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -247,6 +247,7 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
+    "s3fs>=2025.10.0" \
     smclarify \
     "sagemaker>=2.200.0" \
     sagemaker-experiments \

From 0b0f708bb7985c7705ad9eb234524c891b789255 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 09:42:08 -0800
Subject: [PATCH 11/19] revert config file

---
 dlc_developer_config.toml                            | 12 ++++++------
 pytorch/training/buildspec.yml                       |  2 +-
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       |  3 +++
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 7537ae16d525..2ddfe8ccb932 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = ["pytorch"]
+build_frameworks = []
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = false
+build_inference = true
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -98,11 +98,11 @@ ipv6_vpc_name = ""
 # run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = true
+sagemaker_efa_tests = false
 # run release_candidate_integration tests
-sagemaker_rc_tests = true
+sagemaker_rc_tests = false
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = true
+sagemaker_benchmark_tests = false
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""
@@ -124,7 +124,7 @@ nightly_pr_test_mode = false
 dlc-pr-base = ""
 
 # Standard Framework Training
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
+dlc-pr-pytorch-training = ""
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml
index e9f328177b4b..e7a0d5614f66 100644
--- a/pytorch/training/buildspec.yml
+++ b/pytorch/training/buildspec.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-2-8-sm.yml
+buildspec_pointer: buildspec-2-9-ec2.yml
diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 7db1e830a86e..6c39a229e4ba 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -161,6 +161,7 @@ ENV PATH="/usr/local/bin:${PATH}"
 RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
 
 # Install common packages
+# Updated boto3 and awscli versions to match production baseline
 RUN pip install --no-cache-dir \
     cython \
     cryptography \
@@ -280,6 +281,7 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 WORKDIR /
 
 # Install SM packages
+# Version pins to prevent package regressions and fix protobuf CVE vulnerabilities
 RUN pip install --no-cache-dir -U \
     smclarify \
     "sagemaker>=2.200.0" \
@@ -292,6 +294,7 @@ RUN pip install --no-cache-dir -U \
     "wrapt>=2.0.1"
 
 # Install extra packages
+# Required packages missing from earlier builds to match production baseline
 RUN pip install --no-cache-dir -U \
     bokeh \
     imageio \
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 3a8c41511e55..6d1d1b3f9fca 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -247,6 +247,8 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
+    # Pin s3fs before smclarify to prevent downgrade to 0.4.2 which lacks async dependencies
+    # Version pins to prevent package regressions and fix protobuf CVE vulnerabilities
     "s3fs>=2025.10.0" \
     smclarify \
     "sagemaker>=2.200.0" \
@@ -261,6 +263,7 @@ RUN pip install --no-cache-dir -U \
 RUN pip install --no-cache-dir -U awscli boto3
 
 # Install extra packages
+# Required packages missing from earlier builds to match production baseline
 RUN pip install --no-cache-dir -U \
     bokeh \
     imageio \

From 797fc9af3759851d1f84dd8ba2d6f68776675c2f Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 10:51:39 -0800
Subject: [PATCH 12/19] revert version pin changes and pin sm version

---
 .../training/docker/2.8/py3/Dockerfile.cpu    | 23 +++++------------
 .../docker/2.8/py3/cu129/Dockerfile.gpu       | 25 +++++--------------
 2 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 6c39a229e4ba..77a922b8c48f 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -173,7 +173,7 @@ RUN pip install --no-cache-dir \
     typing \
     charset-normalizer \
     packaging \
-    "boto3>=1.41.0" \
+    boto3 \
     PyYAML \
     numpy \
     scipy \
@@ -189,7 +189,7 @@ RUN pip install --no-cache-dir \
     "requests>=2.32.0" \
     "setuptools>=70.0.0" \
     "urllib3>=2.5.0" \
-    "awscli>=1.43.0" \
+    awscli \
     opencv-python==4.11.0.86 \
     mpi4py \
     jinja2>=3.1.6 \
@@ -281,20 +281,14 @@ ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 WORKDIR /
 
 # Install SM packages
-# Version pins to prevent package regressions and fix protobuf CVE vulnerabilities
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2.200.0" \
+    "sagemaker>=2.254.1" \
     sagemaker-experiments \
-    "sagemaker-pytorch-training>=3.0.0" \
-    "sagemaker-training>=5.1.1" \
-    "protobuf>=6.31.1" \
-    "rich>=14.2.0" \
-    "sniffio>=1.3.0" \
-    "wrapt>=2.0.1"
+    sagemaker-pytorch-training \
+    sagemaker-training 
 
 # Install extra packages
-# Required packages missing from earlier builds to match production baseline
 RUN pip install --no-cache-dir -U \
     bokeh \
     imageio \
@@ -304,12 +298,7 @@ RUN pip install --no-cache-dir -U \
     scikit-learn \
     seaborn \
     shap \
-    cloudpickle \
-    google-pasta \
-    multiprocess \
-    pathos \
-    pox \
-    ppft
+    cloudpickle 
 
 # Copy workaround script for incorrect hostname
 COPY changehostname.c /
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 6d1d1b3f9fca..d86aa9ff3903 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -247,23 +247,15 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
-    # Pin s3fs before smclarify to prevent downgrade to 0.4.2 which lacks async dependencies
-    # Version pins to prevent package regressions and fix protobuf CVE vulnerabilities
-    "s3fs>=2025.10.0" \
+    awscli \
+    boto3 \
     smclarify \
-    "sagemaker>=2.200.0" \
+    "sagemaker>=2.254.1" \
     sagemaker-experiments \
-    "sagemaker-pytorch-training>=3.0.0" \
-    "sagemaker-training>=5.1.1" \
-    "protobuf>=6.31.1" \
-    "rich>=14.2.0" \
-    "sniffio>=1.3.0" 
-
-# Install AWS CLI and boto3 separately to get compatible versions
-RUN pip install --no-cache-dir -U awscli boto3
+    sagemaker-pytorch-training \
+    sagemaker-training 
 
 # Install extra packages
-# Required packages missing from earlier builds to match production baseline
 RUN pip install --no-cache-dir -U \
     bokeh \
     imageio \
@@ -273,12 +265,7 @@ RUN pip install --no-cache-dir -U \
     shap \
     scikit-learn \
     seaborn \
-    cloudpickle \
-    google-pasta \
-    multiprocess \
-    pathos \
-    pox \
-    ppft
+    cloudpickle 
 
 COPY setup_oss_compliance.sh setup_oss_compliance.sh
 RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh

From 1a979a8b5ded0a5a2d280129b57a5cf008506e71 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 10:53:11 -0800
Subject: [PATCH 13/19] build image

---
 dlc_developer_config.toml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 2ddfe8ccb932..7537ae16d525 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -98,11 +98,11 @@ ipv6_vpc_name = ""
 # run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = false
+sagemaker_efa_tests = true
 # run release_candidate_integration tests
-sagemaker_rc_tests = false
+sagemaker_rc_tests = true
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = false
+sagemaker_benchmark_tests = true
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""
@@ -124,7 +124,7 @@ nightly_pr_test_mode = false
 dlc-pr-base = ""
 
 # Standard Framework Training
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 

From 6128b85bd37c15d1b9854615f52b22fa6256c351 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 11:53:44 -0800
Subject: [PATCH 14/19] put back awscli and boto3 pins

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 1 -
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 77a922b8c48f..8ce958f77b10 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -161,7 +161,6 @@ ENV PATH="/usr/local/bin:${PATH}"
 RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
 
 # Install common packages
-# Updated boto3 and awscli versions to match production baseline
 RUN pip install --no-cache-dir \
     cython \
     cryptography \
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index d86aa9ff3903..e7a830017f1c 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -247,8 +247,8 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     # address package regression caused by smclarify depedency s3fs"
-    awscli \
-    boto3 \
+    "awscli<1.42.50" \
+    "boto3<1.40.50" \
     smclarify \
     "sagemaker>=2.254.1" \
     sagemaker-experiments \

From 55ee780a15e808adce340fb41cee36c37cc46c08 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 13:33:15 -0800
Subject: [PATCH 15/19] Empty commit to trigger rebuild


From 1c78f58bedb7a44a4e6b399d2138390dc5d84e95 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 15:54:52 -0800
Subject: [PATCH 16/19] fix typo & rerun

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 2 +-
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 8ce958f77b10..9f3a07505d2a 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -282,7 +282,7 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker>=2.254.1" \
+    "sagemaker<=2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training 
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index e7a830017f1c..d77402164575 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \
     "awscli<1.42.50" \
     "boto3<1.40.50" \
     smclarify \
-    "sagemaker>=2.254.1" \
+    "sagemaker<=2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training 

From 55b15e3118894a8f475c3a88002753071e9664b5 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 20:25:45 -0800
Subject: [PATCH 17/19] fix SM version pin & rerun

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 2 +-
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 9f3a07505d2a..526f9857c2af 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -282,7 +282,7 @@ WORKDIR /
 # Install SM packages
 RUN pip install --no-cache-dir -U \
     smclarify \
-    "sagemaker<=2.254.1" \
+    "sagemaker==2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training 
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index d77402164575..3d8a33cb9bd0 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -250,7 +250,7 @@ RUN pip install --no-cache-dir -U \
     "awscli<1.42.50" \
     "boto3<1.40.50" \
     smclarify \
-    "sagemaker<=2.254.1" \
+    "sagemaker==2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
     sagemaker-training 

From 7d8e24ea430fca2c065f960133a90f334547624e Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 22:03:09 -0800
Subject: [PATCH 18/19] add sniffio explicitly & rerun

---
 pytorch/training/docker/2.8/py3/Dockerfile.cpu       | 4 +++-
 pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.8/py3/Dockerfile.cpu b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
index 526f9857c2af..c97508902096 100644
--- a/pytorch/training/docker/2.8/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.8/py3/Dockerfile.cpu
@@ -285,7 +285,9 @@ RUN pip install --no-cache-dir -U \
     "sagemaker==2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
-    sagemaker-training 
+    sagemaker-training \
+    # Add sniffio explicitly as it's not included in sagemaker==2.254.1 dependencies
+    sniffio  
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
index 3d8a33cb9bd0..5e24300d78d3 100644
--- a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
+++ b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -253,7 +253,9 @@ RUN pip install --no-cache-dir -U \
     "sagemaker==2.254.1" \
     sagemaker-experiments \
     sagemaker-pytorch-training \
-    sagemaker-training 
+    sagemaker-training \
+    # Add sniffio explicitly as it's not included in sagemaker==2.254.1 dependencies
+    sniffio   
 
 # Install extra packages
 RUN pip install --no-cache-dir -U \

From c4034a78a33ce473633257fceff2c55d7b76aa23 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 2 Dec 2025 09:35:36 -0800
Subject: [PATCH 19/19] revert toml file

---
 dlc_developer_config.toml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 7537ae16d525..2ddfe8ccb932 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = ["pytorch"]
+build_frameworks = []
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = false
+build_inference = true
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -98,11 +98,11 @@ ipv6_vpc_name = ""
 # run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = true
+sagemaker_efa_tests = false
 # run release_candidate_integration tests
-sagemaker_rc_tests = true
+sagemaker_rc_tests = false
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = true
+sagemaker_benchmark_tests = false
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""
@@ -124,7 +124,7 @@ nightly_pr_test_mode = false
 dlc-pr-base = ""
 
 # Standard Framework Training
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
+dlc-pr-pytorch-training = ""
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""