aws · sirutBuasai · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025
@@ -1,162 +1,71 @@
 [dev]
-# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
 partner_developer = ""
-# Please only set it to true if you are preparing an EI related PR
-# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
-# Please only set it to true if you are preparing a NEURON related PR
-# Do remember to revert it back to false before merging any PR (including NEURON dedicated PR)
 neuron_mode = false
-# Please only set it to true if you are preparing a NEURONX related PR
-# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR)
 neuronx_mode = false
-# Please only set it to true if you are preparing a GRAVITON related PR
-# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR)
 graviton_mode = false
-# Please only set it to true if you are preparing a ARM64 related PR
-# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
 arm64_mode = false
-# Please only set it to True if you are preparing a HABANA related PR
-# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
 habana_mode = false
-# Please only set it to True if you are preparing a HUGGINGFACE TRCOMP related PR
-# Do remember to revert it back to False before merging any PR (including HUGGINGFACE TRCOMP dedicated PR)
-# This mode is used to build TF 2.6 and PT1.11 DLC
 huggingface_trcomp_mode = false
-# Please only set it to True if you are preparing a TRCOMP related PR
-# Do remember to revert it back to False before merging any PR (including TRCOMP dedicated PR)
-# This mode is used to build PT1.12 and above DLC
 trcomp_mode = false
-# Set deep_canary_mode to true to simulate Deep Canary Test conditions on PR for all frameworks in the
-# build_frameworks list below. This will cause all image builds and non-deep-canary tests on the PR to be skipped,
-# regardless of whether they are enabled or disabled below.
-# Set graviton_mode/arm64_mode to true to run Deep Canaries on Graviton/ARM64 images.
-# Do remember to revert it back to false before merging any PR.
 deep_canary_mode = false
 
 [build]
-# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
-
-
-# By default we build both training and inference containers. Set true/false values to determine which to build.
+build_frameworks = [ "pytorch",]
 build_training = true
-build_inference = true
-
-# Set do_build to "false" to skip builds and test the latest image built by this PR
-# Note: at least one build is required to set do_build to "false"
+build_inference = false
 do_build = true
 
 [notify]
-### Notify on test failures
-### Off by default
 notify_test_failures = false
-  # Valid values: medium or high
-  notification_severity = "medium"
+notification_severity = "medium"
 
 [test]
-### On by default
 sanity_tests = true
 security_tests = true
-  safety_check_test = false
-  ecr_scan_allowlist_feature = false
+safety_check_test = false
+ecr_scan_allowlist_feature = false
 ecs_tests = true
 eks_tests = true
 ec2_tests = true
-# Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = false
-
-### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
-### default. If false, these types of tests will be skipped while other tests will run as usual.
-### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
-### Off by default (set to false)
-ec2_tests_on_heavy_instances = false
-
-### SM specific tests
-### On by default
-sagemaker_local_tests = true
-
-# run standard sagemaker remote tests from test/sagemaker_tests
-sagemaker_remote_tests = true
-# run efa sagemaker tests
+ec2_benchmark_tests = true
+ec2_tests_on_heavy_instances = true
+sagemaker_local_tests = false
+sagemaker_remote_tests = false
 sagemaker_efa_tests = false
-# run release_candidate_integration tests
 sagemaker_rc_tests = false
-# run sagemaker benchmark tests
 sagemaker_benchmark_tests = false
-
-# SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""
-
-# Run CI tests for nightly images
-# false by default
 nightly_pr_test_mode = false
-
 use_scheduler = false
 
 [buildspec_override]
-# Assign the path to the required buildspec file from the deep-learning-containers folder
-# For example:
-# dlc-pr-tensorflow-2-habana-training = "habana/tensorflow/training/buildspec-2-10.yml"
-# dlc-pr-pytorch-inference = "pytorch/inference/buildspec-1-12.yml"
-# Setting the buildspec file path to "" allows the image builder to choose the default buildspec file.
-
-### TRAINING PR JOBS ###
-
-# Standard Framework Training
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-6-ec2.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
-
-# HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""
 dlc-pr-huggingface-pytorch-training = ""
-
-# Training Compiler
 dlc-pr-huggingface-pytorch-trcomp-training = ""
 dlc-pr-huggingface-tensorflow-2-trcomp-training = ""
 dlc-pr-pytorch-trcomp-training = ""
-
-# Neuron Training
 dlc-pr-pytorch-neuron-training = ""
 dlc-pr-tensorflow-2-neuron-training = ""
-
-# Stability AI Training
 dlc-pr-stabilityai-pytorch-training = ""
-
-# Habana Training
 dlc-pr-pytorch-habana-training = ""
 dlc-pr-tensorflow-2-habana-training = ""
-
-### INFERENCE PR JOBS ###
-
-# Standard Framework Inference
 dlc-pr-pytorch-inference = ""
 dlc-pr-tensorflow-2-inference = ""
 dlc-pr-autogluon-inference = ""
-
-# Neuron Inference
 dlc-pr-pytorch-neuron-inference = ""
 dlc-pr-tensorflow-1-neuron-inference = ""
 dlc-pr-tensorflow-2-neuron-inference = ""
-
-# HuggingFace Inference
 dlc-pr-huggingface-tensorflow-inference = ""
 dlc-pr-huggingface-pytorch-inference = ""
 dlc-pr-huggingface-pytorch-neuron-inference = ""
-
-# Stability AI Inference
 dlc-pr-stabilityai-pytorch-inference = ""
-
-# Graviton Inference
 dlc-pr-pytorch-graviton-inference = ""
 dlc-pr-tensorflow-2-graviton-inference = ""
-
-# ARM64 Inference
 dlc-pr-pytorch-arm64-inference = ""
 dlc-pr-tensorflow-2-arm64-inference = ""
-
-# EIA Inference
 dlc-pr-pytorch-eia-inference = ""
 dlc-pr-tensorflow-2-eia-inference = ""
@@ -0,0 +1,66 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.6.0
+short_version: &SHORT_VERSION "2.6"
+arch_type: x86
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildEC2CPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6500
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # build_tag_override: "beta:2.6.0-cpu-py312-ubuntu22.04-ec2"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildEC2GPUPTTrainPy3cu121DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 19700
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu126
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # build_tag_override: "beta:2.6.0-gpu-py312-cu126-ubuntu22.04-ec2"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT