Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ deep_canary_mode = false
[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
build_frameworks = ["pytorch"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_inference = true
build_inference = false

# Set do_build to "false" to skip builds and test the latest image built by this PR
# Note: at least one build is required to set do_build to "false"
Expand Down Expand Up @@ -124,7 +124,7 @@ nightly_pr_test_mode = false
dlc-pr-base = ""

# Standard Framework Training
dlc-pr-pytorch-training = ""
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-10-ec2.yml"
dlc-pr-tensorflow-2-training = ""
dlc-pr-autogluon-training = ""

Expand Down
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-10-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.10.0
short_version: &SHORT_VERSION "2.10"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildEC2CPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu130DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-10-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.10.0
short_version: &SHORT_VERSION "2.10"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py313
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
Loading