Skip to content

Commit d27e523

Browse files
committed
pt 2.8 training ec2
1 parent a66b896 commit d27e523

File tree

6 files changed

+978
-6
lines changed

6 files changed

+978
-6
lines changed

dlc_developer_config.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ deep_canary_mode = false
3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
3939
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
40-
build_frameworks = []
40+
build_frameworks = ["pytorch"]
4141

4242

4343
# By default we build both training and inference containers. Set true/false values to determine which to build.
4444
build_training = true
45-
build_inference = true
45+
build_inference = false
4646

4747
# Set do_build to "false" to skip builds and test the latest image built by this PR
4848
# Note: at least one build is required to set do_build to "false"
@@ -65,13 +65,13 @@ ecs_tests = true
6565
eks_tests = true
6666
ec2_tests = true
6767
# Set it to true if you are preparing a Benchmark related PR
68-
ec2_benchmark_tests = false
68+
ec2_benchmark_tests = true
6969

7070
### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
7171
### default. If false, these types of tests will be skipped while other tests will run as usual.
7272
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
7373
### Off by default (set to false)
74-
ec2_tests_on_heavy_instances = false
74+
ec2_tests_on_heavy_instances = true
7575
### SM specific tests
7676
### On by default
7777
sagemaker_local_tests = true
@@ -119,7 +119,7 @@ use_scheduler = false
119119
### TRAINING PR JOBS ###
120120

121121
# Standard Framework Training
122-
dlc-pr-pytorch-training = ""
122+
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml""
123123
dlc-pr-tensorflow-2-training = ""
124124
dlc-pr-autogluon-training = ""
125125

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK pytorch
5+
version: &VERSION 2.8.0
6+
short_version: &SHORT_VERSION "2.8"
7+
arch_type: x86
8+
# autopatch_build: "True"
9+
10+
repository_info:
11+
training_repository: &TRAINING_REPOSITORY
12+
image_type: &TRAINING_IMAGE_TYPE training
13+
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
14+
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
18+
19+
context:
20+
training_context: &TRAINING_CONTEXT
21+
start_cuda_compat:
22+
source: docker/build_artifacts/start_cuda_compat.sh
23+
target: start_cuda_compat.sh
24+
dockerd_entrypoint:
25+
source: docker/build_artifacts/dockerd_entrypoint.sh
26+
target: dockerd_entrypoint.sh
27+
changehostname:
28+
source: docker/build_artifacts/changehostname.c
29+
target: changehostname.c
30+
start_with_right_hostname:
31+
source: docker/build_artifacts/start_with_right_hostname.sh
32+
target: start_with_right_hostname.sh
33+
example_mnist_file:
34+
source: docker/build_artifacts/mnist.py
35+
target: mnist.py
36+
deep_learning_container:
37+
source: ../../src/deep_learning_container.py
38+
target: deep_learning_container.py
39+
40+
images:
41+
BuildEC2CPUPTTrainPy3DockerImage:
42+
<<: *TRAINING_REPOSITORY
43+
build: &PYTORCH_CPU_TRAINING_PY3 false
44+
image_size_baseline: 6500
45+
device_type: &DEVICE_TYPE cpu
46+
python_version: &DOCKER_PYTHON_VERSION py3
47+
tag_python_version: &TAG_PYTHON_VERSION py312
48+
os_version: &OS_VERSION ubuntu22.04
49+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
50+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
51+
# skip_build: "False"
52+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
53+
target: ec2
54+
context:
55+
<<: *TRAINING_CONTEXT
56+
BuildEC2GPUPTTrainPy3cu128DockerImage:
57+
<<: *TRAINING_REPOSITORY
58+
build: &PYTORCH_GPU_TRAINING_PY3 false
59+
image_size_baseline: 24000
60+
device_type: &DEVICE_TYPE gpu
61+
python_version: &DOCKER_PYTHON_VERSION py3
62+
tag_python_version: &TAG_PYTHON_VERSION py312
63+
cuda_version: &CUDA_VERSION cu128
64+
os_version: &OS_VERSION ubuntu22.04
65+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
66+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
67+
# skip_build: "False"
68+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
69+
*DEVICE_TYPE ]
70+
target: ec2
71+
context:
72+
<<: *TRAINING_CONTEXT
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK pytorch
5+
version: &VERSION 2.8.0
6+
short_version: &SHORT_VERSION "2.8"
7+
arch_type: x86
8+
# autopatch_build: "True"
9+
10+
repository_info:
11+
training_repository: &TRAINING_REPOSITORY
12+
image_type: &TRAINING_IMAGE_TYPE training
13+
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
14+
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
18+
19+
context:
20+
training_context: &TRAINING_CONTEXT
21+
start_cuda_compat:
22+
source: docker/build_artifacts/start_cuda_compat.sh
23+
target: start_cuda_compat.sh
24+
dockerd_entrypoint:
25+
source: docker/build_artifacts/dockerd_entrypoint.sh
26+
target: dockerd_entrypoint.sh
27+
changehostname:
28+
source: docker/build_artifacts/changehostname.c
29+
target: changehostname.c
30+
start_with_right_hostname:
31+
source: docker/build_artifacts/start_with_right_hostname.sh
32+
target: start_with_right_hostname.sh
33+
example_mnist_file:
34+
source: docker/build_artifacts/mnist.py
35+
target: mnist.py
36+
deep_learning_container:
37+
source: ../../src/deep_learning_container.py
38+
target: deep_learning_container.py
39+
40+
images:
41+
BuildSageMakerCPUPTTrainPy3DockerImage:
42+
<<: *TRAINING_REPOSITORY
43+
build: &PYTORCH_CPU_TRAINING_PY3 false
44+
image_size_baseline: 6500
45+
device_type: &DEVICE_TYPE cpu
46+
python_version: &DOCKER_PYTHON_VERSION py3
47+
tag_python_version: &TAG_PYTHON_VERSION py312
48+
os_version: &OS_VERSION ubuntu22.04
49+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
50+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
51+
# skip_build: "False"
52+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
53+
target: sagemaker
54+
context:
55+
<<: *TRAINING_CONTEXT
56+
BuildSageMakerGPUPTTrainPy3DockerImage:
57+
<<: *TRAINING_REPOSITORY
58+
build: &PYTORCH_GPU_TRAINING_PY3 false
59+
image_size_baseline: 24000
60+
device_type: &DEVICE_TYPE gpu
61+
python_version: &DOCKER_PYTHON_VERSION py3
62+
tag_python_version: &TAG_PYTHON_VERSION py312
63+
cuda_version: &CUDA_VERSION cu128
64+
os_version: &OS_VERSION ubuntu22.04
65+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
66+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
67+
# skip_build: "False"
68+
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
69+
*DEVICE_TYPE ]
70+
target: sagemaker
71+
context:
72+
<<: *TRAINING_CONTEXT

pytorch/training/buildspec.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
buildspec_pointer: buildspec-2-7-sm.yml
1+
buildspec_pointer: buildspec-2-8-ec2.yml

0 commit comments

Comments
 (0)