Skip to content

Commit fa82f11

Browse files
committed
update comments and rebuild ec2 image
1 parent 94f3e4e commit fa82f11

File tree

5 files changed

+21
-20
lines changed

5 files changed

+21
-20
lines changed

dlc_developer_config.toml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ build_frameworks = ["pytorch"]
4141

4242

4343
# By default we build both training and inference containers. Set true/false values to determine which to build.
44-
build_training = true
44+
build_training = true
4545
build_inference = false
4646

4747
# Set do_build to "false" to skip builds and test the latest image built by this PR
4848
# Note: at least one build is required to set do_build to "false"
49-
do_build = true
49+
do_build = true
5050

5151
[notify]
5252
### Notify on test failures
@@ -57,24 +57,24 @@ notify_test_failures = false
5757

5858
[test]
5959
### On by default
60-
sanity_tests = true
61-
security_tests = true
62-
safety_check_test = true
63-
ecr_scan_allowlist_feature = true
64-
ecs_tests = true
65-
eks_tests = true
66-
ec2_tests = true
60+
sanity_tests = true
61+
security_tests = true
62+
safety_check_test = true
63+
ecr_scan_allowlist_feature = true
64+
ecs_tests = true
65+
eks_tests = true
66+
ec2_tests = true
6767
# Set it to true if you are preparing a Benchmark related PR
68-
ec2_benchmark_tests = true
68+
ec2_benchmark_tests = true
6969

7070
### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
7171
### default. If false, these types of tests will be skipped while other tests will run as usual.
7272
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
7373
### Off by default (set to false)
74-
ec2_tests_on_heavy_instances = true
74+
ec2_tests_on_heavy_instances = true
7575
### SM specific tests
7676
### On by default
77-
sagemaker_local_tests = true
77+
sagemaker_local_tests = true
7878
### Set enable_ipv6 = true to run tests with IPv6-enabled resources
7979
### Off by default (set to false)
8080
enable_ipv6 = false
@@ -92,13 +92,13 @@ enable_ipv6 = false
9292
ipv6_vpc_name = ""
9393

9494
# run standard sagemaker remote tests from test/sagemaker_tests
95-
sagemaker_remote_tests = true
95+
sagemaker_remote_tests = true
9696
# run efa sagemaker tests
97-
sagemaker_efa_tests = false
97+
sagemaker_efa_tests = true
9898
# run release_candidate_integration tests
99-
sagemaker_rc_tests = true
99+
sagemaker_rc_tests = true
100100
# run sagemaker benchmark tests
101-
sagemaker_benchmark_tests = true
101+
sagemaker_benchmark_tests = true
102102

103103
# SM remote EFA test instance type
104104
sagemaker_remote_efa_instance_type = ""
@@ -119,7 +119,7 @@ use_scheduler = false
119119
### TRAINING PR JOBS ###
120120

121121
# Standard Framework Training
122-
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
122+
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
123123
dlc-pr-tensorflow-2-training = ""
124124
dlc-pr-autogluon-training = ""
125125

pytorch/training/buildspec.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
buildspec_pointer: buildspec-2-8-sm.yml
1+
buildspec_pointer: buildspec-2-8-ec2.yml

pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ RUN apt-get update \
8383
check \
8484
llvm \
8585
xz-utils \
86+
libsqlite3-dev \
8687
&& rm -rf /var/lib/apt/lists/* \
8788
&& apt-get clean
8889

test/dlc_tests/container_tests/bin/efa/testEFA

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){
8989

9090
RETURN_VAL=${PIPESTATUS[0]}
9191
# In case, if you would like see logs, uncomment below line
92-
RESULT=$(cat ${TRAINING_LOG})
92+
# RESULT=$(cat ${TRAINING_LOG})
9393

9494
if [ ${RETURN_VAL} -eq 0 ]; then
9595
echo "***************************** check_efa_nccl_all_reduce passed *****************************"

test/dlc_tests/ec2/pytorch/training/common_cases.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
357357
"""
358358
Test cuDNN Package
359359
PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
360-
Checks both /usr/include/ and /usr/include/x86_64-linux-gnu/ paths to support different cuDNN package installations.
360+
Checks both /usr/include/ and /usr/local/cuda/include/ paths to support different cuDNN package installations.
361361
"""
362362
container_name = "pytorch_cudnn"
363363
account_id = get_account_id_from_image_uri(pytorch_training)

0 commit comments

Comments
 (0)