Skip to content

Commit 6b0aaf0

Browse files
author
Bhanu Teja Goshikonda
committed
reverted all changes and enabled logs of check_efa_nccl_all_reduce
1 parent 092ab8a commit 6b0aaf0

File tree

3 files changed

+17
-17
lines changed

3 files changed

+17
-17
lines changed

dlc_developer_config.toml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,16 @@ deep_canary_mode = false
3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
3939
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
40-
build_frameworks = ["tensorflow"]
40+
build_frameworks = []
4141

4242

4343
# By default we build both training and inference containers. Set true/false values to determine which to build.
4444
build_training = true
45-
build_inference = false
45+
build_inference = true
4646

4747
# Set do_build to "false" to skip builds and test the latest image built by this PR
4848
# Note: at least one build is required to set do_build to "false"
49-
do_build = false
49+
do_build = true
5050

5151
[notify]
5252
### Notify on test failures
@@ -57,24 +57,24 @@ notify_test_failures = false
5757

5858
[test]
5959
### On by default
60-
sanity_tests = false
61-
security_tests = false
60+
sanity_tests = true
61+
security_tests = true
6262
safety_check_test = false
6363
ecr_scan_allowlist_feature = false
64-
ecs_tests = false
65-
eks_tests = false
64+
ecs_tests = true
65+
eks_tests = true
6666
ec2_tests = true
6767
# Set it to true if you are preparing a Benchmark related PR
68-
ec2_benchmark_tests = true
68+
ec2_benchmark_tests = false
6969

7070
### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
7171
### default. If false, these types of tests will be skipped while other tests will run as usual.
7272
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
7373
### Off by default (set to false)
74-
ec2_tests_on_heavy_instances = true
74+
ec2_tests_on_heavy_instances = false
7575
### SM specific tests
7676
### On by default
77-
sagemaker_local_tests = false
77+
sagemaker_local_tests = true
7878
### Set enable_ipv6 = true to run tests with IPv6-enabled resources
7979
### Off by default (set to false)
8080
enable_ipv6 = false
@@ -92,13 +92,13 @@ enable_ipv6 = false
9292
ipv6_vpc_name = ""
9393

9494
# run standard sagemaker remote tests from test/sagemaker_tests
95-
sagemaker_remote_tests = false
95+
sagemaker_remote_tests = true
9696
# run efa sagemaker tests
97-
sagemaker_efa_tests = true
97+
sagemaker_efa_tests = false
9898
# run release_candidate_integration tests
99-
sagemaker_rc_tests = true
99+
sagemaker_rc_tests = false
100100
# run sagemaker benchmark tests
101-
sagemaker_benchmark_tests = true
101+
sagemaker_benchmark_tests = false
102102

103103
# SM remote EFA test instance type
104104
sagemaker_remote_efa_instance_type = ""
@@ -123,7 +123,7 @@ dlc-pr-base = ""
123123

124124
# Standard Framework Training
125125
dlc-pr-pytorch-training = ""
126-
dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml"
126+
dlc-pr-tensorflow-2-training = ""
127127
dlc-pr-autogluon-training = ""
128128

129129
# ARM64 Training

tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ RUN mkdir /tmp/efa-ofi-nccl \
237237
&& ./configure --with-libfabric=/opt/amazon/efa \
238238
--with-mpi=/opt/amazon/openmpi \
239239
--with-cuda=/usr/local/cuda \
240-
--with-nccl=/usr \
240+
--with-nccl=/usr/local \
241241
--prefix=/usr/local \
242242
--disable-tests \
243243
&& make -j $(nproc) \

test/dlc_tests/container_tests/bin/efa/testEFA

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){
8989

9090
RETURN_VAL=${PIPESTATUS[0]}
9191
# In case, if you would like see logs, uncomment below line
92-
# RESULT=$(cat ${TRAINING_LOG})
92+
RESULT=$(cat ${TRAINING_LOG})
9393

9494
if [ ${RETURN_VAL} -eq 0 ]; then
9595
echo "***************************** check_efa_nccl_all_reduce passed *****************************"

0 commit comments

Comments
 (0)