File tree Expand file tree Collapse file tree 3 files changed +6
-4
lines changed
Expand file tree Collapse file tree 3 files changed +6
-4
lines changed Original file line number Diff line number Diff line change 2929validate_all_reduce_performance_logs (){
3030 grep " aws-ofi-nccl" ${TRAINING_LOG} || { echo " aws-ofi-nccl is not working, please check if it is installed correctly" ; exit 1; }
3131 grep " NET/OFI Selected Provider is efa" ${TRAINING_LOG} || { echo " efa is not working, please check if it is installed correctly" ; exit 1; }
32- grep " Using network AWS Libfabric" ${TRAINING_LOG}
32+ # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
33+ grep -E " Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo " efa is not working, please check if it is installed correctly" ; exit 1; }
3334 if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
3435 grep " Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
35- grep " NET/AWS Libfabric/0/GDRDMA" ${TRAINING_LOG}
36+ # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
37+ grep -E " NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
3638 fi
3739}
3840
Original file line number Diff line number Diff line change @@ -103,7 +103,7 @@ def test_pytorch_efa(
103103@pytest .mark .multinode (2 )
104104@pytest .mark .parametrize ("ec2_instance_type,region" , EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION )
105105@pytest .mark .skipif (
106- is_pr_context () and not is_efa_dedicated (),
106+ is_pr_context () and not is_efa_dedicated () and not are_heavy_instance_ec2_tests_enabled () ,
107107 reason = "Skip EFA test in PR context unless explicitly enabled" ,
108108)
109109@pytest .mark .team ("frameworks" )
Original file line number Diff line number Diff line change @@ -1666,7 +1666,7 @@ def install_python_in_instance(context, python_version="3.9"):
16661666 (
16671667 "sudo apt install -y make build-essential libssl-dev zlib1g-dev "
16681668 "libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm "
1669- "libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev"
1669+ "libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev --fix-missing "
16701670 ),
16711671 hide = True ,
16721672 )
You can’t perform that action at this time.
0 commit comments