Skip to content

Commit 6ac1b7e

Browse files
Yadan-WeiYadan Wei
andauthored
Check tf218 training efa test (#4504)
* check tf218 efa test * enable efa tests * change AWS Libfabric grep and exit message if error exists * match EFA log change for p4d and p5 * add --fix-missing to fix cpu test * revert toml * revert toml --------- Co-authored-by: Yadan Wei <yadanwei@amazon.com>
1 parent 19410ab commit 6ac1b7e

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

test/dlc_tests/container_tests/bin/efa/testEFA

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@ fi
2929
validate_all_reduce_performance_logs(){
3030
grep "aws-ofi-nccl" ${TRAINING_LOG} || { echo "aws-ofi-nccl is not working, please check if it is installed correctly"; exit 1; }
3131
grep "NET/OFI Selected Provider is efa" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
32-
grep "Using network AWS Libfabric" ${TRAINING_LOG}
32+
# EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
33+
grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
3334
if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
3435
grep "Setting NCCL_TOPO_FILE environment variable to" ${TRAINING_LOG}
35-
grep "NET/AWS Libfabric/0/GDRDMA" ${TRAINING_LOG}
36+
# EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
37+
grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
3638
fi
3739
}
3840

test/dlc_tests/ec2/test_efa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_pytorch_efa(
103103
@pytest.mark.multinode(2)
104104
@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
105105
@pytest.mark.skipif(
106-
is_pr_context() and not is_efa_dedicated(),
106+
is_pr_context() and not is_efa_dedicated() and not are_heavy_instance_ec2_tests_enabled(),
107107
reason="Skip EFA test in PR context unless explicitly enabled",
108108
)
109109
@pytest.mark.team("frameworks")

test/test_utils/ec2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1666,7 +1666,7 @@ def install_python_in_instance(context, python_version="3.9"):
16661666
(
16671667
"sudo apt install -y make build-essential libssl-dev zlib1g-dev "
16681668
"libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm "
1669-
"libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev"
1669+
"libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev --fix-missing"
16701670
),
16711671
hide=True,
16721672
)

0 commit comments

Comments
 (0)