-
Notifications
You must be signed in to change notification settings - Fork 100
Expand file tree
/
Copy pathrun_tests.sh
More file actions
146 lines (127 loc) · 6.13 KB
/
run_tests.sh
File metadata and controls
146 lines (127 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
HEAD_HOST=$(head -1 ${HOSTFILE})
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
if [ -z "${HEAD_IP}" ]; then
HEAD_IP=${HEAD_HOST}
fi
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
# Select perf baseline based on GPU type
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
if echo "${GPU_NAME}" | grep -qi "H100"; then
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
else
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
fi
function run_mscclpp_test()
{
echo "=================Run allgather_test_perf on 2 nodes========================="
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
# For kernel 2, the message size must can be divided by 3
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
echo "==================Run allreduce_test_perf on 2 nodes========================="
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
echo "==================Run alltoall_test_perf on 2 nodes========================="
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
echo "========================Run performance check==============================="
python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
--baseline-file ${PERF_BASELINE}
}
function run_mp_ut()
{
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
mpirun ${MPI_ARGS} -tag-output -np 2 \
${MSCCLPP_ENV} \
-npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
}
function run_pytests()
{
echo "==================Run python tests================================"
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
}
function run_py_benchmark()
{
echo "==================Run python benchmark================================"
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
-x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
}
function run_executor_tests()
{
echo "==================Run multi-node executor tests======================"
ALGOS_DIR=/root/mscclpp/test/executor-tests/algos
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
PYTHON_BIN=/root/venv/bin/python3
echo "Generating execution plans"
${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer.py --name multi_node_transfer > ${PLANS_DIR}/multi_node_transfer.json
${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer_pkt.py --name multi_node_transfer_pkt > ${PLANS_DIR}/multi_node_transfer_pkt.json
echo "Running multi-node transfer test with in-place buffers"
mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place
mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place
}
if [ $# -lt 1 ]; then
echo "Usage: $0 <mscclpp-test/mp-ut/run_pytests/run_py_benchmark/executor-tests>"
exit 1
fi
test_name=$1
case $test_name in
mscclpp-test)
echo "==================Run mscclpp-test on 2 nodes========================="
run_mscclpp_test
;;
mp-ut)
echo "==================Run mp-ut on 2 nodes================================"
run_mp_ut
;;
pytests)
echo "==================Run python tests===================================="
run_pytests
;;
py-benchmark)
echo "==================Run python benchmark================================"
run_py_benchmark
;;
executor-tests)
echo "==================Run executor tests================================="
run_executor_tests
;;
*)
echo "Unknown test name: $test_name"
exit 1
;;
esac