mscclpp/test/deploy/run_tests.sh at 19bc5a29ec5f8bc28abb0f7f377732a95b91086b · microsoft/mscclpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
HEAD_HOST=$(head -1 ${HOSTFILE})
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
if [ -z "${HEAD_IP}" ]; then
    HEAD_IP=${HEAD_HOST}
fi
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"

# Select perf baseline based on GPU type
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
if echo "${GPU_NAME}" | grep -qi "H100"; then
    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
else
    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
fi

function run_mscclpp_test()
{
  echo "=================Run allgather_test_perf on 2 nodes========================="
  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

  # For kernel 2, the message size must can be divided by 3
  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl

  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl

  echo "==================Run allreduce_test_perf on 2 nodes========================="
  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl

  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl

  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl

  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl

  echo "==================Run alltoall_test_perf on 2 nodes========================="
  mpirun ${MPI_ARGS} -np 16 \
    ${MSCCLPP_ENV} \
    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

  echo "========================Run performance check==============================="
  python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
    --baseline-file ${PERF_BASELINE}
}

function run_mp_ut()
{
  echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
  mpirun ${MPI_ARGS} -tag-output -np 2 \
  ${MSCCLPP_ENV} \
  -npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003

  echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
  mpirun ${MPI_ARGS} -tag-output -np 16 \
  ${MSCCLPP_ENV} \
  -npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
}

function run_pytests()
{
  echo "==================Run python tests================================"
  mpirun ${MPI_ARGS} -tag-output -np 16 \
  ${MSCCLPP_ENV} \
  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
}

function run_py_benchmark()
{
  echo "==================Run python benchmark================================"
  mpirun ${MPI_ARGS} -np 16 \
  ${MSCCLPP_ENV} \
  -mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
  -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
  -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
}

function run_executor_tests()
{
  echo "==================Run multi-node executor tests======================"
  ALGOS_DIR=/root/mscclpp/test/executor-tests/algos
  PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
  TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
  PYTHON_BIN=/root/venv/bin/python3

  echo "Generating execution plans"
  ${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer.py --name multi_node_transfer > ${PLANS_DIR}/multi_node_transfer.json
  ${PYTHON_BIN} ${ALGOS_DIR}/multi_node_transfer_pkt.py --name multi_node_transfer_pkt > ${PLANS_DIR}/multi_node_transfer_pkt.json

  echo "Running multi-node transfer test with in-place buffers"
  mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer.json --size 1M --in_place
  mpirun ${MPI_ARGS} -np 2 -npernode 1 ${MSCCLPP_ENV} ${PYTHON_BIN} $TEST_SCRIPT -path $PLANS_DIR/multi_node_transfer_pkt.json --size 1M --in_place
}

if [ $# -lt 1 ]; then
  echo "Usage: $0 <mscclpp-test/mp-ut/run_pytests/run_py_benchmark/executor-tests>"
    exit 1
fi
test_name=$1
case $test_name in
  mscclpp-test)
    echo "==================Run mscclpp-test on 2 nodes========================="
    run_mscclpp_test
    ;;
  mp-ut)
    echo "==================Run mp-ut on 2 nodes================================"
    run_mp_ut
    ;;
  pytests)
    echo "==================Run python tests===================================="
    run_pytests
    ;;
  py-benchmark)
    echo "==================Run python benchmark================================"
    run_py_benchmark
    ;;
  executor-tests)
    echo "==================Run executor tests================================="
    run_executor_tests
    ;;
  *)
    echo "Unknown test name: $test_name"
    exit 1
    ;;
esac