Skip to content

Commit 702de29

Browse files
authored
Merge branch 'main' into binyli/rocm
2 parents dcf1371 + 08589bf commit 702de29

170 files changed

Lines changed: 7327 additions & 4865 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.azure-pipelines/integration-test-rocm.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ jobs:
9494
script: |
9595
set -e
9696
export PATH=/usr/local/mpi/bin:$PATH
97-
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
97+
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
9898
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
9999
workingDirectory: '$(System.DefaultWorkingDirectory)'
100100

@@ -106,7 +106,7 @@ jobs:
106106
script: |
107107
set -e
108108
export PATH=/usr/local/mpi/bin:$PATH
109-
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
109+
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
110110
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
111111
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
112112
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \

.azure-pipelines/integration-test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ jobs:
4646
subscription: mscclpp-ci
4747
vmssName: mscclpp-ci
4848
sshKeySecureFile: mscclpp.pem
49+
gpuArch: '80'
4950

5051
- job: IntegrationTestH100
5152
displayName: Integration test H100
@@ -66,3 +67,4 @@ jobs:
6667
vmssName: mscclpp-h100-ci
6768
sshKeySecureFile: mscclpp.pem
6869
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
70+
gpuArch: '90'

.azure-pipelines/templates/integration-test.yaml

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ parameters:
88
- name: perfBaselineFile
99
type: string
1010
default: 'test/deploy/perf_ndmv4.jsonl'
11-
11+
- name: gpuArch
12+
type: string
1213

1314
steps:
1415
- task: Bash@3
@@ -18,7 +19,7 @@ steps:
1819
targetType: inline
1920
script: |
2021
mkdir build && cd build
21-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
22+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
2223
make -j
2324
workingDirectory: '$(System.DefaultWorkingDirectory)'
2425

@@ -73,13 +74,13 @@ steps:
7374
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
7475
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
7576
export PATH=/usr/local/mpi/bin:\$PATH; \
76-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
77+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
7778
cd /root/mscclpp; \
7879
set -e; \
79-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
80-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
81-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
82-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
80+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
81+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
82+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
83+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
8384
kill $CHILD_PID
8485
workingDirectory: '$(System.DefaultWorkingDirectory)'
8586

@@ -100,9 +101,9 @@ steps:
100101
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
101102
set -e; \
102103
export PATH=/usr/local/mpi/bin:\$PATH; \
103-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
104+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
104105
cd /root/mscclpp; \
105-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
106+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
106107
kill $CHILD_PID
107108
workingDirectory: '$(System.DefaultWorkingDirectory)'
108109

@@ -123,15 +124,15 @@ steps:
123124
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
124125
set -e; \
125126
export PATH=/usr/local/mpi/bin:\$PATH; \
126-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
127+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
127128
cd /root/mscclpp; \
128-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
129-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
130-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
131-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
132-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
133-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
134-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
129+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
130+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
131+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
132+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
133+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
134+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
135+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
135136
kill $CHILD_PID
136137
workingDirectory: '$(System.DefaultWorkingDirectory)'
137138

@@ -151,10 +152,10 @@ steps:
151152
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
152153
set -e; \
153154
export PATH=/usr/local/mpi/bin:\$PATH; \
154-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
155+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
155156
cd /root/mscclpp; \
156-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
157-
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
157+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
158+
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
158159
kill $CHILD_PID
159160
workingDirectory: '$(System.DefaultWorkingDirectory)'
160161

@@ -176,7 +177,7 @@ steps:
176177
set -e; \
177178
cd /root/mscclpp; \
178179
export PATH=/usr/local/mpi/bin:\$PATH; \
179-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
180+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
180181
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
181182
kill $CHILD_PID
182183
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -199,7 +200,7 @@ steps:
199200
set -e; \
200201
cd /root/mscclpp; \
201202
export PATH=/usr/local/mpi/bin:\$PATH; \
202-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
203+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
203204
python3 -m pip install .; \
204205
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
205206
kill $CHILD_PID
@@ -222,9 +223,9 @@ steps:
222223
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
223224
set -e; \
224225
export PATH=/usr/local/mpi/bin:\$PATH; \
225-
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
226+
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
226227
cd /root/mscclpp; \
227-
./build/test/perf/fifo_test"'
228+
./build/bin/perf/fifo_test"'
228229
kill $CHILD_PID
229230
workingDirectory: '$(System.DefaultWorkingDirectory)'
230231

0 commit comments

Comments
 (0)