-
Notifications
You must be signed in to change notification settings - Fork 100
Expand file tree
/
Copy pathnccl-test.yml
More file actions
80 lines (72 loc) · 4.48 KB
/
nccl-test.yml
File metadata and controls
80 lines (72 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# .azure-pipelines/templates/nccl-test.yml
# ----------------------------------------
# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription – Azure subscription to use for VMSS start/stop
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
default: '80'
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'nccltest-single-node true cuda'
- template: run-remote-task.yml
parameters:
name: InstallNcclTests
displayName: Install NCCL Tests
remoteScript: |
cd
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: InstallNccl
displayName: Install NCCL
remoteScript: |
LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
if [ -z "$LATEST_TAG" ]; then
echo "Failed to fetch latest NCCL tag"
exit 1
fi
cd
git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
- template: run-remote-task.yml
parameters:
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}