-
Notifications
You must be signed in to change notification settings - Fork 100
Expand file tree
/
Copy pathmulti-nodes-test.yml
More file actions
132 lines (116 loc) · 4.14 KB
/
multi-nodes-test.yml
File metadata and controls
132 lines (116 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
# Do not run multi-nodes-test for PR, we can trigger it manually
pr: none
parameters:
- name: vmssName
type: string
default: mscclpp-h100-multinode-ci
- name: hostEntries
type: string
default: |
10.0.0.5 mscclpp-h100-multinode-ci000000
10.0.0.4 mscclpp-h100-multinode-ci000001
jobs:
- job: MultiNodesTest
displayName: Multi nodes test
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: mscclpp-multi-node
container:
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
displayName: Add HostEntry
inputs:
targetType: 'inline'
script: |
while IFS= read -r line; do
[ -z "$line" ] && continue
if ! grep -qxF "$line" /etc/hosts; then
echo "Adding to /etc/hosts: $line"
echo "$line" | sudo tee -a /etc/hosts
else
echo "Entry already exists: $line"
fi
done <<< "${{ parameters.hostEntries }}"
- task: Bash@3
displayName: Generate deploy files
inputs:
targetType: 'inline'
script: |
set -e
VMSS="${{ parameters.vmssName }}"
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
NODE0="${VMSS}000000"
NODE1="${VMSS}000001"
echo "Host ${NODE0}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host ${NODE1}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
- template: templates/deploy.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'
- template: templates/run-remote-task.yml
parameters:
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
continueOnError: true
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
# - template: templates/run-remote-task.yml
# parameters:
# name: RunMultiNodeUnitTest
# displayName: Run multi-nodes unit tests
# runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
# remoteScript: |
# bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
# - template: templates/run-remote-task.yml
# parameters:
# name: RunMultiNodePythonTests
# displayName: Run multi-nodes python tests
# runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
# remoteScript: |
# bash /root/mscclpp/test/deploy/run_tests.sh pytests
# - template: templates/run-remote-task.yml
# parameters:
# name: RunMultiNodePythonBenchmark
# displayName: Run multi-nodes python benchmark
# runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
# remoteScript: |
# bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeExecutorTests
displayName: Run multi-nodes executor tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh executor-tests
- template: templates/stop.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp