Skip to content

Commit 29d5beb

Browse files
EmpyreusCopilotCopilot
authored
Adding Support for SGLang CI Tests (#800)
Adds Azure DevOps pipelines, templates, and supporting scripts to run SGLang end-to-end and benchmark tests against MSCCL++ on H100 GPU nodes, plus the Docker image and small infrastructure tweaks needed to make those pipelines runnable. --------- Co-authored-by: Copilot <copilot@github.com> Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent 379d0e5 commit 29d5beb

14 files changed

Lines changed: 471 additions & 42 deletions

.azure-pipelines/integration-test.yml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ pr:
1919
drafts: false
2020
paths:
2121
exclude:
22-
- .devcontainer/**
23-
- .github/**
24-
- docker/**
25-
- docs/**
26-
- '**/*.md'
22+
- .devcontainer/**
23+
- .github/**
24+
- docker/**
25+
- docs/**
26+
- '**/*.md'
2727

2828
jobs:
2929
- job: IntegrationTestA100
@@ -43,9 +43,9 @@ jobs:
4343
steps:
4444
- template: templates/integration-test.yml
4545
parameters:
46-
subscription: mscclpp-ci
47-
vmssName: mscclpp-ci
48-
gpuArch: '80'
46+
subscription: mscclpp-ci
47+
vmssName: mscclpp-ci
48+
gpuArch: '80'
4949

5050
- job: IntegrationTestH100
5151
displayName: Integration test H100
@@ -62,7 +62,7 @@ jobs:
6262
steps:
6363
- template: templates/integration-test.yml
6464
parameters:
65-
subscription: mscclpp-ci-h100
66-
vmssName: mscclpp-h100-ci
65+
subscription: mscclpp-ci-h100
66+
vmssName: mscclpp-h100-ci
6767
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
68-
gpuArch: '90'
68+
gpuArch: '90'

.azure-pipelines/multi-nodes-test.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ trigger:
1414
# Do not run multi-nodes-test for PR, we can trigger it manually
1515
pr: none
1616

17-
1817
parameters:
1918
- name: vmssName
2019
type: string
@@ -79,10 +78,10 @@ jobs:
7978
8079
- template: templates/deploy.yml
8180
parameters:
82-
subscription: mscclpp-ci-h100
83-
vmssName: ${{ parameters.vmssName }}
81+
subscription: mscclpp-ci-h100
82+
vmssName: ${{ parameters.vmssName }}
8483
resourceGroup: mscclpp
85-
gpuArch: '90'
84+
gpuArch: '90'
8685

8786
- template: templates/run-remote-task.yml
8887
parameters:
@@ -119,6 +118,6 @@ jobs:
119118
120119
- template: templates/stop.yml
121120
parameters:
122-
subscription: mscclpp-ci-h100
123-
vmssName: ${{ parameters.vmssName }}
121+
subscription: mscclpp-ci-h100
122+
vmssName: ${{ parameters.vmssName }}
124123
resourceGroup: mscclpp
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# =============================================================================
2+
# Multi-node SGLang integration test pipeline.
3+
#
4+
# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
5+
# High-level flow:
6+
# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
7+
# pool. The agent itself has no GPUs.
8+
# 2. SSH/host configuration is generated so the agent can reach the two
9+
# pre-provisioned VMSS GPU nodes.
10+
# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
11+
# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
12+
# 5. `templates/stop.yml` tears down / stops the VMSS nodes.
13+
#
14+
# Docs / non-code changes are excluded from triggering this pipeline.
15+
# =============================================================================
16+
17+
trigger:
18+
branches:
19+
include:
20+
- main
21+
- release/*
22+
paths:
23+
exclude:
24+
- .devcontainer/**
25+
- .github/**
26+
- docker/**
27+
- docs/**
28+
- '**/*.md'
29+
30+
pr:
31+
branches:
32+
include:
33+
- main
34+
- release/*
35+
drafts: false
36+
paths:
37+
exclude:
38+
- .devcontainer/**
39+
- .github/**
40+
- docker/**
41+
- docs/**
42+
- '**/*.md'
43+
44+
parameters:
45+
# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
46+
# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
47+
- name: vmssName
48+
type: string
49+
default: mscclpp-h100-multinode-ci
50+
# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
51+
# These IPs are tied to the specific VMSS above; update both together if the
52+
# VMSS is reprovisioned or renamed.
53+
- name: hostEntries
54+
type: string
55+
default: |
56+
10.0.0.5 mscclpp-h100-multinode-ci000000
57+
10.0.0.4 mscclpp-h100-multinode-ci000001
58+
# Docker image used for the SGLang test container on the GPU nodes.
59+
- name: sglangImage
60+
type: string
61+
default: lmsysorg/sglang:latest-cu129
62+
63+
jobs:
64+
- job: SGLangTestMultiNode
65+
displayName: SGLang Test Multi Node
66+
strategy:
67+
matrix:
68+
cuda12:
69+
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
70+
pool:
71+
name: mscclpp-multi-node
72+
container:
73+
image: $(containerImage)
74+
75+
steps:
76+
# Ensure the VMSS node hostnames resolve from the pipeline agent container.
77+
# Idempotent: only appends lines that are not already present in /etc/hosts.
78+
- task: Bash@3
79+
displayName: Add HostEntry
80+
inputs:
81+
targetType: 'inline'
82+
script: |
83+
while IFS= read -r line; do
84+
[ -z "$line" ] && continue
85+
if ! grep -qxF "$line" /etc/hosts; then
86+
echo "Adding to /etc/hosts: $line"
87+
echo "$line" | sudo tee -a /etc/hosts
88+
else
89+
echo "Entry already exists: $line"
90+
fi
91+
done <<< "${{ parameters.hostEntries }}"
92+
93+
# Generate the SSH config and hostfile consumed by the deploy / test
94+
# templates below:
95+
# - config : SSH client config (custom port + key) for each node
96+
# - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
97+
- task: Bash@3
98+
displayName: Generate deploy files
99+
inputs:
100+
targetType: 'inline'
101+
script: |
102+
set -e
103+
VMSS="${{ parameters.vmssName }}"
104+
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
105+
NODE0="${VMSS}000000"
106+
NODE1="${VMSS}000001"
107+
108+
echo "Host ${NODE0}
109+
Port 22345
110+
IdentityFile /root/mscclpp/sshkey
111+
StrictHostKeyChecking no
112+
Host ${NODE1}
113+
Port 22345
114+
IdentityFile /root/mscclpp/sshkey
115+
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
116+
117+
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
118+
119+
# Build MSCCL++ and deploy it onto the VMSS GPU nodes.
120+
- template: templates/deploy.yml
121+
parameters:
122+
subscription: mscclpp-ci-h100
123+
vmssName: ${{ parameters.vmssName }}
124+
resourceGroup: mscclpp
125+
gpuArch: '90'
126+
deployArgs: 'multi-node-test true cuda'
127+
containerName: 'sglang-mscclpp-test'
128+
sglangImage: ${{ parameters.sglangImage }}
129+
130+
# Run the SGLang multi-node tests across the two GPU nodes.
131+
- template: templates/sglang-multi-test.yml
132+
parameters:
133+
subscription: mscclpp-ci-h100
134+
vmssName: ${{ parameters.vmssName }}
135+
136+
# Stop/deallocate the VMSS GPU nodes to release resources.
137+
- template: templates/stop.yml
138+
parameters:
139+
subscription: mscclpp-ci-h100
140+
vmssName: ${{ parameters.vmssName }}
141+
resourceGroup: mscclpp

.azure-pipelines/sglang-test.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# =============================================================================
2+
# Single-node SGLang integration test pipeline.
3+
#
4+
# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
5+
# pool. All deploy / run / teardown logic is delegated to
6+
# `templates/sglang-test.yml`.
7+
#
8+
# Docs / non-code changes are excluded from triggering this pipeline.
9+
# =============================================================================
10+
11+
trigger:
12+
branches:
13+
include:
14+
- main
15+
- release/*
16+
paths:
17+
exclude:
18+
- .devcontainer/**
19+
- .github/**
20+
- docker/**
21+
- docs/**
22+
- '**/*.md'
23+
24+
pr:
25+
branches:
26+
include:
27+
- main
28+
- release/*
29+
drafts: false
30+
paths:
31+
exclude:
32+
- .devcontainer/**
33+
- .github/**
34+
- docker/**
35+
- docs/**
36+
- '**/*.md'
37+
38+
parameters:
39+
# Docker image used for the SGLang test container on the GPU node.
40+
- name: sglangImage
41+
type: string
42+
default: lmsysorg/sglang:latest-cu129
43+
44+
jobs:
45+
- job: SGLangTest
46+
displayName: SGLang Test
47+
strategy:
48+
matrix:
49+
cuda12:
50+
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
51+
pool:
52+
name: msccl-ci-h100
53+
container:
54+
image: $(containerImage)
55+
56+
steps:
57+
# Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
58+
- template: templates/sglang-test.yml
59+
parameters:
60+
subscription: mscclpp-ci-h100
61+
vmssName: mscclpp-h100-ci
62+
gpuArch: '90'
63+
sglangImage: ${{ parameters.sglangImage }}

.azure-pipelines/templates/deploy.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ parameters:
3232
- name: deployArgs
3333
type: string
3434
default: ''
35+
- name: containerName
36+
type: string
37+
default: 'mscclpp-test'
38+
- name: sglangImage
39+
type: string
40+
default: ''
3541

3642
steps:
3743
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
@@ -147,5 +153,5 @@ steps:
147153
inputs:
148154
targetType: filePath
149155
filePath: test/deploy/deploy.sh
150-
arguments: ${{ parameters.deployArgs }}
156+
arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }} ${{ parameters.sglangImage }}
151157
workingDirectory: '$(System.DefaultWorkingDirectory)'

.azure-pipelines/templates/integration-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ steps:
1515
subscription: ${{ parameters.subscription }}
1616
vmssName: ${{ parameters.vmssName }}
1717
gpuArch: ${{ parameters.gpuArch }}
18-
deployArgs: 'single-node-test'
18+
deployArgs: 'single-node-test true cuda'
1919

2020
- template: run-remote-task.yml
2121
parameters:

.azure-pipelines/templates/nccl-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ steps:
2323
subscription: ${{ parameters.subscription }}
2424
vmssName: ${{ parameters.vmssName }}
2525
gpuArch: ${{ parameters.gpuArch }}
26-
deployArgs: 'nccltest-single-node'
26+
deployArgs: 'nccltest-single-node true cuda'
2727

2828
- template: run-remote-task.yml
2929
parameters:

0 commit comments

Comments
 (0)