-
Notifications
You must be signed in to change notification settings - Fork 100
Expand file tree
/
Copy pathsglang-multi-test.yml
More file actions
104 lines (97 loc) · 4.45 KB
/
sglang-multi-test.yml
File metadata and controls
104 lines (97 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# =============================================================================
# SGLang multi-node test template.
#
# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
# caller pipeline). Steps:
# 1. Build and install MSCCL++ on each node.
# 2. Install a (currently forked) SGLang on each node, replacing any
# pre-baked copy from the base image.
# 3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
# 4. Run the MSCCL++ all-reduce micro-benchmark via torchrun across both
# nodes.
# =============================================================================
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: containerName
type: string
default: 'sglang-mscclpp-test'
steps:
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal
# branch and should not remain a long-term CI dependency.
- template: run-remote-task.yml
parameters:
name: InstallSGLang
displayName: Install SGLang
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
# Remove any pre-baked sglang from the container image so all nodes
# use the freshly cloned fork (otherwise rank 0 imports
# /sgl-workspace/sglang while rank 1 imports our fork, causing
# version mismatch and NCCL/CUDA errors).
pip uninstall -y sglang sglang-router || true
rm -rf /sgl-workspace/sglang || true
rm -rf sglang
git clone -b caiorocha/mscclpp https://github.com/caiomcbr/sglang.git
cd sglang
pip install -e "python"
# Sanity check: confirm sglang resolves to our fork on every node.
python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'"
# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
# match across ranks and be free on node 0).
- template: run-remote-task.yml
parameters:
name: RunSGLangMultiBenchOneBatch
displayName: Run SGLang Multi-Node Bench One Batch
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
VMSS="${{ parameters.vmssName }}"
HOSTNAME=$(hostname)
# Explicit 2-node mapping: hostname suffix -> SGLang node rank.
if [ "$HOSTNAME" = "${VMSS}000000" ]; then
NODE_RANK=0
elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
NODE_RANK=1
else
echo "Unknown hostname: $HOSTNAME"
exit 1
fi
python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
# (steps on the same remote share a working directory).
- template: run-remote-task.yml
parameters:
name: RunSGLangMultiTestAllReduce
displayName: Run SGLang Multi-Node Test All Reduce
runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
remoteScript: |
export FLASHINFER_DISABLE_VERSION_CHECK=1
VMSS="${{ parameters.vmssName }}"
HOSTNAME=$(hostname)
# Explicit 2-node mapping: hostname suffix -> torchrun node rank.
if [ "$HOSTNAME" = "${VMSS}000000" ]; then
NODE_RANK=0
elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
NODE_RANK=1
else
echo "Unknown hostname: $HOSTNAME"
exit 1
fi
export NODE_SIZE=2
export WORLD_SIZE=8
cd sglang
# Port 20004 is the torchrun rendezvous port (arbitrary, must match
# across ranks and be free on node 0). Distinct from 20003 used by
# sglang.bench_one_batch above.
torchrun --nproc_per_node $WORLD_SIZE \
--nnodes $NODE_SIZE \
--node_rank $NODE_RANK \
--master_addr ${VMSS}000000 \
--master_port 20004 \
benchmark/kernels/all_reduce/benchmark_mscclpp.py