mscclpp/.azure-pipelines/templates/sglang-multi-test.yml at 2a44522cf6f8ddee404ef102a83226698bd1c0f1 · microsoft/mscclpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# =============================================================================
# SGLang multi-node test template.
#
# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
# caller pipeline). Steps:
#   1. Build and install MSCCL++ on each node.
#   2. Install a (currently forked) SGLang on each node, replacing any
#      pre-baked copy from the base image.
#   3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
#   4. Run the MSCCL++ all-reduce micro-benchmark via torchrun across both
#      nodes.
# =============================================================================

parameters:
- name: subscription
  type: string
- name: vmssName
  type: string
- name: containerName
  type: string
  default: 'sglang-mscclpp-test'

steps:
# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal
# branch and should not remain a long-term CI dependency.
- template: run-remote-task.yml
  parameters:
    name: InstallSGLang
    displayName: Install SGLang
    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
    remoteScript: |
      # Remove any pre-baked sglang from the container image so all nodes
      # use the freshly cloned fork (otherwise rank 0 imports
      # /sgl-workspace/sglang while rank 1 imports our fork, causing
      # version mismatch and NCCL/CUDA errors).
      pip uninstall -y sglang sglang-router || true
      rm -rf /sgl-workspace/sglang || true
      rm -rf sglang
      git clone -b caiorocha/mscclpp https://github.com/caiomcbr/sglang.git
      cd sglang
      pip install -e "python"
      # Sanity check: confirm sglang resolves to our fork on every node.
      python -c "import sglang, os; p=os.path.dirname(sglang.__file__); print('sglang from:', p); assert '/sgl-workspace' not in p, 'stock sglang still active'"

# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
# match across ranks and be free on node 0).
- template: run-remote-task.yml
  parameters:
    name: RunSGLangMultiBenchOneBatch
    displayName: Run SGLang Multi-Node Bench One Batch
    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
    remoteScript: |
      export FLASHINFER_DISABLE_VERSION_CHECK=1
      VMSS="${{ parameters.vmssName }}"
      HOSTNAME=$(hostname)
      # Explicit 2-node mapping: hostname suffix -> SGLang node rank.
      if [ "$HOSTNAME" = "${VMSS}000000" ]; then
        NODE_RANK=0
      elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
        NODE_RANK=1
      else
        echo "Unknown hostname: $HOSTNAME"
        exit 1
      fi
      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp

# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
# (steps on the same remote share a working directory).
- template: run-remote-task.yml
  parameters:
    name: RunSGLangMultiTestAllReduce
    displayName: Run SGLang Multi-Node Test All Reduce
    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
    remoteScript: |
      export FLASHINFER_DISABLE_VERSION_CHECK=1
      VMSS="${{ parameters.vmssName }}"
      HOSTNAME=$(hostname)
      # Explicit 2-node mapping: hostname suffix -> torchrun node rank.
      if [ "$HOSTNAME" = "${VMSS}000000" ]; then
        NODE_RANK=0
      elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
        NODE_RANK=1
      else
        echo "Unknown hostname: $HOSTNAME"
        exit 1
      fi

      export NODE_SIZE=2
      export WORLD_SIZE=8

      cd sglang

      # Port 20004 is the torchrun rendezvous port (arbitrary, must match
      # across ranks and be free on node 0). Distinct from 20003 used by
      # sglang.bench_one_batch above.
      torchrun --nproc_per_node $WORLD_SIZE \
        --nnodes $NODE_SIZE \
        --node_rank $NODE_RANK \
        --master_addr ${VMSS}000000 \
        --master_port 20004 \
        benchmark/kernels/all_reduce/benchmark_mscclpp.py