microsoft
diff --git a/‎.azure-pipelines/integration-test.yml‎
Lines changed: 11 additions & 11 deletions b/‎.azure-pipelines/integration-test.yml‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎.azure-pipelines/multi-nodes-test.yml‎
Lines changed: 5 additions & 6 deletions b/‎.azure-pipelines/multi-nodes-test.yml‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎.azure-pipelines/sglang-multi-node-test.yml‎
Lines changed: 141 additions & 0 deletions b/‎.azure-pipelines/sglang-multi-node-test.yml‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎.azure-pipelines/sglang-test.yml‎
Lines changed: 63 additions & 0 deletions b/‎.azure-pipelines/sglang-test.yml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎.azure-pipelines/templates/deploy.yml‎
Lines changed: 7 additions & 1 deletion b/‎.azure-pipelines/templates/deploy.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.azure-pipelines/templates/integration-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/templates/integration-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.azure-pipelines/templates/nccl-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/templates/nccl-test.yml‎
Lines changed: 1 addition & 1 deletion
@@ -19,11 +19,11 @@ pr:
   drafts: false
   paths:
     exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
 
 jobs:
 - job: IntegrationTestA100
@@ -43,9 +43,9 @@ jobs:
   steps:
   - template: templates/integration-test.yml
     parameters:
-      subscription:     mscclpp-ci
-      vmssName:         mscclpp-ci
-      gpuArch:          '80'
+      subscription: mscclpp-ci
+      vmssName: mscclpp-ci
+      gpuArch: '80'
 
 - job: IntegrationTestH100
   displayName: Integration test H100
@@ -62,7 +62,7 @@ jobs:
   steps:
   - template: templates/integration-test.yml
     parameters:
-      subscription:     mscclpp-ci-h100
-      vmssName:         mscclpp-h100-ci
+      subscription: mscclpp-ci-h100
+      vmssName: mscclpp-h100-ci
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
-      gpuArch:          '90'
+      gpuArch: '90'
@@ -14,7 +14,6 @@ trigger:
 # Do not run multi-nodes-test for PR, we can trigger it manually
 pr: none
 
-
 parameters:
 - name: vmssName
   type: string
@@ -79,10 +78,10 @@ jobs:
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
-      gpuArch:       '90'
+      gpuArch: '90'
 
   - template: templates/run-remote-task.yml
     parameters:
@@ -119,6 +118,6 @@ jobs:
 
   - template: templates/stop.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
@@ -0,0 +1,141 @@
+# =============================================================================
+# Multi-node SGLang integration test pipeline.
+#
+# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
+# High-level flow:
+#   1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
+#      pool. The agent itself has no GPUs.
+#   2. SSH/host configuration is generated so the agent can reach the two
+#      pre-provisioned VMSS GPU nodes.
+#   3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
+#   4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
+#   5. `templates/stop.yml` tears down / stops the VMSS nodes.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
+# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
+# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
+# These IPs are tied to the specific VMSS above; update both together if the
+# VMSS is reprovisioned or renamed.
+- name: hostEntries
+  type: string
+  default: |
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
+# Docker image used for the SGLang test container on the GPU nodes.
+- name: sglangImage
+  type: string
+  default: lmsysorg/sglang:latest-cu129
+
+jobs:
+- job: SGLangTestMultiNode
+  displayName: SGLang Test Multi Node
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+  pool:
+    name: mscclpp-multi-node
+  container:
+    image: $(containerImage)
+
+  steps:
+  # Ensure the VMSS node hostnames resolve from the pipeline agent container.
+  # Idempotent: only appends lines that are not already present in /etc/hosts.
+  - task: Bash@3
+    displayName: Add HostEntry
+    inputs:
+      targetType: 'inline'
+      script: |
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  # Generate the SSH config and hostfile consumed by the deploy / test
+  # templates below:
+  #   - config   : SSH client config (custom port + key) for each node
+  #   - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+  # Build MSCCL++ and deploy it onto the VMSS GPU nodes.
+  - template: templates/deploy.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch: '90'
+      deployArgs: 'multi-node-test true cuda'
+      containerName: 'sglang-mscclpp-test'
+      sglangImage: ${{ parameters.sglangImage }}
+
+  # Run the SGLang multi-node tests across the two GPU nodes.
+  - template: templates/sglang-multi-test.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+
+  # Stop/deallocate the VMSS GPU nodes to release resources.
+  - template: templates/stop.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
@@ -0,0 +1,63 @@
+# =============================================================================
+# Single-node SGLang integration test pipeline.
+#
+# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
+# pool. All deploy / run / teardown logic is delegated to
+# `templates/sglang-test.yml`.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Docker image used for the SGLang test container on the GPU node.
+- name: sglangImage
+  type: string
+  default: lmsysorg/sglang:latest-cu129
+
+jobs:
+- job: SGLangTest
+  displayName: SGLang Test
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+  pool:
+    name: msccl-ci-h100
+  container:
+    image: $(containerImage)
+
+  steps:
+  # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
+  - template: templates/sglang-test.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: mscclpp-h100-ci
+      gpuArch: '90'
+      sglangImage: ${{ parameters.sglangImage }}
@@ -32,6 +32,12 @@ parameters:
 - name: deployArgs
   type: string
   default: ''
+- name: containerName
+  type: string
+  default: 'mscclpp-test'
+- name: sglangImage
+  type: string
+  default: ''
 
 steps:
 # 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
@@ -147,5 +153,5 @@ steps:
   inputs:
     targetType: filePath
     filePath: test/deploy/deploy.sh
-    arguments: ${{ parameters.deployArgs }}
+    arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }} ${{ parameters.sglangImage }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -15,7 +15,7 @@ steps:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
-    deployArgs:       'single-node-test'
+    deployArgs:       'single-node-test true cuda'
 
 - template: run-remote-task.yml
   parameters:
 
@@ -23,7 +23,7 @@ steps:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
-    deployArgs:       'nccltest-single-node'
+    deployArgs:       'nccltest-single-node true cuda'
 
 - template: run-remote-task.yml
   parameters: