diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index d5d5f9bde..45bb1e96d 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -19,11 +19,11 @@ pr:
   drafts: false
   paths:
     exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
 
 jobs:
 - job: IntegrationTestA100
@@ -43,9 +43,9 @@ jobs:
   steps:
   - template: templates/integration-test.yml
     parameters:
-      subscription:     mscclpp-ci
-      vmssName:         mscclpp-ci
-      gpuArch:          '80'
+      subscription: mscclpp-ci
+      vmssName: mscclpp-ci
+      gpuArch: '80'
 
 - job: IntegrationTestH100
   displayName: Integration test H100
@@ -62,7 +62,7 @@ jobs:
   steps:
   - template: templates/integration-test.yml
     parameters:
-      subscription:     mscclpp-ci-h100
-      vmssName:         mscclpp-h100-ci
+      subscription: mscclpp-ci-h100
+      vmssName: mscclpp-h100-ci
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
-      gpuArch:          '90'
+      gpuArch: '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 3b3ebe1ff..ee2766fd7 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -14,7 +14,6 @@ trigger:
 # Do not run multi-nodes-test for PR, we can trigger it manually
 pr: none
 
-
 parameters:
 - name: vmssName
   type: string
@@ -79,10 +78,10 @@ jobs:
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
-      gpuArch:       '90'
+      gpuArch: '90'
 
   - template: templates/run-remote-task.yml
     parameters:
@@ -119,6 +118,6 @@ jobs:
 
   - template: templates/stop.yml
     parameters:
-      subscription:  mscclpp-ci-h100
-      vmssName:      ${{ parameters.vmssName }}
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
       resourceGroup: mscclpp
diff --git a/.azure-pipelines/sglang-multi-node-test.yml b/.azure-pipelines/sglang-multi-node-test.yml
new file mode 100644
index 000000000..bf640db20
--- /dev/null
+++ b/.azure-pipelines/sglang-multi-node-test.yml
@@ -0,0 +1,141 @@
+# =============================================================================
+# Multi-node SGLang integration test pipeline.
+#
+# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes.
+# High-level flow:
+#   1. The pipeline agent runs inside a container on the `mscclpp-multi-node`
+#      pool. The agent itself has no GPUs.
+#   2. SSH/host configuration is generated so the agent can reach the two
+#      pre-provisioned VMSS GPU nodes.
+#   3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes.
+#   4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests.
+#   5. `templates/stop.yml` tears down / stops the VMSS nodes.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes.
+# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001".
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
+# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs.
+# These IPs are tied to the specific VMSS above; update both together if the
+# VMSS is reprovisioned or renamed.
+- name: hostEntries
+  type: string
+  default: |
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
+# Docker image used for the SGLang test container on the GPU nodes.
+- name: sglangImage
+  type: string
+  default: lmsysorg/sglang:latest-cu129
+
+jobs:
+- job: SGLangTestMultiNode
+  displayName: SGLang Test Multi Node
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+  pool:
+    name: mscclpp-multi-node
+  container:
+    image: $(containerImage)
+
+  steps:
+  # Ensure the VMSS node hostnames resolve from the pipeline agent container.
+  # Idempotent: only appends lines that are not already present in /etc/hosts.
+  - task: Bash@3
+    displayName: Add HostEntry
+    inputs:
+      targetType: 'inline'
+      script: |
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  # Generate the SSH config and hostfile consumed by the deploy / test
+  # templates below:
+  #   - config   : SSH client config (custom port + key) for each node
+  #   - hostfile : user@host list used by deploy / test scripts (parallel-ssh)
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+  # Build MSCCL++ and deploy it onto the VMSS GPU nodes.
+  - template: templates/deploy.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch: '90'
+      deployArgs: 'multi-node-test true cuda'
+      containerName: 'sglang-mscclpp-test'
+      sglangImage: ${{ parameters.sglangImage }}
+
+  # Run the SGLang multi-node tests across the two GPU nodes.
+  - template: templates/sglang-multi-test.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+
+  # Stop/deallocate the VMSS GPU nodes to release resources.
+  - template: templates/stop.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
diff --git a/.azure-pipelines/sglang-test.yml b/.azure-pipelines/sglang-test.yml
new file mode 100644
index 000000000..70e30d353
--- /dev/null
+++ b/.azure-pipelines/sglang-test.yml
@@ -0,0 +1,63 @@
+# =============================================================================
+# Single-node SGLang integration test pipeline.
+#
+# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100`
+# pool. All deploy / run / teardown logic is delegated to
+# `templates/sglang-test.yml`.
+#
+# Docs / non-code changes are excluded from triggering this pipeline.
+# =============================================================================
+
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+parameters:
+# Docker image used for the SGLang test container on the GPU node.
+- name: sglangImage
+  type: string
+  default: lmsysorg/sglang:latest-cu129
+
+jobs:
+- job: SGLangTest
+  displayName: SGLang Test
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+  pool:
+    name: msccl-ci-h100
+  container:
+    image: $(containerImage)
+
+  steps:
+  # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests.
+  - template: templates/sglang-test.yml
+    parameters:
+      subscription: mscclpp-ci-h100
+      vmssName: mscclpp-h100-ci
+      gpuArch: '90'
+      sglangImage: ${{ parameters.sglangImage }}
diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
index 2f642f1d5..9eb46d8b9 100644
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -32,6 +32,12 @@ parameters:
 - name: deployArgs
   type: string
   default: ''
+- name: containerName
+  type: string
+  default: 'mscclpp-test'
+- name: sglangImage
+  type: string
+  default: ''
 
 steps:
 # 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
@@ -147,5 +153,5 @@ steps:
   inputs:
     targetType: filePath
     filePath: test/deploy/deploy.sh
-    arguments: ${{ parameters.deployArgs }}
+    arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }} ${{ parameters.sglangImage }}
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml
index b686e4f21..ad95cbc2b 100644
--- a/.azure-pipelines/templates/integration-test.yml
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -15,7 +15,7 @@ steps:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
-    deployArgs:       'single-node-test'
+    deployArgs:       'single-node-test true cuda'
 
 - template: run-remote-task.yml
   parameters:
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
index fa3900f1e..585f5b48f 100644
--- a/.azure-pipelines/templates/nccl-test.yml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -23,7 +23,7 @@ steps:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
-    deployArgs:       'nccltest-single-node'
+    deployArgs:       'nccltest-single-node true cuda'
 
 - template: run-remote-task.yml
   parameters:
diff --git a/.azure-pipelines/templates/sglang-multi-test.yml b/.azure-pipelines/templates/sglang-multi-test.yml
new file mode 100644
index 000000000..80e729268
--- /dev/null
+++ b/.azure-pipelines/templates/sglang-multi-test.yml
@@ -0,0 +1,95 @@
+# =============================================================================
+# SGLang multi-node test template.
+#
+# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU
+# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the
+# caller pipeline). Steps:
+#   1. Build and install MSCCL++ on each node.
+#   2. Install a (currently forked) SGLang on each node, replacing any
+#      pre-baked copy from the base image.
+#   3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled.
+#   4. Run the MSCCL++ all-reduce micro-benchmark via torchrun across both
+#      nodes.
+# =============================================================================
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: containerName
+  type: string
+  default: 'sglang-mscclpp-test'
+
+steps:
+# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
+# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal
+# branch and should not remain a long-term CI dependency.
+- template: run-remote-task.yml
+  parameters:
+    name: InstallSGLang
+    displayName: Install SGLang
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    remoteScript: |
+      git clone -b main https://github.com/caiomcbr/sglang.git
+      cd sglang/python
+      pip install -e .
+
+# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++.
+# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must
+# match across ranks and be free on node 0).
+- template: run-remote-task.yml
+  parameters:
+    name: RunSGLangMultiBenchOneBatch
+    displayName: Run SGLang Multi-Node Bench One Batch
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    remoteScript: |
+      export FLASHINFER_DISABLE_VERSION_CHECK=1
+      VMSS="${{ parameters.vmssName }}"
+      HOSTNAME=$(hostname)
+      # Explicit 2-node mapping: hostname suffix -> SGLang node rank.
+      if [ "$HOSTNAME" = "${VMSS}000000" ]; then
+        NODE_RANK=0
+      elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
+        NODE_RANK=1
+      else
+        echo "Unknown hostname: $HOSTNAME"
+        exit 1
+      fi
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp
+
+# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
+# (steps on the same remote share a working directory).
+- template: run-remote-task.yml
+  parameters:
+    name: RunSGLangMultiTestAllReduce
+    displayName: Run SGLang Multi-Node Test All Reduce
+    runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser'
+    remoteScript: |
+      export FLASHINFER_DISABLE_VERSION_CHECK=1
+      VMSS="${{ parameters.vmssName }}"
+      HOSTNAME=$(hostname)
+      # Explicit 2-node mapping: hostname suffix -> torchrun node rank.
+      if [ "$HOSTNAME" = "${VMSS}000000" ]; then
+        NODE_RANK=0
+      elif [ "$HOSTNAME" = "${VMSS}000001" ]; then
+        NODE_RANK=1
+      else
+        echo "Unknown hostname: $HOSTNAME"
+        exit 1
+      fi
+
+      export NODE_SIZE=2
+      export WORLD_SIZE=8
+
+      cd sglang
+
+      # Port 20004 is the torchrun rendezvous port (arbitrary, must match
+      # across ranks and be free on node 0). Distinct from 20003 used by
+      # sglang.bench_one_batch above.
+      torchrun --nproc_per_node $WORLD_SIZE \
+        --nnodes $NODE_SIZE \
+        --node_rank $NODE_RANK \
+        --master_addr ${VMSS}000000 \
+        --master_port 20004 \
+        benchmark/kernels/all_reduce/benchmark_mscclpp.py
diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml
new file mode 100644
index 000000000..0d663b71e
--- /dev/null
+++ b/.azure-pipelines/templates/sglang-test.yml
@@ -0,0 +1,87 @@
+# =============================================================================
+# SGLang single-node test template.
+#
+# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU
+# node (via run-remote-task.yml). Steps:
+#   1. Deploy: build the test container and bring the VMSS node online.
+#   2. Build and install MSCCL++ on the node.
+#   3. Install a (currently forked) SGLang.
+#   4. Run sglang.bench_one_batch at several batch sizes.
+#   5. Run a longer end-to-end validation: bring up an sglang server and
+#      drive it with sglang.bench_serving.
+#   6. Run the MSCCL++ all-reduce micro-benchmark via torchrun.
+#   7. Stop / deallocate the VMSS node.
+# =============================================================================
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+- name: containerName
+  type: string
+  default: 'sglang-mscclpp-test'
+- name: sglangImage
+  type: string
+  default: 'lmsysorg/sglang:latest'
+
+steps:
+# deployArgs positional fields: <test-mode> <use-gpu> <cuda|rocm>
+- template: deploy.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName: ${{ parameters.vmssName }}
+    gpuArch: ${{ parameters.gpuArch }}
+    deployArgs: 'single-node-test true cuda'
+    containerName: ${{ parameters.containerName }}
+    sglangImage: ${{ parameters.sglangImage }}
+
+# TODO: Switch to the official upstream sglang repo once Caio's PR is merged.
+# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal branch and
+# should not remain a long-term CI dependency. Also consider pinning to a
+# release branch or commit SHA for reproducibility.
+- template: run-remote-task.yml
+  parameters:
+    name: InstallSGLang
+    displayName: Install SGLang
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
+    remoteScript: |
+      git clone -b main https://github.com/caiomcbr/sglang.git
+      cd sglang/python
+      pip install -e .
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunSGLangBenchOneBatch
+    displayName: Run SGLang Bench One Batch
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
+    remoteScript: |
+      export FLASHINFER_DISABLE_VERSION_CHECK=1
+      python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 8 --enable-mscclpp
+
+# Depends on the `sglang/` source tree cloned by the InstallSGLang step above
+# (steps on the same remote share a working directory).
+- template: run-remote-task.yml
+  parameters:
+    name: RunSGLangTestAllReduce
+    displayName: Run SGLang Test All Reduce
+    runRemoteArgs: '--container ${{ parameters.containerName }}'
+    remoteScript: |
+      export FLASHINFER_DISABLE_VERSION_CHECK=1
+      export NODE_SIZE=1
+      export WORLD_SIZE=8
+      export RANK=0
+
+      cd sglang
+
+      torchrun --nproc_per_node $WORLD_SIZE \
+      --nnodes $NODE_SIZE \
+      --node_rank $RANK \
+      benchmark/kernels/all_reduce/benchmark_mscclpp.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName: ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml
index a62f1a77a..cc7d20182 100644
--- a/.azure-pipelines/templates/ut-no-ib-env.yml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -13,7 +13,7 @@ steps:
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
     cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
-    deployArgs:       'single-node-test false'
+    deployArgs:       'single-node-test false cuda'
 
 - template: run-remote-task.yml
   parameters:
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
index 1bd89caf4..18934e6b2 100644
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -14,7 +14,7 @@ steps:
     vmssName:         ${{ parameters.vmssName }}
     gpuArch:          ${{ parameters.gpuArch }}
     cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
-    deployArgs:       'single-node-test'
+    deployArgs:       'single-node-test true cuda'
 
 - template: run-remote-task.yml
   parameters:
diff --git a/docker/build.sh b/docker/build.sh
index 651a61222..b84eac9ad 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -75,6 +75,7 @@ docker build -t ${TAG_BASE_DEV} \
     --build-arg BASE_IMAGE=${TAG_BASE} \
     --build-arg TARGET=${TARGET} .
 
+
 GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
 GHCR_TAG_BASE_DEV=${GHCR}:base-dev-${TARGET}
 GHCR_TAG_BASE_DEV_ARCH=${GHCR}:base-dev-${TARGET}-${OS_ARCH}
@@ -107,4 +108,4 @@ echo ""
 echo "    docker buildx imagetools create \\"
 echo "        --tag ${GHCR_TAG_BASE_DEV} \\"
 echo "        ${GHCR_TAG_BASE_DEV_ARCH}"
-echo ""
+echo ""
\ No newline at end of file
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index 6358787bf..02fe4fd25 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,17 +1,34 @@
+#!/bin/bash
+# deploy.sh — Provisions remote hosts, copies sources, and launches Docker containers
+# for mscclpp CI/CD test environments.
+#
+# Usage: deploy.sh <test_name> [ib_environment] [platform] [container_name] [sglang_image]
+#   test_name       : Test suite to deploy (e.g. single-node-test, nccltest-single-node)
+#   ib_environment  : Enable InfiniBand networking (default: true)
+#   platform        : Target GPU platform — "cuda" or "rocm" (default: cuda)
+#   container_name  : Docker container name (default: mscclpp-test)
+#   sglang_image    : Docker image used for the SGLang test container
+#                     (default: lmsysorg/sglang:latest). Only used when
+#                     container_name is "sglang-mscclpp-test".
+
 set -ex
 
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
 PLATFORM="${3:-cuda}"
+CONTAINER_NAME="${4:-mscclpp-test}"
+SGLANG_IMAGE="${5:-lmsysorg/sglang:latest}"
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
 DST_DIR="/tmp/mscclpp"
+
 if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
 else
   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
 fi
+
 SSH_OPTION="StrictHostKeyChecking=no"
 
 chmod 400 ${KeyFilePath}
@@ -26,8 +43,8 @@ while true; do
   echo "Waiting for sshd to start..."
   sleep 5
 done
-
 set -e
+
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
 tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
 parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
@@ -57,25 +74,38 @@ if [ "${PLATFORM}" == "cuda" ]; then
     fi"
 fi
 
-# force to pull the latest image
-parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker pull ${CONTAINERIMAGE}"
+if [ "${CONTAINER_NAME}" == "sglang-mscclpp-test" ]; then
+  # force to pull the latest image
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+    "sudo docker pull ${SGLANG_IMAGE}"
 
-LAUNCH_OPTION="--gpus=all"
-if [ "${PLATFORM}" == "rocm" ]; then
-  LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
-fi
-if [ "${IB_ENVIRONMENT}" == "true" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
-    -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
-    --entrypoint /bin/bash ${CONTAINERIMAGE}"
+    "sudo docker run --rm -itd --name=${CONTAINER_NAME} --privileged --net=host --ipc=host --gpus=all -w /root -v ${DST_DIR}:/root/mscclpp --entrypoint /bin/bash ${SGLANG_IMAGE}"
 else
+  # force to pull the latest image
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
-    -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
-    --entrypoint /bin/bash ${CONTAINERIMAGE}"
+    "sudo docker pull ${CONTAINERIMAGE}"
+
+  # Set GPU passthrough flags based on platform
+  LAUNCH_OPTION="--gpus=all"
+  if [ "${PLATFORM}" == "rocm" ]; then
+    LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
+  fi
+
+  if [ "${IB_ENVIRONMENT}" == "true" ]; then
+    # InfiniBand: use --privileged for RDMA device access
+    parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+      "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
+      -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=${CONTAINER_NAME} \
+      --entrypoint /bin/bash ${CONTAINERIMAGE}"
+  else
+    # Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged
+    parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+      "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
+      -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=${CONTAINER_NAME} \
+      --entrypoint /bin/bash ${CONTAINERIMAGE}"
+  fi
 fi
-parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
 
+parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+  "sudo docker exec -t --user root ${CONTAINER_NAME} bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
index 2468243ea..9607664fc 100755
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -11,6 +11,7 @@
 #   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
 #   --host        Run command on a single host (uses parallel-ssh -H)
 #   --user        SSH user when using --host or custom hostfile
+#   --container   Docker container name to exec into (default: mscclpp-test)
 
 set -e
 
@@ -23,9 +24,10 @@ USE_DOCKER=true
 USE_LOG=true
 TARGET_HOST=""
 REMOTE_USER=""
+CONTAINER_NAME="mscclpp-test"
 
 usage() {
-    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] [--container <name>] < <command_script>" >&2
 }
 
 require_value() {
@@ -56,6 +58,11 @@ while [[ "$1" == --* ]]; do
             REMOTE_USER="$2"
             shift 2
             ;;
+        --container)
+            require_value "--container" "${2-}"
+            CONTAINER_NAME="$2"
+            shift 2
+            ;;
         *) echo "Unknown option: $1" >&2; exit 1 ;;
     esac
 done
@@ -103,7 +110,7 @@ if $USE_DOCKER; then
     INNER+=" rm -f \\\"\\\$TMP\\\""
 
     parallel-ssh -i "${PSSH_COMMON[@]}" \
-        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
+        "sudo docker exec ${CONTAINER_NAME} bash -c \"${INNER}\""
 else
     parallel-ssh -i "${PSSH_COMMON[@]}" \
         "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""