diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index d5d5f9bde..45bb1e96d 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -19,11 +19,11 @@ pr: drafts: false paths: exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' jobs: - job: IntegrationTestA100 @@ -43,9 +43,9 @@ jobs: steps: - template: templates/integration-test.yml parameters: - subscription: mscclpp-ci - vmssName: mscclpp-ci - gpuArch: '80' + subscription: mscclpp-ci + vmssName: mscclpp-ci + gpuArch: '80' - job: IntegrationTestH100 displayName: Integration test H100 @@ -62,7 +62,7 @@ jobs: steps: - template: templates/integration-test.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: mscclpp-h100-ci + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci perfBaselineFile: test/deploy/perf_ndmv5.jsonl - gpuArch: '90' + gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 3b3ebe1ff..ee2766fd7 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -14,7 +14,6 @@ trigger: # Do not run multi-nodes-test for PR, we can trigger it manually pr: none - parameters: - name: vmssName type: string @@ -79,10 +78,10 @@ jobs: - template: templates/deploy.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp - gpuArch: '90' + gpuArch: '90' - template: templates/run-remote-task.yml parameters: @@ -119,6 +118,6 @@ jobs: - template: templates/stop.yml parameters: - subscription: mscclpp-ci-h100 - vmssName: ${{ parameters.vmssName }} + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} resourceGroup: mscclpp diff --git a/.azure-pipelines/sglang-multi-node-test.yml b/.azure-pipelines/sglang-multi-node-test.yml new file mode 100644 index 000000000..bf640db20 --- /dev/null +++ b/.azure-pipelines/sglang-multi-node-test.yml @@ -0,0 +1,141 @@ +# ============================================================================= +# Multi-node SGLang integration test pipeline. +# +# This pipeline runs MSCCL++ SGLang tests across two H100 VMSS GPU nodes. +# High-level flow: +# 1. The pipeline agent runs inside a container on the `mscclpp-multi-node` +# pool. The agent itself has no GPUs. +# 2. SSH/host configuration is generated so the agent can reach the two +# pre-provisioned VMSS GPU nodes. +# 3. `templates/deploy.yml` builds and ships MSCCL++ to the GPU nodes. +# 4. `templates/sglang-multi-test.yml` runs the SGLang multi-node tests. +# 5. `templates/stop.yml` tears down / stops the VMSS nodes. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +parameters: +# Name of the pre-provisioned Azure VMSS that hosts the GPU test nodes. +# Node hostnames are derived as "${vmssName}000000" and "${vmssName}000001". +- name: vmssName + type: string + default: mscclpp-h100-multinode-ci +# Static /etc/hosts entries mapping VMSS node hostnames to their private IPs. +# These IPs are tied to the specific VMSS above; update both together if the +# VMSS is reprovisioned or renamed. +- name: hostEntries + type: string + default: | + 10.0.0.5 mscclpp-h100-multinode-ci000000 + 10.0.0.4 mscclpp-h100-multinode-ci000001 +# Docker image used for the SGLang test container on the GPU nodes. +- name: sglangImage + type: string + default: lmsysorg/sglang:latest-cu129 + +jobs: +- job: SGLangTestMultiNode + displayName: SGLang Test Multi Node + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + pool: + name: mscclpp-multi-node + container: + image: $(containerImage) + + steps: + # Ensure the VMSS node hostnames resolve from the pipeline agent container. + # Idempotent: only appends lines that are not already present in /etc/hosts. + - task: Bash@3 + displayName: Add HostEntry + inputs: + targetType: 'inline' + script: | + while IFS= read -r line; do + [ -z "$line" ] && continue + if ! grep -qxF "$line" /etc/hosts; then + echo "Adding to /etc/hosts: $line" + echo "$line" | sudo tee -a /etc/hosts + else + echo "Entry already exists: $line" + fi + done <<< "${{ parameters.hostEntries }}" + + # Generate the SSH config and hostfile consumed by the deploy / test + # templates below: + # - config : SSH client config (custom port + key) for each node + # - hostfile : user@host list used by deploy / test scripts (parallel-ssh) + - task: Bash@3 + displayName: Generate deploy files + inputs: + targetType: 'inline' + script: | + set -e + VMSS="${{ parameters.vmssName }}" + DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy" + NODE0="${VMSS}000000" + NODE1="${VMSS}000001" + + echo "Host ${NODE0} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no + Host ${NODE1} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no" > "${DEPLOY_DIR}/config" + + printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile" + + # Build MSCCL++ and deploy it onto the VMSS GPU nodes. + - template: templates/deploy.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp + gpuArch: '90' + deployArgs: 'multi-node-test true cuda' + containerName: 'sglang-mscclpp-test' + sglangImage: ${{ parameters.sglangImage }} + + # Run the SGLang multi-node tests across the two GPU nodes. + - template: templates/sglang-multi-test.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + + # Stop/deallocate the VMSS GPU nodes to release resources. + - template: templates/stop.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp diff --git a/.azure-pipelines/sglang-test.yml b/.azure-pipelines/sglang-test.yml new file mode 100644 index 000000000..70e30d353 --- /dev/null +++ b/.azure-pipelines/sglang-test.yml @@ -0,0 +1,63 @@ +# ============================================================================= +# Single-node SGLang integration test pipeline. +# +# Runs MSCCL++ SGLang tests on a single H100 GPU node from the `msccl-ci-h100` +# pool. All deploy / run / teardown logic is delegated to +# `templates/sglang-test.yml`. +# +# Docs / non-code changes are excluded from triggering this pipeline. +# ============================================================================= + +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +parameters: +# Docker image used for the SGLang test container on the GPU node. +- name: sglangImage + type: string + default: lmsysorg/sglang:latest-cu129 + +jobs: +- job: SGLangTest + displayName: SGLang Test + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + pool: + name: msccl-ci-h100 + container: + image: $(containerImage) + + steps: + # Deploy MSCCL++ to the GPU node and run the SGLang single-node tests. + - template: templates/sglang-test.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + gpuArch: '90' + sglangImage: ${{ parameters.sglangImage }} diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml index 2f642f1d5..9eb46d8b9 100644 --- a/.azure-pipelines/templates/deploy.yml +++ b/.azure-pipelines/templates/deploy.yml @@ -32,6 +32,12 @@ parameters: - name: deployArgs type: string default: '' +- name: containerName + type: string + default: 'mscclpp-test' +- name: sglangImage + type: string + default: '' steps: # 0. Ensure Azure CLI exists before running AzureCLI@2 tasks. @@ -147,5 +153,5 @@ steps: inputs: targetType: filePath filePath: test/deploy/deploy.sh - arguments: ${{ parameters.deployArgs }} + arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }} ${{ parameters.sglangImage }} workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml index b686e4f21..ad95cbc2b 100644 --- a/.azure-pipelines/templates/integration-test.yml +++ b/.azure-pipelines/templates/integration-test.yml @@ -15,7 +15,7 @@ steps: subscription: ${{ parameters.subscription }} vmssName: ${{ parameters.vmssName }} gpuArch: ${{ parameters.gpuArch }} - deployArgs: 'single-node-test' + deployArgs: 'single-node-test true cuda' - template: run-remote-task.yml parameters: diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml index fa3900f1e..585f5b48f 100644 --- a/.azure-pipelines/templates/nccl-test.yml +++ b/.azure-pipelines/templates/nccl-test.yml @@ -23,7 +23,7 @@ steps: subscription: ${{ parameters.subscription }} vmssName: ${{ parameters.vmssName }} gpuArch: ${{ parameters.gpuArch }} - deployArgs: 'nccltest-single-node' + deployArgs: 'nccltest-single-node true cuda' - template: run-remote-task.yml parameters: diff --git a/.azure-pipelines/templates/sglang-multi-test.yml b/.azure-pipelines/templates/sglang-multi-test.yml new file mode 100644 index 000000000..80e729268 --- /dev/null +++ b/.azure-pipelines/templates/sglang-multi-test.yml @@ -0,0 +1,95 @@ +# ============================================================================= +# SGLang multi-node test template. +# +# Runs on the pipeline agent and dispatches remote steps to the two VMSS GPU +# nodes (via run-remote-task.yml + the SSH config / hostfile produced by the +# caller pipeline). Steps: +# 1. Build and install MSCCL++ on each node. +# 2. Install a (currently forked) SGLang on each node, replacing any +# pre-baked copy from the base image. +# 3. Run a 2-node sglang.bench_one_batch smoke test with MSCCL++ enabled. +# 4. Run the MSCCL++ all-reduce micro-benchmark via torchrun across both +# nodes. +# ============================================================================= + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: containerName + type: string + default: 'sglang-mscclpp-test' + +steps: +# TODO: Switch to the official upstream sglang repo once Caio's PR is merged. +# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal +# branch and should not remain a long-term CI dependency. +- template: run-remote-task.yml + parameters: + name: InstallSGLang + displayName: Install SGLang + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + remoteScript: | + git clone -b main https://github.com/caiomcbr/sglang.git + cd sglang/python + pip install -e . + +# Smoke test: 2-node tensor-parallel benchmark of Qwen3-8B with MSCCL++. +# Port 20003 is the SGLang distributed-init rendezvous port (arbitrary, must +# match across ranks and be free on node 0). +- template: run-remote-task.yml + parameters: + name: RunSGLangMultiBenchOneBatch + displayName: Run SGLang Multi-Node Bench One Batch + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + remoteScript: | + export FLASHINFER_DISABLE_VERSION_CHECK=1 + VMSS="${{ parameters.vmssName }}" + HOSTNAME=$(hostname) + # Explicit 2-node mapping: hostname suffix -> SGLang node rank. + if [ "$HOSTNAME" = "${VMSS}000000" ]; then + NODE_RANK=0 + elif [ "$HOSTNAME" = "${VMSS}000001" ]; then + NODE_RANK=1 + else + echo "Unknown hostname: $HOSTNAME" + exit 1 + fi + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 16 --dist-init-addr ${VMSS}000000:20003 --nnodes 2 --node-rank $NODE_RANK --enable-mscclpp + +# Depends on the `sglang/` source tree cloned by the InstallSGLang step above +# (steps on the same remote share a working directory). +- template: run-remote-task.yml + parameters: + name: RunSGLangMultiTestAllReduce + displayName: Run SGLang Multi-Node Test All Reduce + runRemoteArgs: '--container ${{ parameters.containerName }} --hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --user azureuser' + remoteScript: | + export FLASHINFER_DISABLE_VERSION_CHECK=1 + VMSS="${{ parameters.vmssName }}" + HOSTNAME=$(hostname) + # Explicit 2-node mapping: hostname suffix -> torchrun node rank. + if [ "$HOSTNAME" = "${VMSS}000000" ]; then + NODE_RANK=0 + elif [ "$HOSTNAME" = "${VMSS}000001" ]; then + NODE_RANK=1 + else + echo "Unknown hostname: $HOSTNAME" + exit 1 + fi + + export NODE_SIZE=2 + export WORLD_SIZE=8 + + cd sglang + + # Port 20004 is the torchrun rendezvous port (arbitrary, must match + # across ranks and be free on node 0). Distinct from 20003 used by + # sglang.bench_one_batch above. + torchrun --nproc_per_node $WORLD_SIZE \ + --nnodes $NODE_SIZE \ + --node_rank $NODE_RANK \ + --master_addr ${VMSS}000000 \ + --master_port 20004 \ + benchmark/kernels/all_reduce/benchmark_mscclpp.py diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml new file mode 100644 index 000000000..0d663b71e --- /dev/null +++ b/.azure-pipelines/templates/sglang-test.yml @@ -0,0 +1,87 @@ +# ============================================================================= +# SGLang single-node test template. +# +# Runs on the pipeline agent and dispatches remote steps to a single VMSS GPU +# node (via run-remote-task.yml). Steps: +# 1. Deploy: build the test container and bring the VMSS node online. +# 2. Build and install MSCCL++ on the node. +# 3. Install a (currently forked) SGLang. +# 4. Run sglang.bench_one_batch at several batch sizes. +# 5. Run a longer end-to-end validation: bring up an sglang server and +# drive it with sglang.bench_serving. +# 6. Run the MSCCL++ all-reduce micro-benchmark via torchrun. +# 7. Stop / deallocate the VMSS node. +# ============================================================================= + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string +- name: containerName + type: string + default: 'sglang-mscclpp-test' +- name: sglangImage + type: string + default: 'lmsysorg/sglang:latest' + +steps: +# deployArgs positional fields: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true cuda' + containerName: ${{ parameters.containerName }} + sglangImage: ${{ parameters.sglangImage }} + +# TODO: Switch to the official upstream sglang repo once Caio's PR is merged. +# Tracking: the fork below (`caiomcbr/sglang` @ caiorocha/mscclpp) is a personal branch and +# should not remain a long-term CI dependency. Also consider pinning to a +# release branch or commit SHA for reproducibility. +- template: run-remote-task.yml + parameters: + name: InstallSGLang + displayName: Install SGLang + runRemoteArgs: '--container ${{ parameters.containerName }}' + remoteScript: | + git clone -b main https://github.com/caiomcbr/sglang.git + cd sglang/python + pip install -e . + +- template: run-remote-task.yml + parameters: + name: RunSGLangBenchOneBatch + displayName: Run SGLang Bench One Batch + runRemoteArgs: '--container ${{ parameters.containerName }}' + remoteScript: | + export FLASHINFER_DISABLE_VERSION_CHECK=1 + python -m sglang.bench_one_batch --model-path Qwen/Qwen3-8B --batch 1 2 4 8 16 32 64 128 256 512 --input-len 256 --output-len 256 --tp-size 8 --enable-mscclpp + +# Depends on the `sglang/` source tree cloned by the InstallSGLang step above +# (steps on the same remote share a working directory). +- template: run-remote-task.yml + parameters: + name: RunSGLangTestAllReduce + displayName: Run SGLang Test All Reduce + runRemoteArgs: '--container ${{ parameters.containerName }}' + remoteScript: | + export FLASHINFER_DISABLE_VERSION_CHECK=1 + export NODE_SIZE=1 + export WORLD_SIZE=8 + export RANK=0 + + cd sglang + + torchrun --nproc_per_node $WORLD_SIZE \ + --nnodes $NODE_SIZE \ + --node_rank $RANK \ + benchmark/kernels/all_reduce/benchmark_mscclpp.py + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml index a62f1a77a..cc7d20182 100644 --- a/.azure-pipelines/templates/ut-no-ib-env.yml +++ b/.azure-pipelines/templates/ut-no-ib-env.yml @@ -13,7 +13,7 @@ steps: vmssName: ${{ parameters.vmssName }} gpuArch: ${{ parameters.gpuArch }} cmakeArgs: '-DMSCCLPP_USE_IB=OFF' - deployArgs: 'single-node-test false' + deployArgs: 'single-node-test false cuda' - template: run-remote-task.yml parameters: diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml index 1bd89caf4..18934e6b2 100644 --- a/.azure-pipelines/templates/ut-npkit.yml +++ b/.azure-pipelines/templates/ut-npkit.yml @@ -14,7 +14,7 @@ steps: vmssName: ${{ parameters.vmssName }} gpuArch: ${{ parameters.gpuArch }} cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"' - deployArgs: 'single-node-test' + deployArgs: 'single-node-test true cuda' - template: run-remote-task.yml parameters: diff --git a/docker/build.sh b/docker/build.sh index 651a61222..b84eac9ad 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -75,6 +75,7 @@ docker build -t ${TAG_BASE_DEV} \ --build-arg BASE_IMAGE=${TAG_BASE} \ --build-arg TARGET=${TARGET} . + GHCR="ghcr.io/microsoft/mscclpp/mscclpp" GHCR_TAG_BASE_DEV=${GHCR}:base-dev-${TARGET} GHCR_TAG_BASE_DEV_ARCH=${GHCR}:base-dev-${TARGET}-${OS_ARCH} @@ -107,4 +108,4 @@ echo "" echo " docker buildx imagetools create \\" echo " --tag ${GHCR_TAG_BASE_DEV} \\" echo " ${GHCR_TAG_BASE_DEV_ARCH}" -echo "" +echo "" \ No newline at end of file diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index 6358787bf..02fe4fd25 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -1,17 +1,34 @@ +#!/bin/bash +# deploy.sh — Provisions remote hosts, copies sources, and launches Docker containers +# for mscclpp CI/CD test environments. +# +# Usage: deploy.sh [ib_environment] [platform] [container_name] [sglang_image] +# test_name : Test suite to deploy (e.g. single-node-test, nccltest-single-node) +# ib_environment : Enable InfiniBand networking (default: true) +# platform : Target GPU platform — "cuda" or "rocm" (default: cuda) +# container_name : Docker container name (default: mscclpp-test) +# sglang_image : Docker image used for the SGLang test container +# (default: lmsysorg/sglang:latest). Only used when +# container_name is "sglang-mscclpp-test". + set -ex TEST_NAME=$1 IB_ENVIRONMENT="${2:-true}" PLATFORM="${3:-cuda}" +CONTAINER_NAME="${4:-mscclpp-test}" +SGLANG_IMAGE="${5:-lmsysorg/sglang:latest}" KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" DST_DIR="/tmp/mscclpp" + if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci" else HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile" fi + SSH_OPTION="StrictHostKeyChecking=no" chmod 400 ${KeyFilePath} @@ -26,8 +43,8 @@ while true; do echo "Waiting for sshd to start..." sleep 5 done - set -e + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}" tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} . parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz @@ -57,25 +74,38 @@ if [ "${PLATFORM}" == "cuda" ]; then fi" fi -# force to pull the latest image -parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker pull ${CONTAINERIMAGE}" +if [ "${CONTAINER_NAME}" == "sglang-mscclpp-test" ]; then + # force to pull the latest image + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker pull ${SGLANG_IMAGE}" -LAUNCH_OPTION="--gpus=all" -if [ "${PLATFORM}" == "rocm" ]; then - LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" -fi -if [ "${IB_ENVIRONMENT}" == "true" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ - -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ - --entrypoint /bin/bash ${CONTAINERIMAGE}" + "sudo docker run --rm -itd --name=${CONTAINER_NAME} --privileged --net=host --ipc=host --gpus=all -w /root -v ${DST_DIR}:/root/mscclpp --entrypoint /bin/bash ${SGLANG_IMAGE}" else + # force to pull the latest image parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ - -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ - --entrypoint /bin/bash ${CONTAINERIMAGE}" + "sudo docker pull ${CONTAINERIMAGE}" + + # Set GPU passthrough flags based on platform + LAUNCH_OPTION="--gpus=all" + if [ "${PLATFORM}" == "rocm" ]; then + LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" + fi + + if [ "${IB_ENVIRONMENT}" == "true" ]; then + # InfiniBand: use --privileged for RDMA device access + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=${CONTAINER_NAME} \ + --entrypoint /bin/bash ${CONTAINERIMAGE}" + else + # Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=${CONTAINER_NAME} \ + --entrypoint /bin/bash ${CONTAINERIMAGE}" + fi fi -parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker exec -t --user root ${CONTAINER_NAME} bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh index 2468243ea..9607664fc 100755 --- a/test/deploy/run-remote.sh +++ b/test/deploy/run-remote.sh @@ -11,6 +11,7 @@ # --hostfile Override hostfile path (default: test/deploy/hostfile_ci) # --host Run command on a single host (uses parallel-ssh -H) # --user SSH user when using --host or custom hostfile +# --container Docker container name to exec into (default: mscclpp-test) set -e @@ -23,9 +24,10 @@ USE_DOCKER=true USE_LOG=true TARGET_HOST="" REMOTE_USER="" +CONTAINER_NAME="mscclpp-test" usage() { - echo "Usage: $0 [--no-docker] [--no-log] [--hostfile ] [--host ] [--user ] < " >&2 + echo "Usage: $0 [--no-docker] [--no-log] [--hostfile ] [--host ] [--user ] [--container ] < " >&2 } require_value() { @@ -56,6 +58,11 @@ while [[ "$1" == --* ]]; do REMOTE_USER="$2" shift 2 ;; + --container) + require_value "--container" "${2-}" + CONTAINER_NAME="$2" + shift 2 + ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -103,7 +110,7 @@ if $USE_DOCKER; then INNER+=" rm -f \\\"\\\$TMP\\\"" parallel-ssh -i "${PSSH_COMMON[@]}" \ - "sudo docker exec mscclpp-test bash -c \"${INNER}\"" + "sudo docker exec ${CONTAINER_NAME} bash -c \"${INNER}\"" else parallel-ssh -i "${PSSH_COMMON[@]}" \ "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""