Skip to content

PR Test

PR Test #143

Workflow file for this run

name: PR Test
on:
schedule:
- cron: '0 */6 * * *' # Run every 6 hours
pull_request:
branches: [main]
workflow_dispatch:
inputs:
version:
description: "FlashInfer version"
required: true
type: choice
default: "release"
options:
- "release"
- "nightly"
target_stage:
description: "Specific stage to run (optional, for quick testing)"
required: false
type: string
default: ""
force_continue_on_error:
description: "Force continue-on-error (test scheduled CI behavior)"
required: false
type: boolean
default: false
pr_head_sha:
description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
required: false
type: string
default: ""
workflow_call:
inputs:
ref:
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
required: false
type: string
default: ''
run_all_tests:
description: "Run all tests (for releasing or testing purpose)"
required: false
type: boolean
default: false
concurrency:
# Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs
group: pr-test-${{ inputs.pr_head_sha || inputs.ref || github.ref }}
cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
env:
SGLANG_IS_IN_CI: true
jobs:
# =============================================== check changes ====================================================
check-changes:
runs-on: ubuntu-latest
outputs:
main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
sgl_kernel: ${{ steps.filter.outputs.sgl_kernel }} # sgl-kernel tests only run when kernels are rebuilt
jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }}
multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
max_parallel: ${{ steps.set-parallel.outputs.max_parallel }}
b200_runner: ${{ steps.set-runner.outputs.b200_runner }}
enable_retry: ${{ steps.set-retry.outputs.enable_retry }}
continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Determine run mode
id: run-mode
run: |
# Run all tests for scheduled runs and workflow_call (when ref input is provided)
# Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
if [[ "${{ github.event_name }}" == "schedule" || "${{ inputs.run_all_tests }}" == "true" ]]; then
echo "run_all_tests=true" >> $GITHUB_OUTPUT
echo "Run mode: ALL TESTS (schedule=${{ github.event_name == 'schedule' }}, run_all_tests=${{ inputs.run_all_tests }})"
else
echo "run_all_tests=false" >> $GITHUB_OUTPUT
echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
fi
- name: Detect file changes
id: filter
uses: dorny/paths-filter@v3
if: steps.run-mode.outputs.run_all_tests != 'true'
with:
filters: |
main_package:
- "python/sglang/!(multimodal_gen)/**"
- "python/*.toml"
- "scripts/ci/**"
- "test/**"
- ".github/workflows/pr-test.yml"
sgl_kernel:
- "sgl-kernel/**"
jit_kernel:
- "python/sglang/jit_kernel/**"
- "python/*.toml"
- ".github/workflows/pr-test.yml"
multimodal_gen:
- "python/sglang/multimodal_gen/**"
- "python/sglang/cli/**"
- "python/*.toml"
- ".github/workflows/pr-test.yml"
- name: Set max-parallel based on high-priority label
id: set-parallel
run: |
if [[ "${{ github.event_name }}" == "pull_request" && "${{ contains(github.event.pull_request.labels.*.name, 'high priority') }}" == "true" ]]; then
echo "max_parallel=15" >> $GITHUB_OUTPUT
echo "High priority PR detected, setting max_parallel to 15"
else
echo "max_parallel=4" >> $GITHUB_OUTPUT
echo "Using default max_parallel of 4"
fi
- name: Set B200 runner tag
id: set-runner
run: |
sgl_kernel="${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}"
if [[ "$sgl_kernel" == "true" ]]; then
echo "b200_runner=4-gpu-b200-kernel" >> $GITHUB_OUTPUT
else
echo "b200_runner=4-gpu-b200" >> $GITHUB_OUTPUT
fi
- name: Enable retry for CI
id: set-retry
run: |
echo "enable_retry=true" >> $GITHUB_OUTPUT
echo "Retry logic enabled for CI"
- name: Set continue-on-error for full test runs
id: set-continue-on-error
run: |
if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" || "${{ inputs.force_continue_on_error }}" == "true" ]]; then
echo "continue_on_error=true" >> $GITHUB_OUTPUT
echo "Full test run or force flag detected, enabling continue-on-error to run all tests"
else
echo "continue_on_error=false" >> $GITHUB_OUTPUT
echo "Filtered run, continue-on-error disabled"
fi
- name: Show filter results in summary (table)
run: |
{
echo "## Change Detection"
echo ""
echo "| Component | Changed |"
echo "|-------------------|---------|"
echo "| main_package | ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} |"
echo "| sgl_kernel | ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} |"
echo "| jit_kernel | ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} |"
echo "| multimodal_gen | ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} |"
echo "| max_parallel | ${{ steps.set-parallel.outputs.max_parallel }} |"
echo "| b200_runner | ${{ steps.set-runner.outputs.b200_runner }} |"
echo "| enable_retry | ${{ steps.set-retry.outputs.enable_retry }} |"
echo "| continue_on_error | ${{ steps.set-continue-on-error.outputs.continue_on_error }} |"
} >> $GITHUB_STEP_SUMMARY
# =============================================== PR Gate ====================================================
call-gate:
needs: check-changes
if: |
needs.check-changes.outputs.main_package == 'true' ||
needs.check-changes.outputs.sgl_kernel == 'true' ||
needs.check-changes.outputs.jit_kernel == 'true' ||
needs.check-changes.outputs.multimodal_gen == 'true'
uses: ./.github/workflows/pr-gate.yml
secrets: inherit
# =============================================== sgl-kernel ====================================================
sgl-kernel-build-wheels:
needs: [check-changes, call-gate]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: x64-kernel-build-node
strategy:
matrix:
include:
- python-version: "3.10"
cuda-version: "12.9"
# Add back when CUDA 13.0 is supported on CI
# - python-version: "3.10"
# cuda-version: "13.0"
name: Build Wheel
steps:
- name: Cleanup
run: |
sudo rm -rf $GITHUB_WORKSPACE/* || true
- uses: actions/checkout@v4
with:
submodules: "recursive"
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
run: |
cd sgl-kernel
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
env:
USE_CCACHE: 1
- name: Verify wheel artifacts
run: |
ls -alh sgl-kernel/dist
ls -alh sgl-kernel/dist/*.whl
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
path: sgl-kernel/dist/*
if-no-files-found: error
sgl-kernel-build-wheels-arm:
needs: [check-changes, call-gate]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: arm-kernel-build-node
strategy:
matrix:
include:
- python-version: "3.10"
cuda-version: "12.9"
name: Build Wheel Arm
steps:
- name: Cleanup
run: |
if [ -d "$GITHUB_WORKSPACE" ]; then
sudo rm -rf "$GITHUB_WORKSPACE"/* || true
else
echo "$GITHUB_WORKSPACE does not exist, nothing to clean"
fi
- uses: actions/checkout@v4
with:
submodules: "recursive"
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
run: |
cd sgl-kernel
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
env:
USE_CCACHE: 1
- name: Verify wheel artifacts
run: |
ls -alh sgl-kernel/dist
ls -alh sgl-kernel/dist/*.whl
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
path: sgl-kernel/dist/*
if-no-files-found: error
sgl-kernel-unit-test:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
!inputs.target_stage &&
needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
- name: Run test
timeout-minutes: 30
run: |
cd sgl-kernel
pytest tests/
sgl-kernel-mla-test:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
!inputs.target_stage &&
needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/registered/mla
python3 test_mla_deepseek_v3.py
sgl-kernel-benchmark-test:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
!inputs.target_stage &&
needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
CI: true
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run benchmark tests
timeout-minutes: 45
run: |
cd sgl-kernel/benchmark
echo "Running sgl-kernel benchmark tests in CI mode..."
echo "CI environment variable: $CI"
echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"
for bench_file in bench_*.py; do
echo "Testing $bench_file..."
timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..."
echo "Completed $bench_file"
echo "---"
done
echo "All benchmark tests completed!"
sgl-kernel-b200-test:
needs: [check-changes, sgl-kernel-build-wheels]
if: |
!inputs.target_stage &&
needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: ${{ needs.check-changes.outputs.b200_runner }}
env:
RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh diffusion
- name: Run sgl-kernel unit tests on B200
timeout-minutes: 30
run: |
cd sgl-kernel
pytest tests/
# Adding a single CUDA13 smoke test to verify that the kernel builds and runs
# TODO: Add back this test when it can pass on CI
# cuda13-kernel-smoke-test:
# needs: [check-changes, sgl-kernel-build-wheels]
# if: needs.check-changes.outputs.sgl_kernel == 'true'
# runs-on: x64-cu13-kernel-tests
# steps:
# - uses: actions/checkout@v4
# - name: Cleanup
# run: |
# ls -alh sgl-kernel/dist || true
# rm -rf sgl-kernel/dist/* || true
# - name: Download CUDA 13.0 artifacts
# uses: actions/download-artifact@v4
# with:
# path: sgl-kernel/dist/
# merge-multiple: true
# pattern: wheel-python3.10-cuda13.0
# - name: Install dependencies
# run: |
# CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
# - name: Run kernel unit tests
# timeout-minutes: 30
# run: |
# cd sgl-kernel
# pytest tests/
# =============================================== jit-kernel ====================================================
jit-kernel-unit-test:
needs: [check-changes, call-gate]
if: |
!inputs.target_stage &&
needs.check-changes.outputs.jit_kernel == 'true'
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Install dependencies
run: |
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd python/sglang/jit_kernel
pytest tests/
# =============================================== primary ====================================================
stage-a-test-1:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-a-test-1') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cuda --suite stage-a-test-1 $CONTINUE_ON_ERROR_FLAG
# temporarily put backend-independent cpu tests here
python3 run_suite.py --hw cpu --suite default $CONTINUE_ON_ERROR_FLAG
stage-a-cpu-only:
needs: [check-changes, call-gate]
if: |
always() &&
(
(inputs.target_stage == 'stage-a-cpu-only') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
(needs.check-changes.outputs.main_package == 'true')
)
)
runs-on: ubuntu-latest
steps:
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
df -h
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: |
pip install -e "python/[dev]"
- name: Run test
timeout-minutes: 10
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cpu --suite stage-a-cpu-only $CONTINUE_ON_ERROR_FLAG
stage-b-test-small-1-gpu:
needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-b-test-small-1-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
strategy:
fail-fast: false
matrix:
partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 11 $CONTINUE_ON_ERROR_FLAG
stage-b-test-large-1-gpu:
needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-b-test-large-1-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu $CONTINUE_ON_ERROR_FLAG
stage-b-test-large-2-gpu:
needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-b-test-large-2-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu $CONTINUE_ON_ERROR_FLAG
stage-c-test-large-4-gpu:
needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-c-test-large-4-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG
stage-c-test-large-4-gpu-b200:
needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-c-test-large-4-gpu-b200') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: ${{ needs.check-changes.outputs.b200_runner }}
env:
RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v6
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/
IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200
multimodal-gen-test-1-gpu:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'multimodal-gen-test-1-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
needs.check-changes.outputs.multimodal_gen == 'true'
)
)
runs-on: 1-gpu-runner
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
- name: Run diffusion server tests
timeout-minutes: 60
run: |
cd python
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 sglang/multimodal_gen/test/run_suite.py \
--suite 1-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2 \
$CONTINUE_ON_ERROR_FLAG
multimodal-gen-test-2-gpu:
needs: [check-changes, call-gate, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'multimodal-gen-test-2-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
needs.check-changes.outputs.multimodal_gen == 'true'
)
)
runs-on: 2-gpu-runner
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
- name: Run diffusion server tests
timeout-minutes: 60
run: |
cd python
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 sglang/multimodal_gen/test/run_suite.py \
--suite 2-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2 \
$CONTINUE_ON_ERROR_FLAG
quantization-test:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'quantization-test') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite quantization_test $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-1-gpu:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-1-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
strategy:
fail-fast: false
max-parallel: ${{ fromJson(needs.check-changes.outputs.max_parallel) }}
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-1-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
stage-b-test-4-gpu-b200:
needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
if: |
always() &&
(
(inputs.target_stage == 'stage-b-test-4-gpu-b200') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: ${{ needs.check-changes.outputs.b200_runner }}
env:
RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
strategy:
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v6
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG
unit-test-backend-2-gpu:
needs: [check-changes, call-gate, unit-test-backend-1-gpu]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-2-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-4-gpu:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-4-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
strategy:
fail-fast: false
matrix:
part: [0, 1, 2]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-8-gpu-h200:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-8-gpu-h200') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 8-gpu-h200
env:
RUNNER_LABELS: 8-gpu-h200
strategy:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
# - name: Warmup Weights and JIT Compilation
# timeout-minutes: 20
# run: |
# # An example command for testing the warmup. TODO: make this more general and move them to python scripts.
# python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-8-gpu-h20:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-8-gpu-h20') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 8-gpu-h20
env:
SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
RUNNER_LABELS: 8-gpu-h20
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
performance-test-1-gpu-part-1:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'performance-test-1-gpu-part-1') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
- name: Benchmark online latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
- name: Benchmark offline throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
- name: Benchmark online latency (EAGLE)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
- name: Benchmark online latency (LoRA)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
performance-test-1-gpu-part-2:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'performance-test-1-gpu-part-2') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
- name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
- name: Benchmark VLM offline throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
- name: Benchmark VLM online latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-1-gpu-part-3:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'performance-test-1-gpu-part-3') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark Scores online latency and throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
- name: Benchmark Scores online latency and throughput (batch size scaling)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
- name: Benchmark Embeddings online latency and throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_latency_throughput
- name: Benchmark Embeddings online latency and throughput (batch size scaling)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_batch_scaling
performance-test-2-gpu:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'performance-test-2-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
- name: Benchmark offline PP decode throughput (PP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
- name: Benchmark offline PP prefill throughput (PP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
accuracy-test-1-gpu:
needs: [check-changes, call-gate, stage-a-test-1]
if: |
always() &&
(
(inputs.target_stage == 'accuracy-test-1-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate accuracy
timeout-minutes: 25
run: |
cd test/srt
python3 -m sglang.test.ci.run_with_retry test_eval_accuracy_large.py
accuracy-test-2-gpu:
needs: [check-changes, call-gate, accuracy-test-1-gpu]
if: |
always() &&
(
(inputs.target_stage == 'accuracy-test-2-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate accuracy (TP=2)
timeout-minutes: 25
run: |
cd test/srt
python3 -m sglang.test.ci.run_with_retry test_moe_eval_accuracy_large.py
unit-test-deepep-4-gpu:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-deepep-4-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-4-gpu-deepep $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-deepep-8-gpu:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-deepep-8-gpu') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 8-gpu-h200
env:
RUNNER_LABELS: 8-gpu-h200
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-8-gpu-h200-deepep $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-4-gpu-b200:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-4-gpu-b200') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: ${{ needs.check-changes.outputs.b200_runner }}
env:
RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
strategy:
fail-fast: false
matrix:
part: [0, 1, 2]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v6
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
IS_BLACKWELL=1 python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1800 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
unit-test-backend-4-gpu-gb200:
needs: [check-changes, call-gate, unit-test-backend-1-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels-arm]
if: |
always() &&
(
(inputs.target_stage == 'unit-test-backend-4-gpu-gb200') ||
(
!inputs.target_stage &&
(github.event_name == 'schedule' || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: 4-gpu-gb200
env:
RUNNER_LABELS: 4-gpu-gb200
strategy:
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9-aarch64
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 45
run: |
cd test/srt
RETRY_FLAG=""
if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
RETRY_FLAG="--enable-retry"
fi
CONTINUE_ON_ERROR_FLAG=""
if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
CONTINUE_ON_ERROR_FLAG="--continue-on-error"
fi
python3 run_suite.py --suite per-commit-4-gpu-gb200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
pr-test-finish:
needs:
[
call-gate,
check-changes,
sgl-kernel-build-wheels,
sgl-kernel-unit-test,
sgl-kernel-mla-test,
sgl-kernel-benchmark-test,
sgl-kernel-b200-test,
jit-kernel-unit-test,
multimodal-gen-test-1-gpu,
multimodal-gen-test-2-gpu,
stage-a-test-1,
stage-a-cpu-only,
stage-b-test-small-1-gpu,
stage-b-test-large-1-gpu,
stage-b-test-large-2-gpu,
stage-c-test-large-4-gpu,
quantization-test,
unit-test-backend-1-gpu,
unit-test-backend-2-gpu,
stage-b-test-4-gpu-b200,
unit-test-backend-4-gpu,
unit-test-backend-8-gpu-h20,
unit-test-backend-8-gpu-h200,
performance-test-1-gpu-part-1,
performance-test-1-gpu-part-2,
performance-test-1-gpu-part-3,
performance-test-2-gpu,
accuracy-test-1-gpu,
accuracy-test-2-gpu,
unit-test-deepep-4-gpu,
unit-test-deepep-8-gpu,
unit-test-backend-4-gpu-b200,
unit-test-backend-4-gpu-gb200,
]
if: always()
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
# Convert the 'needs' context to a JSON string
json_needs='${{ toJson(needs) }}'
# Get a list of all job names from the JSON keys
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
for job in $job_names; do
# For each job, extract its result
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
# Print the job name and its result
echo "$job: $result"
# Check for failure or cancellation and exit if found
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
echo "The above jobs failed."
exit 1
fi
done
# If the loop completes, all jobs were successful
echo "All jobs completed successfully"
exit 0