Skip to content

Nightly Test (Nvidia) #162

Nightly Test (Nvidia)

Nightly Test (Nvidia) #162

name: Nightly Test (Nvidia)
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
inputs:
job_filter:
description: 'Select which job to run (leave empty or "all" to run all jobs)'
required: false
type: choice
default: 'all'
options:
- 'all'
- 'nightly-test-general-1-gpu-h100'
- 'nightly-test-general-4-gpu-h100'
- 'nightly-test-general-8-gpu-h200'
- 'nightly-test-general-8-gpu-h20'
- 'nightly-test-general-8-gpu-b200'
- 'nightly-test-text-accuracy-2-gpu-h100'
- 'nightly-test-text-perf-2-gpu-h100'
- 'nightly-test-vlm-accuracy-2-gpu-h100'
- 'nightly-test-vlm-perf-2-gpu-h100'
- 'nightly-test-multimodal-server-1-gpu'
- 'nightly-test-multimodal-server-2-gpu'
- 'nightly-test-perf-4-gpu-b200'
- 'nightly-test-perf-8-gpu-b200'
- 'nightly-test-specialized-8-gpu-b200'
- 'nightly-test-kernel-1-gpu-h100'
- 'nightly-test-diffusion-comparison'
- 'nightly-test-kernel-8-gpu-h200'
workflow_call:
inputs:
ref:
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
required: false
type: string
default: ''
job_filter:
description: 'Select which job to run (leave empty or "all" to run all jobs)'
required: false
type: string
default: 'all'
concurrency:
group: nightly-test-nvidia-${{ inputs.ref || github.ref }}
cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
env:
SGLANG_IS_IN_CI: true
SGLANG_CUDA_COREDUMP: "1"
HF_HUB_DOWNLOAD_TIMEOUT: 300
HF_HUB_ETAG_TIMEOUT: 300
jobs:
# General tests - 1 GPU
nightly-test-general-1-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-1-gpu-h100')
runs-on: 1-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-1-gpu --nightly --continue-on-error
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# JIT kernel full unit tests (expanded parameter ranges via SGLANG_JIT_KERNEL_RUN_FULL_TESTS)
nightly-test-kernel-1-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-kernel-1-gpu-h100')
runs-on: 1-gpu-h100
timeout-minutes: 240
env:
# Full jit_kernel test grids (see sglang.jit_kernel.utils.should_run_full_tests)
SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1"
# Match pr-test-jit-kernel workflow for consistent JIT warmup behavior
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true
# Allow maintenance bypass on default branch (same semantics as PR JIT workflow)
SGLANG_PR_TEST_BYPASS_MAINTENANCE_ON_MAIN: ${{ github.ref == 'refs/heads/main' && 'true' || 'false' }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
timeout-minutes: 20
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run jit kernel nightly suite
timeout-minutes: 60
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-kernel-1-gpu --nightly --continue-on-error
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
nightly-test-kernel-8-gpu-h200:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-kernel-8-gpu-h200')
runs-on: 8-gpu-h200
timeout-minutes: 240
env:
SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true
SGLANG_PR_TEST_BYPASS_MAINTENANCE_ON_MAIN: ${{ github.ref == 'refs/heads/main' && 'true' || 'false' }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
timeout-minutes: 20
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run multi-GPU jit kernel nightly suite
timeout-minutes: 90
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-kernel-8-gpu-h200 --nightly --continue-on-error
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# General tests - 4 GPU H100
nightly-test-general-4-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-4-gpu-h100')
runs-on: 4-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-4-gpu --nightly --continue-on-error
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# General tests - 8 GPU H200
nightly-test-general-8-gpu-h200:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h200')
runs-on: 8-gpu-h200
strategy:
fail-fast: false
matrix:
partition: [0, 1, 2, 3]
env:
RUNNER_LABELS: 8-gpu-h200
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run common 8-GPU model tests
if: always()
timeout-minutes: 300
env:
TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
GPU_CONFIG: "8-gpu-h200"
IS_H200: "1"
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
- name: Publish traces to storage repo
if: always()
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
run: |
TRACE_ARGS=""
for dir in test/performance_profiles_*/; do
[ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir"
done
if [ -n "$TRACE_ARGS" ]; then
python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS
find test/performance_profiles_*/ -name '*.json.gz' -delete
else
echo "No trace directories found, skipping publish"
fi
- name: Run test
timeout-minutes: 30
env:
GPU_CONFIG: "8-gpu-h200"
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-8-gpu-h200 --nightly --continue-on-error
- name: Collect performance metrics
if: always()
run: |
python3 scripts/ci/utils/save_metrics.py \
--gpu-config 8-gpu-h200 \
--partition ${{ matrix.partition }} \
--run-id ${{ github.run_id }} \
--output test/metrics-8gpu-h200-partition-${{ matrix.partition }}.json \
--search-dir test/performance_profiles_8_gpu \
--search-dir test
- name: Upload partition metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: metrics-8gpu-h200-partition-${{ matrix.partition }}
path: test/metrics-8gpu-h200-partition-${{ matrix.partition }}.json
retention-days: 5
if-no-files-found: ignore
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
with:
artifact-suffix: ${{ matrix.partition }}
# General tests - 8 GPU H20
nightly-test-general-8-gpu-h20:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-h20')
runs-on: 8-gpu-h20
env:
SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
env:
GPU_CONFIG: "8-gpu-h20"
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-8-gpu-h20 --nightly --continue-on-error
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# General tests - 8 GPU B200
nightly-test-general-8-gpu-b200:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-8-gpu-b200')
runs-on: 8-gpu-b200
strategy:
fail-fast: false
matrix:
partition: [0, 1, 2, 3]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run common 8-GPU model tests
if: always()
timeout-minutes: 300
env:
TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
GPU_CONFIG: "8-gpu-b200"
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
- name: Publish traces to storage repo
if: always()
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
run: |
TRACE_ARGS=""
for dir in test/performance_profiles_*/; do
[ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir"
done
if [ -n "$TRACE_ARGS" ]; then
python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS
find test/performance_profiles_*/ -name '*.json.gz' -delete
else
echo "No trace directories found, skipping publish"
fi
- name: Collect performance metrics
if: always()
run: |
python3 scripts/ci/utils/save_metrics.py \
--gpu-config 8-gpu-b200 \
--partition ${{ matrix.partition }} \
--run-id ${{ github.run_id }} \
--output test/metrics-8gpu-b200-partition-${{ matrix.partition }}.json \
--search-dir test/performance_profiles_8_gpu \
--search-dir test
- name: Upload partition metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: metrics-8gpu-b200-partition-${{ matrix.partition }}
path: test/metrics-8gpu-b200-partition-${{ matrix.partition }}.json
retention-days: 5
if-no-files-found: ignore
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
with:
artifact-suffix: ${{ matrix.partition }}
# Text model accuracy tests
nightly-test-text-accuracy-2-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-text-accuracy-2-gpu-h100')
runs-on: 2-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run eval test for text models
timeout-minutes: 120
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-eval-text-2-gpu --nightly --continue-on-error --timeout-per-file 4500
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# Text model performance tests
nightly-test-text-perf-2-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-text-perf-2-gpu-h100')
runs-on: 2-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run performance test for text models
timeout-minutes: 180
env:
TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
GPU_CONFIG: "2-gpu-h100"
run: |
cd test
rm -rf performance_profiles_text_models/
python3 run_suite.py --hw cuda --suite nightly-perf-text-2-gpu --nightly --continue-on-error --timeout-per-file 3600
- name: Publish traces to storage repo
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
run: |
python3 scripts/ci/utils/publish_traces.py --traces-dir test/performance_profiles_text_models
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# VLM accuracy tests
nightly-test-vlm-accuracy-2-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-vlm-accuracy-2-gpu-h100')
runs-on: 2-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run eval test for VLM models (fixed MMMU-100)
timeout-minutes: 240
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-eval-vlm-2-gpu --nightly --continue-on-error --timeout-per-file 9000
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# VLM performance tests
nightly-test-vlm-perf-2-gpu-h100:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-vlm-perf-2-gpu-h100')
runs-on: 2-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run perf test for VLM models (MMMU)
timeout-minutes: 240
env:
TRACE_BASE_URL: https://raw.githubusercontent.com/sglang-bot/sglang-ci-data/main/traces/${{ github.run_id }}
PERFETTO_RELAY_URL: ${{ vars.PERFETTO_RELAY_URL }}
GPU_CONFIG: "2-gpu-h100"
run: |
cd test
rm -rf performance_profiles_vlms/
python3 run_suite.py --hw cuda --suite nightly-perf-vlm-2-gpu --nightly --continue-on-error --timeout-per-file 3600
- name: Publish traces to storage repo
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
run: |
python3 scripts/ci/utils/publish_traces.py --traces-dir test/performance_profiles_vlms
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# diffusion performance tests
nightly-test-multimodal-server-1-gpu:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-multimodal-server-1-gpu')
runs-on: 1-gpu-h100
strategy:
fail-fast: false
max-parallel: 5
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh diffusion
pip install slack_sdk
- name: Run diffusion server tests
env:
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
GITHUB_RUN_ID: ${{ github.run_id }}
GPU_CONFIG: "1-gpu-h100"
timeout-minutes: 90
run: |
cd python
python3 sglang/multimodal_gen/test/run_suite.py \
--suite 1-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2
- name: Collect diffusion performance metrics
if: always()
run: |
python3 scripts/ci/utils/diffusion/save_diffusion_metrics.py \
--gpu-config 1-gpu-h100 \
--run-id ${{ github.run_id }} \
--output python/diffusion-metrics-1gpu-partition-${{ matrix.part }}.json \
--results-json python/diffusion-results.json
- name: Upload diffusion metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: diffusion-metrics-1gpu-partition-${{ matrix.part }}
path: python/diffusion-metrics-1gpu-partition-${{ matrix.part }}.json
retention-days: 90
if-no-files-found: ignore
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
with:
artifact-suffix: ${{ matrix.part }}
nightly-test-multimodal-server-2-gpu:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-multimodal-server-2-gpu')
runs-on: 2-gpu-h100
strategy:
fail-fast: false
max-parallel: 5
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh diffusion
pip install slack_sdk
- name: Run diffusion server tests
env:
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
GITHUB_RUN_ID: ${{ github.run_id }}
GPU_CONFIG: "2-gpu-h100"
timeout-minutes: 90
run: |
cd python
python3 sglang/multimodal_gen/test/run_suite.py \
--suite 2-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2
- name: Collect diffusion performance metrics
if: always()
run: |
python3 scripts/ci/utils/diffusion/save_diffusion_metrics.py \
--gpu-config 2-gpu-h100 \
--run-id ${{ github.run_id }} \
--output python/diffusion-metrics-2gpu-partition-${{ matrix.part }}.json \
--results-json python/diffusion-results.json
- name: Upload diffusion metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: diffusion-metrics-2gpu-partition-${{ matrix.part }}
path: python/diffusion-metrics-2gpu-partition-${{ matrix.part }}.json
retention-days: 90
if-no-files-found: ignore
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
with:
artifact-suffix: ${{ matrix.part }}
# B200 Performance tests - 4 GPU
nightly-test-perf-4-gpu-b200:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-perf-4-gpu-b200')
runs-on: 4-gpu-b200
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 300
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-4-gpu-b200 --nightly --continue-on-error --timeout-per-file 12000
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# Specialized B200 tests - 8 GPU, for specific backends and configs
nightly-test-specialized-8-gpu-b200:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-perf-8-gpu-b200' || inputs.job_filter == 'nightly-test-specialized-8-gpu-b200')
runs-on: 8-gpu-b200
env:
RUNNER_LABELS: 8-gpu-b200
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- uses: ./.github/actions/check-maintenance
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
- name: Run test
timeout-minutes: 120
env:
GPU_CONFIG: "8-gpu-b200"
run: |
cd test
python3 run_suite.py --hw cuda --suite nightly-8-gpu-b200 --nightly --continue-on-error --timeout-per-file 2400
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# Diffusion cross-framework comparison
nightly-test-diffusion-comparison:
if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-diffusion-comparison')
runs-on: 4-gpu-h100
timeout-minutes: 240
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh diffusion
- name: Run cross-framework comparison
env:
GITHUB_SHA: ${{ github.sha }}
GITHUB_RUN_ID: ${{ github.run_id }}
PYTHONUNBUFFERED: "1"
timeout-minutes: 210
run: |
python3 -u scripts/ci/utils/diffusion/run_comparison.py \
--output comparison-results.json
- name: Generate dashboard
if: always()
env:
GH_PAT_FOR_NIGHTLY_CI_DATA: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GH_TOKEN: ${{ github.token }}
run: |
python3 scripts/ci/utils/diffusion/generate_diffusion_dashboard.py \
--results comparison-results.json \
--output dashboard.md \
--charts-dir comparison-charts \
--fetch-history \
--step-summary
- name: Publish to sglang-ci-data
if: always()
env:
GH_PAT_FOR_NIGHTLY_CI_DATA: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
run: |
python3 scripts/ci/utils/diffusion/publish_comparison_results.py \
--results comparison-results.json \
--dashboard dashboard.md \
--charts-dir comparison-charts
- name: Upload comparison artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: diffusion-comparison-${{ github.run_id }}
path: |
comparison-results.json
dashboard.md
comparison-charts/
comparison-logs/
retention-days: 90
if-no-files-found: ignore
- uses: ./.github/actions/upload-cuda-coredumps
if: always()
# Consolidate performance metrics from all jobs
consolidate-metrics:
if: github.repository == 'sgl-project/sglang' && always()
needs:
- nightly-test-general-8-gpu-h200
- nightly-test-general-8-gpu-b200
- nightly-test-multimodal-server-1-gpu
- nightly-test-multimodal-server-2-gpu
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- name: Download all partition metrics
uses: actions/download-artifact@v4
with:
pattern: "*metrics-*"
path: metrics/
merge-multiple: true
- name: List downloaded metrics
run: |
echo "Downloaded metrics files:"
find metrics/ -name "*.json" -type f 2>/dev/null || echo "No metrics files found"
- name: Merge metrics
run: |
python3 scripts/ci/utils/merge_metrics.py \
--input-dir metrics/ \
--output consolidated-metrics-${{ github.run_id }}.json \
--run-id ${{ github.run_id }} \
--commit-sha ${{ github.sha }} \
--branch ${{ github.ref_name }}
- name: Upload consolidated metrics
uses: actions/upload-artifact@v4
with:
name: consolidated-metrics-${{ github.run_id }}
path: consolidated-metrics-${{ github.run_id }}.json
retention-days: 90
if-no-files-found: warn
# Final check job
check-all-jobs:
if: github.repository == 'sgl-project/sglang' && always()
needs:
- nightly-test-general-1-gpu-h100
- nightly-test-general-4-gpu-h100
- nightly-test-general-8-gpu-h200
- nightly-test-general-8-gpu-h20
- nightly-test-general-8-gpu-b200
- nightly-test-text-accuracy-2-gpu-h100
- nightly-test-text-perf-2-gpu-h100
- nightly-test-vlm-accuracy-2-gpu-h100
- nightly-test-vlm-perf-2-gpu-h100
- nightly-test-multimodal-server-1-gpu
- nightly-test-multimodal-server-2-gpu
- nightly-test-perf-4-gpu-b200
- nightly-test-specialized-8-gpu-b200
- nightly-test-diffusion-comparison
- consolidate-metrics
runs-on: ubuntu-latest
steps:
- name: Check if any job failed
run: |
if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
echo "One or more nightly test jobs failed"
exit 1
fi
if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
echo "One or more nightly test jobs were cancelled"
exit 1
fi
echo "All nightly test jobs passed"