Skip to content

Nightly CI Pipeline #62

Nightly CI Pipeline

Nightly CI Pipeline #62

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Nightly CI Pipeline
on:
schedule:
- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
workflow_dispatch: # Allow manual triggering for testing
inputs:
release:
description: 'Stage NGC images, Artifactory wheels, and trigger the GitLab release pipeline. Schedule always sets release=true.'
required: false
type: boolean
default: false
run_tests:
description: 'Run vllm/sglang/trtllm tests + dynamo-pipeline checks. Schedule always runs them.'
required: false
type: boolean
default: true
skip_gitlab_pipeline:
description: 'Skip the GitLab release automation pipeline trigger. Emergency use only.'
required: false
type: boolean
default: false
permissions:
contents: read
env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# PRE-WARM K8S BUILDER
# ============================================================================
create-fresh-builder:
name: Create fresh K8s builder
runs-on: prod-default-small-v2
permissions:
contents: read
outputs:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
- name: Create and bootstrap fresh K8s builder
uses: ./.github/actions/bootstrap-buildkit
with:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
buildkit_worker_addresses: ''
suppress_fallback_warning: 'true'
# ============================================================================
# RESOLVE SOURCE SHA
# ============================================================================
# Single SHA every downstream job (shared-build-image, dynamo-pipeline,
# shared-test, release.yml) builds, tags, and releases against. On schedule
# and workflow_dispatch from main, this is just github.sha.
resolve-source-sha:
name: Resolve source SHA
runs-on: prod-default-v2
permissions:
contents: read
outputs:
source_sha: ${{ steps.resolve.outputs.source_sha }}
steps:
- id: resolve
shell: bash
run: |
set -euo pipefail
SHA="${GITHUB_SHA}"
echo "Using caller SHA: ${SHA}"
echo "source_sha=${SHA}" >> "$GITHUB_OUTPUT"
# ============================================================================
# COMPUTE NIGHTLY DEV VERSION
# ============================================================================
# Emits a PEP 440 dev suffix (e.g. .dev20260423) forwarded to every
# pipeline below. At the leaf, the suffix is stamped into pyproject / Cargo
# versions on the runner before docker build, so wheels produced by the
# wheel_builder stage carry the dev version.
compute-dev-version:
name: Compute dev version suffix
runs-on: prod-default-v2
permissions:
contents: read
outputs:
dev_suffix: ${{ steps.compute.outputs.dev_suffix }}
steps:
- id: compute
shell: bash
run: |
DATE=$(date -u +%Y%m%d)
echo "dev_suffix=.dev${DATE}" >> $GITHUB_OUTPUT
# ============================================================================
# COMPUTE RELEASE MODE
# ============================================================================
# schedule → always release. workflow_dispatch → honor the `release` input.
# Output is consumed by the `release` job below to gate the workflow_call
# into release.yml.
compute-release-mode:
name: Compute release mode
runs-on: prod-default-v2
permissions:
contents: read
outputs:
release: ${{ steps.compute.outputs.release }}
run_tests: ${{ steps.compute.outputs.run_tests }}
steps:
- id: compute
shell: bash
env:
DISPATCH_RELEASE: ${{ inputs.release }}
DISPATCH_RUN_TESTS: ${{ inputs.run_tests }}
run: |
case "${GITHUB_EVENT_NAME}" in
schedule)
# cron must keep tests on so a failing nightly blocks the release.
echo "release=true" >> "$GITHUB_OUTPUT"
echo "run_tests=true" >> "$GITHUB_OUTPUT"
;;
workflow_dispatch)
if [ "${DISPATCH_RELEASE}" = "false" ]; then
echo "release=false" >> "$GITHUB_OUTPUT"
else
echo "release=true" >> "$GITHUB_OUTPUT"
fi
if [ "${DISPATCH_RUN_TESTS}" = "false" ]; then
echo "run_tests=false" >> "$GITHUB_OUTPUT"
else
echo "run_tests=true" >> "$GITHUB_OUTPUT"
fi
;;
*)
echo "release=false" >> "$GITHUB_OUTPUT"
echo "run_tests=true" >> "$GITHUB_OUTPUT"
;;
esac
# Seeds the nightly Slack thread; exposes ts so downstream jobs (and the
# GitLab pipeline) can reply. continue-on-error so a Slack outage doesn't
# block the nightly — downstream jobs skip on empty outputs.
notify-slack-start:
name: Notify Slack — nightly started
runs-on: prod-default-v2
needs: [compute-release-mode, resolve-source-sha, compute-dev-version]
if: ${{ needs.compute-release-mode.outputs.release == 'true' }}
permissions:
contents: read
outputs:
thread_ts: ${{ steps.post.outputs.thread_ts }}
steps:
- name: Post start message
id: post
continue-on-error: true
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_RELEASE_BOT_TOKEN }}
SLACK_CHANNEL_ID: ${{ secrets.SLACK_RELEASE_CHANNEL_ID }}
SOURCE_SHA: ${{ needs.resolve-source-sha.outputs.source_sha }}
DEV_SUFFIX: ${{ needs.compute-dev-version.outputs.dev_suffix }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
shell: bash
run: |
set -euo pipefail
DATE_UTC=$(date -u +%Y-%m-%d)
SHORT_SHA="${SOURCE_SHA:0:7}"
PAYLOAD=$(jq -n \
--arg channel "${SLACK_CHANNEL_ID}" \
--arg date "${DATE_UTC}" \
--arg sha "${SHORT_SHA}" \
--arg run_url "${RUN_URL}" \
--arg dev "${DEV_SUFFIX}" \
'{
channel: $channel,
text: ("Dynamo Nightly build started — " + $date),
blocks: [
{ type: "header",
text: { type: "plain_text", text: (":crescent_moon: Dynamo Nightly build started — " + $date) } },
{ type: "section",
fields: [
{ type: "mrkdwn", text: ("*Commit:*\n`" + $sha + "`") },
{ type: "mrkdwn", text: ("*Dev version suffix:*\n`" + $dev + "`") }
] },
{ type: "context",
elements: [ { type: "mrkdwn", text: ("<" + $run_url + "|GitHub Actions run>") } ] }
]
}')
RESPONSE=$(curl -fsSL -X POST https://slack.com/api/chat.postMessage \
-H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
-H "Content-type: application/json; charset=utf-8" \
--data "${PAYLOAD}")
OK=$(echo "${RESPONSE}" | jq -r '.ok')
if [ "${OK}" != "true" ]; then
# RESPONSE may carry channel/ts/warning fields; surface only .error.
ERR=$(echo "${RESPONSE}" | jq -r '.error // "unknown"')
echo "::error::Slack chat.postMessage failed: ${ERR}"
exit 1
fi
TS=$(echo "${RESPONSE}" | jq -r '.ts')
# ts is a runtime per-message identifier (not a secret) and is the only
# value other jobs need; channel ID is read by every downstream Slack
# step directly from secrets.SLACK_RELEASE_CHANNEL_ID so it stays
# auto-masked. Do NOT ::add-mask:: the ts — masking it before writing
# GITHUB_OUTPUT caused the value to arrive empty across the
# workflow_call boundary, silently skipping downstream reply steps.
echo "thread_ts=${TS}" >> "${GITHUB_OUTPUT}"
echo "Posted nightly start message"
# ============================================================================
# MANUAL RELEASE APPROVAL GATE
# ============================================================================
# Manual workflow_dispatch that would publish (release=true) must be approved
# by a reviewer of the `manual-release-approval` environment before the
# `release` job runs release.yml. Scheduled runs skip this job; the `release`
# job below accepts `skipped` as success.
manual-release-approval:
name: Manual release approval gate
needs: [compute-release-mode]
if: ${{ github.event_name == 'workflow_dispatch' && needs.compute-release-mode.outputs.release == 'true' }}
runs-on: prod-default-small-v2
environment: manual-release-approval
permissions:
contents: read
steps:
- name: Record approval
run: |
echo "Manual nightly release approved by ${{ github.actor }}"
echo "Run: ${{ github.run_id }} attempt ${{ github.run_attempt }}"
# ============================================================================
# BUILD JOBS
# ============================================================================
vllm-build:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: vllm
target: runtime
cuda_version: '["13.0"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
sglang-build:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: sglang
target: runtime
cuda_version: '["13.0"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
trtllm-build:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: trtllm
target: runtime
cuda_version: '["13.1"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
# ============================================================================
# TEST JOBS
# ============================================================================
vllm-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [vllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: vllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: vllm and gpu_0
gpu_test_markers: vllm and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the parallel stage; unprofiled fall through to sequential.
# 24 GiB admits all currently profiled vLLM tests (max is ~20.4 GiB) on a 48 GiB GPU.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
vllm-multi-gpu-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [vllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: vllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["13.0"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: vllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
sglang-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [sglang-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: sglang
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: sglang and gpu_0
gpu_test_markers: sglang and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
# Current single-GPU runners are 24 GiB, so this cap admits the profiled SGLang pool
# while yielding one auto slot today. Larger runners will get more slots from the same markers.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
sglang-multi-gpu-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [sglang-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: sglang
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["13.0"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: sglang and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
trtllm-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [trtllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: trtllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: trtllm and gpu_0
gpu_test_markers: trtllm and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
# Current single-GPU runners are 24 GiB, so this cap admits the profiled TRT-LLM pool
# while yielding one auto slot today. Larger runners will get more slots from the same markers.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
trtllm-multi-gpu-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [trtllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
dd_env: nightly
dd_flaky_retry_enabled: 'false'
test_suite_name: trtllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: trtllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
# ============================================================================
# XPU TEST JOBS
# ============================================================================
vllm-test-xpu:
name: vllm-xpu
needs: [resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/xpu-ci.yaml
with:
framework: vllm
pytest_markers: 'vllm and xpu_1'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
# ============================================================================
# DYNAMO RUNTIME PIPELINE
# ============================================================================
dynamo-pipeline:
name: dynamo-runtime
needs: [create-fresh-builder, compute-dev-version, resolve-source-sha]
uses: ./.github/workflows/dynamo-pipeline.yml
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
fresh_builder: true
no_cache: true
build_timeout_minutes: 120
# TODO: widen beyond `pre_merge` — today it picks up tests
# (e.g. fault_tolerance/deploy/*) that fail in this container-only
# context. Matches the coverage of the old container-validation-dynamo
# workflow.
cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)'
cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
gpu_test_markers: 'pre_merge and none and gpu_1'
dev_version_suffix: ${{ needs.compute-dev-version.outputs.dev_suffix }}
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
image_tag_suffix: '-nightly'
secrets: inherit
# ============================================================================
# RUST COVERAGE
# ============================================================================
rust-tests:
name: rust-${{ matrix.dir == '.' && 'root' || matrix.dir }}-coverage
needs: [compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
runs-on:
group: Fastchecker
strategy:
fail-fast: false
matrix:
dir: ['.', 'lib/bindings/python', 'lib/bindings/kvbm']
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
lfs: true
- name: Set up system dependencies
run: |
# Install protoc for Rust build dependencies (NOTE: much faster than apt install)
PB_REL="https://github.com/protocolbuffers/protobuf/releases"
PROTOC_VER="30.2"
PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c -
unzip "${PROTOC_ZIP}" -d $HOME/.local
rm "${PROTOC_ZIP}"
export PATH="$PATH:$HOME/.local/bin"
protoc --version
- name: Cache cargo artifacts
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
with:
path: |
~/.cargo/bin/
~/.cargo/registry
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
restore-keys: ${{ runner.os }}-cargo-
- name: Set up Rust Toolchain Components
run: rustup component add llvm-tools-preview
- name: Install cargo-llvm-cov
run: cargo-llvm-cov --version 2>/dev/null || cargo install cargo-llvm-cov --locked
# Have an explicit step to build tests first to separate time spent on build vs execution.
- name: Compile Tests
working-directory: ${{ matrix.dir }}
run: cargo test --locked --no-run
- name: Run Unit Tests with Coverage
working-directory: ${{ matrix.dir }}
# NOTE: --all-targets doesn't run doc tests.
# cargo llvm-cov is a drop-in for cargo test; --no-report defers output
# so we can generate multiple formats without re-running the tests.
run: |
cargo llvm-cov --locked --all-targets --no-report
cargo llvm-cov report --output-path coverage-rust.txt
cargo llvm-cov report --lcov --output-path coverage-rust.lcov
echo "Coverage summary:"
grep "^TOTAL" coverage-rust.txt || tail -3 coverage-rust.txt
SAFE_DIR=$(echo "${{ matrix.dir }}" | sed 's|^\.$|root|' | tr '/' '-')
echo "RUST_COV_ARTIFACT_NAME=coverage-rust-${SAFE_DIR}-${{ github.run_id }}" >> $GITHUB_ENV
- name: Upload Rust Coverage Data
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: always()
with:
name: ${{ env.RUST_COV_ARTIFACT_NAME }}
path: |
${{ matrix.dir }}/coverage-rust.txt
${{ matrix.dir }}/coverage-rust.lcov
retention-days: 7
# ============================================================================
# RELEASE (workflow_call into release.yml)
# ============================================================================
# commit_sha is the SHA `resolve-source-sha` picked (github.sha). release.yml
# requires post-merge CI to have already pushed images to ECR for that SHA;
# on schedule this is true because post-merge runs on every merge to main
# and nightly fires later.
release:
name: Release Nightly
needs:
- resolve-source-sha
- compute-release-mode
- notify-slack-start
- manual-release-approval
- vllm-test
- vllm-multi-gpu-test
- sglang-test
- sglang-multi-gpu-test
- trtllm-test
- trtllm-multi-gpu-test
- dynamo-pipeline
- rust-tests
# !cancelled() lets framework/rust failures fall through; dynamo-pipeline
# is gated strictly because stage-wheels-artifactory extracts wheels from
# its image. manual-release-approval is `skipped` on cron, `success` on dispatch.
if: ${{ !cancelled()
&& needs.compute-release-mode.outputs.release == 'true'
&& needs.dynamo-pipeline.result == 'success'
&& (needs.manual-release-approval.result == 'success'
|| needs.manual-release-approval.result == 'skipped') }}
uses: ./.github/workflows/release.yml
with:
commit_sha: ${{ needs.resolve-source-sha.outputs.source_sha }}
nightly: true
skip_gitlab_pipeline: ${{ inputs.skip_gitlab_pipeline || false }}
slack_thread_ts: ${{ needs.notify-slack-start.outputs.thread_ts }}
secrets: inherit
# ============================================================================
# COVERAGE REPORT
# ============================================================================
coverage-report:
name: Generate Coverage Report
runs-on: ubuntu-latest
needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
if: always()
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Install Coverage Tools
run: |
python -m pip install "coverage[toml]==7.13.1"
python -m coverage --version
echo "✅ Coverage tools installed"
- name: Download All Python Coverage Artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
pattern: coverage-python-*
path: coverage-artifacts/
merge-multiple: false
- name: Download All Rust Coverage Artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
continue-on-error: true
with:
pattern: coverage-rust-*
path: coverage-rust-artifacts/
merge-multiple: false
- name: List Downloaded Artifacts
run: |
echo "📦 Downloaded coverage artifacts:"
echo "==== Directory structure (including hidden files) ===="
ls -Ra coverage-artifacts/
echo ""
echo "==== Coverage files found ===="
find coverage-artifacts/ -type f \( -name "*.xml" -o -name ".coverage*" \) | sort
echo ""
echo "==== Specifically looking for .coverage files ===="
find coverage-artifacts/ -name ".coverage" -type f
echo ""
- name: Merge All Coverage Data
run: |
set -x
echo "📊 Merging all test coverage..."
# Expand $GITHUB_WORKSPACE in the config
sed -i "s|\$GITHUB_WORKSPACE|${GITHUB_WORKSPACE}|g" .coveragerc
echo "Updated .coveragerc with path remapping:"
cat .coveragerc
# Find all .coverage files and copy them with unique names
mkdir -p coverage-combined
find coverage-artifacts/ -name ".coverage*" 2>/dev/null | while read -r file; do
unique_name=$(echo "$file" | tr '/' '_' | sed 's/coverage-artifacts_//')
cp "$file" "coverage-combined/.coverage.${unique_name}"
echo "Copied: $file -> coverage-combined/.coverage.${unique_name}"
done
# Check if we have any coverage files
if ls coverage-combined/.coverage* 1> /dev/null 2>&1; then
echo "✅ Found coverage files to merge"
echo "Files to merge:"
ls -lh coverage-combined/
# Combine all coverage data from the workspace root so that the
# relative canonical path "components/src/dynamo" in .coveragerc
# resolves to $GITHUB_WORKSPACE/components/src/dynamo (where the
# source actually lives) rather than to the non-existent
# coverage-combined/components/src/dynamo subdirectory.
echo "Running coverage combine with path remapping..."
set +e # Don't exit on error
COVERAGE_RCFILE=.coveragerc coverage combine --keep coverage-combined/.coverage* 2>&1 | tee combine.log
set -e
if [ -f .coverage ]; then
echo "✅ Combined .coverage file created"
# Generate reports (continue even if some fail)
echo "📊 Generating coverage reports..."
set +e
COVERAGE_RCFILE=.coveragerc coverage report --show-missing --data-file=.coverage 2>&1 | tee coverage-report.txt
TOTAL_COVERAGE=$(awk '/^TOTAL/ {print $NF}' coverage-report.txt | tail -1)
echo "TOTAL_COVERAGE=${TOTAL_COVERAGE:-0%}" >> $GITHUB_ENV
COVERAGE_RCFILE=.coveragerc coverage html --data-file=.coverage -d coverage-html/ 2>&1 || echo "HTML generation failed"
COVERAGE_RCFILE=.coveragerc coverage xml --data-file=.coverage -o coverage-merged.xml 2>&1 || echo "XML generation failed"
set -e
else
echo "❌ Failed to create combined .coverage file"
echo "No coverage data available" > coverage-report.txt
echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
fi
else
echo "⚠️ No coverage data found"
echo "No coverage data available" > coverage-report.txt
echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
fi
- name: Process Rust Coverage
run: |
RUST_TOTAL_COVERAGE="N/A"
echo "No Rust coverage data available yet." > coverage-rust-report.txt
if find coverage-rust-artifacts/ -name "coverage-rust.txt" -type f 2>/dev/null | grep -q .; then
echo "📊 Processing Rust coverage reports..."
# Concatenate full per-file coverage from every workspace dir
> coverage-rust-report.txt
find coverage-rust-artifacts/ -name "coverage-rust.txt" | sort | while read -r f; do
DIR_NAME=$(basename $(dirname "$f"))
echo "=== ${DIR_NAME} ===" >> coverage-rust-report.txt
cat "$f" >> coverage-rust-report.txt
echo "" >> coverage-rust-report.txt
done
# Aggregate line coverage across all crates by summing total/missed lines
# from every TOTAL row. Columns on a `cargo llvm-cov report` TOTAL line:
# $2 regions $3 missed_regions $4 region%
# $5 funcs $6 missed_funcs $7 func%
# $8 lines $9 missed_lines $10 line%
RUST_TOTAL_COVERAGE=$(find coverage-rust-artifacts/ -name "coverage-rust.txt" -exec grep -h "^TOTAL" {} + \
| awk '{ total += $8; missed += $9 } END { if (total > 0) printf "%.2f%%", (total - missed) * 100 / total; else print "N/A" }')
RUST_TOTAL_COVERAGE="${RUST_TOTAL_COVERAGE:-N/A}"
echo "Rust line coverage (aggregated across $(find coverage-rust-artifacts/ -name "coverage-rust.txt" | wc -l | tr -d ' ') crates): ${RUST_TOTAL_COVERAGE}"
else
echo "ℹ️ No Rust coverage artifacts found"
fi
echo "RUST_TOTAL_COVERAGE=${RUST_TOTAL_COVERAGE}" >> $GITHUB_ENV
- name: Create Coverage Summary
run: |
DATE=$(date +"%Y-%m-%d %H:%M:%S UTC")
cat > coverage-summary.md << EOF
# 📊 Test Coverage Report
**Date:** ${DATE}
**Run ID:** ${{ github.run_id }}
**Workflow:** ${{ github.workflow }}
| Language | Total Coverage |
|----------|---------------|
| Python | ${TOTAL_COVERAGE} |
| Rust | ${RUST_TOTAL_COVERAGE} |
---
## Python Coverage Details
\`\`\`
$(cat coverage-report.txt 2>/dev/null || echo "No Python coverage data available")
\`\`\`
---
## Rust Coverage Details
\`\`\`
$(cat coverage-rust-report.txt 2>/dev/null || echo "No Rust coverage data available")
\`\`\`
---
## 📁 Artifacts
- Full HTML Report: Download \`coverage-reports-${{ github.run_id }}\` artifact
- Python Coverage XML: \`coverage-merged.xml\`
- Rust Coverage LCov: \`coverage-rust.lcov\` (per workspace dir)
EOF
echo "📄 Coverage summary generated"
cat coverage-summary.md
- name: Post to Workflow Summary
run: cat coverage-summary.md >> $GITHUB_STEP_SUMMARY
- name: Upload Coverage Reports
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: always()
with:
name: coverage-reports-${{ github.run_id }}
path: |
coverage-html/
coverage-merged.xml
coverage-report.txt
coverage-rust-report.txt
coverage-rust-artifacts/
coverage-summary.md
.coverage
retention-days: 30
# ============================================================================
# CLEANUP
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-build, sglang-build, trtllm-build, dynamo-pipeline, create-fresh-builder]
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Register K8s builder context (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
buildkit_worker_addresses: ''
skip_bootstrap: 'true'
- name: Remove K8s builder
shell: bash
run: |
docker buildx rm ${{ env.BUILDER_NAME }} || true
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-default-v2
if: always()
needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
permissions:
contents: read
steps:
- name: Get Failed jobs
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | .name | split(" / ") | if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end | . + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
- name: Notify Slack
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
with:
webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type: incoming-webhook
payload: |
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Nightly Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."