Nightly CI Pipeline #62

Workflow file for this run

.github/workflows/nightly-ci.yml at 722e720

	# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0

	name: Nightly CI Pipeline

	on:
	schedule:
	- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
	workflow_dispatch: # Allow manual triggering for testing
	inputs:
	release:
	description: 'Stage NGC images, Artifactory wheels, and trigger the GitLab release pipeline. Schedule always sets release=true.'
	required: false
	type: boolean
	default: false
	run_tests:
	description: 'Run vllm/sglang/trtllm tests + dynamo-pipeline checks. Schedule always runs them.'
	required: false
	type: boolean
	default: true
	skip_gitlab_pipeline:
	description: 'Skip the GitLab release automation pipeline trigger. Emergency use only.'
	required: false
	type: boolean
	default: false

	permissions:
	contents: read


	env:
	BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}

	jobs:

	# ============================================================================
	# PRE-WARM K8S BUILDER
	# ============================================================================
	create-fresh-builder:
	name: Create fresh K8s builder
	runs-on: prod-default-small-v2
	permissions:
	contents: read
	outputs:
	builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	- name: Export builder name
	id: export-builder-name
	run: \|
	echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
	- name: Create and bootstrap fresh K8s builder
	uses: ./.github/actions/bootstrap-buildkit
	with:
	builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
	buildkit_worker_addresses: ''
	suppress_fallback_warning: 'true'

	# ============================================================================
	# RESOLVE SOURCE SHA
	# ============================================================================
	# Single SHA every downstream job (shared-build-image, dynamo-pipeline,
	# shared-test, release.yml) builds, tags, and releases against. On schedule
	# and workflow_dispatch from main, this is just github.sha.
	resolve-source-sha:
	name: Resolve source SHA
	runs-on: prod-default-v2
	permissions:
	contents: read
	outputs:
	source_sha: ${{ steps.resolve.outputs.source_sha }}
	steps:
	- id: resolve
	shell: bash
	run: \|
	set -euo pipefail
	SHA="${GITHUB_SHA}"
	echo "Using caller SHA: ${SHA}"
	echo "source_sha=${SHA}" >> "$GITHUB_OUTPUT"

	# ============================================================================
	# COMPUTE NIGHTLY DEV VERSION
	# ============================================================================
	# Emits a PEP 440 dev suffix (e.g. .dev20260423) forwarded to every
	# pipeline below. At the leaf, the suffix is stamped into pyproject / Cargo
	# versions on the runner before docker build, so wheels produced by the
	# wheel_builder stage carry the dev version.
	compute-dev-version:
	name: Compute dev version suffix
	runs-on: prod-default-v2
	permissions:
	contents: read
	outputs:
	dev_suffix: ${{ steps.compute.outputs.dev_suffix }}
	steps:
	- id: compute
	shell: bash
	run: \|
	DATE=$(date -u +%Y%m%d)
	echo "dev_suffix=.dev${DATE}" >> $GITHUB_OUTPUT

	# ============================================================================
	# COMPUTE RELEASE MODE
	# ============================================================================
	# schedule → always release. workflow_dispatch → honor the `release` input.
	# Output is consumed by the `release` job below to gate the workflow_call
	# into release.yml.
	compute-release-mode:
	name: Compute release mode
	runs-on: prod-default-v2
	permissions:
	contents: read
	outputs:
	release: ${{ steps.compute.outputs.release }}
	run_tests: ${{ steps.compute.outputs.run_tests }}
	steps:
	- id: compute
	shell: bash
	env:
	DISPATCH_RELEASE: ${{ inputs.release }}
	DISPATCH_RUN_TESTS: ${{ inputs.run_tests }}
	run: \|
	case "${GITHUB_EVENT_NAME}" in
	schedule)
	# cron must keep tests on so a failing nightly blocks the release.
	echo "release=true" >> "$GITHUB_OUTPUT"
	echo "run_tests=true" >> "$GITHUB_OUTPUT"
	;;
	workflow_dispatch)
	if [ "${DISPATCH_RELEASE}" = "false" ]; then
	echo "release=false" >> "$GITHUB_OUTPUT"
	else
	echo "release=true" >> "$GITHUB_OUTPUT"
	fi
	if [ "${DISPATCH_RUN_TESTS}" = "false" ]; then
	echo "run_tests=false" >> "$GITHUB_OUTPUT"
	else
	echo "run_tests=true" >> "$GITHUB_OUTPUT"
	fi
	;;
	*)
	echo "release=false" >> "$GITHUB_OUTPUT"
	echo "run_tests=true" >> "$GITHUB_OUTPUT"
	;;
	esac

	# Seeds the nightly Slack thread; exposes ts so downstream jobs (and the
	# GitLab pipeline) can reply. continue-on-error so a Slack outage doesn't
	# block the nightly — downstream jobs skip on empty outputs.
	notify-slack-start:
	name: Notify Slack — nightly started
	runs-on: prod-default-v2
	needs: [compute-release-mode, resolve-source-sha, compute-dev-version]
	if: ${{ needs.compute-release-mode.outputs.release == 'true' }}
	permissions:
	contents: read
	outputs:
	thread_ts: ${{ steps.post.outputs.thread_ts }}
	steps:
	- name: Post start message
	id: post
	continue-on-error: true
	env:
	SLACK_BOT_TOKEN: ${{ secrets.SLACK_RELEASE_BOT_TOKEN }}
	SLACK_CHANNEL_ID: ${{ secrets.SLACK_RELEASE_CHANNEL_ID }}
	SOURCE_SHA: ${{ needs.resolve-source-sha.outputs.source_sha }}
	DEV_SUFFIX: ${{ needs.compute-dev-version.outputs.dev_suffix }}
	RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
	shell: bash
	run: \|
	set -euo pipefail
	DATE_UTC=$(date -u +%Y-%m-%d)
	SHORT_SHA="${SOURCE_SHA:0:7}"

	PAYLOAD=$(jq -n \
	--arg channel "${SLACK_CHANNEL_ID}" \
	--arg date "${DATE_UTC}" \
	--arg sha "${SHORT_SHA}" \
	--arg run_url "${RUN_URL}" \
	--arg dev "${DEV_SUFFIX}" \
	'{
	channel: $channel,
	text: ("Dynamo Nightly build started — " + $date),
	blocks: [
	{ type: "header",
	text: { type: "plain_text", text: (":crescent_moon: Dynamo Nightly build started — " + $date) } },
	{ type: "section",
	fields: [
	{ type: "mrkdwn", text: ("Commit:\n`" + $sha + "`") },
	{ type: "mrkdwn", text: ("Dev version suffix:\n`" + $dev + "`") }
	] },
	{ type: "context",
	elements: [ { type: "mrkdwn", text: ("<" + $run_url + "\|GitHub Actions run>") } ] }
	]
	}')

	RESPONSE=$(curl -fsSL -X POST https://slack.com/api/chat.postMessage \
	-H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \
	-H "Content-type: application/json; charset=utf-8" \
	--data "${PAYLOAD}")

	OK=$(echo "${RESPONSE}" \| jq -r '.ok')
	if [ "${OK}" != "true" ]; then
	# RESPONSE may carry channel/ts/warning fields; surface only .error.
	ERR=$(echo "${RESPONSE}" \| jq -r '.error // "unknown"')
	echo "::error::Slack chat.postMessage failed: ${ERR}"
	exit 1
	fi
	TS=$(echo "${RESPONSE}" \| jq -r '.ts')
	# ts is a runtime per-message identifier (not a secret) and is the only
	# value other jobs need; channel ID is read by every downstream Slack
	# step directly from secrets.SLACK_RELEASE_CHANNEL_ID so it stays
	# auto-masked. Do NOT ::add-mask:: the ts — masking it before writing
	# GITHUB_OUTPUT caused the value to arrive empty across the
	# workflow_call boundary, silently skipping downstream reply steps.
	echo "thread_ts=${TS}" >> "${GITHUB_OUTPUT}"
	echo "Posted nightly start message"

	# ============================================================================
	# MANUAL RELEASE APPROVAL GATE
	# ============================================================================
	# Manual workflow_dispatch that would publish (release=true) must be approved
	# by a reviewer of the `manual-release-approval` environment before the
	# `release` job runs release.yml. Scheduled runs skip this job; the `release`
	# job below accepts `skipped` as success.
	manual-release-approval:
	name: Manual release approval gate
	needs: [compute-release-mode]
	if: ${{ github.event_name == 'workflow_dispatch' && needs.compute-release-mode.outputs.release == 'true' }}
	runs-on: prod-default-small-v2
	environment: manual-release-approval
	permissions:
	contents: read
	steps:
	- name: Record approval
	run: \|
	echo "Manual nightly release approved by ${{ github.actor }}"
	echo "Run: ${{ github.run_id }} attempt ${{ github.run_attempt }}"

	# ============================================================================
	# BUILD JOBS
	# ============================================================================

	vllm-build:
	name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
	needs: [create-fresh-builder, resolve-source-sha]
	uses: ./.github/workflows/shared-build-image.yml
	with:
	framework: vllm
	target: runtime
	cuda_version: '["13.0"]'
	platform: 'linux/amd64,linux/arm64'
	builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
	build_timeout_minutes: 120
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	sglang-build:
	name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
	needs: [create-fresh-builder, resolve-source-sha]
	uses: ./.github/workflows/shared-build-image.yml
	with:
	framework: sglang
	target: runtime
	cuda_version: '["13.0"]'
	platform: 'linux/amd64,linux/arm64'
	builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
	build_timeout_minutes: 120
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	trtllm-build:
	name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
	needs: [create-fresh-builder, resolve-source-sha]
	uses: ./.github/workflows/shared-build-image.yml
	with:
	framework: trtllm
	target: runtime
	cuda_version: '["13.1"]'
	platform: 'linux/amd64,linux/arm64'
	builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
	build_timeout_minutes: 120
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	# ============================================================================
	# TEST JOBS
	# ============================================================================

	vllm-test:
	name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
	needs: [vllm-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: vllm
	test_type: Test
	amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
	target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
	cuda_version: '["13.0"]'
	platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
	enable_coverage: true
	run_cpu_only_tests: true
	cpu_only_test_markers: vllm and gpu_0
	gpu_test_markers: vllm and gpu_1
	gpu_test_timeout_minutes: 240
	# Profiled tests run in the parallel stage; unprofiled fall through to sequential.
	# 24 GiB admits all currently profiled vLLM tests (max is ~20.4 GiB) on a 48 GiB GPU.
	run_gpu_parallel_tests: true
	gpu_parallel_max_vram_gib: '24'
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	vllm-multi-gpu-test:
	name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
	needs: [vllm-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: vllm
	test_type: Multi-GPU Test
	amd_runner: prod-tester-amd-gpu-4-v2
	target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
	cuda_version: '["13.0"]'
	platform: '["amd64"]' # No ARM GPUs available
	enable_coverage: true
	run_sanity_check: false
	gpu_test_markers: vllm and (gpu_2 or gpu_4)
	gpu_test_timeout_minutes: 45
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	sglang-test:
	name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
	needs: [sglang-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: sglang
	test_type: Test
	amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
	target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
	cuda_version: '["13.0"]'
	platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
	enable_coverage: true
	run_cpu_only_tests: true
	cpu_only_test_markers: sglang and gpu_0
	gpu_test_markers: sglang and gpu_1
	gpu_test_timeout_minutes: 240
	# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
	# Current single-GPU runners are 24 GiB, so this cap admits the profiled SGLang pool
	# while yielding one auto slot today. Larger runners will get more slots from the same markers.
	run_gpu_parallel_tests: true
	gpu_parallel_max_vram_gib: '24'
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	sglang-multi-gpu-test:
	name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
	needs: [sglang-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: sglang
	test_type: Multi-GPU Test
	amd_runner: prod-tester-amd-gpu-4-v2
	target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
	cuda_version: '["13.0"]'
	platform: '["amd64"]' # No ARM GPUs available
	enable_coverage: true
	run_sanity_check: false
	gpu_test_markers: sglang and (gpu_2 or gpu_4)
	gpu_test_timeout_minutes: 45
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	trtllm-test:
	name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
	needs: [trtllm-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: trtllm
	test_type: Test
	amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
	target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
	cuda_version: '["13.1"]'
	platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
	enable_coverage: true
	run_cpu_only_tests: true
	cpu_only_test_markers: trtllm and gpu_0
	gpu_test_markers: trtllm and gpu_1
	gpu_test_timeout_minutes: 240
	# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
	# Current single-GPU runners are 24 GiB, so this cap admits the profiled TRT-LLM pool
	# while yielding one auto slot today. Larger runners will get more slots from the same markers.
	run_gpu_parallel_tests: true
	gpu_parallel_max_vram_gib: '24'
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	trtllm-multi-gpu-test:
	name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
	needs: [trtllm-build, resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/shared-test.yml
	with:
	dd_env: nightly
	dd_flaky_retry_enabled: 'false'
	test_suite_name: trtllm
	test_type: Multi-GPU Test
	amd_runner: prod-tester-amd-gpu-4-v2
	target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
	cuda_version: '["13.1"]'
	platform: '["amd64"]' # No ARM GPUs available
	enable_coverage: true
	run_sanity_check: false
	gpu_test_markers: trtllm and (gpu_2 or gpu_4)
	gpu_test_timeout_minutes: 45
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	# ============================================================================
	# XPU TEST JOBS
	# ============================================================================

	vllm-test-xpu:
	name: vllm-xpu
	needs: [resolve-source-sha, compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	uses: ./.github/workflows/xpu-ci.yaml
	with:
	framework: vllm
	pytest_markers: 'vllm and xpu_1'
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}

	# ============================================================================
	# DYNAMO RUNTIME PIPELINE
	# ============================================================================
	dynamo-pipeline:
	name: dynamo-runtime
	needs: [create-fresh-builder, compute-dev-version, resolve-source-sha]
	uses: ./.github/workflows/dynamo-pipeline.yml
	with:
	builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
	fresh_builder: true
	no_cache: true
	build_timeout_minutes: 120
	# TODO: widen beyond `pre_merge` — today it picks up tests
	# (e.g. fault_tolerance/deploy/*) that fail in this container-only
	# context. Matches the coverage of the old container-validation-dynamo
	# workflow.
	cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)'
	cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
	gpu_test_markers: 'pre_merge and none and gpu_1'
	dev_version_suffix: ${{ needs.compute-dev-version.outputs.dev_suffix }}
	source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
	image_tag_suffix: '-nightly'
	secrets: inherit

	# ============================================================================
	# RUST COVERAGE
	# ============================================================================
	rust-tests:
	name: rust-${{ matrix.dir == '.' && 'root' \|\| matrix.dir }}-coverage
	needs: [compute-release-mode]
	if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
	runs-on:
	group: Fastchecker
	strategy:
	fail-fast: false
	matrix:
	dir: ['.', 'lib/bindings/python', 'lib/bindings/kvbm']
	permissions:
	contents: read
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	lfs: true
	- name: Set up system dependencies
	run: \|
	# Install protoc for Rust build dependencies (NOTE: much faster than apt install)
	PB_REL="https://github.com/protocolbuffers/protobuf/releases"
	PROTOC_VER="30.2"
	PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
	PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
	curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
	echo "${PROTOC_SHA256} ${PROTOC_ZIP}" \| sha256sum -c -
	unzip "${PROTOC_ZIP}" -d $HOME/.local
	rm "${PROTOC_ZIP}"
	export PATH="$PATH:$HOME/.local/bin"
	protoc --version
	- name: Cache cargo artifacts
	uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
	with:
	path: \|
	~/.cargo/bin/
	~/.cargo/registry
	key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
	restore-keys: ${{ runner.os }}-cargo-
	- name: Set up Rust Toolchain Components
	run: rustup component add llvm-tools-preview
	- name: Install cargo-llvm-cov
	run: cargo-llvm-cov --version 2>/dev/null \|\| cargo install cargo-llvm-cov --locked
	# Have an explicit step to build tests first to separate time spent on build vs execution.
	- name: Compile Tests
	working-directory: ${{ matrix.dir }}
	run: cargo test --locked --no-run
	- name: Run Unit Tests with Coverage
	working-directory: ${{ matrix.dir }}
	# NOTE: --all-targets doesn't run doc tests.
	# cargo llvm-cov is a drop-in for cargo test; --no-report defers output
	# so we can generate multiple formats without re-running the tests.
	run: \|
	cargo llvm-cov --locked --all-targets --no-report
	cargo llvm-cov report --output-path coverage-rust.txt
	cargo llvm-cov report --lcov --output-path coverage-rust.lcov
	echo "Coverage summary:"
	grep "^TOTAL" coverage-rust.txt \|\| tail -3 coverage-rust.txt
	SAFE_DIR=$(echo "${{ matrix.dir }}" \| sed 's\|^\.$\|root\|' \| tr '/' '-')
	echo "RUST_COV_ARTIFACT_NAME=coverage-rust-${SAFE_DIR}-${{ github.run_id }}" >> $GITHUB_ENV
	- name: Upload Rust Coverage Data
	uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
	if: always()
	with:
	name: ${{ env.RUST_COV_ARTIFACT_NAME }}
	path: \|
	${{ matrix.dir }}/coverage-rust.txt
	${{ matrix.dir }}/coverage-rust.lcov
	retention-days: 7

	# ============================================================================
	# RELEASE (workflow_call into release.yml)
	# ============================================================================
	# commit_sha is the SHA `resolve-source-sha` picked (github.sha). release.yml
	# requires post-merge CI to have already pushed images to ECR for that SHA;
	# on schedule this is true because post-merge runs on every merge to main
	# and nightly fires later.
	release:
	name: Release Nightly
	needs:
	- resolve-source-sha
	- compute-release-mode
	- notify-slack-start
	- manual-release-approval
	- vllm-test
	- vllm-multi-gpu-test
	- sglang-test
	- sglang-multi-gpu-test
	- trtllm-test
	- trtllm-multi-gpu-test
	- dynamo-pipeline
	- rust-tests
	# !cancelled() lets framework/rust failures fall through; dynamo-pipeline
	# is gated strictly because stage-wheels-artifactory extracts wheels from
	# its image. manual-release-approval is `skipped` on cron, `success` on dispatch.
	if: ${{ !cancelled()
	&& needs.compute-release-mode.outputs.release == 'true'
	&& needs.dynamo-pipeline.result == 'success'
	&& (needs.manual-release-approval.result == 'success'
	\|\| needs.manual-release-approval.result == 'skipped') }}
	uses: ./.github/workflows/release.yml
	with:
	commit_sha: ${{ needs.resolve-source-sha.outputs.source_sha }}
	nightly: true
	skip_gitlab_pipeline: ${{ inputs.skip_gitlab_pipeline \|\| false }}
	slack_thread_ts: ${{ needs.notify-slack-start.outputs.thread_ts }}
	secrets: inherit

	# ============================================================================
	# COVERAGE REPORT
	# ============================================================================
	coverage-report:
	name: Generate Coverage Report
	runs-on: ubuntu-latest
	needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
	if: always()
	permissions:
	contents: read
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

	- name: Set up Python
	uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
	with:
	python-version: '3.12'

	- name: Install Coverage Tools
	run: \|
	python -m pip install "coverage[toml]==7.13.1"
	python -m coverage --version
	echo "✅ Coverage tools installed"

	- name: Download All Python Coverage Artifacts
	uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
	with:
	pattern: coverage-python-*
	path: coverage-artifacts/
	merge-multiple: false

	- name: Download All Rust Coverage Artifacts
	uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
	continue-on-error: true
	with:
	pattern: coverage-rust-*
	path: coverage-rust-artifacts/
	merge-multiple: false

	- name: List Downloaded Artifacts
	run: \|
	echo "📦 Downloaded coverage artifacts:"
	echo "==== Directory structure (including hidden files) ===="
	ls -Ra coverage-artifacts/

	echo ""
	echo "==== Coverage files found ===="
	find coverage-artifacts/ -type f $ -name ".xml" -o -name ".coverage" $ \| sort
	echo ""
	echo "==== Specifically looking for .coverage files ===="
	find coverage-artifacts/ -name ".coverage" -type f
	echo ""

	- name: Merge All Coverage Data
	run: \|
	set -x
	echo "📊 Merging all test coverage..."

	# Expand $GITHUB_WORKSPACE in the config
	sed -i "s\|\$GITHUB_WORKSPACE\|${GITHUB_WORKSPACE}\|g" .coveragerc

	echo "Updated .coveragerc with path remapping:"
	cat .coveragerc

	# Find all .coverage files and copy them with unique names
	mkdir -p coverage-combined
	find coverage-artifacts/ -name ".coverage*" 2>/dev/null \| while read -r file; do
	unique_name=$(echo "$file" \| tr '/' '_' \| sed 's/coverage-artifacts_//')
	cp "$file" "coverage-combined/.coverage.${unique_name}"
	echo "Copied: $file -> coverage-combined/.coverage.${unique_name}"
	done

	# Check if we have any coverage files
	if ls coverage-combined/.coverage* 1> /dev/null 2>&1; then
	echo "✅ Found coverage files to merge"
	echo "Files to merge:"
	ls -lh coverage-combined/

	# Combine all coverage data from the workspace root so that the
	# relative canonical path "components/src/dynamo" in .coveragerc
	# resolves to $GITHUB_WORKSPACE/components/src/dynamo (where the
	# source actually lives) rather than to the non-existent
	# coverage-combined/components/src/dynamo subdirectory.
	echo "Running coverage combine with path remapping..."
	set +e # Don't exit on error
	COVERAGE_RCFILE=.coveragerc coverage combine --keep coverage-combined/.coverage* 2>&1 \| tee combine.log
	set -e

	if [ -f .coverage ]; then
	echo "✅ Combined .coverage file created"

	# Generate reports (continue even if some fail)
	echo "📊 Generating coverage reports..."
	set +e
	COVERAGE_RCFILE=.coveragerc coverage report --show-missing --data-file=.coverage 2>&1 \| tee coverage-report.txt
	TOTAL_COVERAGE=$(awk '/^TOTAL/ {print $NF}' coverage-report.txt \| tail -1)
	echo "TOTAL_COVERAGE=${TOTAL_COVERAGE:-0%}" >> $GITHUB_ENV
	COVERAGE_RCFILE=.coveragerc coverage html --data-file=.coverage -d coverage-html/ 2>&1 \|\| echo "HTML generation failed"
	COVERAGE_RCFILE=.coveragerc coverage xml --data-file=.coverage -o coverage-merged.xml 2>&1 \|\| echo "XML generation failed"
	set -e

	else
	echo "❌ Failed to create combined .coverage file"
	echo "No coverage data available" > coverage-report.txt
	echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
	fi
	else
	echo "⚠️ No coverage data found"
	echo "No coverage data available" > coverage-report.txt
	echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
	fi

	- name: Process Rust Coverage
	run: \|
	RUST_TOTAL_COVERAGE="N/A"
	echo "No Rust coverage data available yet." > coverage-rust-report.txt

	if find coverage-rust-artifacts/ -name "coverage-rust.txt" -type f 2>/dev/null \| grep -q .; then
	echo "📊 Processing Rust coverage reports..."
	# Concatenate full per-file coverage from every workspace dir
	> coverage-rust-report.txt
	find coverage-rust-artifacts/ -name "coverage-rust.txt" \| sort \| while read -r f; do
	DIR_NAME=$(basename $(dirname "$f"))
	echo "=== ${DIR_NAME} ===" >> coverage-rust-report.txt
	cat "$f" >> coverage-rust-report.txt
	echo "" >> coverage-rust-report.txt
	done

	# Aggregate line coverage across all crates by summing total/missed lines
	# from every TOTAL row. Columns on a `cargo llvm-cov report` TOTAL line:
	# $2 regions $3 missed_regions $4 region%
	# $5 funcs $6 missed_funcs $7 func%
	# $8 lines $9 missed_lines $10 line%
	RUST_TOTAL_COVERAGE=$(find coverage-rust-artifacts/ -name "coverage-rust.txt" -exec grep -h "^TOTAL" {} + \
	\| awk '{ total += $8; missed += $9 } END { if (total > 0) printf "%.2f%%", (total - missed) * 100 / total; else print "N/A" }')
	RUST_TOTAL_COVERAGE="${RUST_TOTAL_COVERAGE:-N/A}"
	echo "Rust line coverage (aggregated across $(find coverage-rust-artifacts/ -name "coverage-rust.txt" \| wc -l \| tr -d ' ') crates): ${RUST_TOTAL_COVERAGE}"
	else
	echo "ℹ️ No Rust coverage artifacts found"
	fi

	echo "RUST_TOTAL_COVERAGE=${RUST_TOTAL_COVERAGE}" >> $GITHUB_ENV

	- name: Create Coverage Summary
	run: \|
	DATE=$(date +"%Y-%m-%d %H:%M:%S UTC")
	cat > coverage-summary.md << EOF
	# 📊 Test Coverage Report

	Date: ${DATE}
	Run ID: ${{ github.run_id }}
	Workflow: ${{ github.workflow }}

	\| Language \| Total Coverage \|
	\|----------\|---------------\|
	\| Python \| ${TOTAL_COVERAGE} \|
	\| Rust \| ${RUST_TOTAL_COVERAGE} \|

	---

	## Python Coverage Details

	\`\`\`
	$(cat coverage-report.txt 2>/dev/null \|\| echo "No Python coverage data available")
	\`\`\`

	---

	## Rust Coverage Details

	\`\`\`
	$(cat coverage-rust-report.txt 2>/dev/null \|\| echo "No Rust coverage data available")
	\`\`\`

	---

	## 📁 Artifacts
	- Full HTML Report: Download \`coverage-reports-${{ github.run_id }}\` artifact
	- Python Coverage XML: \`coverage-merged.xml\`
	- Rust Coverage LCov: \`coverage-rust.lcov\` (per workspace dir)

	EOF

	echo "📄 Coverage summary generated"
	cat coverage-summary.md

	- name: Post to Workflow Summary
	run: cat coverage-summary.md >> $GITHUB_STEP_SUMMARY

	- name: Upload Coverage Reports
	uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
	if: always()
	with:
	name: coverage-reports-${{ github.run_id }}
	path: \|
	coverage-html/
	coverage-merged.xml
	coverage-report.txt
	coverage-rust-report.txt
	coverage-rust-artifacts/
	coverage-summary.md
	.coverage
	retention-days: 30

	# ============================================================================
	# CLEANUP
	# ============================================================================
	clean-k8s-builder:
	name: Clean K8s builder if exists
	runs-on: prod-default-small-v2
	if: always()
	needs: [vllm-build, sglang-build, trtllm-build, dynamo-pipeline, create-fresh-builder]
	permissions:
	contents: read
	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	- name: Register K8s builder context (skip bootstrap)
	uses: ./.github/actions/bootstrap-buildkit
	continue-on-error: true
	with:
	builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
	buildkit_worker_addresses: ''
	skip_bootstrap: 'true'
	- name: Remove K8s builder
	shell: bash
	run: \|
	docker buildx rm ${{ env.BUILDER_NAME }} \|\| true

	############################## SLACK NOTIFICATION ##############################
	notify-slack:
	name: Notify Slack
	runs-on: prod-default-v2
	if: always()
	needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
	permissions:
	contents: read
	steps:
	- name: Get Failed jobs
	shell: bash
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	JOBS_JSON=$(mktemp)
	curl -sSL \
	-H "Authorization: Bearer ${GITHUB_TOKEN}" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
	>$JOBS_JSON

	FAILED_JOBS=$(jq -r '.jobs[] \| select(.conclusion == "failure") \| .name \| split(" / ") \| if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end \| . + "\\n"' "$JOBS_JSON")
	echo $FAILED_JOBS
	{
	echo "FAILED_JOBS<<EOF"
	echo "$FAILED_JOBS"
	echo "EOF"
	} >> "$GITHUB_ENV"
	- name: Notify Slack
	uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
	with:
	webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
	webhook-type: incoming-webhook
	payload: \|
	blocks:
	- type: "section"
	text:
	type: mrkdwn
	text: ":alert: Github Nightly Pipeline Failure"
	- type: "section"
	text:
	type: mrkdwn
	text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}\|Workflow Summary>"
	- type: "section"
	text:
	type: mrkdwn
	text: "${{ env.FAILED_JOBS }}"
	- type: "section"
	text:
	type: mrkdwn
	text: "@ops-support Please investigate the failures above."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Nightly CI Pipeline #62

Workflow file

Nightly CI Pipeline #62

Uh oh!

Workflow file for this run