Skip to content

CICD NeMo

CICD NeMo #16306

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
# Allow MCore to trigger this workflow remotely for compatibility testing
workflow_dispatch:
inputs:
mcore_ref:
description: "MCore commit SHA to test against"
required: false
type: string
mcore_repo:
description: "MCore repository URL (for fetching from forks)"
required: false
type: string
default: "https://github.com/NVIDIA/Megatron-LM.git"
test_suite:
description: "Test suite to run"
required: false
type: choice
options:
- "all"
- "L0"
- "L1"
- "L2"
- "unit-only"
- "functional-only"
default: "all"
triggered_by:
description: "Trigger source (for tracking)"
required: false
type: string
default: "manual"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-bridge
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.80.1
with:
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
default_registry: ${{ vars.DEFAULT_CONTAINER_REGISTRY }}
non_nvidia_registry: ${{ vars.NON_NVIDIA_CONTAINER_REGISTRY }}
sso_users_filename: ${{ vars.SSO_USERS_FILENAME }}
secrets:
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
configure:
runs-on: ubuntu-latest
needs: [pre-flight]
outputs:
needs_more_tests: ${{ steps.configure.outputs.needs_more_tests }}
full_test_suite: ${{ steps.configure.outputs.full_test_suite }}
expect_l0: ${{ steps.configure.outputs.expect_l0 }}
expect_l1: ${{ steps.configure.outputs.expect_l1 }}
expect_l2: ${{ steps.configure.outputs.expect_l2 }}
perf_scripts_only: ${{ steps.configure.outputs.perf_scripts_only }}
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Configure
id: configure
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }}
EVENT_NAME: ${{ github.event_name }}
REF: ${{ github.ref }}
TEST_SUITE: ${{ github.event.inputs.test_suite }}
run: |
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
# Fetch labels; fall back to empty list if not a PR
LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]' 2>/dev/null) || LABELS='[]'
NEEDS_MORE_TESTS=$(echo "$LABELS" | jq 'any(. == "needs-more-tests")')
FULL_TEST_SUITE=$(echo "$LABELS" | jq 'any(. == "full-test-suite")')
# Detect if every changed file lives under scripts/performance/
PERF_SCRIPTS_ONLY=false
if [[ -n "$PR_NUMBER" ]]; then
CHANGED_FILES=$(gh pr diff "$PR_NUMBER" --repo ${{ github.repository }} --name-only 2>/dev/null) || CHANGED_FILES=""
if [[ -n "$CHANGED_FILES" ]]; then
NON_PERF=$(echo "$CHANGED_FILES" | grep -v '^scripts/performance/' || true)
[[ -z "$NON_PERF" ]] && PERF_SCRIPTS_ONLY=true
fi
fi
# Tests are expected on every run except docs-only, deployment, and perf-scripts-only
if [[ "$DOCS_ONLY" == "true" || "$IS_DEPLOYMENT" == "true" || "$PERF_SCRIPTS_ONLY" == "true" ]]; then
RUN_TESTS=false
else
RUN_TESTS=true
fi
# L0/L1/L2 functional tests are expected when test_suite is '' (default), 'all', or 'functional-only'
EXPECT_FUNCTIONAL=false
[[ "$TEST_SUITE" == "" || "$TEST_SUITE" == "all" || "$TEST_SUITE" == "functional-only" ]] && EXPECT_FUNCTIONAL=true
# EXPECT_L0: any non-docs/deployment run that includes functional tests
EXPECT_L0=false
[[ "$RUN_TESTS" == "true" && "$EXPECT_FUNCTIONAL" == "true" ]] && EXPECT_L0=true
# EXPECT_L1: L0 conditions + event/label gate
EXPECT_L1=false
if [[ "$EXPECT_L0" == "true" ]]; then
if [[ "$REF" == "refs/heads/main" || "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "workflow_dispatch" || "$EVENT_NAME" == "merge_group" || "$NEEDS_MORE_TESTS" == "true" ]]; then
EXPECT_L1=true
fi
fi
# EXPECT_L2: schedule, workflow_dispatch, or full-test-suite label
EXPECT_L2=false
if [[ "$EXPECT_L0" == "true" && ("$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "workflow_dispatch" || "$FULL_TEST_SUITE" == "true") ]]; then
EXPECT_L2=true
fi
echo "needs_more_tests=$NEEDS_MORE_TESTS" | tee -a "$GITHUB_OUTPUT"
echo "full_test_suite=$FULL_TEST_SUITE" | tee -a "$GITHUB_OUTPUT"
echo "expect_l0=$EXPECT_L0" | tee -a "$GITHUB_OUTPUT"
echo "expect_l1=$EXPECT_L1" | tee -a "$GITHUB_OUTPUT"
echo "expect_l2=$EXPECT_L2" | tee -a "$GITHUB_OUTPUT"
echo "perf_scripts_only=$PERF_SCRIPTS_ONLY" | tee -a "$GITHUB_OUTPUT"
# Active row markers for step summary decision tree
_L0=$( [[ "$EXPECT_L0" == "true" ]] && echo "**→**" || echo "" )
_L1=$( [[ "$EXPECT_L1" == "true" ]] && echo "**→**" || echo "" )
_L2=$( [[ "$EXPECT_L2" == "true" ]] && echo "**→**" || echo "" )
_SKIP_DOCS=$( [[ "$DOCS_ONLY" == "true" ]] && echo "**→**" || echo "" )
_SKIP_DEPLOY=$([[ "$IS_DEPLOYMENT" == "true" ]] && echo "**→**" || echo "" )
_SKIP_PERF=$( [[ "$PERF_SCRIPTS_ONLY" == "true" ]] && echo "**→**" || echo "" )
_MG=$( [[ "$EVENT_NAME" == "merge_group" ]] && echo "**→**" || echo "" )
_MAIN=$( [[ "$REF" == "refs/heads/main" ]] && echo "**→**" || echo "" )
_SCHED=$( [[ "$EVENT_NAME" == "schedule" ]] && echo "**→**" || echo "" )
_WD=$( [[ "$EVENT_NAME" == "workflow_dispatch" ]] && echo "**→**" || echo "" )
_NMT=$( [[ "$NEEDS_MORE_TESTS" == "true" ]] && echo "**→**" || echo "" )
_FTS=$( [[ "$FULL_TEST_SUITE" == "true" ]] && echo "**→**" || echo "" )
cat <<SUMMARY >> "$GITHUB_STEP_SUMMARY"
## CI Configuration
**Event:** \`$EVENT_NAME\` | **Ref:** \`$REF\` | **Test suite:** \`${TEST_SUITE:-all}\`
| Setting | Value |
|---|---|
| \`docs_only\` | \`$DOCS_ONLY\` |
| \`is_deployment_workflow\` | \`$IS_DEPLOYMENT\` |
| \`perf_scripts_only\` | \`$PERF_SCRIPTS_ONLY\` |
| \`needs_more_tests\` | \`$NEEDS_MORE_TESTS\` |
| \`full_test_suite\` | \`$FULL_TEST_SUITE\` |
### Expected test tiers
| | Tier | Condition |
|---|---|---|
| $_L0 | **L0** | any non-docs/deployment/perf-scripts run |
| $_L1 | **L1** | \`main\` / \`schedule\` / \`workflow_dispatch\` / \`merge_group\` / label _needs-more-tests_ |
| $_L2 | **L2** | \`schedule\` / \`workflow_dispatch\` / label _full-test-suite_ |
### Decision tree
**Why tests may be skipped**
| | Reason |
|---|---|
| $_SKIP_DOCS | Docs-only change (no src files modified) |
| $_SKIP_DEPLOY | Deployment workflow (\`deploy-release/*\` branch) |
| $_SKIP_PERF | Perf-scripts-only change (all changes under \`scripts/performance/\`) |
**L1/L2 active trigger**
| | Trigger |
|---|---|
| $_MAIN | Push to \`main\` |
| $_SCHED | \`schedule\` |
| $_WD | \`workflow_dispatch\` |
| $_MG | \`merge_group\` |
| $_NMT | Label: _needs-more-tests_ |
| $_FTS | Label: _full-test-suite_ |
SUMMARY
lint-check:
name: Lint check
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
needs.pre-flight.outputs.is_deployment_workflow == 'false'
|| github.event_name == 'workflow_dispatch'
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
submodules: "recursive"
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_ref != '' }}
run: |
echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_ref }}"
echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}"
echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}"
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_ref }}
git checkout ${{ github.event.inputs.mcore_ref }}
- name: Set environment for MCore testing
if: ${{ github.event.inputs.mcore_ref != '' }}
run: |
echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV"
echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version"
- name: Check lint
run: |
pip install pre-commit==3.6.0
pre-commit install
pre-commit run --all-files --show-diff-on-failure --color=always
cicd-wait-in-queue:
needs: [pre-flight, lint-check, configure]
runs-on: ubuntu-latest
environment: test
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true'
|| needs.configure.outputs.perf_scripts_only == 'true')
&& github.event_name != 'merge_group'
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-compute-build-matrix:
needs: [pre-flight, configure, cicd-wait-in-queue]
runs-on: ubuntu-latest
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.configure.outputs.perf_scripts_only != 'true'
outputs:
matrix: ${{ steps.compute.outputs.matrix }}
steps:
- name: Compute build matrix
id: compute
env:
RUNNER_PREFIX: ${{ needs.pre-flight.outputs.runner_prefix }}
REGISTRY_AWS: ${{ needs.pre-flight.outputs.registry }}
REGISTRY_GCP: ${{ env.container-registry-gb200 }}
IS_MEMBER: ${{ needs.pre-flight.outputs.is_member }}
run: |
AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "${RUNNER_PREFIX}" \
'{"cloud": "aws", "registry": $registry, "runner": $runner}')
if [[ "$IS_MEMBER" == "true" ]]; then
GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "nemo-ci-gcp-gpu-x2" \
'{"cloud": "gcp", "registry": $registry, "runner": $runner}')
MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \
'{"include": [$aws, $gcp]}')
else
MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}')
fi
echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
cicd-container-build:
needs: [pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }}
runs-on: ${{ matrix.runner }}
environment: ${{ contains(matrix.registry, 'azure') && 'nemo-ci' || '' }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.configure.outputs.perf_scripts_only != 'true'
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Get merge commit sha
shell: bash -x -e -u -o pipefail {0}
id: sha
env:
IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
run: |
if [[ "$IS_PR" == "true" ]]; then
SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
else
SHA=${GITHUB_SHA}
fi
echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
ref: ${{ steps.sha.outputs.main }}
- name: Update MCore submodule (if triggered from MCore)
if: ${{ github.event.inputs.mcore_ref != '' }}
run: |
echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_ref }}"
echo "📌 MCore branch: ${{ github.event.inputs.mcore_branch || 'unknown' }}"
echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}"
echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}"
cd 3rdparty/Megatron-LM
git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_ref }}
git checkout ${{ github.event.inputs.mcore_ref }}
# Verify the checkout was successful
ACTUAL_COMMIT=$(git rev-parse HEAD)
EXPECTED_COMMIT="${{ github.event.inputs.mcore_ref }}"
echo "✅ MCore submodule updated successfully"
echo "Expected: ${EXPECTED_COMMIT}"
echo "Actual: ${ACTUAL_COMMIT}"
if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then
echo "❌ ERROR: MCore commit mismatch!"
exit 1
fi
git log -1 --pretty=format:"📝 Commit: %H%n👤 Author: %an%n📅 Date: %ad%n💬 Message: %s" --date=short
cd ../..
# Store for Docker build arg
echo "MCORE_COMMIT_SHA=${EXPECTED_COMMIT}" | tee -a "$GITHUB_ENV"
- name: Set environment for MCore testing
if: ${{ github.event.inputs.mcore_ref != '' }}
run: |
echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV"
echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version"
- name: Setup python
uses: actions/setup-python@v6
with:
python-version: 3.12
- name: Install GH CLI
shell: bash
run: |
for i in 1 2 3; do
apt-get update && apt-get install -y gh && break
echo "Attempt $i failed, retrying in 10s..."
sleep 10
done
- name: Get last merged PR
id: cache_from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA-NeMo", name: "Megatron-Bridge") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "type=registry,ref=${{ matrix.registry }}/megatron-bridge:$number-buildcache,mode=max"
done)
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
echo "EOF" | tee -a $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Compute cache keys
id: cache_keys
shell: bash
run: |
PR_NUMBER="${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}"
BRANCH_SANITIZED=$(echo "${{ github.ref_name }}" | tr '/' '-' | tr -cd '[:alnum:]._-')
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
KEY="main"
elif [[ -n "$PR_NUMBER" ]]; then
KEY="$PR_NUMBER"
else
KEY="$BRANCH_SANITIZED"
fi
echo "key=$KEY" | tee -a "$GITHUB_OUTPUT"
echo "cache-to=type=registry,ref=${{ matrix.registry }}/megatron-bridge:${KEY}-buildcache,mode=max" | tee -a "$GITHUB_OUTPUT"
- name: Compute platform
id: platform
run: |
if [[ "${{ matrix.cloud }}" == "gcp" ]]; then
echo "platforms=linux/arm64" | tee -a "$GITHUB_OUTPUT"
else
echo "platforms=linux/amd64" | tee -a "$GITHUB_OUTPUT"
fi
- name: Build and push
uses: docker/build-push-action@v5
with:
file: ./docker/Dockerfile.ci
push: true
context: .
platforms: ${{ steps.platform.outputs.platforms }}
build-args: |
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
MCORE_TRIGGERED_TESTING=${{ env.MCORE_TRIGGERED_TESTING || 'false' }}
MCORE_COMMIT_SHA=${{ env.MCORE_COMMIT_SHA || 'unknown' }}
cache-from: |
type=registry,ref=${{ matrix.registry }}/megatron-bridge:${{ steps.cache_keys.outputs.key }}-buildcache,mode=max
type=registry,ref=${{ matrix.registry }}/megatron-bridge:main-buildcache,mode=max
${{ steps.cache_from.outputs.LAST_PRS }}
cache-to: ${{ steps.cache_keys.outputs.cache-to }}
no-cache: false
tags: |
${{ matrix.registry }}/megatron-bridge:${{ steps.cache_keys.outputs.key }}
${{ matrix.registry }}/megatron-bridge:${{ github.sha }}
secrets: |
GH_TOKEN=${{ secrets.PAT }}
cicd-import-check:
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.configure.outputs.perf_scripts_only != 'true'
needs: [pre-flight, configure, cicd-wait-in-queue, cicd-container-build]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
name: Launch_Import_Check
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Run venv import check
shell: bash -e -u -o pipefail {0}
env:
IMAGE: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
run: |
docker run --rm \
-v "${{ github.workspace }}/docker/common:/opt/import-check:ro" \
"$IMAGE" \
python /opt/import-check/import_check.py \
--jobs 16 \
--skip-file /opt/import-check/import_check_skip.txt
cicd-unit-tests-core:
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'unit-only' || contains('L0 L1 L2', github.event.inputs.test_suite))
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
name: Launch_Unit_Tests_Core
env:
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: Launch_Unit_Tests_Core
timeout: 18
is_unit_test: "true"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ needs.pre-flight.outputs.runner_prefix }}
cicd-unit-tests-diffusion:
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'unit-only' || contains('L0 L1 L2', github.event.inputs.test_suite))
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
name: Launch_Unit_Tests_Diffusion
env:
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: Launch_Unit_Tests_Diffusion
timeout: 18
is_unit_test: "true"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ needs.pre-flight.outputs.runner_prefix }}
generate-test-matrix:
needs: [pre-flight, cicd-container-build]
runs-on: ubuntu-latest
outputs:
matrix_l0: ${{ steps.scan.outputs.matrix_l0 }}
matrix_l1: ${{ steps.scan.outputs.matrix_l1 }}
matrix_l2: ${{ steps.scan.outputs.matrix_l2 }}
matrix_flaky: ${{ steps.scan.outputs.matrix_flaky }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
steps:
- uses: actions/checkout@v6
- id: scan
shell: bash
env:
RUNNER_PREFIX: ${{ needs.pre-flight.outputs.runner_prefix }}
run: |
get_timeout() {
local f="$1"
local t
t=$(grep -m1 '^# CI_TIMEOUT=' "$f" | cut -d= -f2)
echo "${t:-30}"
}
get_runner() {
local f="$1"
local gpu_count
gpu_count=$(grep -m1 '^# GPU_COUNT=' "$f" | cut -d= -f2)
if [ -n "$gpu_count" ]; then
echo "${RUNNER_PREFIX}" | sed "s/gpu-x[0-9]*/gpu-${gpu_count}/"
else
echo "${RUNNER_PREFIX}"
fi
}
for tier in L0 L1 L2; do
entries=""
for f in tests/functional_tests/launch_scripts/h100/active/${tier}_*.sh; do
name=$(basename "$f" .sh)
timeout=$(get_timeout "$f")
runner=$(get_runner "$f")
entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"},"
done
matrix="{\"include\":[${entries%,}]}"
echo "matrix_${tier,,}=${matrix}" | tee -a "$GITHUB_OUTPUT"
done
entries=""
for f in tests/functional_tests/launch_scripts/h100/flaky/L*.sh; do
[ -f "$f" ] || continue
name=$(basename "$f" .sh)
timeout=$(get_timeout "$f")
runner=$(get_runner "$f")
entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"},"
done
echo "matrix_flaky={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT"
# L0: runs on every PR, main push, and schedule
cicd-functional-tests-l0:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l0) }}
needs: [pre-flight, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L0 L1 L2', github.event.inputs.test_suite))
name: ${{ matrix.script }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
# L1: runs on main push, schedule, and PRs with "needs-more-tests" label
cicd-functional-tests-l1:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l1) }}
needs: [pre-flight, configure, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& (github.ref == 'refs/heads/main' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group' || needs.configure.outputs.needs_more_tests == 'true' || needs.configure.outputs.full_test_suite == 'true')
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L1 L2', github.event.inputs.test_suite))
name: ${{ matrix.script }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
# L2: runs on schedule (nightly/weekly), workflow_dispatch, and PRs labeled `full-test-suite`
cicd-functional-tests-l2:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l2) }}
needs: [pre-flight, configure, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || needs.configure.outputs.full_test_suite == 'true')
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L2', github.event.inputs.test_suite))
name: ${{ matrix.script }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
cicd-functional-tests-flaky:
if: github.event_name == 'workflow_dispatch' && github.event.inputs.test_suite == 'all'
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_flaky) }}
needs: [pre-flight, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.script }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script_dir: h100/flaky
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
generate-gb200-test-matrix:
needs: [pre-flight, cicd-container-build]
runs-on: ubuntu-latest
outputs:
matrix_gb200_l0: ${{ steps.scan.outputs.matrix_gb200_l0 }}
matrix_gb200_l1: ${{ steps.scan.outputs.matrix_gb200_l1 }}
matrix_gb200_l2: ${{ steps.scan.outputs.matrix_gb200_l2 }}
matrix_gb200_flaky: ${{ steps.scan.outputs.matrix_gb200_flaky }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.pre-flight.outputs.is_member == 'true'
steps:
- uses: actions/checkout@v6
- id: scan
shell: bash
run: |
get_timeout() {
local f="$1"
local t
t=$(grep -m1 '^# CI_TIMEOUT=' "$f" | cut -d= -f2)
echo "${t:-30}"
}
get_runner() {
local f="$1"
local gpu_count
gpu_count=$(grep -m1 '^# GPU_COUNT=' "$f" | cut -d= -f2)
if [ -n "$gpu_count" ]; then
echo "nemo-ci-gcp-gpu-${gpu_count}"
else
echo "nemo-ci-gcp-gpu-x2"
fi
}
for tier in L0 L1 L2; do
entries=""
for f in tests/functional_tests/launch_scripts/gb200/active/${tier}_*.sh; do
[ -f "$f" ] || continue
name=$(basename "$f" .sh)
timeout=$(get_timeout "$f")
runner=$(get_runner "$f")
entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"},"
done
echo "matrix_gb200_${tier,,}={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT"
done
entries=""
for f in tests/functional_tests/launch_scripts/gb200/flaky/L*.sh; do
[ -f "$f" ] || continue
name=$(basename "$f" .sh)
timeout=$(get_timeout "$f")
runner=$(get_runner "$f")
entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"},"
done
echo "matrix_gb200_flaky={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT"
# GB200-L0: mirrors H100-L0 trigger conditions, runs on GB200 hardware
cicd-functional-tests-gb200-l0:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l0) }}
needs: [pre-flight, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.pre-flight.outputs.is_member == 'true'
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L0 L1 L2', github.event.inputs.test_suite))
name: gb200_${{ matrix.script }}
environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script_dir: gb200/active
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
# GB200-L1: mirrors H100-L1 trigger conditions, runs on GB200 hardware
cicd-functional-tests-gb200-l1:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l1) }}
needs: [pre-flight, configure, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.pre-flight.outputs.is_member == 'true'
&& (github.ref == 'refs/heads/main' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group' || needs.configure.outputs.needs_more_tests == 'true' || needs.configure.outputs.full_test_suite == 'true')
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L1 L2', github.event.inputs.test_suite))
name: gb200_${{ matrix.script }}
environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script_dir: gb200/active
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
# GB200-L2: mirrors H100-L2 trigger conditions, runs on GB200 hardware
cicd-functional-tests-gb200-l2:
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l2) }}
needs: [pre-flight, configure, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
|| github.event_name == 'merge_group'
)
&& !cancelled()
&& needs.pre-flight.outputs.is_member == 'true'
&& (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || needs.configure.outputs.full_test_suite == 'true')
&& (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L2', github.event.inputs.test_suite))
name: gb200_${{ matrix.script }}
environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script_dir: gb200/active
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
cicd-functional-tests-gb200-flaky:
if: github.event_name == 'workflow_dispatch' && github.event.inputs.test_suite == 'all' && needs.pre-flight.outputs.is_member == 'true'
strategy:
fail-fast: false
max-parallel: 16
matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_flaky) }}
needs: [pre-flight, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion]
runs-on: ${{ matrix.runner }}
name: gb200_${{ matrix.script }}
environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }}
env:
HF_HOME: /home/TestData/HF_HOME
TRANSFORMERS_OFFLINE: "1"
HF_HUB_OFFLINE: "1"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script_dir: gb200/flaky
script: ${{ matrix.script }}
timeout: ${{ fromJSON(matrix.timeout || '30') }}
is_unit_test: "false"
has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ matrix.runner }}
Nemo_CICD_Test:
needs:
- pre-flight
- configure
- cicd-import-check
- cicd-unit-tests-core
- cicd-unit-tests-diffusion
- cicd-functional-tests-l0
- cicd-functional-tests-l1
- cicd-functional-tests-l2
- cicd-functional-tests-gb200-l0
- cicd-functional-tests-gb200-l1
- cicd-functional-tests-gb200-l2
if: always() && !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Get workflow result
id: result
shell: bash -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
EXPECT_L0: ${{ needs.configure.outputs.expect_l0 }}
EXPECT_L1: ${{ needs.configure.outputs.expect_l1 }}
EXPECT_L2: ${{ needs.configure.outputs.expect_l2 }}
IS_MEMBER: ${{ needs.pre-flight.outputs.is_member }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .conclusion != "skipped")] | length') || echo 0
UNEXPECTED_SKIPS=0
check_tier_skips() {
local tier="$1" expect="$2"
[ "$expect" != "true" ] && return
local n
n=$(gh run view $GITHUB_RUN_ID --json jobs --jq --arg p "${tier}_" \
'[.jobs[] | select(.name | startswith($p)) | select(.conclusion == "skipped")] | length') || n=0
if [ "${n:-0}" -gt 0 ]; then
echo "❌ Found $n unexpectedly skipped ${tier} job(s):"
gh run view $GITHUB_RUN_ID --json jobs --jq --arg p "${tier}_" \
'.jobs[] | select(.name | startswith($p)) | select(.conclusion == "skipped") | .name'
UNEXPECTED_SKIPS=$((UNEXPECTED_SKIPS + n))
fi
}
check_tier_skips "L0" "$EXPECT_L0"
check_tier_skips "L1" "$EXPECT_L1"
check_tier_skips "L2" "$EXPECT_L2"
if [[ "$IS_MEMBER" == "true" ]]; then
check_tier_skips "gb200_L0" "$EXPECT_L0"
check_tier_skips "gb200_L1" "$EXPECT_L1"
check_tier_skips "gb200_L2" "$EXPECT_L2"
fi
if [ "${FAILED_JOBS:-0}" -gt 0 ]; then
echo "❌ Found $FAILED_JOBS failed job(s):"
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .conclusion != "skipped") | .name'
fi
if [ "${FAILED_JOBS:-0}" -gt 0 ] || [ "${UNEXPECTED_SKIPS:-0}" -gt 0 ]; then
exit 1
fi
echo "✅ All previous jobs completed successfully"
exit 0
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight, configure]
if: |
always()
&& !cancelled()
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& (
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.configure.outputs.perf_scripts_only == 'true'
)
steps:
- name: Generate fake coverage report
uses: actions/github-script@v8
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight, configure]
if: |
needs.Nemo_CICD_Test.result == 'success'
&& needs.pre-flight.outputs.docs_only == 'false'
&& needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.configure.outputs.perf_scripts_only == 'false'
&& github.event.inputs.mcore_ref == ''
&& !cancelled()
strategy:
matrix:
flag:
- unit-test
- e2e
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Checkout
uses: actions/checkout@v6
- name: Download coverage reports of current branch
uses: actions/download-artifact@v7
with:
pattern: coverage-${{ matrix.flag }}-*
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage[toml]
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').base.sha }}
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true