CICD NeMo #16306
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: CICD NeMo | |
| on: | |
| schedule: | |
| - cron: 0 0 * * * | |
| push: | |
| branches: | |
| - main | |
| - "pull-request/[0-9]+" | |
| - "deploy-release/*" | |
| merge_group: | |
| types: [checks_requested] | |
| # Allow MCore to trigger this workflow remotely for compatibility testing | |
| workflow_dispatch: | |
| inputs: | |
| mcore_ref: | |
| description: "MCore commit SHA to test against" | |
| required: false | |
| type: string | |
| mcore_repo: | |
| description: "MCore repository URL (for fetching from forks)" | |
| required: false | |
| type: string | |
| default: "https://github.com/NVIDIA/Megatron-LM.git" | |
| test_suite: | |
| description: "Test suite to run" | |
| required: false | |
| type: choice | |
| options: | |
| - "all" | |
| - "L0" | |
| - "L1" | |
| - "L2" | |
| - "unit-only" | |
| - "functional-only" | |
| default: "all" | |
| triggered_by: | |
| description: "Trigger source (for tracking)" | |
| required: false | |
| type: string | |
| default: "manual" | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} | |
| cancel-in-progress: true | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-bridge | |
| jobs: | |
| pre-flight: | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.80.1 | |
| with: | |
| default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }} | |
| non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }} | |
| default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }} | |
| non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }} | |
| default_registry: ${{ vars.DEFAULT_CONTAINER_REGISTRY }} | |
| non_nvidia_registry: ${{ vars.NON_NVIDIA_CONTAINER_REGISTRY }} | |
| sso_users_filename: ${{ vars.SSO_USERS_FILENAME }} | |
| secrets: | |
| NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} | |
| configure: | |
| runs-on: ubuntu-latest | |
| needs: [pre-flight] | |
| outputs: | |
| needs_more_tests: ${{ steps.configure.outputs.needs_more_tests }} | |
| full_test_suite: ${{ steps.configure.outputs.full_test_suite }} | |
| expect_l0: ${{ steps.configure.outputs.expect_l0 }} | |
| expect_l1: ${{ steps.configure.outputs.expect_l1 }} | |
| expect_l2: ${{ steps.configure.outputs.expect_l2 }} | |
| perf_scripts_only: ${{ steps.configure.outputs.perf_scripts_only }} | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Configure | |
| id: configure | |
| shell: bash -x -e -u -o pipefail {0} | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }} | |
| IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }} | |
| EVENT_NAME: ${{ github.event_name }} | |
| REF: ${{ github.ref }} | |
| TEST_SUITE: ${{ github.event.inputs.test_suite }} | |
| run: | | |
| PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} | |
| # Fetch labels; fall back to empty list if not a PR | |
| LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]' 2>/dev/null) || LABELS='[]' | |
| NEEDS_MORE_TESTS=$(echo "$LABELS" | jq 'any(. == "needs-more-tests")') | |
| FULL_TEST_SUITE=$(echo "$LABELS" | jq 'any(. == "full-test-suite")') | |
| # Detect if every changed file lives under scripts/performance/ | |
| PERF_SCRIPTS_ONLY=false | |
| if [[ -n "$PR_NUMBER" ]]; then | |
| CHANGED_FILES=$(gh pr diff "$PR_NUMBER" --repo ${{ github.repository }} --name-only 2>/dev/null) || CHANGED_FILES="" | |
| if [[ -n "$CHANGED_FILES" ]]; then | |
| NON_PERF=$(echo "$CHANGED_FILES" | grep -v '^scripts/performance/' || true) | |
| [[ -z "$NON_PERF" ]] && PERF_SCRIPTS_ONLY=true | |
| fi | |
| fi | |
| # Tests are expected on every run except docs-only, deployment, and perf-scripts-only | |
| if [[ "$DOCS_ONLY" == "true" || "$IS_DEPLOYMENT" == "true" || "$PERF_SCRIPTS_ONLY" == "true" ]]; then | |
| RUN_TESTS=false | |
| else | |
| RUN_TESTS=true | |
| fi | |
| # L0/L1/L2 functional tests are expected when test_suite is '' (default), 'all', or 'functional-only' | |
| EXPECT_FUNCTIONAL=false | |
| [[ "$TEST_SUITE" == "" || "$TEST_SUITE" == "all" || "$TEST_SUITE" == "functional-only" ]] && EXPECT_FUNCTIONAL=true | |
| # EXPECT_L0: any non-docs/deployment run that includes functional tests | |
| EXPECT_L0=false | |
| [[ "$RUN_TESTS" == "true" && "$EXPECT_FUNCTIONAL" == "true" ]] && EXPECT_L0=true | |
| # EXPECT_L1: L0 conditions + event/label gate | |
| EXPECT_L1=false | |
| if [[ "$EXPECT_L0" == "true" ]]; then | |
| if [[ "$REF" == "refs/heads/main" || "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "workflow_dispatch" || "$EVENT_NAME" == "merge_group" || "$NEEDS_MORE_TESTS" == "true" ]]; then | |
| EXPECT_L1=true | |
| fi | |
| fi | |
| # EXPECT_L2: schedule, workflow_dispatch, or full-test-suite label | |
| EXPECT_L2=false | |
| if [[ "$EXPECT_L0" == "true" && ("$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "workflow_dispatch" || "$FULL_TEST_SUITE" == "true") ]]; then | |
| EXPECT_L2=true | |
| fi | |
| echo "needs_more_tests=$NEEDS_MORE_TESTS" | tee -a "$GITHUB_OUTPUT" | |
| echo "full_test_suite=$FULL_TEST_SUITE" | tee -a "$GITHUB_OUTPUT" | |
| echo "expect_l0=$EXPECT_L0" | tee -a "$GITHUB_OUTPUT" | |
| echo "expect_l1=$EXPECT_L1" | tee -a "$GITHUB_OUTPUT" | |
| echo "expect_l2=$EXPECT_L2" | tee -a "$GITHUB_OUTPUT" | |
| echo "perf_scripts_only=$PERF_SCRIPTS_ONLY" | tee -a "$GITHUB_OUTPUT" | |
| # Active row markers for step summary decision tree | |
| _L0=$( [[ "$EXPECT_L0" == "true" ]] && echo "**→**" || echo "" ) | |
| _L1=$( [[ "$EXPECT_L1" == "true" ]] && echo "**→**" || echo "" ) | |
| _L2=$( [[ "$EXPECT_L2" == "true" ]] && echo "**→**" || echo "" ) | |
| _SKIP_DOCS=$( [[ "$DOCS_ONLY" == "true" ]] && echo "**→**" || echo "" ) | |
| _SKIP_DEPLOY=$([[ "$IS_DEPLOYMENT" == "true" ]] && echo "**→**" || echo "" ) | |
| _SKIP_PERF=$( [[ "$PERF_SCRIPTS_ONLY" == "true" ]] && echo "**→**" || echo "" ) | |
| _MG=$( [[ "$EVENT_NAME" == "merge_group" ]] && echo "**→**" || echo "" ) | |
| _MAIN=$( [[ "$REF" == "refs/heads/main" ]] && echo "**→**" || echo "" ) | |
| _SCHED=$( [[ "$EVENT_NAME" == "schedule" ]] && echo "**→**" || echo "" ) | |
| _WD=$( [[ "$EVENT_NAME" == "workflow_dispatch" ]] && echo "**→**" || echo "" ) | |
| _NMT=$( [[ "$NEEDS_MORE_TESTS" == "true" ]] && echo "**→**" || echo "" ) | |
| _FTS=$( [[ "$FULL_TEST_SUITE" == "true" ]] && echo "**→**" || echo "" ) | |
| cat <<SUMMARY >> "$GITHUB_STEP_SUMMARY" | |
| ## CI Configuration | |
| **Event:** \`$EVENT_NAME\` | **Ref:** \`$REF\` | **Test suite:** \`${TEST_SUITE:-all}\` | |
| | Setting | Value | | |
| |---|---| | |
| | \`docs_only\` | \`$DOCS_ONLY\` | | |
| | \`is_deployment_workflow\` | \`$IS_DEPLOYMENT\` | | |
| | \`perf_scripts_only\` | \`$PERF_SCRIPTS_ONLY\` | | |
| | \`needs_more_tests\` | \`$NEEDS_MORE_TESTS\` | | |
| | \`full_test_suite\` | \`$FULL_TEST_SUITE\` | | |
| ### Expected test tiers | |
| | | Tier | Condition | | |
| |---|---|---| | |
| | $_L0 | **L0** | any non-docs/deployment/perf-scripts run | | |
| | $_L1 | **L1** | \`main\` / \`schedule\` / \`workflow_dispatch\` / \`merge_group\` / label _needs-more-tests_ | | |
| | $_L2 | **L2** | \`schedule\` / \`workflow_dispatch\` / label _full-test-suite_ | | |
| ### Decision tree | |
| **Why tests may be skipped** | |
| | | Reason | | |
| |---|---| | |
| | $_SKIP_DOCS | Docs-only change (no src files modified) | | |
| | $_SKIP_DEPLOY | Deployment workflow (\`deploy-release/*\` branch) | | |
| | $_SKIP_PERF | Perf-scripts-only change (all changes under \`scripts/performance/\`) | | |
| **L1/L2 active trigger** | |
| | | Trigger | | |
| |---|---| | |
| | $_MAIN | Push to \`main\` | | |
| | $_SCHED | \`schedule\` | | |
| | $_WD | \`workflow_dispatch\` | | |
| | $_MG | \`merge_group\` | | |
| | $_NMT | Label: _needs-more-tests_ | | |
| | $_FTS | Label: _full-test-suite_ | | |
| SUMMARY | |
| lint-check: | |
| name: Lint check | |
| runs-on: ubuntu-latest | |
| needs: [pre-flight] | |
| if: | | |
| needs.pre-flight.outputs.is_deployment_workflow == 'false' | |
| || github.event_name == 'workflow_dispatch' | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: "recursive" | |
| - name: Update MCore submodule (if triggered from MCore) | |
| if: ${{ github.event.inputs.mcore_ref != '' }} | |
| run: | | |
| echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_ref }}" | |
| echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}" | |
| echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}" | |
| cd 3rdparty/Megatron-LM | |
| git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_ref }} | |
| git checkout ${{ github.event.inputs.mcore_ref }} | |
| - name: Set environment for MCore testing | |
| if: ${{ github.event.inputs.mcore_ref != '' }} | |
| run: | | |
| echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV" | |
| echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version" | |
| - name: Check lint | |
| run: | | |
| pip install pre-commit==3.6.0 | |
| pre-commit install | |
| pre-commit run --all-files --show-diff-on-failure --color=always | |
| cicd-wait-in-queue: | |
| needs: [pre-flight, lint-check, configure] | |
| runs-on: ubuntu-latest | |
| environment: test | |
| if: | | |
| !(needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.configure.outputs.perf_scripts_only == 'true') | |
| && github.event_name != 'merge_group' | |
| steps: | |
| - name: Running CI tests | |
| run: | | |
| echo "Running CI tests" | |
| cicd-compute-build-matrix: | |
| needs: [pre-flight, configure, cicd-wait-in-queue] | |
| runs-on: ubuntu-latest | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.configure.outputs.perf_scripts_only != 'true' | |
| outputs: | |
| matrix: ${{ steps.compute.outputs.matrix }} | |
| steps: | |
| - name: Compute build matrix | |
| id: compute | |
| env: | |
| RUNNER_PREFIX: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| REGISTRY_AWS: ${{ needs.pre-flight.outputs.registry }} | |
| REGISTRY_GCP: ${{ env.container-registry-gb200 }} | |
| IS_MEMBER: ${{ needs.pre-flight.outputs.is_member }} | |
| run: | | |
| AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "${RUNNER_PREFIX}" \ | |
| '{"cloud": "aws", "registry": $registry, "runner": $runner}') | |
| if [[ "$IS_MEMBER" == "true" ]]; then | |
| GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "nemo-ci-gcp-gpu-x2" \ | |
| '{"cloud": "gcp", "registry": $registry, "runner": $runner}') | |
| MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \ | |
| '{"include": [$aws, $gcp]}') | |
| else | |
| MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}') | |
| fi | |
| echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT" | |
| cicd-container-build: | |
| needs: [pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix] | |
| strategy: | |
| fail-fast: false | |
| matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }} | |
| runs-on: ${{ matrix.runner }} | |
| environment: ${{ contains(matrix.registry, 'azure') && 'nemo-ci' || '' }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.configure.outputs.perf_scripts_only != 'true' | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Get merge commit sha | |
| shell: bash -x -e -u -o pipefail {0} | |
| id: sha | |
| env: | |
| IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} | |
| run: | | |
| if [[ "$IS_PR" == "true" ]]; then | |
| SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} | |
| else | |
| SHA=${GITHUB_SHA} | |
| fi | |
| echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| ref: ${{ steps.sha.outputs.main }} | |
| - name: Update MCore submodule (if triggered from MCore) | |
| if: ${{ github.event.inputs.mcore_ref != '' }} | |
| run: | | |
| echo "🔄 Updating MCore submodule to commit: ${{ github.event.inputs.mcore_ref }}" | |
| echo "📌 MCore branch: ${{ github.event.inputs.mcore_branch || 'unknown' }}" | |
| echo "📍 MCore repo: ${{ github.event.inputs.mcore_repo || 'https://github.com/NVIDIA/Megatron-LM.git' }}" | |
| echo "🎯 Triggered by: ${{ github.event.inputs.triggered_by }}" | |
| cd 3rdparty/Megatron-LM | |
| git fetch ${{ github.event.inputs.mcore_repo || 'origin' }} ${{ github.event.inputs.mcore_ref }} | |
| git checkout ${{ github.event.inputs.mcore_ref }} | |
| # Verify the checkout was successful | |
| ACTUAL_COMMIT=$(git rev-parse HEAD) | |
| EXPECTED_COMMIT="${{ github.event.inputs.mcore_ref }}" | |
| echo "✅ MCore submodule updated successfully" | |
| echo "Expected: ${EXPECTED_COMMIT}" | |
| echo "Actual: ${ACTUAL_COMMIT}" | |
| if [ "${ACTUAL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then | |
| echo "❌ ERROR: MCore commit mismatch!" | |
| exit 1 | |
| fi | |
| git log -1 --pretty=format:"📝 Commit: %H%n👤 Author: %an%n📅 Date: %ad%n💬 Message: %s" --date=short | |
| cd ../.. | |
| # Store for Docker build arg | |
| echo "MCORE_COMMIT_SHA=${EXPECTED_COMMIT}" | tee -a "$GITHUB_ENV" | |
| - name: Set environment for MCore testing | |
| if: ${{ github.event.inputs.mcore_ref != '' }} | |
| run: | | |
| echo "MCORE_TRIGGERED_TESTING=true" | tee -a "$GITHUB_ENV" | |
| echo "⚙️ MCore testing mode: skipping --locked flag because lockfile was generated with different MCore version" | |
| - name: Setup python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: 3.12 | |
| - name: Install GH CLI | |
| shell: bash | |
| run: | | |
| for i in 1 2 3; do | |
| apt-get update && apt-get install -y gh && break | |
| echo "Attempt $i failed, retrying in 10s..." | |
| sleep 10 | |
| done | |
| - name: Get last merged PR | |
| id: cache_from | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| LAST_PRS=$(gh api graphql -f query=' | |
| query { | |
| repository(owner: "NVIDIA-NeMo", name: "Megatron-Bridge") { | |
| pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { | |
| nodes { | |
| number | |
| } | |
| } | |
| } | |
| }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do | |
| echo "type=registry,ref=${{ matrix.registry }}/megatron-bridge:$number-buildcache,mode=max" | |
| done) | |
| echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT | |
| echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT | |
| echo "EOF" | tee -a $GITHUB_OUTPUT | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Compute cache keys | |
| id: cache_keys | |
| shell: bash | |
| run: | | |
| PR_NUMBER="${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}" | |
| BRANCH_SANITIZED=$(echo "${{ github.ref_name }}" | tr '/' '-' | tr -cd '[:alnum:]._-') | |
| if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
| KEY="main" | |
| elif [[ -n "$PR_NUMBER" ]]; then | |
| KEY="$PR_NUMBER" | |
| else | |
| KEY="$BRANCH_SANITIZED" | |
| fi | |
| echo "key=$KEY" | tee -a "$GITHUB_OUTPUT" | |
| echo "cache-to=type=registry,ref=${{ matrix.registry }}/megatron-bridge:${KEY}-buildcache,mode=max" | tee -a "$GITHUB_OUTPUT" | |
| - name: Compute platform | |
| id: platform | |
| run: | | |
| if [[ "${{ matrix.cloud }}" == "gcp" ]]; then | |
| echo "platforms=linux/arm64" | tee -a "$GITHUB_OUTPUT" | |
| else | |
| echo "platforms=linux/amd64" | tee -a "$GITHUB_OUTPUT" | |
| fi | |
| - name: Build and push | |
| uses: docker/build-push-action@v5 | |
| with: | |
| file: ./docker/Dockerfile.ci | |
| push: true | |
| context: . | |
| platforms: ${{ steps.platform.outputs.platforms }} | |
| build-args: | | |
| FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3 | |
| MCORE_TRIGGERED_TESTING=${{ env.MCORE_TRIGGERED_TESTING || 'false' }} | |
| MCORE_COMMIT_SHA=${{ env.MCORE_COMMIT_SHA || 'unknown' }} | |
| cache-from: | | |
| type=registry,ref=${{ matrix.registry }}/megatron-bridge:${{ steps.cache_keys.outputs.key }}-buildcache,mode=max | |
| type=registry,ref=${{ matrix.registry }}/megatron-bridge:main-buildcache,mode=max | |
| ${{ steps.cache_from.outputs.LAST_PRS }} | |
| cache-to: ${{ steps.cache_keys.outputs.cache-to }} | |
| no-cache: false | |
| tags: | | |
| ${{ matrix.registry }}/megatron-bridge:${{ steps.cache_keys.outputs.key }} | |
| ${{ matrix.registry }}/megatron-bridge:${{ github.sha }} | |
| secrets: | | |
| GH_TOKEN=${{ secrets.PAT }} | |
| cicd-import-check: | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.configure.outputs.perf_scripts_only != 'true' | |
| needs: [pre-flight, configure, cicd-wait-in-queue, cicd-container-build] | |
| runs-on: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| name: Launch_Import_Check | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Run venv import check | |
| shell: bash -e -u -o pipefail {0} | |
| env: | |
| IMAGE: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| run: | | |
| docker run --rm \ | |
| -v "${{ github.workspace }}/docker/common:/opt/import-check:ro" \ | |
| "$IMAGE" \ | |
| python /opt/import-check/import_check.py \ | |
| --jobs 16 \ | |
| --skip-file /opt/import-check/import_check_skip.txt | |
| cicd-unit-tests-core: | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'unit-only' || contains('L0 L1 L2', github.event.inputs.test_suite)) | |
| needs: [pre-flight, cicd-wait-in-queue, cicd-container-build] | |
| runs-on: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| name: Launch_Unit_Tests_Core | |
| env: | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script: Launch_Unit_Tests_Core | |
| timeout: 18 | |
| is_unit_test: "true" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| cicd-unit-tests-diffusion: | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'unit-only' || contains('L0 L1 L2', github.event.inputs.test_suite)) | |
| needs: [pre-flight, cicd-wait-in-queue, cicd-container-build] | |
| runs-on: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| name: Launch_Unit_Tests_Diffusion | |
| env: | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script: Launch_Unit_Tests_Diffusion | |
| timeout: 18 | |
| is_unit_test: "true" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| generate-test-matrix: | |
| needs: [pre-flight, cicd-container-build] | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix_l0: ${{ steps.scan.outputs.matrix_l0 }} | |
| matrix_l1: ${{ steps.scan.outputs.matrix_l1 }} | |
| matrix_l2: ${{ steps.scan.outputs.matrix_l2 }} | |
| matrix_flaky: ${{ steps.scan.outputs.matrix_flaky }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - id: scan | |
| shell: bash | |
| env: | |
| RUNNER_PREFIX: ${{ needs.pre-flight.outputs.runner_prefix }} | |
| run: | | |
| get_timeout() { | |
| local f="$1" | |
| local t | |
| t=$(grep -m1 '^# CI_TIMEOUT=' "$f" | cut -d= -f2) | |
| echo "${t:-30}" | |
| } | |
| get_runner() { | |
| local f="$1" | |
| local gpu_count | |
| gpu_count=$(grep -m1 '^# GPU_COUNT=' "$f" | cut -d= -f2) | |
| if [ -n "$gpu_count" ]; then | |
| echo "${RUNNER_PREFIX}" | sed "s/gpu-x[0-9]*/gpu-${gpu_count}/" | |
| else | |
| echo "${RUNNER_PREFIX}" | |
| fi | |
| } | |
| for tier in L0 L1 L2; do | |
| entries="" | |
| for f in tests/functional_tests/launch_scripts/h100/active/${tier}_*.sh; do | |
| name=$(basename "$f" .sh) | |
| timeout=$(get_timeout "$f") | |
| runner=$(get_runner "$f") | |
| entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"}," | |
| done | |
| matrix="{\"include\":[${entries%,}]}" | |
| echo "matrix_${tier,,}=${matrix}" | tee -a "$GITHUB_OUTPUT" | |
| done | |
| entries="" | |
| for f in tests/functional_tests/launch_scripts/h100/flaky/L*.sh; do | |
| [ -f "$f" ] || continue | |
| name=$(basename "$f" .sh) | |
| timeout=$(get_timeout "$f") | |
| runner=$(get_runner "$f") | |
| entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"}," | |
| done | |
| echo "matrix_flaky={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT" | |
| # L0: runs on every PR, main push, and schedule | |
| cicd-functional-tests-l0: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l0) }} | |
| needs: [pre-flight, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L0 L1 L2', github.event.inputs.test_suite)) | |
| name: ${{ matrix.script }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| # L1: runs on main push, schedule, and PRs with "needs-more-tests" label | |
| cicd-functional-tests-l1: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l1) }} | |
| needs: [pre-flight, configure, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && (github.ref == 'refs/heads/main' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group' || needs.configure.outputs.needs_more_tests == 'true' || needs.configure.outputs.full_test_suite == 'true') | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L1 L2', github.event.inputs.test_suite)) | |
| name: ${{ matrix.script }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| # L2: runs on schedule (nightly/weekly), workflow_dispatch, and PRs labeled `full-test-suite` | |
| cicd-functional-tests-l2: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_l2) }} | |
| needs: [pre-flight, configure, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || needs.configure.outputs.full_test_suite == 'true') | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L2', github.event.inputs.test_suite)) | |
| name: ${{ matrix.script }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| cicd-functional-tests-flaky: | |
| if: github.event_name == 'workflow_dispatch' && github.event.inputs.test_suite == 'all' | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-test-matrix.outputs.matrix_flaky) }} | |
| needs: [pre-flight, generate-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| name: ${{ matrix.script }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script_dir: h100/flaky | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ needs.pre-flight.outputs.registry }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| generate-gb200-test-matrix: | |
| needs: [pre-flight, cicd-container-build] | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix_gb200_l0: ${{ steps.scan.outputs.matrix_gb200_l0 }} | |
| matrix_gb200_l1: ${{ steps.scan.outputs.matrix_gb200_l1 }} | |
| matrix_gb200_l2: ${{ steps.scan.outputs.matrix_gb200_l2 }} | |
| matrix_gb200_flaky: ${{ steps.scan.outputs.matrix_gb200_flaky }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.pre-flight.outputs.is_member == 'true' | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - id: scan | |
| shell: bash | |
| run: | | |
| get_timeout() { | |
| local f="$1" | |
| local t | |
| t=$(grep -m1 '^# CI_TIMEOUT=' "$f" | cut -d= -f2) | |
| echo "${t:-30}" | |
| } | |
| get_runner() { | |
| local f="$1" | |
| local gpu_count | |
| gpu_count=$(grep -m1 '^# GPU_COUNT=' "$f" | cut -d= -f2) | |
| if [ -n "$gpu_count" ]; then | |
| echo "nemo-ci-gcp-gpu-${gpu_count}" | |
| else | |
| echo "nemo-ci-gcp-gpu-x2" | |
| fi | |
| } | |
| for tier in L0 L1 L2; do | |
| entries="" | |
| for f in tests/functional_tests/launch_scripts/gb200/active/${tier}_*.sh; do | |
| [ -f "$f" ] || continue | |
| name=$(basename "$f" .sh) | |
| timeout=$(get_timeout "$f") | |
| runner=$(get_runner "$f") | |
| entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"}," | |
| done | |
| echo "matrix_gb200_${tier,,}={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT" | |
| done | |
| entries="" | |
| for f in tests/functional_tests/launch_scripts/gb200/flaky/L*.sh; do | |
| [ -f "$f" ] || continue | |
| name=$(basename "$f" .sh) | |
| timeout=$(get_timeout "$f") | |
| runner=$(get_runner "$f") | |
| entries="${entries}{\"script\":\"${name}\",\"timeout\":${timeout},\"runner\":\"${runner}\"}," | |
| done | |
| echo "matrix_gb200_flaky={\"include\":[${entries%,}]}" | tee -a "$GITHUB_OUTPUT" | |
| # GB200-L0: mirrors H100-L0 trigger conditions, runs on GB200 hardware | |
| cicd-functional-tests-gb200-l0: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l0) }} | |
| needs: [pre-flight, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.pre-flight.outputs.is_member == 'true' | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L0 L1 L2', github.event.inputs.test_suite)) | |
| name: gb200_${{ matrix.script }} | |
| environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script_dir: gb200/active | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }} | |
| azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| # GB200-L1: mirrors H100-L1 trigger conditions, runs on GB200 hardware | |
| cicd-functional-tests-gb200-l1: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l1) }} | |
| needs: [pre-flight, configure, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.pre-flight.outputs.is_member == 'true' | |
| && (github.ref == 'refs/heads/main' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group' || needs.configure.outputs.needs_more_tests == 'true' || needs.configure.outputs.full_test_suite == 'true') | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L1 L2', github.event.inputs.test_suite)) | |
| name: gb200_${{ matrix.script }} | |
| environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script_dir: gb200/active | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }} | |
| azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| # GB200-L2: mirrors H100-L2 trigger conditions, runs on GB200 hardware | |
| cicd-functional-tests-gb200-l2: | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_l2) }} | |
| needs: [pre-flight, configure, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || github.event_name == 'merge_group' | |
| ) | |
| && !cancelled() | |
| && needs.pre-flight.outputs.is_member == 'true' | |
| && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || needs.configure.outputs.full_test_suite == 'true') | |
| && (github.event.inputs.test_suite == '' || github.event.inputs.test_suite == 'all' || github.event.inputs.test_suite == 'functional-only' || contains('L2', github.event.inputs.test_suite)) | |
| name: gb200_${{ matrix.script }} | |
| environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script_dir: gb200/active | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }} | |
| azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| cicd-functional-tests-gb200-flaky: | |
| if: github.event_name == 'workflow_dispatch' && github.event.inputs.test_suite == 'all' && needs.pre-flight.outputs.is_member == 'true' | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 16 | |
| matrix: ${{ fromJSON(needs.generate-gb200-test-matrix.outputs.matrix_gb200_flaky) }} | |
| needs: [pre-flight, generate-gb200-test-matrix, cicd-unit-tests-core, cicd-unit-tests-diffusion] | |
| runs-on: ${{ matrix.runner }} | |
| name: gb200_${{ matrix.script }} | |
| environment: ${{ contains(needs.pre-flight.outputs.registry, 'azure') && 'nemo-ci' || '' }} | |
| env: | |
| HF_HOME: /home/TestData/HF_HOME | |
| TRANSFORMERS_OFFLINE: "1" | |
| HF_HUB_OFFLINE: "1" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| script_dir: gb200/flaky | |
| script: ${{ matrix.script }} | |
| timeout: ${{ fromJSON(matrix.timeout || '30') }} | |
| is_unit_test: "false" | |
| has-azure-credentials: ${{ contains(needs.pre-flight.outputs.registry, 'azure') }} | |
| azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry-gb200 }}/megatron-bridge:${{ github.sha }} | |
| test-data-path: ${{ needs.pre-flight.outputs.test_data_path }} | |
| runner: ${{ matrix.runner }} | |
| Nemo_CICD_Test: | |
| needs: | |
| - pre-flight | |
| - configure | |
| - cicd-import-check | |
| - cicd-unit-tests-core | |
| - cicd-unit-tests-diffusion | |
| - cicd-functional-tests-l0 | |
| - cicd-functional-tests-l1 | |
| - cicd-functional-tests-l2 | |
| - cicd-functional-tests-gb200-l0 | |
| - cicd-functional-tests-gb200-l1 | |
| - cicd-functional-tests-gb200-l2 | |
| if: always() && !cancelled() | |
| runs-on: ubuntu-latest | |
| permissions: write-all | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Get workflow result | |
| id: result | |
| shell: bash -e -u -o pipefail {0} | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| EXPECT_L0: ${{ needs.configure.outputs.expect_l0 }} | |
| EXPECT_L1: ${{ needs.configure.outputs.expect_l1 }} | |
| EXPECT_L2: ${{ needs.configure.outputs.expect_l2 }} | |
| IS_MEMBER: ${{ needs.pre-flight.outputs.is_member }} | |
| run: | | |
| FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and .conclusion != "skipped")] | length') || echo 0 | |
| UNEXPECTED_SKIPS=0 | |
| check_tier_skips() { | |
| local tier="$1" expect="$2" | |
| [ "$expect" != "true" ] && return | |
| local n | |
| n=$(gh run view $GITHUB_RUN_ID --json jobs --jq --arg p "${tier}_" \ | |
| '[.jobs[] | select(.name | startswith($p)) | select(.conclusion == "skipped")] | length') || n=0 | |
| if [ "${n:-0}" -gt 0 ]; then | |
| echo "❌ Found $n unexpectedly skipped ${tier} job(s):" | |
| gh run view $GITHUB_RUN_ID --json jobs --jq --arg p "${tier}_" \ | |
| '.jobs[] | select(.name | startswith($p)) | select(.conclusion == "skipped") | .name' | |
| UNEXPECTED_SKIPS=$((UNEXPECTED_SKIPS + n)) | |
| fi | |
| } | |
| check_tier_skips "L0" "$EXPECT_L0" | |
| check_tier_skips "L1" "$EXPECT_L1" | |
| check_tier_skips "L2" "$EXPECT_L2" | |
| if [[ "$IS_MEMBER" == "true" ]]; then | |
| check_tier_skips "gb200_L0" "$EXPECT_L0" | |
| check_tier_skips "gb200_L1" "$EXPECT_L1" | |
| check_tier_skips "gb200_L2" "$EXPECT_L2" | |
| fi | |
| if [ "${FAILED_JOBS:-0}" -gt 0 ]; then | |
| echo "❌ Found $FAILED_JOBS failed job(s):" | |
| gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and .conclusion != "skipped") | .name' | |
| fi | |
| if [ "${FAILED_JOBS:-0}" -gt 0 ] || [ "${UNEXPECTED_SKIPS:-0}" -gt 0 ]; then | |
| exit 1 | |
| fi | |
| echo "✅ All previous jobs completed successfully" | |
| exit 0 | |
| Coverage_Fake: | |
| runs-on: ubuntu-latest | |
| needs: [Nemo_CICD_Test, pre-flight, configure] | |
| if: | | |
| always() | |
| && !cancelled() | |
| && needs.pre-flight.outputs.is_ci_workload == 'false' | |
| && ( | |
| needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || needs.configure.outputs.perf_scripts_only == 'true' | |
| ) | |
| steps: | |
| - name: Generate fake coverage report | |
| uses: actions/github-script@v8 | |
| with: | |
| github-token: ${{ secrets.PAT }} | |
| script: | | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: context.sha, | |
| state: 'success', | |
| description: 'No code changes - coverage check skipped', | |
| context: 'codecov/patch' | |
| }); | |
| Coverage: | |
| runs-on: ubuntu-latest | |
| needs: [Nemo_CICD_Test, pre-flight, configure] | |
| if: | | |
| needs.Nemo_CICD_Test.result == 'success' | |
| && needs.pre-flight.outputs.docs_only == 'false' | |
| && needs.pre-flight.outputs.is_deployment_workflow == 'false' | |
| && needs.configure.outputs.perf_scripts_only == 'false' | |
| && github.event.inputs.mcore_ref == '' | |
| && !cancelled() | |
| strategy: | |
| matrix: | |
| flag: | |
| - unit-test | |
| - e2e | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Download coverage reports of current branch | |
| uses: actions/download-artifact@v7 | |
| with: | |
| pattern: coverage-${{ matrix.flag }}-* | |
| - name: Get total coverage of current branch | |
| shell: bash -x -e -u -o pipefail {0} | |
| if: always() | |
| run: | | |
| pip install coverage[toml] | |
| ls -al . | |
| ls -al coverage-*/ | |
| coverage combine --keep $(ls coverage-*/.coverage) | |
| coverage report -i | |
| rm -rf coverage-* | |
| ls -al | |
| - name: Upload coverage reports to Codecov | |
| uses: codecov/codecov-action@v5 | |
| with: | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| verbose: true | |
| flags: ${{ matrix.flag }} | |
| base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').base.sha }} | |
| - name: Upload artifacts | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: coverage-${{ matrix.flag }}-aggregated | |
| path: | | |
| .coverage | |
| include-hidden-files: true |