NVIDIA-NeMo
diff --git a/‎.claude/commands/bootstrap.md‎
Lines changed: 2 additions & 1 deletion b/‎.claude/commands/bootstrap.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.cursor/rules/repo-navigation.mdc‎
Lines changed: 1 addition & 1 deletion b/‎.cursor/rules/repo-navigation.mdc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.cursor/setup-worktree.sh‎
Lines changed: 2 additions & 2 deletions b/‎.cursor/setup-worktree.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/actions/setup-gpu-test-env/action.yml‎
Lines changed: 87 additions & 0 deletions b/‎.github/actions/setup-gpu-test-env/action.yml‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎.github/copy-pr-bot.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/copy-pr-bot.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/README.md‎
Lines changed: 19 additions & 12 deletions b/‎.github/workflows/README.md‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎.github/workflows/gpu-tests.yml‎
Lines changed: 53 additions & 7 deletions b/‎.github/workflows/gpu-tests.yml‎
Lines changed: 53 additions & 7 deletions
@@ -14,7 +14,8 @@ Set up the development environment from scratch.
    ```bash
    make bootstrap-nss cpu    # CPU-only (macOS or Linux without GPU)
    make bootstrap-nss cuda   # CUDA 12.8 (Linux with NVIDIA GPU)
-   make bootstrap-nss engine # Engine dependencies only (no torch)
+   make bootstrap-nss cu130  # CUDA 13.0 (Linux with NVIDIA GPU)
+   make bootstrap-nss docs   # Documentation dependencies only
    make bootstrap-nss dev    # Minimal dev dependencies only
    ```
 
 
@@ -27,7 +27,7 @@ Cursor discovers skills directly from `.agents/skills/` -- no symlinks needed.
 
 | File | Purpose |
 |------|---------|
-| `pyproject.toml` | Package metadata, dependencies, extras (cpu/cu128/engine), uv config |
+| `pyproject.toml` | Package metadata, dependencies, runtime extras (cpu/cu128/cu130), uv config |
 | `pytest.ini` | Test markers, pytest options, timeout, parallelism |
 | `ruff.toml` | Ruff linting and formatting rules |
 | `mkdocs.yml` | Documentation site config (MkDocs Material) |
 
@@ -16,10 +16,10 @@ fi
 
 # Bare --frozen installs the base environment. For GPU dev work (ty, import
 # checks, GPU tests) run the full command manually after setup:
-#   uv sync --frozen --extra cu128 --extra engine --group dev
+#   uv sync --frozen --extra cu128 --group dev
 uv sync --frozen
 echo "Venv ready: $(pwd)/.venv"
-echo "Note: for GPU extras run: uv sync --frozen --extra cu128 --extra engine --group dev"
+echo "Note: for GPU extras run: uv sync --frozen --extra cu128 --group dev"
 
 for _envfile in .env .env.local mise.local.toml .local.envrc; do
     if [ -f "$ROOT_WORKTREE_PATH/$_envfile" ]; then
 
@@ -67,3 +67,90 @@ runs:
       shell: bash
       run: |
         uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+
+# Copyright (c) 2024-2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "Setup GPU Test Environment"
+description: "Common setup for GPU test workflows: make, Python, CUDA dependencies, and GPU availability"
+
+inputs:
+  python-version:
+    description: "Python version to use (defaults to .python-version when empty)"
+    required: false
+    default: ""
+  bootstrap-tools:
+    description: "Whether to install dev tools via mise"
+    required: false
+    default: "true"
+  cuda-extra:
+    description: "CUDA dependency extra to bootstrap"
+    required: false
+    default: "cu128"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install make
+      shell: bash
+      run: |
+        if command -v make >/dev/null 2>&1; then
+          echo "make is already installed: $(command -v make)"
+          exit 0
+        fi
+
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends make
+
+    - name: Setup uv cache
+      uses: astral-sh/setup-uv@v6
+      with:
+        enable-cache: true
+        cache-dependency-glob: |
+          .python-version
+          pyproject.toml
+          uv.lock
+
+    - name: Setup Python environment
+      uses: ./.github/actions/setup-python-env
+      with:
+        checkout: "false"
+        python-version: ${{ inputs.python-version }}
+        bootstrap-tools: ${{ inputs.bootstrap-tools }}
+
+    - name: Bootstrap CUDA environment
+      shell: bash
+      run: make bootstrap-nss ${{ inputs.cuda-extra }}
+
+    - name: Report GPU environment
+      shell: bash
+      run: |
+          if command -v nvidia-smi >/dev/null 2>&1; then
+            nvidia-smi --query-gpu=driver_version,name --format=csv,noheader
+          else
+            echo "nvidia-smi: not available"
+          fi
+          uv run python - <<'PY'
+          import platform
+          import torch
+
+          print("python:", platform.python_version())
+          print("machine:", platform.machine())
+          print("torch:", torch.__version__)
+          print("torch cuda:", torch.version.cuda)
+          print("cuda available:", torch.cuda.is_available())
+          if torch.cuda.is_available():
+              print("device count:", torch.cuda.device_count())
+              print("device 0:", torch.cuda.get_device_name(0))
+          PY
@@ -13,8 +13,8 @@
 #
 # On-demand runs: comment `/sync` on any open PR to trigger GPU test runs
 # immediately without pushing a new commit. The bot pushes the current HEAD to
-# pull-request/<number>, which fires gpu-tests.yml and posts the GPU CI Status
-# check result back to the PR. Useful for draft PRs or re-running flaky tests.
+# pull-request/<number>. GPU workflow runs require the pull-request/* push
+# trigger in gpu-tests.yml, which is currently disabled.
 
 enabled: true
 auto_sync_draft: false
 
@@ -11,7 +11,7 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
 
 | Workflow                                           | Trigger                     | Description                                                                                                |
 | -------------------------------------------------- | --------------------------- | ---------------------------------------------------------------------------------------------------------- |
-| [ci-checks.yml](ci-checks.yml)                     | Push to `main`, PRs, manual | Format, typecheck, unit tests, and CPU smoke tests                                                         |
+| [ci-checks.yml](ci-checks.yml)                     | Push to `main`, PRs, manual | Format, lock/generated dependency checks, typecheck, unit tests, and CPU smoke tests                       |
 | [gpu-tests.yml](gpu-tests.yml)                     | Nightly, manual             | GPU smoke tests (required) and E2E tests                                                                   |
 | [conventional-commit.yml](conventional-commit.yml) | PRs                         | Validates PR titles follow conventional commit format                                                      |
 | [docs.yml](docs.yml)                               | Push to `main` (docs paths) | Publishes `main` docs as the `latest` GitHub Pages version                                                 |
@@ -20,9 +20,9 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
 
 ## Pull Request Testing (copy-pr-bot)
 
-GPU tests on PRs are currently disabled due to internal constraints. We hope to reenable them asap. The rest of this information is kept for posterity, but it is also relevant to the external tests ran for unit and cpu smoke tests.
+GPU tests on PRs are currently disabled due to internal constraints. The `pull-request/*` push trigger is commented out in `gpu-tests.yml`, so copy-pr-bot syncs do not start GPU workflow runs until that trigger is reenabled.
 
-GPU tests (`gpu-tests.yml`) run on NVIDIA self-hosted runners, which block `pull_request`-triggered jobs. They use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern instead:
+When PR GPU tests are reenabled, `gpu-tests.yml` should use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern because NVIDIA self-hosted runners block `pull_request`-triggered jobs:
 
 1. When a PR is opened by a trusted user with trusted changes, `copy-pr-bot` automatically copies the code to a `pull-request/<number>` branch
 2. The push to `pull-request/<number>` triggers the GPU workflow
@@ -35,7 +35,7 @@ CPU checks (`ci-checks.yml`) run on GitHub-hosted `ubuntu-latest` runners and us
 
 ### On-demand GPU test runs
 
-To trigger a GPU test run on an open PR without waiting for the auto-sync, comment `/sync` on the PR. copy-pr-bot will push the current HEAD to `pull-request/<number>`, which fires `gpu-tests.yml` and posts the `GPU CI Status` check result back to the PR -- the same check as the automatic trigger.
+The `/sync` command pushes the current PR HEAD to `pull-request/<number>`. While the `pull-request/*` push trigger is disabled, that push does not fire `gpu-tests.yml` or post a `GPU CI Status` check.
 
 Use `/sync` when:
 
@@ -49,7 +49,7 @@ Use `/sync` when:
 flowchart LR
     subgraph triggers [Triggers]
         push[Push to main]
-        cpb[copy-pr-bot push to pull-request/*]
+        schedule[Nightly schedule]
         pr[Pull Request event]
         manual[Manual Dispatch]
     end
@@ -92,8 +92,9 @@ flowchart LR
         publishArtifactory[Publish to Artifactory/PyPI]
     end
 
-    push --> ci & gpu
-    cpb --> gpu
+    push --> ci
+    schedule --> gpu
+    manual --> gpu
     pr --> ci & conventional & secrets
     tag[Tag push v[0-9]*] --> release
 
@@ -111,7 +112,7 @@ The `ci-checks.yml` workflow runs on every push to `main` and on pull requests.
 | Job | `make` target | What it checks |
 | --- | --- | --- |
 | Format | `format-check` | `ruff format --check` + `ruff check` + SPDX copyright headers |
-| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml` |
+| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml`; generated CUDA dependency sections match `cuda_deps.toml` |
 | Typecheck | `typecheck` | `ty check` (excludes per `pyproject.toml [tool.ty.src]`) |
 | Unit Tests | `test-ci` | pytest with coverage (excludes slow, e2e, gpu, smoke) |
 | Smoke Tests | `test-smoke` | CPU smoke tests (training/generation hot paths, tiny models) |
@@ -126,7 +127,7 @@ To replicate CI locally:
 
 ```bash
 make check       # format-check + typecheck
-make lock-check  # verify uv.lock
+make lock-check  # verify uv.lock and generated CUDA dependency sections
 make test        # unit tests
 make test-smoke  # CPU smoke tests
 ```
@@ -135,13 +136,14 @@ All jobs run on `ubuntu-latest` (GitHub-hosted).
 
 ## GPU Tests Workflow
 
-The `gpu-tests.yml` workflow runs nightly at 02:00 UTC, and can also be triggered manually via `workflow_dispatch`. Manual dispatch includes a `suite` dropdown with `all`, `smoke`, and `e2e` options. There are several key jobs:
+The `gpu-tests.yml` workflow runs nightly at 02:00 UTC, and can also be triggered manually via `workflow_dispatch`. Manual dispatch includes a `suite` dropdown with `all`, `smoke`, and `e2e` options. GPU jobs run on Python 3.11 and matrix over CUDA runtime extras. There are several key jobs:
 
 - GPU Smoke Tests: Quick smoke tests on a gpu runner with a 30-minute job timeout and 20-minute step timeout. Required for merge.
 - GPU E2E Tests: End-to-end tests on a gpu runner with a 60-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
+- GPU E2E Tests: End-to-end tests on a gpu runner with a 60-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
 - GPU CI Status: Aggregation job -- single required check for branch protection. Fails if smoke tests fail; warns if E2E tests fail.
 
-The `changes` (Detect Changes) job is skipped on `workflow_dispatch`. GPU jobs use `always()` in their job conditions so manual runs can bypass the skipped dependency and run the selected suite. On scheduled runs, `changes` gates GPU jobs to source and test changes.
+The `changes` (Detect Changes) job is skipped on `workflow_dispatch`. GPU jobs use `always()` in their job conditions so manual runs can bypass the skipped dependency and run the selected suite. On scheduled runs, `changes` gates GPU jobs to source, test, dependency (`pyproject.toml` or `uv.lock`), and CI workflow/action changes.
 
 GPU jobs use `.github/actions/setup-gpu-test-env` for shared GPU setup: installing `make`, setting up Python from `.python-version`, bootstrapping CUDA dependencies, and checking GPU availability.
 
@@ -151,14 +153,19 @@ To trigger manually from the CLI (produces a run but not a PR status check):
 gh workflow run gpu-tests.yml --ref <branch-name> -f suite=all
 gh workflow run gpu-tests.yml --ref <branch-name> -f suite=smoke
 gh workflow run gpu-tests.yml --ref <branch-name> -f suite=e2e
+gh workflow run gpu-tests.yml --ref <branch-name> -f suite=all
+gh workflow run gpu-tests.yml --ref <branch-name> -f suite=smoke
+gh workflow run gpu-tests.yml --ref <branch-name> -f suite=e2e
 ```
 
-To trigger from the PR UI and get a status check result, use `/sync` -- see [On-demand GPU test runs](#on-demand-gpu-test-runs) above.
+PR status checks from `/sync` require the `pull-request/*` push trigger to be reenabled -- see [On-demand GPU test runs](#on-demand-gpu-test-runs) above.
 
 ### Runners
 
 Internal runners and projects are defined in an internal repo, `nv-gha-runners/enterprise-runner-configuration`.
 
+Internal runners and projects are defined in an internal repo, `nv-gha-runners/enterprise-runner-configuration`.
+
 | Workflow | Job | Runner Label | Type |
 | --- | --- | --- | --- |
 | CI Checks | All jobs | `ubuntu-latest` | GitHub-hosted |
 
@@ -13,22 +13,27 @@
 # limitations under the License.
 
 # ---------------------------------------------------------------------------
-# GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot
-# pattern: PRs are tested via push events to pull-request/* branches rather
-# than pull_request events.
+# GPU tests run on NVIDIA on-prem self-hosted runners. PR copy-pr-bot testing
+# uses push events to pull-request/* branches rather than pull_request events,
+# but that push trigger is currently disabled below.
 # See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/
 # ---------------------------------------------------------------------------
 
 name: GPU Tests
 
 on:
   schedule:
+    # Nightly at 02:00 UTC.
     # Nightly at 02:00 UTC.
     - cron: '0 2 * * *'
   # disabled for now to avoid running on PRs
   # push:
   #   branches:
   #     - "pull-request/[0-9]+"
+  # disabled for now to avoid running on PRs
+  # push:
+  #   branches:
+  #     - "pull-request/[0-9]+"
   workflow_dispatch:
     inputs:
       suite:
@@ -40,6 +45,16 @@ on:
           - all
           - smoke
           - e2e
+    inputs:
+      suite:
+        description: "GPU test suite to run"
+        required: true
+        default: all
+        type: choice
+        options:
+          - all
+          - smoke
+          - e2e
 
 defaults:
   run:
@@ -53,20 +68,23 @@ jobs:
   changes:
     name: Detect changes
     if: github.event_name != 'workflow_dispatch'
+    if: github.event_name != 'workflow_dispatch'
     runs-on: linux-amd64-cpu4
     permissions:
       contents: read
     outputs:
       src: ${{ steps.changes.outputs.src }}
       test: ${{ steps.changes.outputs.test }}
+      deps: ${{ steps.changes.outputs.deps }}
+      ci: ${{ steps.changes.outputs.ci }}
     steps:
       - uses: actions/checkout@v6
       - name: Detect changes
         id: changes
         uses: ./.github/actions/detect-changes
 
   gpu-smoke-test:
-    name: GPU Smoke Tests
+    name: GPU Smoke Tests (Python 3.11, ${{ matrix.cuda-extra }})
     needs: changes
     # `changes` is intentionally skipped on workflow_dispatch. `always()` lets
     # manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -76,16 +94,27 @@ jobs:
         (
           github.event_name == 'workflow_dispatch' ||
           needs.changes.outputs.src == 'true' ||
-          needs.changes.outputs.test == 'true'
+          needs.changes.outputs.test == 'true' ||
+          needs.changes.outputs.deps == 'true' ||
+          needs.changes.outputs.ci == 'true'
         ) &&
         (
           github.event_name != 'workflow_dispatch' ||
           inputs.suite == 'all' ||
           inputs.suite == 'smoke'
         )
       }}
+    continue-on-error: ${{ matrix.required == false }}
     timeout-minutes: 30
     runs-on: linux-amd64-gpu-a100-latest-1
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda-extra: cu128
+            required: true
+          - cuda-extra: cu130
+            required: false
     steps:
       - name: checkout
         uses: actions/checkout@v6
@@ -94,13 +123,16 @@ jobs:
 
       - name: Setup GPU test environment
         uses: ./.github/actions/setup-gpu-test-env
+        with:
+          python-version: "3.11"
+          cuda-extra: ${{ matrix.cuda-extra }}
 
       - name: Run GPU smoke tests
         timeout-minutes: 20
         run: make test-smoke-gpu
 
   gpu-e2e-test:
-    name: GPU E2E Tests
+    name: GPU E2E Tests (Python 3.11, ${{ matrix.cuda-extra }})
     needs: changes
     # `changes` is intentionally skipped on workflow_dispatch. `always()` lets
     # manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -110,16 +142,27 @@ jobs:
         (
           github.event_name == 'workflow_dispatch' ||
           needs.changes.outputs.src == 'true' ||
-          needs.changes.outputs.test == 'true'
+          needs.changes.outputs.test == 'true' ||
+          needs.changes.outputs.deps == 'true' ||
+          needs.changes.outputs.ci == 'true'
         ) &&
         (
           github.event_name != 'workflow_dispatch' ||
           inputs.suite == 'all' ||
           inputs.suite == 'e2e'
         )
       }}
+    continue-on-error: ${{ matrix.required == false }}
     timeout-minutes: 60
     runs-on: linux-amd64-gpu-a100-latest-1
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda-extra: cu128
+            required: true
+          - cuda-extra: cu130
+            required: false
     steps:
       - name: checkout
         uses: actions/checkout@v6
@@ -128,6 +171,9 @@ jobs:
 
       - name: Setup GPU test environment
         uses: ./.github/actions/setup-gpu-test-env
+        with:
+          python-version: "3.11"
+          cuda-extra: ${{ matrix.cuda-extra }}
 
       - name: Run GPU E2E tests
         timeout-minutes: 45