NVIDIA-NeMo · binaryaaron · May 5, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 name: uv-build
-description: "uv package management, dependency groups, PyTorch index handling, hatch build system, and versioning for this repo. Triggers on: uv, uv sync, uv lock, uv add, uv build, dependency, pyproject.toml, extras, cpu, cu128, hatch, wheel, version, publish."
+description: "uv package management, dependency groups, PyTorch index handling, hatch build system, and versioning for this repo. Triggers on: uv, uv sync, uv lock, uv add, uv build, dependency, pyproject.toml, extras, cpu, cu129, hatch, wheel, version, publish."
 license: Apache-2.0
 ---
 
@@ -17,24 +17,29 @@ Package management with uv, extras for CPU/CUDA, hatch build, and dynamic versio
 make bootstrap-tools && make bootstrap-nss cpu
 
 # Pick a variant:
-make bootstrap-nss dev       # dev tools only (no engine/torch)
-make bootstrap-nss cpu       # + engine + CPU PyTorch
-make bootstrap-nss cu129     # + engine + CUDA 12.9 PyTorch
-make bootstrap-nss engine    # + engine (no torch)
+make bootstrap-nss dev       # dev tools only
+make bootstrap-nss cpu       # runtime deps + CPU PyTorch
+make bootstrap-nss cu129     # runtime deps + CUDA 12.9 PyTorch
+make bootstrap-nss cu130     # runtime deps + CUDA 13.0 PyTorch
 ```
 
-Under the hood: `uv sync --frozen --extra <extra> [--extra engine] --group dev`
+Under the hood:
+
+```bash
+uv sync --frozen --group dev                  # dev tools only
+uv sync --frozen --extra <runtime> --group dev # cpu, cu129, or cu130
+```
 
 ## Extras and Conflicts
 
 | Extra | What it installs |
 |-------|------------------|
-| `cpu` | PyTorch CPU, faiss-cpu, flashinfer (Linux only) |
-| `cu129` | PyTorch+CUDA 12.9, faiss-gpu, flashinfer-jit-cache |
-| `engine` | ML pipeline deps (outlines, wandb, tiktoken, etc.) -- no torch |
+| `cpu` | Runtime deps, shared Torch-adjacent deps, and CPU PyTorch |
+| `cu129` | Runtime deps, shared Torch-adjacent deps, CUDA deps, and CUDA 12.9 PyTorch |
+| `cu130` | Runtime deps, shared Torch-adjacent deps, CUDA deps, and CUDA 13.0 PyTorch |
 | `microservices` | `nemo-microservices` from local path |
 
-`cpu` and `cu129` conflict -- you must pick one, never both. Enforced in `[tool.uv] conflicts`.
+`cpu`, `cu129`, and `cu130` conflict -- pick exactly one runtime extra. Enforced in `[tool.uv] conflicts`.
 
 ## Index Management
 

@@ -13,8 +13,9 @@ Set up the development environment from scratch.
 2. Install Python dependencies (choose one):
    ```bash
    make bootstrap-nss cpu    # CPU-only (macOS or Linux without GPU)
-   make bootstrap-nss cuda   # CUDA 12.9 (Linux with NVIDIA GPU)
-   make bootstrap-nss engine # Engine dependencies only (no torch)
+   make bootstrap-nss cu129   # CUDA 12.9 (Linux with NVIDIA GPU)
+   make bootstrap-nss cu130  # CUDA 13.0 (Linux with NVIDIA GPU)
+   make bootstrap-nss docs   # Documentation dependencies only
    make bootstrap-nss dev    # Minimal dev dependencies only
    ```
 

@@ -27,7 +27,7 @@ Cursor discovers skills directly from `.agents/skills/` -- no symlinks needed.
 
 | File | Purpose |
 |------|---------|
-| `pyproject.toml` | Package metadata, dependencies, extras (cpu/cu128/engine), uv config |
+| `pyproject.toml` | Package metadata, dependencies, runtime extras (cpu/cu129/cu130), uv config |
 | `pytest.ini` | Test markers, pytest options, timeout, parallelism |
 | `ruff.toml` | Ruff linting and formatting rules |
 | `mkdocs.yml` | Documentation site config (MkDocs Material) |

@@ -16,10 +16,10 @@ fi
 
 # Bare --frozen installs the base environment. For GPU dev work (ty, import
 # checks, GPU tests) run the full command manually after setup:
-#   uv sync --frozen --extra cu129 --extra engine --group dev
+#   uv sync --frozen --extra cu129 --group dev
 uv sync --frozen
 echo "Venv ready: $(pwd)/.venv"
-echo "Note: for GPU extras run: uv sync --frozen --extra cu129 --extra engine --group dev"
+echo "Note: for GPU extras run: uv sync --frozen --extra cu129 --group dev"
 
 for _envfile in .env .env.local mise.local.toml .local.envrc; do
     if [ -f "$ROOT_WORKTREE_PATH/$_envfile" ]; then

@@ -63,7 +63,24 @@ runs:
       shell: bash
       run: make bootstrap-nss ${{ inputs.cuda-extra }}
 
-    - name: Check GPU availability
+    - name: Report GPU environment
       shell: bash
       run: |
-        uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          nvidia-smi --query-gpu=driver_version,name --format=csv,noheader
+        else
+          echo "nvidia-smi: not available"
+        fi
+        uv run python - <<'PY'
+        import platform
+        import torch
+
+        print("python:", platform.python_version())
+        print("machine:", platform.machine())
+        print("torch:", torch.__version__)
+        print("torch cuda:", torch.version.cuda)
+        print("cuda available:", torch.cuda.is_available())
+        if torch.cuda.is_available():
+            print("device count:", torch.cuda.device_count())
+            print("device 0:", torch.cuda.get_device_name(0))
+        PY
@@ -13,8 +13,8 @@
 #
 # On-demand runs: comment `/sync` on any open PR to trigger GPU test runs
 # immediately without pushing a new commit. The bot pushes the current HEAD to
-# pull-request/<number>, which fires gpu-tests.yml and posts the GPU CI Status
-# check result back to the PR. Useful for draft PRs or re-running flaky tests.
+# pull-request/<number>. GPU workflow runs require the pull-request/* push
+# trigger in gpu-tests.yml, which is currently disabled.
 
 enabled: true
 auto_sync_draft: false

@@ -11,18 +11,18 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
 
 | Workflow                                           | Trigger                     | Description                                                                                                |
 | -------------------------------------------------- | --------------------------- | ---------------------------------------------------------------------------------------------------------- |
-| [ci-checks.yml](ci-checks.yml)                     | Push to `main`, PRs, manual | Format, typecheck, unit tests, and CPU smoke tests                                                         |
-| [gpu-tests.yml](gpu-tests.yml)                     | Nightly, manual             | GPU smoke tests (required) and E2E tests                                                                   |
+| [ci-checks.yml](ci-checks.yml)                     | Push to `main`, PRs, manual | Format, lock/generated dependency checks, typecheck, unit tests, and CPU smoke tests                       |
+| [gpu-tests.yml](gpu-tests.yml)                     | Nightly, manual             | GPU smoke tests (intended to be required when PR GPU checks are re-enabled) and E2E tests                   |
 | [conventional-commit.yml](conventional-commit.yml) | PRs                         | Validates PR titles follow conventional commit format                                                      |
 | [docs.yml](docs.yml)                               | Push to `main` (docs paths) | Publishes `main` docs as the `latest` GitHub Pages version                                                 |
 | [release.yml](release.yml)                         | Push tags to `v*`           | Builds and publishes package to Test PyPI/PyPI, creates a GitHub release, and publishes versioned docs     |
 | [secrets-detector.yml](secrets-detector.yml)       | PRs                         | Scans for accidentally committed secrets                                                                   |
 
 ## Pull Request Testing
 
-GPU tests on PRs are currently disabled due to internal constraints. `gpu-tests.yml` has its `push` trigger commented out, so it runs only on the nightly schedule or manual `workflow_dispatch`.
+GPU tests on PRs are currently disabled due to internal constraints. The `pull-request/*` push trigger is commented out in `gpu-tests.yml`, so copy-pr-bot syncs do not start GPU workflow runs until that trigger is reenabled.
 
-GPU tests (`gpu-tests.yml`) run on NVIDIA self-hosted runners, which block `pull_request`-triggered jobs. When PR GPU testing is re-enabled, use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern:
+When PR GPU tests are reenabled, `gpu-tests.yml` should use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern because NVIDIA self-hosted runners block `pull_request`-triggered jobs:
 
 1. When a PR is opened by a trusted user with trusted changes, `copy-pr-bot` automatically copies the code to a `pull-request/<number>` branch
 2. The push to `pull-request/<number>` triggers the GPU workflow
@@ -49,7 +49,7 @@ When this path is re-enabled, use `/sync` when:
 flowchart LR
     subgraph triggers [Triggers]
         push[Push to main]
-        schedule[Nightly Schedule]
+        schedule[Nightly schedule]
         pr[Pull Request event]
         manual[Manual Dispatch]
     end
@@ -112,7 +112,7 @@ The `ci-checks.yml` workflow runs on every push to `main` and on pull requests.
 | Job | `make` target | What it checks |
 | --- | --- | --- |
 | Format | `format-check` | `ruff format --check` + `ruff check` + SPDX copyright headers |
-| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml` |
+| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml`; generated CUDA dependency sections match `cuda_deps.toml` |
 | Typecheck | `typecheck` | `ty check` (excludes per `pyproject.toml [tool.ty.src]`) |
 | Unit Tests | `test-ci` | pytest with coverage (excludes slow, e2e, gpu, smoke) |
 | Smoke Tests | `test-smoke` | CPU smoke tests (training/generation hot paths, tiny models) |
@@ -127,7 +127,7 @@ To replicate CI locally:
 
 ```bash
 make check       # format-check + typecheck
-make lock-check  # verify uv.lock
+make lock-check  # verify uv.lock and generated CUDA dependency sections
 make test        # unit tests
 make test-smoke  # CPU smoke tests
 ```
@@ -138,7 +138,7 @@ All jobs run on `ubuntu-latest` (GitHub-hosted).
 
 The `gpu-tests.yml` workflow runs nightly at 02:00 UTC, and can also be triggered manually via `workflow_dispatch`. Manual dispatch includes a `suite` dropdown with `all`, `smoke`, and `e2e` options. The `push` trigger for `pull-request/*` branches is currently commented out due to internal blockers, so PRs do not automatically produce GPU status checks. We expect to re-enable that path as soon as those blockers are resolved. There are several key jobs:
 
-- GPU Smoke Tests: staged smoke tests on a gpu runner with a 30-minute job timeout. The train-only, generation, resume, structured generation, timeseries, and SmolLM2 lanes run as separate workflow steps. Required for merge when the workflow is part of branch protection.
+- GPU Smoke Tests: Quick smoke tests on a gpu runner with a 30-minute job timeout and 20-minute step timeout. These are intended to be required for merge when PR GPU status checks are re-enabled.
 - GPU E2E Tests: End-to-end tests on a gpu runner with a 60-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
 - GPU CI Status: Aggregation job for the GPU workflow. It is not currently a live branch-protection requirement while PR GPU runs are disabled; when re-enabled, it is intended to be the required GPU check. It fails if smoke tests fail and warns if E2E tests fail.
 

@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # ---------------------------------------------------------------------------
-# GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot
-# pattern: PRs are tested via push events to pull-request/* branches rather
-# than pull_request events.
+# GPU tests run on NVIDIA on-prem self-hosted runners. PR copy-pr-bot testing
+# uses push events to pull-request/* branches rather than pull_request events,
+# but that push trigger is currently disabled below.
 # See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/
 # ---------------------------------------------------------------------------
 
@@ -58,14 +58,16 @@ jobs:
       contents: read
     outputs:
       src_test_deps: ${{ steps.changes.outputs.src_test_deps }}
+      deps: ${{ steps.changes.outputs.deps }}
+      ci: ${{ steps.changes.outputs.ci }}
     steps:
       - uses: actions/checkout@v6
       - name: Detect changes
         id: changes
         uses: ./.github/actions/detect-changes
 
   gpu-smoke-test:
-    name: GPU Smoke Tests
+    name: GPU Smoke Tests (Python 3.11, ${{ matrix.cuda-extra }})
     needs: changes
     # `changes` is intentionally skipped on workflow_dispatch. `always()` lets
     # manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -74,16 +76,27 @@ jobs:
         always() &&
         (
           github.event_name == 'workflow_dispatch' ||
-          needs.changes.outputs.src_test_deps == 'true'
+          needs.changes.outputs.src_test_deps == 'true' ||
+          needs.changes.outputs.deps == 'true' ||
+          needs.changes.outputs.ci == 'true'
         ) &&
         (
           github.event_name != 'workflow_dispatch' ||
           inputs.suite == 'all' ||
           inputs.suite == 'smoke'
         )
       }}
+    continue-on-error: ${{ matrix.required == false }}
     timeout-minutes: 30
     runs-on: linux-amd64-gpu-a100-latest-1
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda-extra: cu129
+            required: true
+          - cuda-extra: cu130
+            required: false
     steps:
       - name: checkout
         uses: actions/checkout@v6
@@ -92,6 +105,9 @@ jobs:
 
       - name: Setup GPU test environment
         uses: ./.github/actions/setup-gpu-test-env
+        with:
+          python-version: "3.11"
+          cuda-extra: ${{ matrix.cuda-extra }}
 
       - name: Run GPU smoke tests - train only
         timeout-minutes: 10
@@ -118,7 +134,7 @@ jobs:
         run: make test-smoke-gpu-smollm2
 
   gpu-e2e-test:
-    name: GPU E2E Tests
+    name: GPU E2E Tests (Python 3.11, ${{ matrix.cuda-extra }})
     needs: changes
     # `changes` is intentionally skipped on workflow_dispatch. `always()` lets
     # manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -127,16 +143,27 @@ jobs:
         always() &&
         (
           github.event_name == 'workflow_dispatch' ||
-          needs.changes.outputs.src_test_deps == 'true'
+          needs.changes.outputs.src_test_deps == 'true' ||
+          needs.changes.outputs.deps == 'true' ||
+          needs.changes.outputs.ci == 'true'
         ) &&
         (
           github.event_name != 'workflow_dispatch' ||
           inputs.suite == 'all' ||
           inputs.suite == 'e2e'
         )
       }}
+    continue-on-error: ${{ matrix.required == false }}
     timeout-minutes: 60
     runs-on: linux-amd64-gpu-a100-latest-1
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda-extra: cu129
+            required: true
+          - cuda-extra: cu130
+            required: false
     steps:
       - name: checkout
         uses: actions/checkout@v6
@@ -145,6 +172,9 @@ jobs:
 
       - name: Setup GPU test environment
         uses: ./.github/actions/setup-gpu-test-env
+        with:
+          python-version: "3.11"
+          cuda-extra: ${{ matrix.cuda-extra }}
 
       - name: Run GPU E2E tests
         timeout-minutes: 45

@@ -24,7 +24,7 @@ Common commands: `make test` (unit tests), `make format` (auto-fix formatting +
 The canonical `uv sync` command for a full GPU/dev environment is:
 
 ```bash
-uv sync --frozen --extra cu129 --extra engine --group dev
+uv sync --frozen --extra cu129 --group dev
 ```
 
 Bare `uv sync --frozen` (without extras) installs an incomplete environment -- `ty`, import checks, and GPU tests will fail.