feat(recipes): add GKE COS training overlays for H100 #330
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: GPU Training Test (nvkind + H100 x2) | |
| on: | |
| schedule: | |
| - cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| paths: | |
| - '.github/workflows/gpu-h100-training-test.yaml' | |
| - '.github/actions/gpu-cluster-setup/**' | |
| - '.github/actions/gpu-operator-install/**' | |
| - '.github/actions/aicr-build/**' | |
| - 'validators/*/Dockerfile' | |
| - 'pkg/evidence/**' | |
| - 'pkg/validator/checks/conformance/**' | |
| - 'pkg/validator/checks/deployment/**' | |
| - '.github/actions/gpu-test-cleanup/**' | |
| - '.github/actions/load-versions/**' | |
| - 'tests/chainsaw/ai-conformance/kind-training/**' | |
| - 'recipes/overlays/kind.yaml' | |
| - 'recipes/overlays/h100-kind-training.yaml' | |
| - 'kwok/manifests/karpenter/**' | |
| - 'kwok/scripts/install-karpenter-kwok.sh' | |
| - 'recipes/components/prometheus-adapter/**' | |
| workflow_dispatch: {} # Allow manual runs | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| gpu-training-test: | |
| name: GPU Training Test (nvkind + H100 x2) | |
| runs-on: linux-amd64-gpu-h100-latest-2 | |
| timeout-minutes: 45 | |
| env: | |
| KIND_CLUSTER_NAME: gpu-training-test | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| persist-credentials: false | |
| - name: Set up GPU cluster | |
| uses: ./.github/actions/gpu-cluster-setup | |
| - name: Build aicr | |
| uses: ./.github/actions/aicr-build | |
| - name: Install GPU operator (bundle) | |
| uses: ./.github/actions/gpu-operator-install | |
| with: | |
| method: bundle | |
| accelerator: h100 | |
| intent: training | |
| # --- Snapshot and validation --- | |
| - name: Snapshot and validate GPU | |
| uses: ./.github/actions/gpu-snapshot-validate | |
| with: | |
| gpu_model: H100 | |
| min_gpu_count: '2' | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Install Karpenter before validation so cluster-autoscaling check passes --- | |
| - name: Install Karpenter + KWOK (setup) | |
| uses: ./.github/actions/install-karpenter-kwok | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Health checks (run before conformance to give metrics pipeline time) --- | |
| - name: Load versions | |
| id: versions | |
| uses: ./.github/actions/load-versions | |
| - name: Install chainsaw | |
| uses: ./.github/actions/setup-build-tools | |
| with: | |
| install_chainsaw: 'true' | |
| chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' | |
| - name: Run chainsaw health checks | |
| run: | | |
| chainsaw test \ | |
| --test-dir tests/chainsaw/ai-conformance/kind-training \ | |
| --config tests/chainsaw/chainsaw-config.yaml | |
| # --- Validate cluster (Go conformance checks run inside K8s Jobs) --- | |
| # Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline | |
| # has had time to bootstrap (pod-autoscaling check needs live metric data). | |
| # Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained | |
| # gang-scheduling conformance check — no separate deploy step needed. | |
| - name: Validate cluster | |
| run: | | |
| AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ | |
| ./aicr validate \ | |
| --recipe recipe.yaml \ | |
| --phase conformance \ | |
| --namespace gpu-operator \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --require-gpu \ | |
| --image=ko.local:smoke-test \ | |
| --timeout=10m \ | |
| --output=validation-result.yaml \ | |
| --evidence-dir=conformance-evidence | |
| # --- Evidence collection --- | |
| - name: Collect AI conformance evidence | |
| if: always() | |
| run: | | |
| go run ./tests/chainsaw/ai-conformance/ \ | |
| --dir tests/chainsaw/ai-conformance/kind-training \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --debug | |
| - name: Upload conformance evidence | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 | |
| with: | |
| name: conformance-evidence | |
| path: | | |
| conformance-evidence/ | |
| validation-result.yaml | |
| if-no-files-found: warn | |
| # --- Debug diagnostics (before cleanup so resources still exist) --- | |
| - name: Debug diagnostics | |
| if: failure() | |
| run: | | |
| echo "=== KAI scheduler pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true | |
| echo "=== KAI scheduler logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ | |
| logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true | |
| echo "=== KAI scheduler queues ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true | |
| echo "=== KAI scheduler podgroups ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true | |
| echo "=== Non-running pods (all namespaces) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \ | |
| --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true | |
| echo "=== GPU Operator pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true | |
| echo "=== Node resources ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \ | |
| grep -A 20 "Allocated resources" || true | |
| - name: GPU Test Cleanup | |
| if: always() | |
| uses: ./.github/actions/gpu-test-cleanup | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| artifact_name_prefix: gpu-training-test-debug |