Skip to content

feat(recipes): add GKE COS training overlays for H100 #335

feat(recipes): add GKE COS training overlays for H100

feat(recipes): add GKE COS training overlays for H100 #335

# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: GPU Training Test (nvkind + H100 x2)
on:
schedule:
- cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test
push:
branches:
- "pull-request/[0-9]+"
paths:
- '.github/workflows/gpu-h100-training-test.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/aicr-build/**'
- 'validators/*/Dockerfile'
- 'pkg/evidence/**'
- 'pkg/validator/checks/conformance/**'
- 'pkg/validator/checks/deployment/**'
- '.github/actions/gpu-test-cleanup/**'
- '.github/actions/load-versions/**'
- 'tests/chainsaw/ai-conformance/kind-training/**'
- 'recipes/overlays/kind.yaml'
- 'recipes/overlays/h100-kind-training.yaml'
- 'kwok/manifests/karpenter/**'
- 'kwok/scripts/install-karpenter-kwok.sh'
- 'recipes/components/prometheus-adapter/**'
workflow_dispatch: {} # Allow manual runs
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
gpu-training-test:
name: GPU Training Test (nvkind + H100 x2)
runs-on: linux-amd64-gpu-h100-latest-2
timeout-minutes: 45
env:
KIND_CLUSTER_NAME: gpu-training-test
steps:
- name: Checkout Code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Set up GPU cluster
uses: ./.github/actions/gpu-cluster-setup
- name: Build aicr
uses: ./.github/actions/aicr-build
- name: Install GPU operator (bundle)
uses: ./.github/actions/gpu-operator-install
with:
method: bundle
accelerator: h100
intent: training
# --- Snapshot and validation ---
- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
with:
gpu_model: H100
min_gpu_count: '2'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Install Karpenter before validation so cluster-autoscaling check passes ---
- name: Install Karpenter + KWOK (setup)
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Health checks (run before conformance to give metrics pipeline time) ---
- name: Load versions
id: versions
uses: ./.github/actions/load-versions
- name: Install chainsaw
uses: ./.github/actions/setup-build-tools
with:
install_chainsaw: 'true'
chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
- name: Run chainsaw health checks
run: |
chainsaw test \
--test-dir tests/chainsaw/ai-conformance/kind-training \
--config tests/chainsaw/chainsaw-config.yaml
# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
# has had time to bootstrap (pod-autoscaling check needs live metric data).
# Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
# gang-scheduling conformance check — no separate deploy step needed.
- name: Validate cluster
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence
# --- Evidence collection ---
- name: Collect AI conformance evidence
if: always()
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-training \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug
- name: Upload conformance evidence
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: conformance-evidence
path: |
conformance-evidence/
validation-result.yaml
if-no-files-found: warn
# --- Debug diagnostics (before cleanup so resources still exist) ---
- name: Debug diagnostics
if: failure()
run: |
echo "=== KAI scheduler pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
echo "=== KAI scheduler logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
echo "=== KAI scheduler queues ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
echo "=== KAI scheduler podgroups ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
echo "=== Non-running pods (all namespaces) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
echo "=== GPU Operator pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
echo "=== Node resources ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
grep -A 20 "Allocated resources" || true
- name: GPU Test Cleanup
if: always()
uses: ./.github/actions/gpu-test-cleanup
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
artifact_name_prefix: gpu-training-test-debug