feat(recipes): add GKE COS training overlays for H100 #335

Workflow file for this run

.github/workflows/gpu-h100-training-test.yaml at 5033975

	# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	name: GPU Training Test (nvkind + H100 x2)

	on:
	schedule:
	- cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test
	push:
	branches:
	- "pull-request/[0-9]+"
	paths:
	- '.github/workflows/gpu-h100-training-test.yaml'
	- '.github/actions/gpu-cluster-setup/**'
	- '.github/actions/gpu-operator-install/**'
	- '.github/actions/aicr-build/**'
	- 'validators/*/Dockerfile'
	- 'pkg/evidence/**'
	- 'pkg/validator/checks/conformance/**'
	- 'pkg/validator/checks/deployment/**'
	- '.github/actions/gpu-test-cleanup/**'
	- '.github/actions/load-versions/**'
	- 'tests/chainsaw/ai-conformance/kind-training/**'
	- 'recipes/overlays/kind.yaml'
	- 'recipes/overlays/h100-kind-training.yaml'
	- 'kwok/manifests/karpenter/**'
	- 'kwok/scripts/install-karpenter-kwok.sh'
	- 'recipes/components/prometheus-adapter/**'
	workflow_dispatch: {} # Allow manual runs

	permissions:
	contents: read

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:

	gpu-training-test:
	name: GPU Training Test (nvkind + H100 x2)
	runs-on: linux-amd64-gpu-h100-latest-2
	timeout-minutes: 45

	env:
	KIND_CLUSTER_NAME: gpu-training-test

	steps:

	- name: Checkout Code
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false

	- name: Set up GPU cluster
	uses: ./.github/actions/gpu-cluster-setup

	- name: Build aicr
	uses: ./.github/actions/aicr-build

	- name: Install GPU operator (bundle)
	uses: ./.github/actions/gpu-operator-install
	with:
	method: bundle
	accelerator: h100
	intent: training

	# --- Snapshot and validation ---

	- name: Snapshot and validate GPU
	uses: ./.github/actions/gpu-snapshot-validate
	with:
	gpu_model: H100
	min_gpu_count: '2'
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}

	# --- Install Karpenter before validation so cluster-autoscaling check passes ---

	- name: Install Karpenter + KWOK (setup)
	uses: ./.github/actions/install-karpenter-kwok
	with:
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}

	# --- Health checks (run before conformance to give metrics pipeline time) ---

	- name: Load versions
	id: versions
	uses: ./.github/actions/load-versions

	- name: Install chainsaw
	uses: ./.github/actions/setup-build-tools
	with:
	install_chainsaw: 'true'
	chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'

	- name: Run chainsaw health checks
	run: \|
	chainsaw test \
	--test-dir tests/chainsaw/ai-conformance/kind-training \
	--config tests/chainsaw/chainsaw-config.yaml

	# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
	# Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
	# has had time to bootstrap (pod-autoscaling check needs live metric data).
	# Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
	# gang-scheduling conformance check — no separate deploy step needed.

	- name: Validate cluster
	run: \|
	AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
	./aicr validate \
	--recipe recipe.yaml \
	--phase conformance \
	--namespace gpu-operator \
	--kubeconfig="${HOME}/.kube/config" \
	--require-gpu \
	--image=ko.local:smoke-test \
	--timeout=10m \
	--output=validation-result.yaml \
	--evidence-dir=conformance-evidence

	# --- Evidence collection ---

	- name: Collect AI conformance evidence
	if: always()
	run: \|
	go run ./tests/chainsaw/ai-conformance/ \
	--dir tests/chainsaw/ai-conformance/kind-training \
	--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
	--kubeconfig="${HOME}/.kube/config" \
	--debug

	- name: Upload conformance evidence
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
	with:
	name: conformance-evidence
	path: \|
	conformance-evidence/
	validation-result.yaml
	if-no-files-found: warn

	# --- Debug diagnostics (before cleanup so resources still exist) ---

	- name: Debug diagnostics
	if: failure()
	run: \|
	echo "=== KAI scheduler pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null \|\| true
	echo "=== KAI scheduler logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
	logs deployment/kai-scheduler-default --tail=100 2>/dev/null \|\| true
	echo "=== KAI scheduler queues ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null \|\| true
	echo "=== KAI scheduler podgroups ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null \|\| true
	echo "=== Non-running pods (all namespaces) ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
	--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \|\| true
	echo "=== GPU Operator pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null \|\| true
	echo "=== Node resources ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null \| \
	grep -A 20 "Allocated resources" \|\| true

	- name: GPU Test Cleanup
	if: always()
	uses: ./.github/actions/gpu-test-cleanup
	with:
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}
	artifact_name_prefix: gpu-training-test-debug

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(recipes): add GKE COS training overlays for H100 #335

Workflow file

feat(recipes): add GKE COS training overlays for H100 #335

Uh oh!

Workflow file for this run