Skip to content

feat(validator): add Kubeflow Trainer support to robust-controller check #196

feat(validator): add Kubeflow Trainer support to robust-controller check

feat(validator): add Kubeflow Trainer support to robust-controller check #196

# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: GPU Conformance Test (nvkind + H100 x2)
on:
schedule:
- cron: '45 8,20 * * *' # Every 12 hours (2x daily), 2h15m after training (06:30/18:30)
push:
branches:
- "pull-request/[0-9]+"
paths:
- '.github/workflows/gpu-h100-conformance-test.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/aicr-build/**'
- 'validators/*/Dockerfile'
- 'pkg/evidence/**'
- 'pkg/validator/checks/conformance/**'
- 'pkg/validator/checks/deployment/**'
- '.github/actions/gpu-snapshot-validate/**'
- '.github/actions/gpu-test-cleanup/**'
- '.github/actions/load-versions/**'
- '.github/actions/setup-build-tools/**'
- 'tests/manifests/**'
- 'tests/chainsaw/ai-conformance/**'
- 'docs/conformance/cncf/**'
- 'recipes/components/prometheus-adapter/**'
- 'recipes/overlays/kind.yaml'
- 'recipes/overlays/kind-inference.yaml'
- 'recipes/overlays/h100-kind-training.yaml'
- 'recipes/overlays/h100-kind-conformance.yaml'
- 'kwok/manifests/karpenter/**'
- 'kwok/scripts/install-karpenter-kwok.sh'
workflow_dispatch: {} # Allow manual runs
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
gpu-conformance-test:
name: GPU Conformance Test (nvkind + H100 x2)
runs-on: linux-amd64-gpu-h100-latest-2
timeout-minutes: 60
env:
KIND_CLUSTER_NAME: gpu-conformance-test
steps:
- name: Checkout Code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Set up GPU cluster
uses: ./.github/actions/gpu-cluster-setup
- name: Build aicr
uses: ./.github/actions/aicr-build
- name: Install GPU operator (bundle)
uses: ./.github/actions/gpu-operator-install
with:
method: bundle
accelerator: h100
intent: training
# --- Snapshot and validation ---
- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
with:
gpu_model: H100
min_gpu_count: '2'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Install Karpenter before validation so cluster-autoscaling check passes ---
- name: Install Karpenter + KWOK (setup)
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# DRA and gang scheduling exercises are self-contained within the
# conformance checks — they create their own resources and clean up.
- name: Validate cluster
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence
- name: Load versions
id: versions
uses: ./.github/actions/load-versions
- name: Install chainsaw
uses: ./.github/actions/setup-build-tools
with:
install_chainsaw: 'true'
chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
- name: Run chainsaw health checks
run: |
chainsaw test \
--test-dir tests/chainsaw/ai-conformance/kind-training \
--config tests/chainsaw/chainsaw-config.yaml
# --- Evidence collection ---
- name: Collect AI conformance evidence
if: always()
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-training \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug
- name: Upload conformance evidence
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: conformance-evidence
path: |
conformance-evidence/
validation-result.yaml
if-no-files-found: warn
- name: Debug diagnostics
if: failure()
run: |
echo "=== ClusterPolicy status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
echo "=== GPU Operator pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
echo "=== Non-running pods (all namespaces) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
echo "=== Recent events (gpu-operator) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== KAI scheduler pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
echo "=== KAI scheduler logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
echo "=== Custom metrics API ==="
for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
echo "--- ${METRIC} ---"
for NS in gpu-operator; do
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
done
done
echo "=== prometheus-adapter pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
echo "=== DCGM Exporter pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
echo "=== Monitoring pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true
echo "=== DRA ResourceSlices ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true
echo "=== Node status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
- name: GPU Test Cleanup
if: always()
uses: ./.github/actions/gpu-test-cleanup
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
artifact_name_prefix: gpu-conformance-test-debug