feat(validator): add Kubeflow Trainer support to robust-controller check #196
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: GPU Conformance Test (nvkind + H100 x2) | |
| on: | |
| schedule: | |
| - cron: '45 8,20 * * *' # Every 12 hours (2x daily), 2h15m after training (06:30/18:30) | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| paths: | |
| - '.github/workflows/gpu-h100-conformance-test.yaml' | |
| - '.github/actions/gpu-cluster-setup/**' | |
| - '.github/actions/gpu-operator-install/**' | |
| - '.github/actions/aicr-build/**' | |
| - 'validators/*/Dockerfile' | |
| - 'pkg/evidence/**' | |
| - 'pkg/validator/checks/conformance/**' | |
| - 'pkg/validator/checks/deployment/**' | |
| - '.github/actions/gpu-snapshot-validate/**' | |
| - '.github/actions/gpu-test-cleanup/**' | |
| - '.github/actions/load-versions/**' | |
| - '.github/actions/setup-build-tools/**' | |
| - 'tests/manifests/**' | |
| - 'tests/chainsaw/ai-conformance/**' | |
| - 'docs/conformance/cncf/**' | |
| - 'recipes/components/prometheus-adapter/**' | |
| - 'recipes/overlays/kind.yaml' | |
| - 'recipes/overlays/kind-inference.yaml' | |
| - 'recipes/overlays/h100-kind-training.yaml' | |
| - 'recipes/overlays/h100-kind-conformance.yaml' | |
| - 'kwok/manifests/karpenter/**' | |
| - 'kwok/scripts/install-karpenter-kwok.sh' | |
| workflow_dispatch: {} # Allow manual runs | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| gpu-conformance-test: | |
| name: GPU Conformance Test (nvkind + H100 x2) | |
| runs-on: linux-amd64-gpu-h100-latest-2 | |
| timeout-minutes: 60 | |
| env: | |
| KIND_CLUSTER_NAME: gpu-conformance-test | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| persist-credentials: false | |
| - name: Set up GPU cluster | |
| uses: ./.github/actions/gpu-cluster-setup | |
| - name: Build aicr | |
| uses: ./.github/actions/aicr-build | |
| - name: Install GPU operator (bundle) | |
| uses: ./.github/actions/gpu-operator-install | |
| with: | |
| method: bundle | |
| accelerator: h100 | |
| intent: training | |
| # --- Snapshot and validation --- | |
| - name: Snapshot and validate GPU | |
| uses: ./.github/actions/gpu-snapshot-validate | |
| with: | |
| gpu_model: H100 | |
| min_gpu_count: '2' | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Install Karpenter before validation so cluster-autoscaling check passes --- | |
| - name: Install Karpenter + KWOK (setup) | |
| uses: ./.github/actions/install-karpenter-kwok | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Validate cluster (Go conformance checks run inside K8s Jobs) --- | |
| # DRA and gang scheduling exercises are self-contained within the | |
| # conformance checks — they create their own resources and clean up. | |
| - name: Validate cluster | |
| run: | | |
| AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ | |
| ./aicr validate \ | |
| --recipe recipe.yaml \ | |
| --phase conformance \ | |
| --namespace gpu-operator \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --require-gpu \ | |
| --image=ko.local:smoke-test \ | |
| --timeout=10m \ | |
| --output=validation-result.yaml \ | |
| --evidence-dir=conformance-evidence | |
| - name: Load versions | |
| id: versions | |
| uses: ./.github/actions/load-versions | |
| - name: Install chainsaw | |
| uses: ./.github/actions/setup-build-tools | |
| with: | |
| install_chainsaw: 'true' | |
| chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' | |
| - name: Run chainsaw health checks | |
| run: | | |
| chainsaw test \ | |
| --test-dir tests/chainsaw/ai-conformance/kind-training \ | |
| --config tests/chainsaw/chainsaw-config.yaml | |
| # --- Evidence collection --- | |
| - name: Collect AI conformance evidence | |
| if: always() | |
| run: | | |
| go run ./tests/chainsaw/ai-conformance/ \ | |
| --dir tests/chainsaw/ai-conformance/kind-training \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --debug | |
| - name: Upload conformance evidence | |
| if: always() | |
| uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 | |
| with: | |
| name: conformance-evidence | |
| path: | | |
| conformance-evidence/ | |
| validation-result.yaml | |
| if-no-files-found: warn | |
| - name: Debug diagnostics | |
| if: failure() | |
| run: | | |
| echo "=== ClusterPolicy status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true | |
| echo "=== GPU Operator pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true | |
| echo "=== Non-running pods (all namespaces) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true | |
| echo "=== Recent events (gpu-operator) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true | |
| echo "=== KAI scheduler pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true | |
| echo "=== KAI scheduler logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ | |
| logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true | |
| echo "=== Custom metrics API ===" | |
| for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do | |
| echo "--- ${METRIC} ---" | |
| for NS in gpu-operator; do | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ | |
| "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true | |
| done | |
| done | |
| echo "=== prometheus-adapter pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true | |
| echo "=== DCGM Exporter pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ | |
| get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true | |
| echo "=== Monitoring pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true | |
| echo "=== DRA ResourceSlices ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true | |
| echo "=== Node status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true | |
| - name: GPU Test Cleanup | |
| if: always() | |
| uses: ./.github/actions/gpu-test-cleanup | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| artifact_name_prefix: gpu-conformance-test-debug |