NVIDIA
diff --git a/‎.github/actions/aicr-build/action.yml‎
Lines changed: 7 additions & 5 deletions b/‎.github/actions/aicr-build/action.yml‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 36 additions & 237 deletions b/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 36 additions & 237 deletions
diff --git a/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 24 additions & 14 deletions b/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 24 additions & 14 deletions
@@ -13,20 +13,22 @@
 # limitations under the License.
 
 name: 'AICR Build'
-description: 'Builds the aicr container image (via ko) and CLI binary, and loads the image into kind.'
+description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.'
 
 runs:
   using: 'composite'
   steps:
 
-    - name: Build aicr image and load into kind
+    - name: Install ko
       shell: bash
-      env:
-        GOFLAGS: -mod=vendor
       run: |
         KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
         GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
-        KO_DOCKER_REPO=ko.local ko build --bare --sbom=none --tags=smoke-test ./cmd/aicr
+
+    - name: Build aicr validator image and load into kind
+      shell: bash
+      run: |
+        docker build -f Dockerfile.validator -t ko.local:smoke-test .
         kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
 
     - name: Build aicr binary
 
@@ -25,6 +25,8 @@ on:
       - '.github/actions/gpu-cluster-setup/**'
       - '.github/actions/gpu-operator-install/**'
       - '.github/actions/aicr-build/**'
+      - 'Dockerfile.validator'
+      - 'pkg/validator/checks/conformance/**'
       - '.github/actions/gpu-test-cleanup/**'
       - '.github/actions/load-versions/**'
       - 'tests/manifests/**'
@@ -107,6 +109,39 @@ jobs:
           fi
           echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
 
+      # --- Deploy DRA test pod (prerequisite for secure-accelerator-access check) ---
+
+      - name: Deploy DRA GPU test
+        run: |
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
+
+          echo "Waiting for DRA GPU test pod to complete..."
+          if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+            wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
+            echo "DRA GPU allocation test passed."
+          else
+            echo "::error::DRA GPU test pod did not succeed"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+              logs pod/dra-gpu-test 2>/dev/null || true
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+              get pod/dra-gpu-test -o yaml 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== DRA GPU test logs ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+            logs pod/dra-gpu-test
+
+      # --- Install Karpenter before validation so cluster-autoscaling check passes ---
+
+      - name: Install Karpenter + KWOK (setup)
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh --setup
+
+      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
+      # Replaces previous bash assertion steps for: inference-gateway,
+      # accelerator-metrics, pod-autoscaling, secure-accelerator-access.
+
       - name: Validate cluster
         run: |
           ./aicr validate \
@@ -131,43 +166,6 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind \
             --config tests/chainsaw/chainsaw-config.yaml
 
-      # --- Inference Gateway validation (CNCF AI Conformance #6) ---
-
-      - name: Validate inference gateway
-        run: |
-          echo "=== GatewayClass ==="
-          GC_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get gatewayclass kgateway \
-            -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null)
-          echo "GatewayClass accepted: ${GC_STATUS}"
-          if [[ "${GC_STATUS}" != "True" ]]; then
-            echo "::error::GatewayClass 'kgateway' not accepted"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== Gateway ==="
-          GW_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get gateway inference-gateway -n kgateway-system \
-            -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null)
-          echo "Gateway programmed: ${GW_STATUS}"
-          if [[ "${GW_STATUS}" != "True" ]]; then
-            echo "::error::Gateway 'inference-gateway' not programmed"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-              get gateway inference-gateway -n kgateway-system -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== Gateway API CRDs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
-            grep -E "gateway\.networking\.k8s\.io" || true
-
-          echo "=== Inference extension CRDs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
-            grep -E "inference\.networking" || true
-
-          echo "Inference gateway validation passed."
-
       # --- Dynamo vLLM inference smoke test ---
 
       - name: Deploy Dynamo vLLM smoke test
@@ -255,209 +253,10 @@ jobs:
           fi
           echo "Dynamo vLLM inference smoke test passed."
 
-      # --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
-
-      - name: Validate accelerator metrics
-
-        run: |
-          echo "=== DCGM Exporter pod ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o wide
-          DCGM_POD=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
-          if [[ -z "${DCGM_POD}" ]]; then
-            echo "::error::DCGM Exporter pod not found"
-            exit 1
-          fi
-          echo "DCGM Exporter pod: ${DCGM_POD}"
-
-          echo "=== Query DCGM metrics endpoint ==="
-          METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" run dcgm-probe \
-            --rm -i --restart=Never --image=curlimages/curl \
-            -- curl -sf http://nvidia-dcgm-exporter.gpu-operator.svc:9400/metrics 2>/dev/null)
-
-          for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
-            if echo "${METRICS}" | grep -q "^${METRIC}"; then
-              echo "${METRIC}: $(echo "${METRICS}" | grep "^${METRIC}" | head -1)"
-            else
-              echo "::warning::Metric ${METRIC} not found in DCGM output"
-            fi
-          done
-
-          echo "=== Prometheus scraping GPU metrics ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring \
-            port-forward svc/kube-prometheus-prometheus 9090:9090 &
-          PF_PID=$!
-          sleep 3
-
-          cleanup_pf() { kill "${PF_PID}" 2>/dev/null || true; }
-          trap cleanup_pf EXIT
-
-          for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
-            RESULT=$(curl -sf "http://localhost:9090/api/v1/query?query=${METRIC}" 2>/dev/null)
-            COUNT=$(echo "${RESULT}" | jq -r '.data.result | length' 2>/dev/null)
-            if [[ "${COUNT}" -gt 0 ]]; then
-              echo "${METRIC}: ${COUNT} time series in Prometheus"
-            else
-              echo "::warning::${METRIC} not found in Prometheus (may need more scrape time)"
-            fi
-          done
-
-          kill "${PF_PID}" 2>/dev/null || true
-          trap - EXIT
-
-          echo "=== Custom Metrics API ==="
-          CUSTOM_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
-          if [[ -n "${CUSTOM_METRICS}" ]]; then
-            echo "Custom metrics API is available"
-            echo "${CUSTOM_METRICS}" | jq -r '.resources[].name' 2>/dev/null | head -20 || true
-          else
-            echo "::warning::Custom metrics API not available (prometheus-adapter may need time)"
-          fi
-
-          echo "Accelerator metrics validation passed."
-
-      # --- Pod Autoscaling readiness validation (CNCF AI Conformance #8b) ---
-      # Validates the custom metrics pipeline (DCGM → Prometheus → prometheus-adapter
-      # → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
-      # Deployments), so we validate the API directly rather than creating an HPA.
-      #
-      # DCGM exporter pod-mapping relabels metrics with the GPU workload's
-      # namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
-      # (idle GPU) or dynamo-system (active workload). prometheus-adapter also
-      # needs relist cycles (30s each) to discover new label combinations, so
-      # we poll with retries.
-
-      - name: Validate custom metrics for pod autoscaling
-
-        run: |
-          echo "=== Custom metrics API availability ==="
-          RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
-          if [[ -z "${RESOURCES}" ]]; then
-            echo "::error::Custom metrics API not available"
-            exit 1
-          fi
-          echo "Custom metrics API is available"
-          echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
-
-          NAMESPACES="gpu-operator dynamo-system"
-          METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
-
-          # Poll for up to 3 minutes — prometheus-adapter relists every 30s and
-          # avg_over_time(...[2m]) queries need sufficient data points.
-          HAS_METRICS=false
-          for ATTEMPT in $(seq 1 18); do
-            for METRIC in ${METRICS}; do
-              for NS in ${NAMESPACES}; do
-                RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-                  "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
-                if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
-                  echo "${METRIC} metrics available in ${NS}:"
-                  echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
-                  HAS_METRICS=true
-                  break 3
-                fi
-              done
-            done
-            echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
-            sleep 10
-          done
-
-          if [[ "${HAS_METRICS}" != "true" ]]; then
-            echo "::error::No GPU custom metrics available via custom metrics API (prometheus-adapter pipeline broken)"
-            exit 1
-          fi
-
-          echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
-
       # --- Cluster Autoscaling validation ---
 
       - name: Cluster Autoscaling (Karpenter + KWOK)
-
-        run: bash kwok/scripts/validate-cluster-autoscaling.sh
-
-      # --- DRA GPU allocation test ---
-
-      - name: Deploy DRA GPU test
-
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-            -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
-
-          echo "Waiting for DRA GPU test pod to complete..."
-          if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
-            echo "DRA GPU allocation test passed."
-          else
-            echo "::error::DRA GPU test pod did not succeed"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-              logs pod/dra-gpu-test 2>/dev/null || true
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-              get pod/dra-gpu-test -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== DRA GPU test logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            logs pod/dra-gpu-test
-
-      # --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
-
-      - name: Validate secure accelerator access
-
-        run: |
-          echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
-
-          # Check pod uses resourceClaims (DRA), not resources.limits (device plugin)
-          RESOURCE_CLAIMS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test -o jsonpath='{.spec.resourceClaims}' 2>/dev/null)
-          if [[ -z "${RESOURCE_CLAIMS}" || "${RESOURCE_CLAIMS}" == "null" ]]; then
-            echo "::error::Pod does not use DRA resourceClaims"
-            exit 1
-          fi
-          echo "Pod uses DRA resourceClaims: ${RESOURCE_CLAIMS}"
-
-          # Verify no nvidia.com/gpu in resources.limits (device plugin pattern)
-          GPU_LIMITS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test \
-            -o jsonpath='{.spec.containers[0].resources.limits.nvidia\.com/gpu}' 2>/dev/null)
-          if [[ -n "${GPU_LIMITS}" && "${GPU_LIMITS}" != "null" ]]; then
-            echo "::error::Pod uses device plugin (nvidia.com/gpu limits) instead of DRA"
-            exit 1
-          fi
-          echo "No device plugin resources.limits — GPU access via DRA only"
-
-          # Verify no hostPath volumes to /dev/nvidia*
-          VOLUMES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test -o jsonpath='{.spec.volumes}' 2>/dev/null)
-          if echo "${VOLUMES}" | grep -q "hostPath" && echo "${VOLUMES}" | grep -q "/dev/nvidia"; then
-            echo "::error::Pod has hostPath volume mount to /dev/nvidia*"
-            exit 1
-          fi
-          echo "No hostPath volumes to /dev/nvidia* — access is DRA-mediated"
-
-          # Verify container security (no privilege escalation)
-          PRIV_ESC=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test \
-            -o jsonpath='{.spec.containers[0].securityContext.allowPrivilegeEscalation}' 2>/dev/null)
-          echo "allowPrivilegeEscalation: ${PRIV_ESC}"
-
-          # Verify only 1 GPU visible (allocated count matches)
-          GPU_COUNT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            logs pod/dra-gpu-test 2>/dev/null | grep -c "/dev/nvidia[0-9]" || echo "0")
-          echo "GPU devices visible in container: ${GPU_COUNT}"
-          if [[ "${GPU_COUNT}" -lt 1 ]]; then
-            echo "::error::No GPU devices visible in container"
-            exit 1
-          fi
-
-          echo "=== ResourceClaim allocation ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get resourceclaim gpu-claim -o wide
-
-          echo "Secure accelerator access validation passed."
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh --exercise
 
       - name: DRA GPU test cleanup
         if: always()
 
@@ -25,6 +25,8 @@ on:
       - '.github/actions/gpu-cluster-setup/**'
       - '.github/actions/gpu-operator-install/**'
       - '.github/actions/aicr-build/**'
+      - 'Dockerfile.validator'
+      - 'pkg/validator/checks/conformance/**'
       - '.github/actions/gpu-test-cleanup/**'
       - '.github/actions/load-versions/**'
       - 'docs/conformance/cncf/manifests/gang-scheduling-test.yaml'
@@ -104,19 +106,12 @@ jobs:
           fi
           echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
 
-      - name: Validate cluster
-        run: |
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase readiness \
-            --phase deployment \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test
+      # --- Install Karpenter before validation so cluster-autoscaling check passes ---
+
+      - name: Install Karpenter + KWOK (setup)
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh --setup
 
-      # --- Health checks ---
+      # --- Health checks (run before conformance to give metrics pipeline time) ---
 
       - name: Install chainsaw
         run: |
@@ -130,6 +125,22 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind-training \
             --config tests/chainsaw/chainsaw-config.yaml
 
+      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
+      # Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
+      # has had time to bootstrap (pod-autoscaling check needs live metric data).
+
+      - name: Validate cluster
+        run: |
+          ./aicr validate \
+            --recipe recipe.yaml \
+            --phase readiness \
+            --phase deployment \
+            --phase conformance \
+            --namespace gpu-operator \
+            --kubeconfig="${HOME}/.kube/config" \
+            --require-gpu \
+            --image=ko.local:smoke-test
+
       # --- Gang scheduling test with PodGroup + KAI scheduler ---
 
       - name: Deploy gang scheduling test
@@ -179,8 +190,7 @@ jobs:
       # --- Cluster Autoscaling validation ---
 
       - name: Cluster Autoscaling (Karpenter + KWOK)
-
-        run: bash kwok/scripts/validate-cluster-autoscaling.sh
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh --exercise
 
       # --- Evidence collection ---