refactor(ci): replace redundant bash assertions with Go conformance checks

dims · dims · commit e87da87b0eb4 · 2026-02-22T08:46:55.000-05:00
Remove bash assertion steps from the inference workflow that are now
covered by Go conformance checks running inside `aicr validate --phase
conformance`:

- Validate inference gateway (GatewayClass, Gateway, CRDs)
- Validate accelerator metrics (DCGM exporter, Prometheus, custom metrics API)
- Validate custom metrics for pod autoscaling (prometheus-adapter pipeline)
- Validate secure accelerator access (DRA resourceClaims, no hostPath)

Move DRA test pod deployment before `aicr validate` so the
secure-accelerator-access Go check can read the pod.

Fix conformance check execution: switch the aicr-build CI action from
`ko build` (which only packages the CLI binary) to `docker build` with
Dockerfile.validator, which includes the pre-compiled conformance.test
binary, test2json, and a shell. Add conformance.test compilation to
Dockerfile.validator alongside the existing readiness.test and
deployment.test binaries.

Add secure-accelerator-access and pod-autoscaling to the inference
recipe overlay check list.

Remove secure-accelerator-access from the training recipe overlay since
the training workflow does not deploy the prerequisite DRA test pod.

Add Dockerfile.validator and conformance check source paths to GPU
workflow triggers.
diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
@@ -13,20 +13,16 @@
 # limitations under the License.
 
 name: 'AICR Build'
-description: 'Builds the aicr container image (via ko) and CLI binary, and loads the image into kind.'
+description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.'
 
 runs:
   using: 'composite'
   steps:
 
-    - name: Build aicr image and load into kind
+    - name: Build aicr validator image and load into kind
       shell: bash
-      env:
-        GOFLAGS: -mod=vendor
       run: |
-        KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
-        GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
-        KO_DOCKER_REPO=ko.local ko build --bare --sbom=none --tags=smoke-test ./cmd/aicr
+        docker build -f Dockerfile.validator -t ko.local:smoke-test .
         kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
 
     - name: Build aicr binary
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
@@ -25,6 +25,8 @@ on:
       - '.github/actions/gpu-cluster-setup/**'
       - '.github/actions/gpu-operator-install/**'
       - '.github/actions/aicr-build/**'
+      - 'Dockerfile.validator'
+      - 'pkg/validator/checks/conformance/**'
       - '.github/actions/gpu-test-cleanup/**'
       - '.github/actions/load-versions/**'
       - 'tests/manifests/**'
@@ -107,6 +109,34 @@ jobs:
           fi
           echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
 
+      # --- Deploy DRA test pod (prerequisite for secure-accelerator-access check) ---
+
+      - name: Deploy DRA GPU test
+        run: |
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
+
+          echo "Waiting for DRA GPU test pod to complete..."
+          if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+            wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
+            echo "DRA GPU allocation test passed."
+          else
+            echo "::error::DRA GPU test pod did not succeed"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+              logs pod/dra-gpu-test 2>/dev/null || true
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+              get pod/dra-gpu-test -o yaml 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== DRA GPU test logs ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
+            logs pod/dra-gpu-test
+
+      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
+      # Replaces previous bash assertion steps for: inference-gateway,
+      # accelerator-metrics, pod-autoscaling, secure-accelerator-access.
+
       - name: Validate cluster
         run: |
           ./aicr validate \
@@ -131,43 +161,6 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind \
             --config tests/chainsaw/chainsaw-config.yaml
 
-      # --- Inference Gateway validation (CNCF AI Conformance #6) ---
-
-      - name: Validate inference gateway
-        run: |
-          echo "=== GatewayClass ==="
-          GC_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get gatewayclass kgateway \
-            -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null)
-          echo "GatewayClass accepted: ${GC_STATUS}"
-          if [[ "${GC_STATUS}" != "True" ]]; then
-            echo "::error::GatewayClass 'kgateway' not accepted"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== Gateway ==="
-          GW_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get gateway inference-gateway -n kgateway-system \
-            -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null)
-          echo "Gateway programmed: ${GW_STATUS}"
-          if [[ "${GW_STATUS}" != "True" ]]; then
-            echo "::error::Gateway 'inference-gateway' not programmed"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-              get gateway inference-gateway -n kgateway-system -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== Gateway API CRDs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
-            grep -E "gateway\.networking\.k8s\.io" || true
-
-          echo "=== Inference extension CRDs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
-            grep -E "inference\.networking" || true
-
-          echo "Inference gateway validation passed."
-
       # --- Dynamo vLLM inference smoke test ---
 
       - name: Deploy Dynamo vLLM smoke test
@@ -255,210 +248,11 @@ jobs:
           fi
           echo "Dynamo vLLM inference smoke test passed."
 
-      # --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
-
-      - name: Validate accelerator metrics
-
-        run: |
-          echo "=== DCGM Exporter pod ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o wide
-          DCGM_POD=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
-          if [[ -z "${DCGM_POD}" ]]; then
-            echo "::error::DCGM Exporter pod not found"
-            exit 1
-          fi
-          echo "DCGM Exporter pod: ${DCGM_POD}"
-
-          echo "=== Query DCGM metrics endpoint ==="
-          METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" run dcgm-probe \
-            --rm -i --restart=Never --image=curlimages/curl \
-            -- curl -sf http://nvidia-dcgm-exporter.gpu-operator.svc:9400/metrics 2>/dev/null)
-
-          for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
-            if echo "${METRICS}" | grep -q "^${METRIC}"; then
-              echo "${METRIC}: $(echo "${METRICS}" | grep "^${METRIC}" | head -1)"
-            else
-              echo "::warning::Metric ${METRIC} not found in DCGM output"
-            fi
-          done
-
-          echo "=== Prometheus scraping GPU metrics ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring \
-            port-forward svc/kube-prometheus-prometheus 9090:9090 &
-          PF_PID=$!
-          sleep 3
-
-          cleanup_pf() { kill "${PF_PID}" 2>/dev/null || true; }
-          trap cleanup_pf EXIT
-
-          for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
-            RESULT=$(curl -sf "http://localhost:9090/api/v1/query?query=${METRIC}" 2>/dev/null)
-            COUNT=$(echo "${RESULT}" | jq -r '.data.result | length' 2>/dev/null)
-            if [[ "${COUNT}" -gt 0 ]]; then
-              echo "${METRIC}: ${COUNT} time series in Prometheus"
-            else
-              echo "::warning::${METRIC} not found in Prometheus (may need more scrape time)"
-            fi
-          done
-
-          kill "${PF_PID}" 2>/dev/null || true
-          trap - EXIT
-
-          echo "=== Custom Metrics API ==="
-          CUSTOM_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
-          if [[ -n "${CUSTOM_METRICS}" ]]; then
-            echo "Custom metrics API is available"
-            echo "${CUSTOM_METRICS}" | jq -r '.resources[].name' 2>/dev/null | head -20 || true
-          else
-            echo "::warning::Custom metrics API not available (prometheus-adapter may need time)"
-          fi
-
-          echo "Accelerator metrics validation passed."
-
-      # --- Pod Autoscaling readiness validation (CNCF AI Conformance #8b) ---
-      # Validates the custom metrics pipeline (DCGM → Prometheus → prometheus-adapter
-      # → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
-      # Deployments), so we validate the API directly rather than creating an HPA.
-      #
-      # DCGM exporter pod-mapping relabels metrics with the GPU workload's
-      # namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
-      # (idle GPU) or dynamo-system (active workload). prometheus-adapter also
-      # needs relist cycles (30s each) to discover new label combinations, so
-      # we poll with retries.
-
-      - name: Validate custom metrics for pod autoscaling
-
-        run: |
-          echo "=== Custom metrics API availability ==="
-          RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
-            get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
-          if [[ -z "${RESOURCES}" ]]; then
-            echo "::error::Custom metrics API not available"
-            exit 1
-          fi
-          echo "Custom metrics API is available"
-          echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
-
-          NAMESPACES="gpu-operator dynamo-system"
-          METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
-
-          # Poll for up to 3 minutes — prometheus-adapter relists every 30s and
-          # avg_over_time(...[2m]) queries need sufficient data points.
-          HAS_METRICS=false
-          for ATTEMPT in $(seq 1 18); do
-            for METRIC in ${METRICS}; do
-              for NS in ${NAMESPACES}; do
-                RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-                  "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
-                if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
-                  echo "${METRIC} metrics available in ${NS}:"
-                  echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
-                  HAS_METRICS=true
-                  break 3
-                fi
-              done
-            done
-            echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
-            sleep 10
-          done
-
-          if [[ "${HAS_METRICS}" != "true" ]]; then
-            echo "::error::No GPU custom metrics available via custom metrics API (prometheus-adapter pipeline broken)"
-            exit 1
-          fi
-
-          echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
-
       # --- Cluster Autoscaling validation ---
 
       - name: Cluster Autoscaling (Karpenter + KWOK)
-
         run: bash kwok/scripts/validate-cluster-autoscaling.sh
 
-      # --- DRA GPU allocation test ---
-
-      - name: Deploy DRA GPU test
-
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-            -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
-
-          echo "Waiting for DRA GPU test pod to complete..."
-          if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
-            echo "DRA GPU allocation test passed."
-          else
-            echo "::error::DRA GPU test pod did not succeed"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-              logs pod/dra-gpu-test 2>/dev/null || true
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-              get pod/dra-gpu-test -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "=== DRA GPU test logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            logs pod/dra-gpu-test
-
-      # --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
-
-      - name: Validate secure accelerator access
-
-        run: |
-          echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
-
-          # Check pod uses resourceClaims (DRA), not resources.limits (device plugin)
-          RESOURCE_CLAIMS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test -o jsonpath='{.spec.resourceClaims}' 2>/dev/null)
-          if [[ -z "${RESOURCE_CLAIMS}" || "${RESOURCE_CLAIMS}" == "null" ]]; then
-            echo "::error::Pod does not use DRA resourceClaims"
-            exit 1
-          fi
-          echo "Pod uses DRA resourceClaims: ${RESOURCE_CLAIMS}"
-
-          # Verify no nvidia.com/gpu in resources.limits (device plugin pattern)
-          GPU_LIMITS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test \
-            -o jsonpath='{.spec.containers[0].resources.limits.nvidia\.com/gpu}' 2>/dev/null)
-          if [[ -n "${GPU_LIMITS}" && "${GPU_LIMITS}" != "null" ]]; then
-            echo "::error::Pod uses device plugin (nvidia.com/gpu limits) instead of DRA"
-            exit 1
-          fi
-          echo "No device plugin resources.limits — GPU access via DRA only"
-
-          # Verify no hostPath volumes to /dev/nvidia*
-          VOLUMES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test -o jsonpath='{.spec.volumes}' 2>/dev/null)
-          if echo "${VOLUMES}" | grep -q "hostPath" && echo "${VOLUMES}" | grep -q "/dev/nvidia"; then
-            echo "::error::Pod has hostPath volume mount to /dev/nvidia*"
-            exit 1
-          fi
-          echo "No hostPath volumes to /dev/nvidia* — access is DRA-mediated"
-
-          # Verify container security (no privilege escalation)
-          PRIV_ESC=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get pod/dra-gpu-test \
-            -o jsonpath='{.spec.containers[0].securityContext.allowPrivilegeEscalation}' 2>/dev/null)
-          echo "allowPrivilegeEscalation: ${PRIV_ESC}"
-
-          # Verify only 1 GPU visible (allocated count matches)
-          GPU_COUNT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            logs pod/dra-gpu-test 2>/dev/null | grep -c "/dev/nvidia[0-9]" || echo "0")
-          echo "GPU devices visible in container: ${GPU_COUNT}"
-          if [[ "${GPU_COUNT}" -lt 1 ]]; then
-            echo "::error::No GPU devices visible in container"
-            exit 1
-          fi
-
-          echo "=== ResourceClaim allocation ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
-            get resourceclaim gpu-claim -o wide
-
-          echo "Secure accelerator access validation passed."
-
       - name: DRA GPU test cleanup
         if: always()
         run: |
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
@@ -25,6 +25,8 @@ on:
       - '.github/actions/gpu-cluster-setup/**'
       - '.github/actions/gpu-operator-install/**'
       - '.github/actions/aicr-build/**'
+      - 'Dockerfile.validator'
+      - 'pkg/validator/checks/conformance/**'
       - '.github/actions/gpu-test-cleanup/**'
       - '.github/actions/load-versions/**'
       - 'docs/conformance/cncf/manifests/gang-scheduling-test.yaml'
diff --git a/Dockerfile.validator b/Dockerfile.validator
@@ -50,7 +50,8 @@ RUN set -e; \
 
 # Pre-compile test binaries for in-cluster validation Jobs
 RUN CGO_ENABLED=0 go test -c -o /out/readiness.test ./pkg/validator/checks/readiness && \
-    CGO_ENABLED=0 go test -c -o /out/deployment.test ./pkg/validator/checks/deployment
+    CGO_ENABLED=0 go test -c -o /out/deployment.test ./pkg/validator/checks/deployment && \
+    CGO_ENABLED=0 go test -c -o /out/conformance.test ./pkg/validator/checks/conformance
 
 # Build test2json tool — converts verbose test output to JSON event stream.
 # Compiled test binaries don't support -test.json; they require piping through
@@ -70,6 +71,7 @@ LABEL org.opencontainers.image.title="aicr-validator" \
 COPY --from=builder /out/aicr /usr/local/bin/aicr
 COPY --from=builder /out/readiness.test /usr/local/bin/readiness.test
 COPY --from=builder /out/deployment.test /usr/local/bin/deployment.test
+COPY --from=builder /out/conformance.test /usr/local/bin/conformance.test
 COPY --from=builder /out/test2json /usr/local/bin/test2json
 
 # Copy testdata needed by deployment tests at runtime (loaded via os.ReadFile
diff --git a/recipes/overlays/h100-kind-inference.yaml b/recipes/overlays/h100-kind-inference.yaml
@@ -39,6 +39,8 @@ spec:
         - platform-health
         - gpu-operator-health
         - dra-support
+        - secure-accelerator-access
         - accelerator-metrics
         - ai-service-metrics
         - inference-gateway
+        - pod-autoscaling
diff --git a/recipes/overlays/h100-kind-training.yaml b/recipes/overlays/h100-kind-training.yaml
@@ -71,6 +71,5 @@ spec:
         - ai-service-metrics
         - gang-scheduling
         - robust-controller
-        - secure-accelerator-access
         - pod-autoscaling
         - cluster-autoscaling