NVIDIA
diff --git a/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 142 additions & 0 deletions b/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 142 additions & 0 deletions b/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎.settings.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.settings.yaml‎
Lines changed: 1 addition & 0 deletions
@@ -36,6 +36,9 @@ on:
       - 'recipes/overlays/kind-inference.yaml'
       - 'recipes/overlays/h100-kind-inference.yaml'
       - 'recipes/overlays/h100-kind-inference-dynamo.yaml'
+      - 'kwok/manifests/karpenter/**'
+      - 'kwok/scripts/install-karpenter-kwok.sh'
+      - 'recipes/components/prometheus-adapter/**'
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -336,6 +339,145 @@ jobs:
 
           echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
 
+      # --- Cluster Autoscaling validation (CNCF AI Conformance #8a) ---
+      # Validates the full metrics-driven autoscaling chain:
+      #   GPU workload → DCGM metrics → Prometheus → prometheus-adapter (external metric)
+      #   → HPA scales Deployment → pending pods → Karpenter → KWOK nodes provisioned
+      #
+      # Uses dcgm_gpu_memory_used external metric (always > 0 when a GPU exists)
+      # to trigger HPA scaling, which overflows onto Karpenter-provisioned KWOK nodes.
+
+      - name: "CNCF AI Conformance #8a - Cluster Autoscaling (Karpenter + KWOK)"
+        run: |
+          set -euo pipefail
+
+          echo "=== Installing Karpenter with KWOK provider ==="
+          export KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}"
+          export KARPENTER_VERSION=$(yq eval '.testing_tools.karpenter' .settings.yaml)
+          bash kwok/scripts/install-karpenter-kwok.sh
+
+          echo "=== Creating NodePool and KWOKNodeClass ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f kwok/manifests/karpenter/nodepool.yaml
+
+          echo "=== Verifying external metrics API has GPU metrics ==="
+          EXT_AVAILABLE=false
+          for i in $(seq 1 12); do
+            EXT_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+              /apis/external.metrics.k8s.io/v1beta1 2>/dev/null)
+            if [[ -n "${EXT_METRICS}" ]] && echo "${EXT_METRICS}" | jq -e '.resources[]? | select(.name=="dcgm_gpu_memory_used")' >/dev/null 2>&1; then
+              echo "External metric dcgm_gpu_memory_used is available"
+              EXT_AVAILABLE=true
+              break
+            fi
+            echo "Waiting for external metrics API... (${i}/12)"
+            sleep 10
+          done
+          if [[ "${EXT_AVAILABLE}" != "true" ]]; then
+            echo "::error::External metric dcgm_gpu_memory_used not available"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw /apis/external.metrics.k8s.io/v1beta1 2>/dev/null | jq . || true
+            exit 1
+          fi
+
+          # Query the metric value to confirm it's non-zero
+          EXT_VALUE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+            "/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>/dev/null)
+          echo "External metric value: $(echo "${EXT_VALUE}" | jq -r '.items[0].value // "N/A"' 2>/dev/null)"
+
+          echo "=== Deploying HPA-driven autoscaling test ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" create namespace autoscaling-test
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f kwok/manifests/karpenter/hpa-gpu-scale-test.yaml
+
+          echo "=== Waiting for HPA to read metrics and scale ==="
+          HPA_SCALED=false
+          for i in $(seq 1 20); do
+            DESIRED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.desiredReplicas}' 2>/dev/null)
+            CURRENT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.currentReplicas}' 2>/dev/null)
+            METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.currentMetrics}' 2>/dev/null)
+
+            if [[ -n "${DESIRED}" && "${DESIRED}" -gt 1 ]]; then
+              echo "HPA scaled: desired=${DESIRED} current=${CURRENT}"
+              echo "HPA metrics: ${METRICS}"
+              HPA_SCALED=true
+              break
+            fi
+            echo "Waiting for HPA to compute scaling decision... desired=${DESIRED:-?} (${i}/20)"
+            sleep 15
+          done
+          if [[ "${HPA_SCALED}" != "true" ]]; then
+            echo "::error::HPA did not scale beyond 1 replica"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Waiting for Karpenter to provision KWOK nodes ==="
+          KWOK_NODES=0
+          for i in $(seq 1 30); do
+            KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+              -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$KWOK_NODES" -gt 0 ]]; then
+              echo "Karpenter provisioned ${KWOK_NODES} KWOK GPU node(s)"
+              break
+            fi
+            echo "Waiting for Karpenter to provision nodes... (${i}/30)"
+            sleep 10
+          done
+          if [[ "$KWOK_NODES" -eq 0 ]]; then
+            echo "::error::Karpenter did not provision GPU nodes"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n karpenter logs deployment/karpenter --tail=50 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Verifying nodes have GPU capacity ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+            -l karpenter.sh/nodepool=gpu-autoscaling-test \
+            -o jsonpath='{range .items[*]}{.metadata.name}: nvidia.com/gpu={.status.capacity.nvidia\.com/gpu}{"\n"}{end}'
+
+          echo "=== Verifying pods scheduled onto KWOK nodes ==="
+          SCHEDULED=0
+          TOTAL=0
+          for i in $(seq 1 20); do
+            SCHEDULED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
+              --field-selector=status.phase!=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            TOTAL=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
+              --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$SCHEDULED" -eq "$TOTAL" && "$TOTAL" -gt 1 ]]; then
+              echo "All ${TOTAL} GPU pods scheduled successfully (HPA-driven)"
+              break
+            fi
+            echo "Waiting for pods to schedule... (${SCHEDULED}/${TOTAL}, attempt ${i}/20)"
+            sleep 5
+          done
+          if [[ "$TOTAL" -le 1 ]]; then
+            echo "::error::HPA did not create additional replicas"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Full chain verified ==="
+          echo "  GPU metrics → Prometheus → external metrics API → HPA → Deployment scaled"
+          echo "  → pending pods → Karpenter → ${KWOK_NODES} KWOK node(s) → ${TOTAL} pods scheduled"
+
+          echo "=== Testing scale-down (consolidation) ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete namespace autoscaling-test --wait=false
+          sleep 15
+          for i in $(seq 1 12); do
+            KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+              -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$KWOK_NODES" -eq 0 ]]; then
+              echo "Karpenter consolidated all KWOK nodes (scale to zero)"
+              break
+            fi
+            echo "Waiting for consolidation... (${KWOK_NODES} nodes remaining, ${i}/12)"
+            sleep 10
+          done
+
+          echo "=== Cluster autoscaling validation PASSED ==="
+
       # --- DRA GPU allocation test ---
 
       - name: Deploy DRA GPU test
 
@@ -32,6 +32,9 @@ on:
       - 'recipes/components/dynamo-platform/**'
       - 'recipes/overlays/kind.yaml'
       - 'recipes/overlays/h100-kind-training.yaml'
+      - 'kwok/manifests/karpenter/**'
+      - 'kwok/scripts/install-karpenter-kwok.sh'
+      - 'recipes/components/prometheus-adapter/**'
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -172,6 +175,145 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
             logs gang-worker-1 2>/dev/null || true
 
+      # --- Cluster Autoscaling validation (CNCF AI Conformance #8a) ---
+      # Validates the full metrics-driven autoscaling chain:
+      #   GPU workload → DCGM metrics → Prometheus → prometheus-adapter (external metric)
+      #   → HPA scales Deployment → pending pods → Karpenter → KWOK nodes provisioned
+      #
+      # Uses dcgm_gpu_memory_used external metric (always > 0 when a GPU exists)
+      # to trigger HPA scaling, which overflows onto Karpenter-provisioned KWOK nodes.
+
+      - name: "CNCF AI Conformance #8a - Cluster Autoscaling (Karpenter + KWOK)"
+        run: |
+          set -euo pipefail
+
+          echo "=== Installing Karpenter with KWOK provider ==="
+          export KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}"
+          export KARPENTER_VERSION=$(yq eval '.testing_tools.karpenter' .settings.yaml)
+          bash kwok/scripts/install-karpenter-kwok.sh
+
+          echo "=== Creating NodePool and KWOKNodeClass ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f kwok/manifests/karpenter/nodepool.yaml
+
+          echo "=== Verifying external metrics API has GPU metrics ==="
+          EXT_AVAILABLE=false
+          for i in $(seq 1 12); do
+            EXT_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+              /apis/external.metrics.k8s.io/v1beta1 2>/dev/null)
+            if [[ -n "${EXT_METRICS}" ]] && echo "${EXT_METRICS}" | jq -e '.resources[]? | select(.name=="dcgm_gpu_memory_used")' >/dev/null 2>&1; then
+              echo "External metric dcgm_gpu_memory_used is available"
+              EXT_AVAILABLE=true
+              break
+            fi
+            echo "Waiting for external metrics API... (${i}/12)"
+            sleep 10
+          done
+          if [[ "${EXT_AVAILABLE}" != "true" ]]; then
+            echo "::error::External metric dcgm_gpu_memory_used not available"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw /apis/external.metrics.k8s.io/v1beta1 2>/dev/null | jq . || true
+            exit 1
+          fi
+
+          # Query the metric value to confirm it's non-zero
+          EXT_VALUE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+            "/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>/dev/null)
+          echo "External metric value: $(echo "${EXT_VALUE}" | jq -r '.items[0].value // "N/A"' 2>/dev/null)"
+
+          echo "=== Deploying HPA-driven autoscaling test ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" create namespace autoscaling-test
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
+            -f kwok/manifests/karpenter/hpa-gpu-scale-test.yaml
+
+          echo "=== Waiting for HPA to read metrics and scale ==="
+          HPA_SCALED=false
+          for i in $(seq 1 20); do
+            DESIRED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.desiredReplicas}' 2>/dev/null)
+            CURRENT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.currentReplicas}' 2>/dev/null)
+            METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
+              get hpa gpu-overflow-hpa -o jsonpath='{.status.currentMetrics}' 2>/dev/null)
+
+            if [[ -n "${DESIRED}" && "${DESIRED}" -gt 1 ]]; then
+              echo "HPA scaled: desired=${DESIRED} current=${CURRENT}"
+              echo "HPA metrics: ${METRICS}"
+              HPA_SCALED=true
+              break
+            fi
+            echo "Waiting for HPA to compute scaling decision... desired=${DESIRED:-?} (${i}/20)"
+            sleep 15
+          done
+          if [[ "${HPA_SCALED}" != "true" ]]; then
+            echo "::error::HPA did not scale beyond 1 replica"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Waiting for Karpenter to provision KWOK nodes ==="
+          KWOK_NODES=0
+          for i in $(seq 1 30); do
+            KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+              -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$KWOK_NODES" -gt 0 ]]; then
+              echo "Karpenter provisioned ${KWOK_NODES} KWOK GPU node(s)"
+              break
+            fi
+            echo "Waiting for Karpenter to provision nodes... (${i}/30)"
+            sleep 10
+          done
+          if [[ "$KWOK_NODES" -eq 0 ]]; then
+            echo "::error::Karpenter did not provision GPU nodes"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n karpenter logs deployment/karpenter --tail=50 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Verifying nodes have GPU capacity ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+            -l karpenter.sh/nodepool=gpu-autoscaling-test \
+            -o jsonpath='{range .items[*]}{.metadata.name}: nvidia.com/gpu={.status.capacity.nvidia\.com/gpu}{"\n"}{end}'
+
+          echo "=== Verifying pods scheduled onto KWOK nodes ==="
+          SCHEDULED=0
+          TOTAL=0
+          for i in $(seq 1 20); do
+            SCHEDULED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
+              --field-selector=status.phase!=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            TOTAL=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
+              --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$SCHEDULED" -eq "$TOTAL" && "$TOTAL" -gt 1 ]]; then
+              echo "All ${TOTAL} GPU pods scheduled successfully (HPA-driven)"
+              break
+            fi
+            echo "Waiting for pods to schedule... (${SCHEDULED}/${TOTAL}, attempt ${i}/20)"
+            sleep 5
+          done
+          if [[ "$TOTAL" -le 1 ]]; then
+            echo "::error::HPA did not create additional replicas"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
+            exit 1
+          fi
+
+          echo "=== Full chain verified ==="
+          echo "  GPU metrics → Prometheus → external metrics API → HPA → Deployment scaled"
+          echo "  → pending pods → Karpenter → ${KWOK_NODES} KWOK node(s) → ${TOTAL} pods scheduled"
+
+          echo "=== Testing scale-down (consolidation) ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete namespace autoscaling-test --wait=false
+          sleep 15
+          for i in $(seq 1 12); do
+            KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
+              -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
+            if [[ "$KWOK_NODES" -eq 0 ]]; then
+              echo "Karpenter consolidated all KWOK nodes (scale to zero)"
+              break
+            fi
+            echo "Waiting for consolidation... (${KWOK_NODES} nodes remaining, ${i}/12)"
+            sleep 10
+          done
+
+          echo "=== Cluster autoscaling validation PASSED ==="
+
       # --- Evidence collection ---
 
       - name: Collect AI conformance evidence
 
@@ -44,6 +44,7 @@ testing_tools:
   kwok: 'v0.7.0'
   chainsaw: 'v0.2.14'
   yq: 'v4.52.4'
+  karpenter: 'v1.8.0'
 
 # Quality Thresholds
 quality: