NVIDIA
diff --git a/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 67 additions & 20 deletions b/‎.github/workflows/gpu-h100-inference-test.yaml‎
Lines changed: 67 additions & 20 deletions
diff --git a/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/gpu-h100-training-test.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.settings.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.settings.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎kwok/manifests/karpenter/hpa-gpu-scale-test.yaml‎
Lines changed: 99 additions & 0 deletions b/‎kwok/manifests/karpenter/hpa-gpu-scale-test.yaml‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎kwok/manifests/karpenter/instance-types.json‎
Lines changed: 89 additions & 0 deletions b/‎kwok/manifests/karpenter/instance-types.json‎
Lines changed: 89 additions & 0 deletions
@@ -36,6 +36,9 @@ on:
       - 'recipes/overlays/kind-inference.yaml'
       - 'recipes/overlays/h100-kind-inference.yaml'
       - 'recipes/overlays/h100-kind-inference-dynamo.yaml'
+      - 'kwok/manifests/karpenter/**'
+      - 'kwok/scripts/install-karpenter-kwok.sh'
+      - 'kwok/scripts/validate-cluster-autoscaling.sh'
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -169,6 +172,31 @@ jobs:
 
       - name: Deploy Dynamo vLLM smoke test
         run: |
+          # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
+          # The kai-scheduler chart creates default-parent-queue + default-queue on install,
+          # but Dynamo needs its own queue as a child of the parent.
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
+          apiVersion: scheduling.run.ai/v2
+          kind: Queue
+          metadata:
+            name: dynamo
+          spec:
+            parentQueue: default-parent-queue
+            resources:
+              gpu:
+                quota: 0
+                limit: -1
+                overQuotaWeight: 1
+              cpu:
+                quota: 0
+                limit: -1
+                overQuotaWeight: 1
+              memory:
+                quota: 0
+                limit: -1
+                overQuotaWeight: 1
+          EOF
+
           kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
             -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
 
@@ -228,8 +256,10 @@ jobs:
           echo "Dynamo vLLM inference smoke test passed."
 
       # --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
+      # Independent of Dynamo — run even if Dynamo deployment fails.
 
       - name: Validate accelerator metrics
+        if: success() || failure()
         run: |
           echo "=== DCGM Exporter pod ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
@@ -294,11 +324,14 @@ jobs:
       # → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
       # Deployments), so we validate the API directly rather than creating an HPA.
       #
-      # Note: DCGM exporter runs as a DaemonSet in gpu-operator namespace, so
-      # Prometheus labels GPU metrics with namespace=gpu-operator. We query that
-      # namespace to validate the full metrics pipeline.
+      # DCGM exporter pod-mapping relabels metrics with the GPU workload's
+      # namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
+      # (idle GPU) or dynamo-system (active workload). prometheus-adapter also
+      # needs relist cycles (30s each) to discover new label combinations, so
+      # we poll with retries.
 
       - name: Validate custom metrics for pod autoscaling
+        if: success() || failure()
         run: |
           echo "=== Custom metrics API availability ==="
           RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
@@ -310,23 +343,27 @@ jobs:
           echo "Custom metrics API is available"
           echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
 
-          # DCGM exporter runs in gpu-operator namespace, so custom metrics are
-          # attributed to the DCGM exporter pod there (not workload pods).
-          METRICS_NS="gpu-operator"
+          NAMESPACES="gpu-operator dynamo-system"
+          METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
 
-          # At least one GPU metric must be available via the custom metrics API
+          # Poll for up to 3 minutes — prometheus-adapter relists every 30s and
+          # avg_over_time(...[2m]) queries need sufficient data points.
           HAS_METRICS=false
-          for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
-            echo "=== Query ${METRIC} ==="
-            RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-              "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${METRICS_NS}/pods/*/${METRIC}" 2>/dev/null)
-            if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
-              echo "${METRIC} metrics available:"
-              echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
-              HAS_METRICS=true
-            else
-              echo "::warning::${METRIC} not available in ${METRICS_NS} namespace"
-            fi
+          for ATTEMPT in $(seq 1 18); do
+            for METRIC in ${METRICS}; do
+              for NS in ${NAMESPACES}; do
+                RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+                  "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
+                if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
+                  echo "${METRIC} metrics available in ${NS}:"
+                  echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
+                  HAS_METRICS=true
+                  break 3
+                fi
+              done
+            done
+            echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
+            sleep 10
           done
 
           if [[ "${HAS_METRICS}" != "true" ]]; then
@@ -336,9 +373,16 @@ jobs:
 
           echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
 
+      # --- Cluster Autoscaling validation ---
+
+      - name: Cluster Autoscaling (Karpenter + KWOK)
+        if: success() || failure()
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh
+
       # --- DRA GPU allocation test ---
 
       - name: Deploy DRA GPU test
+        if: success() || failure()
         run: |
           kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
             -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
@@ -363,6 +407,7 @@ jobs:
       # --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
 
       - name: Validate secure accelerator access
+        if: success() || failure()
         run: |
           echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
 
@@ -474,8 +519,10 @@ jobs:
           echo "=== Custom metrics API ==="
           for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
             echo "--- ${METRIC} ---"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-              "/apis/custom.metrics.k8s.io/v1beta1/namespaces/gpu-operator/pods/*/${METRIC}" 2>/dev/null | jq . || true
+            for NS in gpu-operator dynamo-system; do
+              kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+                "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
+            done
           done
           echo "=== prometheus-adapter pods ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
 
@@ -32,6 +32,10 @@ on:
       - 'recipes/components/dynamo-platform/**'
       - 'recipes/overlays/kind.yaml'
       - 'recipes/overlays/h100-kind-training.yaml'
+      - 'kwok/manifests/karpenter/**'
+      - 'kwok/scripts/install-karpenter-kwok.sh'
+      - 'kwok/scripts/validate-cluster-autoscaling.sh'
+      - 'recipes/components/prometheus-adapter/**'
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -172,6 +176,12 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
             logs gang-worker-1 2>/dev/null || true
 
+      # --- Cluster Autoscaling validation ---
+
+      - name: Cluster Autoscaling (Karpenter + KWOK)
+        if: success() || failure()
+        run: bash kwok/scripts/validate-cluster-autoscaling.sh
+
       # --- Evidence collection ---
 
       - name: Collect AI conformance evidence
 
@@ -44,6 +44,7 @@ testing_tools:
   kwok: 'v0.7.0'
   chainsaw: 'v0.2.14'
   yq: 'v4.52.4'
+  karpenter: 'v1.8.0'
 
 # Quality Thresholds
 quality:
 
@@ -0,0 +1,99 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# HPA-driven GPU autoscaling test for CNCF AI Conformance cluster_autoscaling.
+#
+# This validates the full metrics-driven autoscaling chain:
+#   GPU workload → DCGM metrics → Prometheus → prometheus-adapter → HPA
+#   → Deployment scales → pending pods → Karpenter → KWOK nodes provisioned
+#
+# The HPA uses an external GPU metric (dcgm_gpu_memory_used) which is always > 0
+# when a GPU exists (driver memory overhead). When the metric exceeds the low
+# threshold, HPA scales the Deployment beyond what the real GPU node can serve,
+# causing overflow pods to trigger Karpenter KWOK node provisioning.
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-overflow-workers
+  namespace: autoscaling-test
+  labels:
+    app: gpu-overflow-workers
+    test: cluster-autoscaling
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-overflow-workers
+  template:
+    metadata:
+      labels:
+        app: gpu-overflow-workers
+        test: cluster-autoscaling
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Equal
+          value: "present"
+          effect: NoSchedule
+        - key: kwok.x-k8s.io/node
+          operator: Exists
+          effect: NoSchedule
+      nodeSelector:
+        karpenter.sh/nodepool: gpu-autoscaling-test
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: gpu-workload
+          image: ubuntu:22.04
+          command: ["sleep", "120"]
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+      restartPolicy: Always
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: gpu-overflow-hpa
+  namespace: autoscaling-test
+  labels:
+    test: cluster-autoscaling
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: gpu-overflow-workers
+  minReplicas: 1
+  maxReplicas: 4
+  metrics:
+    # External metric: cluster-wide average GPU power draw (watts).
+    # Power usage is always > 0 when a GPU exists (idle H100 draws ~46W).
+    # With a low threshold, this reliably triggers scale-up.
+    - type: External
+      external:
+        metric:
+          name: dcgm_gpu_power_usage
+        target:
+          type: AverageValue
+          averageValue: "10"
@@ -0,0 +1,89 @@
+[
+  {
+    "name": "p5.48xlarge",
+    "offerings": [
+      {
+        "Price": 98.32,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
+        ]
+      },
+      {
+        "Price": 98.32,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
+        ]
+      }
+    ],
+    "architecture": "amd64",
+    "operatingSystems": ["linux"],
+    "resources": {
+      "cpu": "192",
+      "memory": "2048Gi",
+      "ephemeral-storage": "3800Gi",
+      "nvidia.com/gpu": "8"
+    }
+  },
+  {
+    "name": "g5.xlarge",
+    "offerings": [
+      {
+        "Price": 1.006,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
+        ]
+      },
+      {
+        "Price": 1.006,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
+        ]
+      }
+    ],
+    "architecture": "amd64",
+    "operatingSystems": ["linux"],
+    "resources": {
+      "cpu": "4",
+      "memory": "16Gi",
+      "ephemeral-storage": "250Gi",
+      "nvidia.com/gpu": "1"
+    }
+  },
+  {
+    "name": "g5.2xlarge",
+    "offerings": [
+      {
+        "Price": 1.212,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
+        ]
+      },
+      {
+        "Price": 1.212,
+        "Available": true,
+        "Requirements": [
+          { "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
+          { "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
+        ]
+      }
+    ],
+    "architecture": "amd64",
+    "operatingSystems": ["linux"],
+    "resources": {
+      "cpu": "8",
+      "memory": "32Gi",
+      "ephemeral-storage": "450Gi",
+      "nvidia.com/gpu": "1"
+    }
+  }
+]