|
32 | 32 | - 'recipes/components/dynamo-platform/**' |
33 | 33 | - 'recipes/overlays/kind.yaml' |
34 | 34 | - 'recipes/overlays/h100-kind-training.yaml' |
| 35 | + - 'kwok/manifests/karpenter/**' |
| 36 | + - 'kwok/scripts/install-karpenter-kwok.sh' |
| 37 | + - 'recipes/components/prometheus-adapter/**' |
35 | 38 | workflow_dispatch: {} # Allow manual runs |
36 | 39 |
|
37 | 40 | permissions: |
@@ -172,6 +175,145 @@ jobs: |
172 | 175 | kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \ |
173 | 176 | logs gang-worker-1 2>/dev/null || true |
174 | 177 |
|
| 178 | + # --- Cluster Autoscaling validation (CNCF AI Conformance #8a) --- |
| 179 | + # Validates the full metrics-driven autoscaling chain: |
| 180 | + # GPU workload → DCGM metrics → Prometheus → prometheus-adapter (external metric) |
| 181 | + # → HPA scales Deployment → pending pods → Karpenter → KWOK nodes provisioned |
| 182 | + # |
| 183 | + # Uses dcgm_gpu_memory_used external metric (always > 0 when a GPU exists) |
| 184 | + # to trigger HPA scaling, which overflows onto Karpenter-provisioned KWOK nodes. |
| 185 | + |
| 186 | + - name: "CNCF AI Conformance #8a - Cluster Autoscaling (Karpenter + KWOK)" |
| 187 | + run: | |
| 188 | + set -euo pipefail |
| 189 | +
|
| 190 | + echo "=== Installing Karpenter with KWOK provider ===" |
| 191 | + export KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" |
| 192 | + export KARPENTER_VERSION=$(yq eval '.testing_tools.karpenter' .settings.yaml) |
| 193 | + bash kwok/scripts/install-karpenter-kwok.sh |
| 194 | +
|
| 195 | + echo "=== Creating NodePool and KWOKNodeClass ===" |
| 196 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \ |
| 197 | + -f kwok/manifests/karpenter/nodepool.yaml |
| 198 | +
|
| 199 | + echo "=== Verifying external metrics API has GPU metrics ===" |
| 200 | + EXT_AVAILABLE=false |
| 201 | + for i in $(seq 1 12); do |
| 202 | + EXT_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ |
| 203 | + /apis/external.metrics.k8s.io/v1beta1 2>/dev/null) |
| 204 | + if [[ -n "${EXT_METRICS}" ]] && echo "${EXT_METRICS}" | jq -e '.resources[]? | select(.name=="dcgm_gpu_memory_used")' >/dev/null 2>&1; then |
| 205 | + echo "External metric dcgm_gpu_memory_used is available" |
| 206 | + EXT_AVAILABLE=true |
| 207 | + break |
| 208 | + fi |
| 209 | + echo "Waiting for external metrics API... (${i}/12)" |
| 210 | + sleep 10 |
| 211 | + done |
| 212 | + if [[ "${EXT_AVAILABLE}" != "true" ]]; then |
| 213 | + echo "::error::External metric dcgm_gpu_memory_used not available" |
| 214 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw /apis/external.metrics.k8s.io/v1beta1 2>/dev/null | jq . || true |
| 215 | + exit 1 |
| 216 | + fi |
| 217 | +
|
| 218 | + # Query the metric value to confirm it's non-zero |
| 219 | + EXT_VALUE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ |
| 220 | + "/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>/dev/null) |
| 221 | + echo "External metric value: $(echo "${EXT_VALUE}" | jq -r '.items[0].value // "N/A"' 2>/dev/null)" |
| 222 | +
|
| 223 | + echo "=== Deploying HPA-driven autoscaling test ===" |
| 224 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" create namespace autoscaling-test |
| 225 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \ |
| 226 | + -f kwok/manifests/karpenter/hpa-gpu-scale-test.yaml |
| 227 | +
|
| 228 | + echo "=== Waiting for HPA to read metrics and scale ===" |
| 229 | + HPA_SCALED=false |
| 230 | + for i in $(seq 1 20); do |
| 231 | + DESIRED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \ |
| 232 | + get hpa gpu-overflow-hpa -o jsonpath='{.status.desiredReplicas}' 2>/dev/null) |
| 233 | + CURRENT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \ |
| 234 | + get hpa gpu-overflow-hpa -o jsonpath='{.status.currentReplicas}' 2>/dev/null) |
| 235 | + METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \ |
| 236 | + get hpa gpu-overflow-hpa -o jsonpath='{.status.currentMetrics}' 2>/dev/null) |
| 237 | +
|
| 238 | + if [[ -n "${DESIRED}" && "${DESIRED}" -gt 1 ]]; then |
| 239 | + echo "HPA scaled: desired=${DESIRED} current=${CURRENT}" |
| 240 | + echo "HPA metrics: ${METRICS}" |
| 241 | + HPA_SCALED=true |
| 242 | + break |
| 243 | + fi |
| 244 | + echo "Waiting for HPA to compute scaling decision... desired=${DESIRED:-?} (${i}/20)" |
| 245 | + sleep 15 |
| 246 | + done |
| 247 | + if [[ "${HPA_SCALED}" != "true" ]]; then |
| 248 | + echo "::error::HPA did not scale beyond 1 replica" |
| 249 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true |
| 250 | + exit 1 |
| 251 | + fi |
| 252 | +
|
| 253 | + echo "=== Waiting for Karpenter to provision KWOK nodes ===" |
| 254 | + KWOK_NODES=0 |
| 255 | + for i in $(seq 1 30); do |
| 256 | + KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \ |
| 257 | + -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ') |
| 258 | + if [[ "$KWOK_NODES" -gt 0 ]]; then |
| 259 | + echo "Karpenter provisioned ${KWOK_NODES} KWOK GPU node(s)" |
| 260 | + break |
| 261 | + fi |
| 262 | + echo "Waiting for Karpenter to provision nodes... (${i}/30)" |
| 263 | + sleep 10 |
| 264 | + done |
| 265 | + if [[ "$KWOK_NODES" -eq 0 ]]; then |
| 266 | + echo "::error::Karpenter did not provision GPU nodes" |
| 267 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n karpenter logs deployment/karpenter --tail=50 2>/dev/null || true |
| 268 | + exit 1 |
| 269 | + fi |
| 270 | +
|
| 271 | + echo "=== Verifying nodes have GPU capacity ===" |
| 272 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \ |
| 273 | + -l karpenter.sh/nodepool=gpu-autoscaling-test \ |
| 274 | + -o jsonpath='{range .items[*]}{.metadata.name}: nvidia.com/gpu={.status.capacity.nvidia\.com/gpu}{"\n"}{end}' |
| 275 | +
|
| 276 | + echo "=== Verifying pods scheduled onto KWOK nodes ===" |
| 277 | + SCHEDULED=0 |
| 278 | + TOTAL=0 |
| 279 | + for i in $(seq 1 20); do |
| 280 | + SCHEDULED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \ |
| 281 | + --field-selector=status.phase!=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ') |
| 282 | + TOTAL=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \ |
| 283 | + --no-headers 2>/dev/null | wc -l | tr -d ' ') |
| 284 | + if [[ "$SCHEDULED" -eq "$TOTAL" && "$TOTAL" -gt 1 ]]; then |
| 285 | + echo "All ${TOTAL} GPU pods scheduled successfully (HPA-driven)" |
| 286 | + break |
| 287 | + fi |
| 288 | + echo "Waiting for pods to schedule... (${SCHEDULED}/${TOTAL}, attempt ${i}/20)" |
| 289 | + sleep 5 |
| 290 | + done |
| 291 | + if [[ "$TOTAL" -le 1 ]]; then |
| 292 | + echo "::error::HPA did not create additional replicas" |
| 293 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true |
| 294 | + exit 1 |
| 295 | + fi |
| 296 | +
|
| 297 | + echo "=== Full chain verified ===" |
| 298 | + echo " GPU metrics → Prometheus → external metrics API → HPA → Deployment scaled" |
| 299 | + echo " → pending pods → Karpenter → ${KWOK_NODES} KWOK node(s) → ${TOTAL} pods scheduled" |
| 300 | +
|
| 301 | + echo "=== Testing scale-down (consolidation) ===" |
| 302 | + kubectl --context="kind-${KIND_CLUSTER_NAME}" delete namespace autoscaling-test --wait=false |
| 303 | + sleep 15 |
| 304 | + for i in $(seq 1 12); do |
| 305 | + KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \ |
| 306 | + -l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ') |
| 307 | + if [[ "$KWOK_NODES" -eq 0 ]]; then |
| 308 | + echo "Karpenter consolidated all KWOK nodes (scale to zero)" |
| 309 | + break |
| 310 | + fi |
| 311 | + echo "Waiting for consolidation... (${KWOK_NODES} nodes remaining, ${i}/12)" |
| 312 | + sleep 10 |
| 313 | + done |
| 314 | +
|
| 315 | + echo "=== Cluster autoscaling validation PASSED ===" |
| 316 | +
|
175 | 317 | # --- Evidence collection --- |
176 | 318 |
|
177 | 319 | - name: Collect AI conformance evidence |
|
0 commit comments