Skip to content

Commit 2cc21ca

Browse files
committed
feat(conformance): capture observed state in evidence artifacts
Replace hardcoded congratulatory strings in conformance evidence artifacts with actual observed cluster state. Each behavioral check now returns a typed report struct capturing real values (HPA desired/current replicas, node counts, scheduling timestamps, webhook rejection codes, etc.). Also fixes TestSecureAcceleratorAccess flaky failure on GPU Inference CI by: - Pinning the no-claim isolation pod to the GPU node via NodeName, ensuring isolation is proven on a node that actually has GPUs and bypassing scheduler-level delays - Adding podStuckReason() helper for fast failure on ImagePullBackOff, CrashLoopBackOff, and Unschedulable states - Treating K8s client rate limiter errors as retriable - Pinning busybox image to 1.37 (matching HPA tests) - Adding diagnostic output (phase, container status, node) to timeout error messages Deduplicates waitForHPAScalingIntent / waitForClusterAutoHPAScale into a shared waitForHPAScaleUp helper in helpers.go.
1 parent d3a0ad3 commit 2cc21ca

12 files changed

+517
-236
lines changed

.yamllint.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ ignore: |
3232
# KWOK templates (contain shell variable syntax, not valid YAML)
3333
kwok/templates/
3434
35+
# CNCF conformance submission (long prose descriptions exceed line-length)
36+
docs/conformance/cncf/submission/
37+
3538
# Git directory
3639
.git/
3740

pkg/validator/checks/conformance/ai_service_metrics_check.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,34 @@ func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL str
8383
return errors.New(errors.ErrCodeInternal, "discovery REST client is not available")
8484
}
8585
result := restClient.Get().AbsPath(rawURL).Do(ctx.Context)
86+
var statusCode int
87+
result.StatusCode(&statusCode)
8688
if cmErr := result.Error(); cmErr != nil {
87-
recordArtifact(ctx, "Custom Metrics API", fmt.Sprintf("Status: unavailable\nError: %v", cmErr))
89+
recordArtifact(ctx, "Custom Metrics API",
90+
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nStatus: unavailable\nError: %v",
91+
rawURL, statusCode, cmErr))
8892
return errors.Wrap(errors.ErrCodeNotFound,
8993
"custom metrics API not available", cmErr)
9094
}
91-
recordArtifact(ctx, "Custom Metrics API", "Status: available\nEndpoint: /apis/custom.metrics.k8s.io/v1beta1")
95+
96+
groupVersion := "unknown"
97+
resourceCount := 0
98+
discoveryBody, rawErr := result.Raw()
99+
if rawErr == nil {
100+
var discovery struct {
101+
GroupVersion string `json:"groupVersion"`
102+
Resources []json.RawMessage `json:"resources"`
103+
}
104+
if json.Unmarshal(discoveryBody, &discovery) == nil {
105+
if discovery.GroupVersion != "" {
106+
groupVersion = discovery.GroupVersion
107+
}
108+
resourceCount = len(discovery.Resources)
109+
}
110+
}
111+
recordArtifact(ctx, "Custom Metrics API",
112+
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nGroupVersion: %s\nAPI Resources: %d\nStatus: available",
113+
rawURL, statusCode, groupVersion, resourceCount))
92114

93115
return nil
94116
}

pkg/validator/checks/conformance/cluster_autoscaling_check.go

Lines changed: 74 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ const (
4242
karpenterNodePoolLabel = "karpenter.sh/nodepool"
4343
)
4444

45+
type clusterAutoscalingReport struct {
46+
NodePool string
47+
HPADesired int32
48+
HPACurrent int32
49+
BaselineNodes int
50+
ObservedNodes int
51+
TotalPods int
52+
ScheduledPods int
53+
}
54+
4555
func init() {
4656
checks.RegisterCheck(&checks.Check{
4757
Name: "cluster-autoscaling",
@@ -122,14 +132,19 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
122132
var lastErr error
123133
for _, poolName := range gpuNodePoolNames {
124134
slog.Info("attempting behavioral validation with NodePool", "nodePool", poolName)
125-
lastErr = validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
126-
if lastErr == nil {
135+
report, runErr := validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
136+
if runErr == nil {
127137
recordArtifact(ctx, "Cluster Autoscaling Behavioral Test",
128-
fmt.Sprintf("NodePool: %s\nHPA: scaling intent detected\nKarpenter: new node(s) provisioned\nPods: scheduled on new nodes", poolName))
138+
fmt.Sprintf("NodePool: %s\nHPA desired/current: %d/%d\nKarpenter nodes: baseline=%d observed=%d new=%d\nPods scheduled: %d/%d",
139+
report.NodePool,
140+
report.HPADesired, report.HPACurrent,
141+
report.BaselineNodes, report.ObservedNodes, report.ObservedNodes-report.BaselineNodes,
142+
report.ScheduledPods, report.TotalPods))
129143
return nil
130144
}
145+
lastErr = runErr
131146
slog.Debug("behavioral validation failed for NodePool",
132-
"nodePool", poolName, "error", lastErr)
147+
"nodePool", poolName, "error", runErr)
133148
}
134149
return lastErr
135150
}
@@ -138,11 +153,15 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
138153
// Deployment + HPA (external metric) → HPA computes scale-up → Karpenter provisions
139154
// KWOK nodes → pods are scheduled. This proves the chain works end-to-end.
140155
// nodePoolName is the discovered GPU NodePool name from the precheck.
141-
func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interface, nodePoolName string) error {
156+
func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interface, nodePoolName string) (*clusterAutoscalingReport, error) {
157+
report := &clusterAutoscalingReport{
158+
NodePool: nodePoolName,
159+
}
160+
142161
// Generate unique test resource names and namespace (prevents cross-run interference).
143162
b := make([]byte, 4)
144163
if _, err := rand.Read(b); err != nil {
145-
return errors.Wrap(errors.ErrCodeInternal, "failed to generate random suffix", err)
164+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to generate random suffix", err)
146165
}
147166
suffix := hex.EncodeToString(b)
148167
nsName := clusterAutoTestPrefix + suffix
@@ -154,7 +173,7 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
154173
ObjectMeta: metav1.ObjectMeta{Name: nsName},
155174
}
156175
if _, err := clientset.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}); k8s.IgnoreAlreadyExists(err) != nil {
157-
return errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test namespace", err)
176+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test namespace", err)
158177
}
159178

160179
// Cleanup: delete namespace (cascades all resources, triggers Karpenter consolidation).
@@ -175,37 +194,51 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
175194
LabelSelector: fmt.Sprintf("%s=%s", karpenterNodePoolLabel, nodePoolName),
176195
})
177196
if err != nil {
178-
return errors.Wrap(errors.ErrCodeInternal, "failed to count baseline Karpenter nodes", err)
197+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to count baseline Karpenter nodes", err)
179198
}
180199
baselineNodeCount := len(baselineNodes.Items)
200+
report.BaselineNodes = baselineNodeCount
181201
slog.Info("baseline Karpenter node count", "pool", nodePoolName, "count", baselineNodeCount)
182202

183203
// Create Deployment: GPU-requesting pods with Karpenter nodeSelector.
184204
deploy := buildClusterAutoTestDeployment(deployName, nsName, nodePoolName)
185-
if _, err := clientset.AppsV1().Deployments(nsName).Create(
186-
ctx, deploy, metav1.CreateOptions{}); err != nil {
187-
return errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test deployment", err)
205+
_, createErr := clientset.AppsV1().Deployments(nsName).Create(
206+
ctx, deploy, metav1.CreateOptions{})
207+
if createErr != nil {
208+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test deployment", createErr)
188209
}
189210

190211
// Create HPA targeting external metric dcgm_gpu_power_usage.
191212
hpa := buildClusterAutoTestHPA(hpaName, deployName, nsName)
192-
if _, err := clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
193-
ctx, hpa, metav1.CreateOptions{}); err != nil {
194-
return errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", err)
213+
_, createErr = clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
214+
ctx, hpa, metav1.CreateOptions{})
215+
if createErr != nil {
216+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", createErr)
195217
}
196218

197219
// Wait for HPA to report scaling intent.
198-
if err := waitForClusterAutoHPAScale(ctx, clientset, nsName, hpaName); err != nil {
199-
return err
220+
desired, current, err := waitForHPAScaleUp(ctx, clientset, nsName, hpaName, "cluster autoscaling")
221+
if err != nil {
222+
return nil, err
200223
}
224+
report.HPADesired = desired
225+
report.HPACurrent = current
201226

202227
// Wait for Karpenter to provision KWOK nodes (above baseline count).
203-
if err := waitForKarpenterNodes(ctx, clientset, nodePoolName, baselineNodeCount); err != nil {
204-
return err
228+
observedNodes, err := waitForKarpenterNodes(ctx, clientset, nodePoolName, baselineNodeCount)
229+
if err != nil {
230+
return nil, err
205231
}
232+
report.ObservedNodes = observedNodes
206233

207234
// Verify pods are scheduled (not Pending) with poll loop.
208-
return verifyPodsScheduled(ctx, clientset, nsName)
235+
totalPods, scheduledPods, err := verifyPodsScheduled(ctx, clientset, nsName)
236+
if err != nil {
237+
return nil, err
238+
}
239+
report.TotalPods = totalPods
240+
report.ScheduledPods = scheduledPods
241+
return report, nil
209242
}
210243

211244
// buildClusterAutoTestDeployment creates a Deployment that requests GPU resources
@@ -317,45 +350,10 @@ func buildClusterAutoTestHPA(name, deployName, namespace string) *autoscalingv2.
317350
}
318351
}
319352

320-
// waitForClusterAutoHPAScale polls the HPA until desiredReplicas > currentReplicas.
321-
func waitForClusterAutoHPAScale(ctx context.Context, clientset kubernetes.Interface, namespace, hpaName string) error {
322-
waitCtx, cancel := context.WithTimeout(ctx, defaults.HPAScaleTimeout)
323-
defer cancel()
324-
325-
err := wait.PollUntilContextCancel(waitCtx, defaults.HPAPollInterval, true,
326-
func(ctx context.Context) (bool, error) {
327-
hpa, getErr := clientset.AutoscalingV2().HorizontalPodAutoscalers(namespace).Get(
328-
ctx, hpaName, metav1.GetOptions{})
329-
if getErr != nil {
330-
slog.Debug("HPA not ready yet", "error", getErr)
331-
return false, nil
332-
}
333-
334-
desired := hpa.Status.DesiredReplicas
335-
current := hpa.Status.CurrentReplicas
336-
slog.Debug("cluster autoscaling HPA status", "desired", desired, "current", current)
337-
338-
if desired > current {
339-
slog.Info("cluster autoscaling HPA scaling intent detected",
340-
"desiredReplicas", desired, "currentReplicas", current)
341-
return true, nil
342-
}
343-
return false, nil
344-
},
345-
)
346-
if err != nil {
347-
if ctx.Err() != nil || waitCtx.Err() != nil {
348-
return errors.Wrap(errors.ErrCodeTimeout,
349-
"HPA did not report scaling intent — external metrics pipeline may be broken", err)
350-
}
351-
return errors.Wrap(errors.ErrCodeInternal, "HPA scaling intent polling failed", err)
352-
}
353-
return nil
354-
}
355-
356353
// waitForKarpenterNodes polls until nodes with the discovered NodePool label exceed the
357354
// baseline count. This proves Karpenter provisioned NEW nodes, not just pre-existing ones.
358-
func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface, nodePoolName string, baselineNodeCount int) error {
355+
func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface, nodePoolName string, baselineNodeCount int) (int, error) {
356+
var observedNodeCount int
359357
waitCtx, cancel := context.WithTimeout(ctx, defaults.KarpenterNodeTimeout)
360358
defer cancel()
361359

@@ -369,29 +367,32 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
369367
return false, nil
370368
}
371369

372-
if len(nodes.Items) > baselineNodeCount {
370+
observedNodeCount = len(nodes.Items)
371+
if observedNodeCount > baselineNodeCount {
373372
slog.Info("Karpenter provisioned new KWOK GPU node(s)",
374-
"total", len(nodes.Items), "baseline", baselineNodeCount,
375-
"new", len(nodes.Items)-baselineNodeCount)
373+
"total", observedNodeCount, "baseline", baselineNodeCount,
374+
"new", observedNodeCount-baselineNodeCount)
376375
return true, nil
377376
}
378377
return false, nil
379378
},
380379
)
381380
if err != nil {
382381
if ctx.Err() != nil || waitCtx.Err() != nil {
383-
return errors.Wrap(errors.ErrCodeTimeout,
382+
return 0, errors.Wrap(errors.ErrCodeTimeout,
384383
"Karpenter did not provision GPU nodes within timeout", err)
385384
}
386-
return errors.Wrap(errors.ErrCodeInternal, "Karpenter node polling failed", err)
385+
return 0, errors.Wrap(errors.ErrCodeInternal, "Karpenter node polling failed", err)
387386
}
388-
return nil
387+
return observedNodeCount, nil
389388
}
390389

391390
// verifyPodsScheduled polls until pods in the unique test namespace are scheduled (not Pending).
392391
// This proves the full chain: HPA → scale → Karpenter → nodes → pods scheduled.
393392
// The namespace is unique per run, so all pods belong to this test — no stale pod interference.
394-
func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, namespace string) error {
393+
func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, namespace string) (int, int, error) {
394+
var observedTotal int
395+
var observedScheduled int
395396
waitCtx, cancel := context.WithTimeout(ctx, defaults.PodScheduleTimeout)
396397
defer cancel()
397398

@@ -403,8 +404,9 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
403404
return false, nil
404405
}
405406

406-
if len(pods.Items) < 2 {
407-
slog.Debug("waiting for HPA-scaled pods", "count", len(pods.Items))
407+
observedTotal = len(pods.Items)
408+
if observedTotal < 2 {
409+
slog.Debug("waiting for HPA-scaled pods", "count", observedTotal)
408410
return false, nil
409411
}
410412

@@ -415,23 +417,24 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
415417
}
416418
}
417419

420+
observedScheduled = scheduled
418421
slog.Debug("cluster autoscaling pod status",
419-
"total", len(pods.Items), "scheduled", scheduled)
422+
"total", observedTotal, "scheduled", observedScheduled)
420423

421-
if scheduled >= 2 {
424+
if observedScheduled >= 2 {
422425
slog.Info("cluster autoscaling pods verified",
423-
"total", len(pods.Items), "scheduled", scheduled)
426+
"total", observedTotal, "scheduled", observedScheduled)
424427
return true, nil
425428
}
426429
return false, nil
427430
},
428431
)
429432
if err != nil {
430433
if ctx.Err() != nil || waitCtx.Err() != nil {
431-
return errors.Wrap(errors.ErrCodeTimeout,
434+
return 0, 0, errors.Wrap(errors.ErrCodeTimeout,
432435
"test pods not scheduled within timeout — Karpenter nodes may not be ready", err)
433436
}
434-
return errors.Wrap(errors.ErrCodeInternal, "pod scheduling verification failed", err)
437+
return 0, 0, errors.Wrap(errors.ErrCodeInternal, "pod scheduling verification failed", err)
435438
}
436-
return nil
439+
return observedTotal, observedScheduled, nil
437440
}

pkg/validator/checks/conformance/cluster_autoscaling_check_unit_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ func TestValidateClusterAutoscaling(t *testing.T) {
397397
defer cancel()
398398
}
399399

400-
err := validateClusterAutoscaling(ctx, clientset, testNodePool)
400+
report, err := validateClusterAutoscaling(ctx, clientset, testNodePool)
401401

402402
if (err != nil) != tt.wantErr {
403403
t.Errorf("validateClusterAutoscaling() error = %v, wantErr %v", err, tt.wantErr)
@@ -409,6 +409,22 @@ func TestValidateClusterAutoscaling(t *testing.T) {
409409
t.Errorf("validateClusterAutoscaling() error = %v, should contain %q", err, tt.errContains)
410410
}
411411
}
412+
413+
if !tt.wantErr {
414+
if report == nil {
415+
t.Fatal("validateClusterAutoscaling() report = nil, want non-nil")
416+
}
417+
if report.HPADesired != tt.hpaDesired || report.HPACurrent != tt.hpaCurrent {
418+
t.Errorf("report HPA desired/current = %d/%d, want %d/%d",
419+
report.HPADesired, report.HPACurrent, tt.hpaDesired, tt.hpaCurrent)
420+
}
421+
if report.ObservedNodes != tt.kwokNodes {
422+
t.Errorf("report observed nodes = %d, want %d", report.ObservedNodes, tt.kwokNodes)
423+
}
424+
if report.TotalPods != tt.podCount {
425+
t.Errorf("report total pods = %d, want %d", report.TotalPods, tt.podCount)
426+
}
427+
}
412428
})
413429
}
414430
}

0 commit comments

Comments
 (0)