fix: resolve TestSecureAcceleratorAccess flaky failure on GPU Inference CI

dims · dims · commit 55059ffb9acd · 2026-02-24T14:35:19.000-05:00
Pin no-claim isolation pod image from busybox:stable to busybox:1.37
(matches HPA test, better kubelet cache hit). Add podStuckReason()
helper to detect ImagePullBackOff/ErrImagePull/Unschedulable and fail
fast instead of polling for 5 minutes. Handle K8s client rate limiter
errors as retriable to prevent misclassified INTERNAL errors near
context deadline.
diff --git a/pkg/validator/checks/conformance/helpers.go b/pkg/validator/checks/conformance/helpers.go
@@ -205,6 +205,35 @@ func containsAllMetrics(text string, required []string) []string {
 	return missing
 }
 
+// podStuckReason inspects a Pod for non-recoverable stuck states and returns a
+// human-readable reason. Returns empty string if the pod is not stuck.
+// Follows the pattern from pkg/validator/agent/wait.go:getJobFailureReasonFromPod.
+func podStuckReason(pod *corev1.Pod) string {
+	for _, cs := range pod.Status.ContainerStatuses {
+		if w := cs.State.Waiting; w != nil {
+			switch w.Reason {
+			case "ImagePullBackOff", "ErrImagePull", "InvalidImageName", "CrashLoopBackOff":
+				return fmt.Sprintf("%s: %s (image: %s)", w.Reason, w.Message, cs.Image)
+			}
+		}
+	}
+	for _, cs := range pod.Status.InitContainerStatuses {
+		if w := cs.State.Waiting; w != nil {
+			switch w.Reason {
+			case "ImagePullBackOff", "ErrImagePull", "InvalidImageName", "CrashLoopBackOff":
+				return fmt.Sprintf("%s: %s (init container, image: %s)", w.Reason, w.Message, cs.Image)
+			}
+		}
+	}
+	for _, cond := range pod.Status.Conditions {
+		if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse &&
+			cond.Reason == string(corev1.PodReasonUnschedulable) {
+			return fmt.Sprintf("Unschedulable: %s", cond.Message)
+		}
+	}
+	return ""
+}
+
 // waitForHPAScaleUp polls the HPA until desiredReplicas > currentReplicas.
 // This proves the HPA read metrics and computed a scale-up intent. The logPrefix
 // is prepended to log messages to distinguish callers (e.g. "pod-autoscaling", "cluster-autoscaling").
diff --git a/pkg/validator/checks/conformance/secure_access_check.go b/pkg/validator/checks/conformance/secure_access_check.go
@@ -178,13 +178,22 @@ func waitForDRATestPod(ctx context.Context, clientset kubernetes.Interface, run
 
 	err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
 		func(ctx context.Context) (bool, error) {
-			pod, err := clientset.CoreV1().Pods(draTestNamespace).Get(
+			pod, getErr := clientset.CoreV1().Pods(draTestNamespace).Get(
 				ctx, run.podName, metav1.GetOptions{})
-			if err != nil {
-				if k8serrors.IsNotFound(err) {
+			if getErr != nil {
+				if k8serrors.IsNotFound(getErr) {
 					return false, nil // pod not yet visible after create, keep polling
 				}
-				return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", err)
+				// K8s client rate limiter fires near context deadline — retry gracefully.
+				if strings.Contains(getErr.Error(), "rate limiter") {
+					return false, nil
+				}
+				return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", getErr)
+			}
+			// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
+			if reason := podStuckReason(pod); reason != "" {
+				return false, errors.New(errors.ErrCodeInternal,
+					fmt.Sprintf("DRA test pod stuck: %s", reason))
 			}
 			switch pod.Status.Phase { //nolint:exhaustive // only terminal states matter
 			case corev1.PodSucceeded, corev1.PodFailed:
@@ -275,14 +284,23 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
 
 	err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
 		func(ctx context.Context) (bool, error) {
-			p, err := clientset.CoreV1().Pods(draTestNamespace).Get(
+			p, getErr := clientset.CoreV1().Pods(draTestNamespace).Get(
 				ctx, run.noClaimPodName, metav1.GetOptions{})
-			if err != nil {
-				if k8serrors.IsNotFound(err) {
+			if getErr != nil {
+				if k8serrors.IsNotFound(getErr) {
 					return false, nil // pod not yet visible after create, keep polling
 				}
+				// K8s client rate limiter fires near context deadline — retry gracefully.
+				if strings.Contains(getErr.Error(), "rate limiter") {
+					return false, nil
+				}
 				return false, errors.Wrap(errors.ErrCodeInternal,
-					"failed to get no-claim isolation test pod", err)
+					"failed to get no-claim isolation test pod", getErr)
+			}
+			// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
+			if reason := podStuckReason(p); reason != "" {
+				return false, errors.New(errors.ErrCodeInternal,
+					fmt.Sprintf("no-claim isolation test pod stuck: %s", reason))
 			}
 			switch p.Status.Phase { //nolint:exhaustive // only terminal states matter
 			case corev1.PodSucceeded, corev1.PodFailed:
@@ -420,7 +438,7 @@ func buildNoClaimTestPod(run *draTestRun) *corev1.Pod {
 			Containers: []corev1.Container{
 				{
 					Name:  "isolation-test",
-					Image: "busybox:stable",
+					Image: "busybox:1.37",
 					Command: []string{
 						"sh", "-c",
 						"if ls /dev/nvidia* 2>/dev/null; then echo 'FAIL: GPU visible without DRA claim' && exit 1; else echo 'PASS: GPU isolated' && exit 0; fi",