Skip to content

Commit 55059ff

Browse files
committed
fix: resolve TestSecureAcceleratorAccess flaky failure on GPU Inference CI
Pin no-claim isolation pod image from busybox:stable to busybox:1.37 (matches HPA test, better kubelet cache hit). Add podStuckReason() helper to detect ImagePullBackOff/ErrImagePull/Unschedulable and fail fast instead of polling for 5 minutes. Handle K8s client rate limiter errors as retriable to prevent misclassified INTERNAL errors near context deadline.
1 parent ad6e66b commit 55059ff

File tree

2 files changed

+56
-9
lines changed

2 files changed

+56
-9
lines changed

pkg/validator/checks/conformance/helpers.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,35 @@ func containsAllMetrics(text string, required []string) []string {
205205
return missing
206206
}
207207

208+
// podStuckReason inspects a Pod for non-recoverable stuck states and returns a
209+
// human-readable reason. Returns empty string if the pod is not stuck.
210+
// Follows the pattern from pkg/validator/agent/wait.go:getJobFailureReasonFromPod.
211+
func podStuckReason(pod *corev1.Pod) string {
212+
for _, cs := range pod.Status.ContainerStatuses {
213+
if w := cs.State.Waiting; w != nil {
214+
switch w.Reason {
215+
case "ImagePullBackOff", "ErrImagePull", "InvalidImageName", "CrashLoopBackOff":
216+
return fmt.Sprintf("%s: %s (image: %s)", w.Reason, w.Message, cs.Image)
217+
}
218+
}
219+
}
220+
for _, cs := range pod.Status.InitContainerStatuses {
221+
if w := cs.State.Waiting; w != nil {
222+
switch w.Reason {
223+
case "ImagePullBackOff", "ErrImagePull", "InvalidImageName", "CrashLoopBackOff":
224+
return fmt.Sprintf("%s: %s (init container, image: %s)", w.Reason, w.Message, cs.Image)
225+
}
226+
}
227+
}
228+
for _, cond := range pod.Status.Conditions {
229+
if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse &&
230+
cond.Reason == string(corev1.PodReasonUnschedulable) {
231+
return fmt.Sprintf("Unschedulable: %s", cond.Message)
232+
}
233+
}
234+
return ""
235+
}
236+
208237
// waitForHPAScaleUp polls the HPA until desiredReplicas > currentReplicas.
209238
// This proves the HPA read metrics and computed a scale-up intent. The logPrefix
210239
// is prepended to log messages to distinguish callers (e.g. "pod-autoscaling", "cluster-autoscaling").

pkg/validator/checks/conformance/secure_access_check.go

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,13 +178,22 @@ func waitForDRATestPod(ctx context.Context, clientset kubernetes.Interface, run
178178

179179
err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
180180
func(ctx context.Context) (bool, error) {
181-
pod, err := clientset.CoreV1().Pods(draTestNamespace).Get(
181+
pod, getErr := clientset.CoreV1().Pods(draTestNamespace).Get(
182182
ctx, run.podName, metav1.GetOptions{})
183-
if err != nil {
184-
if k8serrors.IsNotFound(err) {
183+
if getErr != nil {
184+
if k8serrors.IsNotFound(getErr) {
185185
return false, nil // pod not yet visible after create, keep polling
186186
}
187-
return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", err)
187+
// K8s client rate limiter fires near context deadline — retry gracefully.
188+
if strings.Contains(getErr.Error(), "rate limiter") {
189+
return false, nil
190+
}
191+
return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", getErr)
192+
}
193+
// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
194+
if reason := podStuckReason(pod); reason != "" {
195+
return false, errors.New(errors.ErrCodeInternal,
196+
fmt.Sprintf("DRA test pod stuck: %s", reason))
188197
}
189198
switch pod.Status.Phase { //nolint:exhaustive // only terminal states matter
190199
case corev1.PodSucceeded, corev1.PodFailed:
@@ -275,14 +284,23 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
275284

276285
err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
277286
func(ctx context.Context) (bool, error) {
278-
p, err := clientset.CoreV1().Pods(draTestNamespace).Get(
287+
p, getErr := clientset.CoreV1().Pods(draTestNamespace).Get(
279288
ctx, run.noClaimPodName, metav1.GetOptions{})
280-
if err != nil {
281-
if k8serrors.IsNotFound(err) {
289+
if getErr != nil {
290+
if k8serrors.IsNotFound(getErr) {
282291
return false, nil // pod not yet visible after create, keep polling
283292
}
293+
// K8s client rate limiter fires near context deadline — retry gracefully.
294+
if strings.Contains(getErr.Error(), "rate limiter") {
295+
return false, nil
296+
}
284297
return false, errors.Wrap(errors.ErrCodeInternal,
285-
"failed to get no-claim isolation test pod", err)
298+
"failed to get no-claim isolation test pod", getErr)
299+
}
300+
// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
301+
if reason := podStuckReason(p); reason != "" {
302+
return false, errors.New(errors.ErrCodeInternal,
303+
fmt.Sprintf("no-claim isolation test pod stuck: %s", reason))
286304
}
287305
switch p.Status.Phase { //nolint:exhaustive // only terminal states matter
288306
case corev1.PodSucceeded, corev1.PodFailed:
@@ -420,7 +438,7 @@ func buildNoClaimTestPod(run *draTestRun) *corev1.Pod {
420438
Containers: []corev1.Container{
421439
{
422440
Name: "isolation-test",
423-
Image: "busybox:stable",
441+
Image: "busybox:1.37",
424442
Command: []string{
425443
"sh", "-c",
426444
"if ls /dev/nvidia* 2>/dev/null; then echo 'FAIL: GPU visible without DRA claim' && exit 1; else echo 'PASS: GPU isolated' && exit 0; fi",

0 commit comments

Comments
 (0)