@@ -178,13 +178,22 @@ func waitForDRATestPod(ctx context.Context, clientset kubernetes.Interface, run
178178
179179 err := wait .PollUntilContextCancel (waitCtx , defaults .PodPollInterval , true ,
180180 func (ctx context.Context ) (bool , error ) {
181- pod , err := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
181+ pod , getErr := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
182182 ctx , run .podName , metav1.GetOptions {})
183- if err != nil {
184- if k8serrors .IsNotFound (err ) {
183+ if getErr != nil {
184+ if k8serrors .IsNotFound (getErr ) {
185185 return false , nil // pod not yet visible after create, keep polling
186186 }
187- return false , errors .Wrap (errors .ErrCodeInternal , "failed to get DRA test pod" , err )
187+ // K8s client rate limiter fires near context deadline — retry gracefully.
188+ if strings .Contains (getErr .Error (), "rate limiter" ) {
189+ return false , nil
190+ }
191+ return false , errors .Wrap (errors .ErrCodeInternal , "failed to get DRA test pod" , getErr )
192+ }
193+ // Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
194+ if reason := podStuckReason (pod ); reason != "" {
195+ return false , errors .New (errors .ErrCodeInternal ,
196+ fmt .Sprintf ("DRA test pod stuck: %s" , reason ))
188197 }
189198 switch pod .Status .Phase { //nolint:exhaustive // only terminal states matter
190199 case corev1 .PodSucceeded , corev1 .PodFailed :
@@ -275,14 +284,23 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
275284
276285 err := wait .PollUntilContextCancel (waitCtx , defaults .PodPollInterval , true ,
277286 func (ctx context.Context ) (bool , error ) {
278- p , err := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
287+ p , getErr := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
279288 ctx , run .noClaimPodName , metav1.GetOptions {})
280- if err != nil {
281- if k8serrors .IsNotFound (err ) {
289+ if getErr != nil {
290+ if k8serrors .IsNotFound (getErr ) {
282291 return false , nil // pod not yet visible after create, keep polling
283292 }
293+ // K8s client rate limiter fires near context deadline — retry gracefully.
294+ if strings .Contains (getErr .Error (), "rate limiter" ) {
295+ return false , nil
296+ }
284297 return false , errors .Wrap (errors .ErrCodeInternal ,
285- "failed to get no-claim isolation test pod" , err )
298+ "failed to get no-claim isolation test pod" , getErr )
299+ }
300+ // Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
301+ if reason := podStuckReason (p ); reason != "" {
302+ return false , errors .New (errors .ErrCodeInternal ,
303+ fmt .Sprintf ("no-claim isolation test pod stuck: %s" , reason ))
286304 }
287305 switch p .Status .Phase { //nolint:exhaustive // only terminal states matter
288306 case corev1 .PodSucceeded , corev1 .PodFailed :
@@ -420,7 +438,7 @@ func buildNoClaimTestPod(run *draTestRun) *corev1.Pod {
420438 Containers : []corev1.Container {
421439 {
422440 Name : "isolation-test" ,
423- Image : "busybox:stable " ,
441+ Image : "busybox:1.37 " ,
424442 Command : []string {
425443 "sh" , "-c" ,
426444 "if ls /dev/nvidia* 2>/dev/null; then echo 'FAIL: GPU visible without DRA claim' && exit 1; else echo 'PASS: GPU isolated' && exit 0; fi" ,
0 commit comments