@@ -132,7 +132,9 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
132132 len (pod .Spec .ResourceClaims )))
133133
134134 // Validate isolation: a pod without DRA claims cannot access GPU devices.
135- report , err := validateDRAIsolation (ctx .Context , ctx .Clientset , run )
135+ // Target the same node as the DRA test pod — isolation must be proven on the
136+ // GPU node, not a control-plane node that has no GPUs in the first place.
137+ report , err := validateDRAIsolation (ctx .Context , ctx .Clientset , run , pod .Spec .NodeName )
136138 if err != nil {
137139 return err
138140 }
@@ -260,10 +262,12 @@ func validateDRAPatterns(ctx context.Context, dynClient dynamic.Interface, pod *
260262
261263// validateDRAIsolation verifies that a pod WITHOUT DRA ResourceClaims cannot see GPU devices.
262264// This proves GPU access is truly mediated by DRA — the scheduler does not expose devices
263- // to pods that lack claims.
264- func validateDRAIsolation (ctx context.Context , clientset kubernetes.Interface , run * draTestRun ) (* draIsolationReport , error ) {
265- // Create no-claim pod.
266- pod := buildNoClaimTestPod (run )
265+ // to pods that lack claims. gpuNodeName pins the pod to the same GPU node where the DRA
266+ // test ran, ensuring isolation is proven on a node that actually has GPUs and bypassing
267+ // scheduler-level delays.
268+ func validateDRAIsolation (ctx context.Context , clientset kubernetes.Interface , run * draTestRun , gpuNodeName string ) (* draIsolationReport , error ) {
269+ // Create no-claim pod pinned to the GPU node.
270+ pod := buildNoClaimTestPod (run , gpuNodeName )
267271 if _ , err := clientset .CoreV1 ().Pods (draTestNamespace ).Create (ctx , pod , metav1.CreateOptions {}); err != nil {
268272 return nil , errors .Wrap (errors .ErrCodeInternal , "failed to create no-claim isolation test pod" , err )
269273 }
@@ -279,6 +283,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
279283
280284 // Wait for no-claim pod to reach terminal state.
281285 var resultPod * corev1.Pod
286+ var lastPhase corev1.PodPhase
287+ var lastContainerStatus string
282288 waitCtx , cancel := context .WithTimeout (ctx , defaults .DRATestPodTimeout )
283289 defer cancel ()
284290
@@ -297,6 +303,9 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
297303 return false , errors .Wrap (errors .ErrCodeInternal ,
298304 "failed to get no-claim isolation test pod" , getErr )
299305 }
306+ // Track last known state for diagnostics on timeout.
307+ lastPhase = p .Status .Phase
308+ lastContainerStatus = podWaitingStatus (p )
300309 // Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
301310 if reason := podStuckReason (p ); reason != "" {
302311 return false , errors .New (errors .ErrCodeInternal ,
@@ -314,7 +323,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
314323 if err != nil {
315324 if ctx .Err () != nil || waitCtx .Err () != nil {
316325 return nil , errors .Wrap (errors .ErrCodeTimeout ,
317- "no-claim isolation test pod did not complete in time" , err )
326+ fmt .Sprintf ("no-claim isolation test pod did not complete in time (last phase=%s, status=%s, node=%s)" ,
327+ lastPhase , lastContainerStatus , gpuNodeName ), err )
318328 }
319329 return nil , errors .Wrap (errors .ErrCodeInternal ,
320330 "no-claim isolation test pod polling failed" , err )
@@ -424,13 +434,16 @@ func buildDRATestPod(run *draTestRun) *corev1.Pod {
424434// If the cluster properly mediates GPU access through DRA, this pod will not see GPU devices.
425435// Uses a lightweight image (busybox) since no CUDA libraries are needed — only checking
426436// whether /dev/nvidia* device files are visible.
427- func buildNoClaimTestPod (run * draTestRun ) * corev1.Pod {
437+ // gpuNodeName pins the pod to the GPU node via NodeName, bypassing the scheduler to ensure
438+ // the isolation test runs on a node that actually has GPUs and avoiding scheduler delays.
439+ func buildNoClaimTestPod (run * draTestRun , gpuNodeName string ) * corev1.Pod {
428440 return & corev1.Pod {
429441 ObjectMeta : metav1.ObjectMeta {
430442 Name : run .noClaimPodName ,
431443 Namespace : draTestNamespace ,
432444 },
433445 Spec : corev1.PodSpec {
446+ NodeName : gpuNodeName ,
434447 RestartPolicy : corev1 .RestartPolicyNever ,
435448 Tolerations : []corev1.Toleration {
436449 {Operator : corev1 .TolerationOpExists },
0 commit comments