fix: pin no-claim isolation pod to GPU node for reliable scheduling

dims · dims · commit bbdac96fb302 · 2026-02-24T15:35:10.000-05:00
The no-claim DRA isolation pod was timing out after 5 minutes on CI because
it could be scheduled on the control-plane node (via TolerationOpExists),
where busybox:1.37 might not be cached and Docker Hub pulls are rate-limited.
This also made the test semantically wrong — testing GPU isolation on a node
without GPUs proves nothing.

Fix by setting NodeName on the no-claim pod to the same GPU worker where
the DRA test pod ran. This:
- Ensures isolation is proven on the actual GPU node
- Bypasses scheduler-level delays (kai-scheduler, DRA extensions)
- Leverages images already cached on the worker from prior tests

Also adds diagnostic output (phase, container status, node) to the timeout
error message for easier triage of future failures.
diff --git a/pkg/validator/checks/conformance/helpers.go b/pkg/validator/checks/conformance/helpers.go
@@ -235,6 +235,17 @@ func podStuckReason(pod *corev1.Pod) string {
 	return ""
 }
 
+// podWaitingStatus returns the first container's waiting reason and message, or "none"
+// if no container is in a waiting state. Used for diagnostic output on timeout.
+func podWaitingStatus(pod *corev1.Pod) string {
+	for _, cs := range pod.Status.ContainerStatuses {
+		if w := cs.State.Waiting; w != nil {
+			return fmt.Sprintf("%s: %s", w.Reason, w.Message)
+		}
+	}
+	return "none"
+}
+
 // waitForHPAScaleUp polls the HPA until desiredReplicas > currentReplicas.
 // This proves the HPA read metrics and computed a scale-up intent. The logPrefix
 // is prepended to log messages to distinguish callers (e.g. "pod-autoscaling", "cluster-autoscaling").
diff --git a/pkg/validator/checks/conformance/secure_access_check.go b/pkg/validator/checks/conformance/secure_access_check.go
@@ -132,7 +132,9 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
 			len(pod.Spec.ResourceClaims)))
 
 	// Validate isolation: a pod without DRA claims cannot access GPU devices.
-	report, err := validateDRAIsolation(ctx.Context, ctx.Clientset, run)
+	// Target the same node as the DRA test pod — isolation must be proven on the
+	// GPU node, not a control-plane node that has no GPUs in the first place.
+	report, err := validateDRAIsolation(ctx.Context, ctx.Clientset, run, pod.Spec.NodeName)
 	if err != nil {
 		return err
 	}
@@ -260,10 +262,12 @@ func validateDRAPatterns(ctx context.Context, dynClient dynamic.Interface, pod *
 
 // validateDRAIsolation verifies that a pod WITHOUT DRA ResourceClaims cannot see GPU devices.
 // This proves GPU access is truly mediated by DRA — the scheduler does not expose devices
-// to pods that lack claims.
-func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, run *draTestRun) (*draIsolationReport, error) {
-	// Create no-claim pod.
-	pod := buildNoClaimTestPod(run)
+// to pods that lack claims. gpuNodeName pins the pod to the same GPU node where the DRA
+// test ran, ensuring isolation is proven on a node that actually has GPUs and bypassing
+// scheduler-level delays.
+func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, run *draTestRun, gpuNodeName string) (*draIsolationReport, error) {
+	// Create no-claim pod pinned to the GPU node.
+	pod := buildNoClaimTestPod(run, gpuNodeName)
 	if _, err := clientset.CoreV1().Pods(draTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
 		return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create no-claim isolation test pod", err)
 	}
@@ -279,6 +283,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
 
 	// Wait for no-claim pod to reach terminal state.
 	var resultPod *corev1.Pod
+	var lastPhase corev1.PodPhase
+	var lastContainerStatus string
 	waitCtx, cancel := context.WithTimeout(ctx, defaults.DRATestPodTimeout)
 	defer cancel()
 
@@ -297,6 +303,9 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
 				return false, errors.Wrap(errors.ErrCodeInternal,
 					"failed to get no-claim isolation test pod", getErr)
 			}
+			// Track last known state for diagnostics on timeout.
+			lastPhase = p.Status.Phase
+			lastContainerStatus = podWaitingStatus(p)
 			// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
 			if reason := podStuckReason(p); reason != "" {
 				return false, errors.New(errors.ErrCodeInternal,
@@ -314,7 +323,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
 	if err != nil {
 		if ctx.Err() != nil || waitCtx.Err() != nil {
 			return nil, errors.Wrap(errors.ErrCodeTimeout,
-				"no-claim isolation test pod did not complete in time", err)
+				fmt.Sprintf("no-claim isolation test pod did not complete in time (last phase=%s, status=%s, node=%s)",
+					lastPhase, lastContainerStatus, gpuNodeName), err)
 		}
 		return nil, errors.Wrap(errors.ErrCodeInternal,
 			"no-claim isolation test pod polling failed", err)
@@ -424,13 +434,16 @@ func buildDRATestPod(run *draTestRun) *corev1.Pod {
 // If the cluster properly mediates GPU access through DRA, this pod will not see GPU devices.
 // Uses a lightweight image (busybox) since no CUDA libraries are needed — only checking
 // whether /dev/nvidia* device files are visible.
-func buildNoClaimTestPod(run *draTestRun) *corev1.Pod {
+// gpuNodeName pins the pod to the GPU node via NodeName, bypassing the scheduler to ensure
+// the isolation test runs on a node that actually has GPUs and avoiding scheduler delays.
+func buildNoClaimTestPod(run *draTestRun, gpuNodeName string) *corev1.Pod {
 	return &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      run.noClaimPodName,
 			Namespace: draTestNamespace,
 		},
 		Spec: corev1.PodSpec{
+			NodeName:      gpuNodeName,
 			RestartPolicy: corev1.RestartPolicyNever,
 			Tolerations: []corev1.Toleration{
 				{Operator: corev1.TolerationOpExists},
diff --git a/pkg/validator/checks/conformance/secure_access_check_unit_test.go b/pkg/validator/checks/conformance/secure_access_check_unit_test.go
@@ -117,7 +117,7 @@ func TestCheckSecureAcceleratorAccess(t *testing.T) {
 								Name:      ga.GetName(),
 								Namespace: draTestNamespace,
 							},
-							Spec: *buildNoClaimTestPod(&draTestRun{noClaimPodName: ga.GetName()}).Spec.DeepCopy(),
+							Spec: *buildNoClaimTestPod(&draTestRun{noClaimPodName: ga.GetName()}, "test-node").Spec.DeepCopy(),
 							Status: corev1.PodStatus{
 								Phase: corev1.PodSucceeded,
 							},