Skip to content

Commit bbdac96

Browse files
committed
fix: pin no-claim isolation pod to GPU node for reliable scheduling
The no-claim DRA isolation pod was timing out after 5 minutes on CI because it could be scheduled on the control-plane node (via TolerationOpExists), where busybox:1.37 might not be cached and Docker Hub pulls are rate-limited. This also made the test semantically wrong — testing GPU isolation on a node without GPUs proves nothing. Fix by setting NodeName on the no-claim pod to the same GPU worker where the DRA test pod ran. This: - Ensures isolation is proven on the actual GPU node - Bypasses scheduler-level delays (kai-scheduler, DRA extensions) - Leverages images already cached on the worker from prior tests Also adds diagnostic output (phase, container status, node) to the timeout error message for easier triage of future failures.
1 parent 7cf8253 commit bbdac96

File tree

3 files changed

+32
-8
lines changed

3 files changed

+32
-8
lines changed

pkg/validator/checks/conformance/helpers.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,17 @@ func podStuckReason(pod *corev1.Pod) string {
235235
return ""
236236
}
237237

238+
// podWaitingStatus returns the first container's waiting reason and message, or "none"
239+
// if no container is in a waiting state. Used for diagnostic output on timeout.
240+
func podWaitingStatus(pod *corev1.Pod) string {
241+
for _, cs := range pod.Status.ContainerStatuses {
242+
if w := cs.State.Waiting; w != nil {
243+
return fmt.Sprintf("%s: %s", w.Reason, w.Message)
244+
}
245+
}
246+
return "none"
247+
}
248+
238249
// waitForHPAScaleUp polls the HPA until desiredReplicas > currentReplicas.
239250
// This proves the HPA read metrics and computed a scale-up intent. The logPrefix
240251
// is prepended to log messages to distinguish callers (e.g. "pod-autoscaling", "cluster-autoscaling").

pkg/validator/checks/conformance/secure_access_check.go

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
132132
len(pod.Spec.ResourceClaims)))
133133

134134
// Validate isolation: a pod without DRA claims cannot access GPU devices.
135-
report, err := validateDRAIsolation(ctx.Context, ctx.Clientset, run)
135+
// Target the same node as the DRA test pod — isolation must be proven on the
136+
// GPU node, not a control-plane node that has no GPUs in the first place.
137+
report, err := validateDRAIsolation(ctx.Context, ctx.Clientset, run, pod.Spec.NodeName)
136138
if err != nil {
137139
return err
138140
}
@@ -260,10 +262,12 @@ func validateDRAPatterns(ctx context.Context, dynClient dynamic.Interface, pod *
260262

261263
// validateDRAIsolation verifies that a pod WITHOUT DRA ResourceClaims cannot see GPU devices.
262264
// This proves GPU access is truly mediated by DRA — the scheduler does not expose devices
263-
// to pods that lack claims.
264-
func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, run *draTestRun) (*draIsolationReport, error) {
265-
// Create no-claim pod.
266-
pod := buildNoClaimTestPod(run)
265+
// to pods that lack claims. gpuNodeName pins the pod to the same GPU node where the DRA
266+
// test ran, ensuring isolation is proven on a node that actually has GPUs and bypassing
267+
// scheduler-level delays.
268+
func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, run *draTestRun, gpuNodeName string) (*draIsolationReport, error) {
269+
// Create no-claim pod pinned to the GPU node.
270+
pod := buildNoClaimTestPod(run, gpuNodeName)
267271
if _, err := clientset.CoreV1().Pods(draTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
268272
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create no-claim isolation test pod", err)
269273
}
@@ -279,6 +283,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
279283

280284
// Wait for no-claim pod to reach terminal state.
281285
var resultPod *corev1.Pod
286+
var lastPhase corev1.PodPhase
287+
var lastContainerStatus string
282288
waitCtx, cancel := context.WithTimeout(ctx, defaults.DRATestPodTimeout)
283289
defer cancel()
284290

@@ -297,6 +303,9 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
297303
return false, errors.Wrap(errors.ErrCodeInternal,
298304
"failed to get no-claim isolation test pod", getErr)
299305
}
306+
// Track last known state for diagnostics on timeout.
307+
lastPhase = p.Status.Phase
308+
lastContainerStatus = podWaitingStatus(p)
300309
// Fail fast if pod is stuck in a non-recoverable state (e.g. ImagePullBackOff).
301310
if reason := podStuckReason(p); reason != "" {
302311
return false, errors.New(errors.ErrCodeInternal,
@@ -314,7 +323,8 @@ func validateDRAIsolation(ctx context.Context, clientset kubernetes.Interface, r
314323
if err != nil {
315324
if ctx.Err() != nil || waitCtx.Err() != nil {
316325
return nil, errors.Wrap(errors.ErrCodeTimeout,
317-
"no-claim isolation test pod did not complete in time", err)
326+
fmt.Sprintf("no-claim isolation test pod did not complete in time (last phase=%s, status=%s, node=%s)",
327+
lastPhase, lastContainerStatus, gpuNodeName), err)
318328
}
319329
return nil, errors.Wrap(errors.ErrCodeInternal,
320330
"no-claim isolation test pod polling failed", err)
@@ -424,13 +434,16 @@ func buildDRATestPod(run *draTestRun) *corev1.Pod {
424434
// If the cluster properly mediates GPU access through DRA, this pod will not see GPU devices.
425435
// Uses a lightweight image (busybox) since no CUDA libraries are needed — only checking
426436
// whether /dev/nvidia* device files are visible.
427-
func buildNoClaimTestPod(run *draTestRun) *corev1.Pod {
437+
// gpuNodeName pins the pod to the GPU node via NodeName, bypassing the scheduler to ensure
438+
// the isolation test runs on a node that actually has GPUs and avoiding scheduler delays.
439+
func buildNoClaimTestPod(run *draTestRun, gpuNodeName string) *corev1.Pod {
428440
return &corev1.Pod{
429441
ObjectMeta: metav1.ObjectMeta{
430442
Name: run.noClaimPodName,
431443
Namespace: draTestNamespace,
432444
},
433445
Spec: corev1.PodSpec{
446+
NodeName: gpuNodeName,
434447
RestartPolicy: corev1.RestartPolicyNever,
435448
Tolerations: []corev1.Toleration{
436449
{Operator: corev1.TolerationOpExists},

pkg/validator/checks/conformance/secure_access_check_unit_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func TestCheckSecureAcceleratorAccess(t *testing.T) {
117117
Name: ga.GetName(),
118118
Namespace: draTestNamespace,
119119
},
120-
Spec: *buildNoClaimTestPod(&draTestRun{noClaimPodName: ga.GetName()}).Spec.DeepCopy(),
120+
Spec: *buildNoClaimTestPod(&draTestRun{noClaimPodName: ga.GetName()}, "test-node").Spec.DeepCopy(),
121121
Status: corev1.PodStatus{
122122
Phase: corev1.PodSucceeded,
123123
},

0 commit comments

Comments
 (0)