Skip to content

Commit a94382b

Browse files
committed
fix: address code review findings in KubeRay e2e tests
- Use NestedFieldNoCopy+BeNumerically instead of NestedInt64 for minCount assertion (JSON numbers are float64, not int64) - Strengthen gang scheduling assertion to verify all-or-nothing: at most 1 worker Running, >= 19 Pending (not just pendingCount > 0) - Hardcode apiServer feature-gates consistently with controller-manager and scheduler (remove K8S_FEATURE_GATES override that could cause component feature gate mismatch)
1 parent f16dc8c commit a94382b

2 files changed

Lines changed: 18 additions & 11 deletions

File tree

templates/test/ci/cluster-template-prow-ci-version-native-scheduling.yaml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/e2e/azure_kuberay.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -487,17 +487,17 @@ func KubeRayNativeSchedulingSpec(ctx context.Context, inputGetter func() KubeRay
487487
workerPGName = rayClusterName + "-worker-small-group"
488488

489489
By("verifying the worker PodGroup minCount is updated to 20")
490-
Eventually(func() int64 {
490+
Eventually(func() interface{} {
491491
pg, err := dynamicClient.Resource(podGroupGVR).Namespace(corev1.NamespaceDefault).Get(ctx, workerPGName, metav1.GetOptions{})
492492
if err != nil {
493-
return 0
493+
return nil
494494
}
495-
minCount, found, _ := unstructured.NestedInt64(pg.Object, "spec", "schedulingPolicy", "gang", "minCount")
495+
minCount, found, _ := unstructured.NestedFieldNoCopy(pg.Object, "spec", "schedulingPolicy", "gang", "minCount")
496496
if !found {
497-
return 0
497+
return nil
498498
}
499499
return minCount
500-
}, e2eConfig.GetIntervals(specName, "wait-workload-ready")...).Should(Equal(int64(20)), "worker PodGroup minCount should be updated to 20")
500+
}, e2eConfig.GetIntervals(specName, "wait-workload-ready")...).Should(BeNumerically("==", 20), "worker PodGroup minCount should be updated to 20")
501501

502502
By("verifying worker pods are Pending due to gang scheduling (all-or-nothing)")
503503
Eventually(func() bool {
@@ -507,16 +507,23 @@ func KubeRayNativeSchedulingSpec(ctx context.Context, inputGetter func() KubeRay
507507
if err != nil {
508508
return false
509509
}
510+
runningCount := 0
510511
pendingCount := 0
511512
for _, pod := range pods.Items {
512-
if pod.Status.Phase == corev1.PodPending {
513+
switch pod.Status.Phase {
514+
case corev1.PodRunning:
515+
runningCount++
516+
case corev1.PodPending:
513517
pendingCount++
514518
}
515519
}
516-
Logf("Worker pods: %d total, %d Pending", len(pods.Items), pendingCount)
517-
return pendingCount > 0
518-
}, e2eConfig.GetIntervals(specName, "wait-workload-ready")...).Should(BeTrue(), "expected Pending worker pods when scaled beyond cluster capacity")
519-
Logf("Gang scheduling verified: worker pods are Pending when replicas exceed available resources")
520+
Logf("Worker pods: %d total, %d Running, %d Pending", len(pods.Items), runningCount, pendingCount)
521+
// Gang scheduling all-or-nothing: with minCount=20 and insufficient resources,
522+
// the scheduler should not schedule any new pods in the gang. At most 1 worker
523+
// may remain Running from before the scale-up.
524+
return runningCount <= 1 && pendingCount >= 19
525+
}, e2eConfig.GetIntervals(specName, "wait-workload-ready")...).Should(BeTrue(), "expected gang scheduling to prevent new workers from Running (all-or-nothing)")
526+
Logf("Gang scheduling verified: at most 1 worker Running, rest Pending (all-or-nothing)")
520527

521528
By("scaling workers back to 1 replica to verify recovery")
522529
scaleDownPatch := []byte(`[

0 commit comments

Comments
 (0)