[release-0.10] cherry-pick #4813 (#4823)

gabesaba · web-flow · commit 036b210650d6 · 2025-03-28T09:34:39.000-07:00
* Move Integ Test Helpers from FairSharing to Util (#4812) * Copy MakeWorkloadWithGeneratedName from #4695 * Admit Borrowing Cohort Workloads when Reclaim Guaranteed (#4813) * Admit Borrowing Cohort Workloads when Reclaim Guaranteed * Update Metrics Test
diff --git a/pkg/scheduler/preemption/policy.go b/pkg/scheduler/preemption/policy.go
@@ -0,0 +1,29 @@
+/*
+Copyright The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package preemption
+
+import (
+	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
+	"sigs.k8s.io/kueue/pkg/cache"
+)
+
+// CanAlwaysReclaim indicates that the CQ is guaranteed to
+// be able to reclaim the capacity of workloads borrowing
+// its capacity.
+func CanAlwaysReclaim(cq *cache.ClusterQueueSnapshot) bool {
+	return cq.Preemption.ReclaimWithinCohort == kueue.PreemptionPolicyAny
+}
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
@@ -225,10 +225,18 @@ func (s *Scheduler) schedule(ctx context.Context) wait.SpeedSignal {
 
 		if mode == flavorassigner.Preempt && len(e.preemptionTargets) == 0 {
 			log.V(2).Info("Workload requires preemption, but there are no candidate workloads allowed for preemption", "preemption", cq.Preemption)
-			// we use resourcesToReserve to block capacity up to either the nominal capacity,
-			// or the borrowing limit when borrowing, so that a lower priority workload cannot
-			// admit before us.
-			cq.AddUsage(resourcesToReserve(e, cq))
+			// we reserve capacity if we are uncertain
+			// whether we can reclaim the capacity
+			// later. Otherwise, we allow other workloads
+			// in the Cohort to borrow this capacity,
+			// confident we can reclaim it later.
+			if !preemption.CanAlwaysReclaim(cq) {
+				// reserve capacity up to the
+				// borrowing limit, so that
+				// lower-priority workloads in another
+				// Cohort cannot admit before us.
+				cq.AddUsage(resourcesToReserve(e, cq))
+			}
 			continue
 		}
 
diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go
@@ -2557,6 +2557,104 @@ func TestSchedule(t *testing.T) {
 				"eng-gamma/Admitted-Workload-3": *utiltesting.MakeAdmission("CQ3").Assignment("gpu", "on-demand", "5").Obj(),
 			},
 		},
+		"capacity not blocked when lending clusterqueue can reclaim": {
+			additionalClusterQueues: []kueue.ClusterQueue{
+				*utiltesting.MakeClusterQueue("ClusterQueueA").
+					Cohort("root").
+					ResourceGroup(
+						utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "2").FlavorQuotas,
+					).
+					Preemption(kueue.ClusterQueuePreemption{
+						ReclaimWithinCohort: kueue.PreemptionPolicyAny,
+					}).
+					Obj(),
+				*utiltesting.MakeClusterQueue("ClusterQueueB").
+					Cohort("root").
+					ResourceGroup(
+						utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "0").FlavorQuotas,
+					).
+					Obj(),
+			},
+			additionalLocalQueues: []kueue.LocalQueue{
+				*utiltesting.MakeLocalQueue("lq", "eng-alpha").ClusterQueue("ClusterQueueA").Obj(),
+				*utiltesting.MakeLocalQueue("lq", "eng-beta").ClusterQueue("ClusterQueueB").Obj(),
+			},
+			workloads: []kueue.Workload{
+				*utiltesting.MakeWorkload("a1-admitted", "eng-alpha").
+					Queue("lq").
+					Request("gpu", "1").
+					SimpleReserveQuota("ClusterQueueA", "on-demand", now).
+					Obj(),
+				*utiltesting.MakeWorkload("a2-pending", "eng-alpha").
+					Queue("lq").
+					Request("gpu", "2").
+					Obj(),
+				*utiltesting.MakeWorkload("b1-pending", "eng-beta").
+					Creation(now).
+					Queue("lq").
+					Request("gpu", "1").
+					Obj(),
+			},
+			wantLeft: nil,
+			wantInadmissibleLeft: map[string][]string{
+				"ClusterQueueA": {"eng-alpha/a2-pending"},
+			},
+			wantScheduled: []string{
+				"eng-beta/b1-pending",
+			},
+			wantAssignments: map[string]kueue.Admission{
+				"eng-alpha/a1-admitted": *utiltesting.MakeAdmission("ClusterQueueA").Assignment("gpu", "on-demand", "1").Obj(),
+				"eng-beta/b1-pending":   *utiltesting.MakeAdmission("ClusterQueueB").Assignment("gpu", "on-demand", "1").Obj(),
+			},
+		},
+		"capacity blocked when lending clusterqueue not guaranteed to reclaim": {
+			additionalClusterQueues: []kueue.ClusterQueue{
+				*utiltesting.MakeClusterQueue("ClusterQueueA").
+					Cohort("root").
+					ResourceGroup(
+						utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "2").FlavorQuotas,
+					).
+					Preemption(kueue.ClusterQueuePreemption{
+						ReclaimWithinCohort: kueue.PreemptionPolicyLowerPriority,
+					}).
+					Obj(),
+				*utiltesting.MakeClusterQueue("ClusterQueueB").
+					Cohort("root").
+					ResourceGroup(
+						utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "0").FlavorQuotas,
+					).
+					Obj(),
+			},
+			additionalLocalQueues: []kueue.LocalQueue{
+				*utiltesting.MakeLocalQueue("lq", "eng-alpha").ClusterQueue("ClusterQueueA").Obj(),
+				*utiltesting.MakeLocalQueue("lq", "eng-beta").ClusterQueue("ClusterQueueB").Obj(),
+			},
+			workloads: []kueue.Workload{
+				*utiltesting.MakeWorkload("a1-admitted", "eng-alpha").
+					Queue("lq").
+					Request("gpu", "1").
+					SimpleReserveQuota("ClusterQueueA", "on-demand", now).
+					Obj(),
+				*utiltesting.MakeWorkload("a2-pending", "eng-alpha").
+					Queue("lq").
+					Request("gpu", "2").
+					Obj(),
+				*utiltesting.MakeWorkload("b1-pending", "eng-beta").
+					Creation(now).
+					Queue("lq").
+					Request("gpu", "1").
+					Obj(),
+			},
+			wantLeft: map[string][]string{
+				"ClusterQueueB": {"eng-beta/b1-pending"},
+			},
+			wantInadmissibleLeft: map[string][]string{
+				"ClusterQueueA": {"eng-alpha/a2-pending"},
+			},
+			wantAssignments: map[string]kueue.Admission{
+				"eng-alpha/a1-admitted": *utiltesting.MakeAdmission("ClusterQueueA").Assignment("gpu", "on-demand", "1").Obj(),
+			},
+		},
 	}
 
 	for name, tc := range cases {
diff --git a/pkg/util/testing/wrappers.go b/pkg/util/testing/wrappers.go
@@ -75,6 +75,14 @@ func MakeWorkload(name, ns string) *WorkloadWrapper {
 	}}
 }
 
+// MakeWorkloadWithGeneratedName creates a wrapper for a Workload with a single pod
+// with a single container.
+func MakeWorkloadWithGeneratedName(namePrefix, ns string) *WorkloadWrapper {
+	wl := MakeWorkload("", ns)
+	wl.GenerateName = namePrefix
+	return wl
+}
+
 func (w *WorkloadWrapper) Obj() *kueue.Workload {
 	return &w.Workload
 }
diff --git a/test/integration/scheduler/fairsharing/fair_sharing_test.go b/test/integration/scheduler/fairsharing/fair_sharing_test.go
@@ -18,20 +18,15 @@ package fairsharing
 
 import (
 	"fmt"
-	"time"
 
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/types"
-	"k8s.io/apimachinery/pkg/util/sets"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 	"sigs.k8s.io/kueue/pkg/util/testing"
-	"sigs.k8s.io/kueue/pkg/workload"
 	"sigs.k8s.io/kueue/test/integration/framework"
 	"sigs.k8s.io/kueue/test/util"
 )
@@ -130,7 +125,7 @@ var _ = ginkgo.Describe("Scheduler", func() {
 			util.ExpectClusterQueueWeightedShareMetric(cqShared, 0)
 
 			ginkgo.By("Terminating 4 running workloads in cqA: shared quota is fair-shared")
-			finishRunningWorkloadsInCQ(cqA, 4)
+			util.FinishRunningWorkloadsInCQ(ctx, k8sClient, cqA, 4)
 
 			// Admits 1 from cqA and 3 from cqB.
 			util.ExpectReservingActiveWorkloadsMetric(cqA, 5)
@@ -142,7 +137,7 @@ var _ = ginkgo.Describe("Scheduler", func() {
 			util.ExpectClusterQueueWeightedShareMetric(cqShared, 0)
 
 			ginkgo.By("Terminating 2 more running workloads in cqA: cqB starts to take over shared quota")
-			finishRunningWorkloadsInCQ(cqA, 2)
+			util.FinishRunningWorkloadsInCQ(ctx, k8sClient, cqA, 2)
 
 			// Admits last 1 from cqA and 1 from cqB.
 			util.ExpectReservingActiveWorkloadsMetric(cqA, 4)
@@ -263,7 +258,7 @@ var _ = ginkgo.Describe("Scheduler", func() {
 			util.ExpectPendingWorkloadsMetric(cqB, 5, 0)
 
 			ginkgo.By("Finishing eviction of 4 running workloads in cqA: shared quota is fair-shared")
-			finishEvictionOfWorkloadsInCQ(cqA, 4)
+			util.FinishEvictionOfWorkloadsInCQ(ctx, k8sClient, cqA, 4)
 			util.ExpectReservingActiveWorkloadsMetric(cqB, 4)
 			util.ExpectClusterQueueWeightedShareMetric(cqA, 222)
 			util.ExpectClusterQueueWeightedShareMetric(cqB, 111)
@@ -278,7 +273,7 @@ var _ = ginkgo.Describe("Scheduler", func() {
 			util.ExpectClusterQueueWeightedShareMetric(cqC, 0)
 
 			ginkgo.By("Finishing eviction of 1 running workloads in the CQ with highest usage: cqA")
-			finishEvictionOfWorkloadsInCQ(cqA, 1)
+			util.FinishEvictionOfWorkloadsInCQ(ctx, k8sClient, cqA, 1)
 			util.ExpectReservingActiveWorkloadsMetric(cqC, 1)
 			util.ExpectClusterQueueWeightedShareMetric(cqA, 111)
 			util.ExpectClusterQueueWeightedShareMetric(cqB, 111)
@@ -295,39 +290,3 @@ var _ = ginkgo.Describe("Scheduler", func() {
 		})
 	})
 })
-
-func finishRunningWorkloadsInCQ(cq *kueue.ClusterQueue, n int) {
-	var wList kueue.WorkloadList
-	gomega.ExpectWithOffset(1, k8sClient.List(ctx, &wList)).To(gomega.Succeed())
-	finished := 0
-	for i := 0; i < len(wList.Items) && finished < n; i++ {
-		wl := wList.Items[i]
-		if wl.Status.Admission != nil && string(wl.Status.Admission.ClusterQueue) == cq.Name && !meta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
-			util.FinishWorkloads(ctx, k8sClient, &wl)
-			finished++
-		}
-	}
-	gomega.ExpectWithOffset(1, finished).To(gomega.Equal(n), "Not enough workloads finished")
-}
-
-func finishEvictionOfWorkloadsInCQ(cq *kueue.ClusterQueue, n int) {
-	finished := sets.New[types.UID]()
-	gomega.EventuallyWithOffset(1, func(g gomega.Gomega) {
-		var wList kueue.WorkloadList
-		g.Expect(k8sClient.List(ctx, &wList)).To(gomega.Succeed())
-		for i := 0; i < len(wList.Items) && finished.Len() < n; i++ {
-			wl := wList.Items[i]
-			if wl.Status.Admission == nil || string(wl.Status.Admission.ClusterQueue) != cq.Name {
-				continue
-			}
-			evicted := meta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted)
-			quotaReserved := meta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadQuotaReserved)
-			if evicted && quotaReserved {
-				workload.UnsetQuotaReservationWithCondition(&wl, "Pending", "Eviction finished by test", time.Now())
-				g.Expect(workload.ApplyAdmissionStatus(ctx, k8sClient, &wl, true)).To(gomega.Succeed())
-				finished.Insert(wl.UID)
-			}
-		}
-		g.Expect(finished.Len()).Should(gomega.Equal(n), "Not enough workloads evicted")
-	}, util.Timeout, util.Interval).Should(gomega.Succeed())
-}
diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go
diff --git a/test/util/util_scheduling.go b/test/util/util_scheduling.go

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,14 @@ func MakeWorkload(name, ns string) *WorkloadWrapper {`
`75`	`75`	`}}`
`76`	`76`	`}`
`77`	`77`
	`78`	`+// MakeWorkloadWithGeneratedName creates a wrapper for a Workload with a single pod`
	`79`	`+// with a single container.`
	`80`	`+func MakeWorkloadWithGeneratedName(namePrefix, ns string) *WorkloadWrapper {`
	`81`	`+ wl := MakeWorkload("", ns)`
	`82`	`+ wl.GenerateName = namePrefix`
	`83`	`+ return wl`
	`84`	`+}`
	`85`	`+`
`78`	`86`	`func (w WorkloadWrapper) Obj() kueue.Workload {`
`79`	`87`	`return &w.Workload`
`80`	`88`	`}`