Skip to content

Commit 7d879a3

Browse files
committed
wip3
1 parent 73d7f23 commit 7d879a3

File tree

3 files changed

+41
-34
lines changed

3 files changed

+41
-34
lines changed

pkg/controller/admissionchecks/multikueue/workload.go

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -365,31 +365,35 @@ func (w *wlReconciler) reconcileGroup(ctx context.Context, group *wlGroup) (reco
365365
// We'll retry this in the next reconciling.
366366
return reconcile.Result{}, err
367367
}
368-
} else {
369-
// workload eviction on worker cluster
370-
log.V(3).Info("Workload gets evicted in the remote cluster", "cluster", evictedRemote)
368+
return reconcile.Result{}, nil
369+
}
371370

372-
for cluster := range group.remotes {
373-
err := group.RemoveRemoteObjects(ctx, cluster)
374-
if err != nil {
375-
return reconcile.Result{}, err
376-
}
371+
// workload eviction on worker cluster
372+
log.V(3).Info("Workload gets evicted in the remote cluster", "cluster", evictedRemote)
373+
374+
needsACUpdate := acs.State == kueue.CheckStateReady
375+
if err := workload.PatchAdmissionStatus(ctx, w.client, group.local, w.clock, func(wl *kueue.Workload) (bool, error) {
376+
if needsACUpdate {
377+
acs.State = kueue.CheckStatePending
378+
acs.Message = fmt.Sprintf("Workload evicted on worker cluster: %q, resetting for re-admission", *group.local.Status.ClusterName)
379+
acs.LastTransitionTime = metav1.NewTime(w.clock.Now())
380+
workload.SetAdmissionCheckState(&wl.Status.AdmissionChecks, *acs, w.clock)
381+
wl.Status.ClusterName = nil
382+
wl.Status.NominatedClusterNames = nil
377383
}
378-
needsACUpdate := acs.State == kueue.CheckStateReady
379-
if err := workload.PatchAdmissionStatus(ctx, w.client, group.local, w.clock, func(wl *kueue.Workload) (bool, error) {
380-
if needsACUpdate {
381-
acs.State = kueue.CheckStatePending
382-
// update the message
383-
acs.Message = fmt.Sprintf("Workload evicted on worker cluster: %q, resetting for re-admission", *group.local.Status.ClusterName)
384-
// update the transition time since is used to detect the lost worker state.
385-
acs.LastTransitionTime = metav1.NewTime(w.clock.Now())
386-
workload.SetAdmissionCheckState(&group.local.Status.AdmissionChecks, *acs, w.clock)
387-
group.local.Status.ClusterName = nil
388-
group.local.Status.NominatedClusterNames = nil
389-
}
390-
return true, nil
391-
}); err != nil {
392-
log.Error(err, "Failed to patch workload status")
384+
return true, nil
385+
}); err != nil {
386+
log.Error(err, "Failed to patch workload status")
387+
return reconcile.Result{}, err
388+
}
389+
390+
if needsACUpdate {
391+
w.recorder.Eventf(group.local, corev1.EventTypeNormal, "MultiKueue", acs.Message)
392+
}
393+
394+
for cluster := range group.remotes {
395+
if err := client.IgnoreNotFound(group.RemoveRemoteObjects(ctx, cluster)); err != nil {
396+
log.Error(err, "Failed to remove cluster remote objects", "cluster", cluster)
393397
return reconcile.Result{}, err
394398
}
395399
}

pkg/controller/admissionchecks/multikueue/workload_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -697,8 +697,8 @@ func TestWlReconcile(t *testing.T) {
697697
*baseWorkloadBuilder.Clone().
698698
AdmissionCheck(kueue.AdmissionCheckState{
699699
Name: "ac1",
700-
State: kueue.CheckStateReady,
701-
Message: `The workload got reservation on "worker1"`,
700+
State: kueue.CheckStatePending,
701+
Message: `Workload evicted on worker cluster: "worker1", resetting for re-admission`,
702702
}).
703703
ControllerReference(batchv1.SchemeGroupVersion.WithKind("Job"), "job1", "uid1").
704704
ReserveQuotaAt(utiltestingapi.MakeAdmission("q1").Obj(), now).
@@ -711,6 +711,14 @@ func TestWlReconcile(t *testing.T) {
711711
wantWorker1Workloads: []kueue.Workload{},
712712
wantWorker1Jobs: []batchv1.Job{},
713713
wantWorker2Workloads: []kueue.Workload{},
714+
wantEvents: []utiltesting.EventRecord{
715+
{
716+
Key: client.ObjectKeyFromObject(baseWorkloadBuilder.Clone().Obj()),
717+
EventType: "Normal",
718+
Reason: "MultiKueue",
719+
Message: `Workload evicted on worker cluster: "worker1", resetting for re-admission`,
720+
},
721+
},
714722
},
715723
"remote wl with reservation (withoutJobManagedBy)": {
716724
features: map[featuregate.Feature]bool{features.MultiKueueBatchJobWithManagedBy: false},
@@ -748,7 +756,6 @@ func TestWlReconcile(t *testing.T) {
748756
}).
749757
ControllerReference(batchv1.SchemeGroupVersion.WithKind("Job"), "job1", "uid1").
750758
ReserveQuotaAt(utiltestingapi.MakeAdmission("q1").Obj(), now).
751-
ClusterName("worker1").
752759
Obj(),
753760
},
754761
wantManagersJobs: []batchv1.Job{
@@ -1760,7 +1767,7 @@ func TestWlReconcile(t *testing.T) {
17601767
// However, other important Status fields (e.g. Conditions) still reflect the change,
17611768
// so we deliberately ignore the Admission field here.
17621769
if features.Enabled(features.WorkloadRequestUseMergePatch) {
1763-
objCheckOpts = append(objCheckOpts, cmpopts.IgnoreFields(kueue.WorkloadStatus{}, "Admission"))
1770+
objCheckOpts = append(objCheckOpts, cmpopts.IgnoreFields(kueue.WorkloadStatus{}, "Admission", "ClusterName"))
17641771
}
17651772

17661773
gotManagersWorkloads := &kueue.WorkloadList{}

test/integration/multikueue/jobs_test.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,13 +1964,9 @@ var _ = ginkgo.Describe("MultiKueue", ginkgo.Label("area:multikueue", "feature:m
19641964
gomega.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, wlLookupKey, managerWl)).To(gomega.Succeed())
19651965
gomega.Eventually(func(g gomega.Gomega) {
19661966
createdWorkload := &kueue.Workload{}
1967-
err := worker1TestCluster.client.Get(worker1TestCluster.ctx, wlLookupKey, createdWorkload)
1968-
ginkgo.GinkgoLogr.Info("Checking worker1 workload", "workload", createdWorkload != nil, "err", err)
1969-
// g.Expect(err).To(gomega.Succeed())
1970-
// g.Expect(createdWorkload.Spec).To(gomega.BeComparableTo(managerWl.Spec))
1971-
err = worker2TestCluster.client.Get(worker2TestCluster.ctx, wlLookupKey, createdWorkload)
1972-
ginkgo.GinkgoLogr.Info("Checking worker2 workload", "workload", createdWorkload != nil, "err", err)
1973-
g.Expect(err).To(gomega.Succeed())
1967+
g.Expect(worker1TestCluster.client.Get(worker1TestCluster.ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed())
1968+
g.Expect(createdWorkload.Spec).To(gomega.BeComparableTo(managerWl.Spec))
1969+
g.Expect(worker2TestCluster.client.Get(worker2TestCluster.ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed())
19741970
g.Expect(createdWorkload.Spec).To(gomega.BeComparableTo(managerWl.Spec))
19751971
}, util.LongTimeout, util.Interval).Should(gomega.Succeed())
19761972
})

0 commit comments

Comments
 (0)