Skip to content

Commit 702fed9

Browse files
no longer ad terminting NoExecute taint for the Deleting Cluster
Signed-off-by: changzhen <[email protected]>
1 parent 307163d commit 702fed9

File tree

7 files changed

+70
-80
lines changed

7 files changed

+70
-80
lines changed

cmd/controller-manager/app/controllermanager.go

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,15 +248,13 @@ func startClusterController(ctx controllerscontext.Context) (enabled bool, err e
248248
opts := ctx.Opts
249249

250250
clusterController := &cluster.Controller{
251-
Client: mgr.GetClient(),
252-
EventRecorder: mgr.GetEventRecorderFor(cluster.ControllerName),
253-
ClusterMonitorPeriod: opts.ClusterMonitorPeriod.Duration,
254-
ClusterMonitorGracePeriod: opts.ClusterMonitorGracePeriod.Duration,
255-
ClusterStartupGracePeriod: opts.ClusterStartupGracePeriod.Duration,
256-
EnableTaintManager: ctx.Opts.EnableTaintManager,
257-
ClusterTaintEvictionRetryFrequency: 10 * time.Second,
258-
ExecutionSpaceRetryFrequency: 10 * time.Second,
259-
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
251+
Client: mgr.GetClient(),
252+
EventRecorder: mgr.GetEventRecorderFor(cluster.ControllerName),
253+
ClusterMonitorPeriod: opts.ClusterMonitorPeriod.Duration,
254+
ClusterMonitorGracePeriod: opts.ClusterMonitorGracePeriod.Duration,
255+
ClusterStartupGracePeriod: opts.ClusterStartupGracePeriod.Duration,
256+
CleanupCheckInterval: 10 * time.Second,
257+
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
260258
}
261259
if err := clusterController.SetupWithManager(mgr); err != nil {
262260
return false, err

pkg/apis/cluster/v1alpha1/well_known_constants.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ const (
2727
// (corresponding to ClusterConditionReady status ConditionUnknown)
2828
// and removed when cluster becomes reachable (ClusterConditionReady status ConditionTrue).
2929
TaintClusterUnreachable = "cluster.karmada.io/unreachable"
30-
// TaintClusterTerminating will be added when cluster is terminating.
31-
TaintClusterTerminating = "cluster.karmada.io/terminating"
3230

3331
// CacheSourceAnnotationKey is the annotation that added to a resource to
3432
// represent which cluster it cached from.

pkg/controllers/cluster/cluster_controller.go

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -75,20 +75,12 @@ var (
7575
Key: clusterv1alpha1.TaintClusterNotReady,
7676
Effect: corev1.TaintEffectNoSchedule,
7777
}
78-
79-
// TerminatingTaintTemplate is the taint for when a cluster is terminating executing resources.
80-
// Used for taint based eviction.
81-
TerminatingTaintTemplate = &corev1.Taint{
82-
Key: clusterv1alpha1.TaintClusterTerminating,
83-
Effect: corev1.TaintEffectNoExecute,
84-
}
8578
)
8679

8780
// Controller is to sync Cluster.
8881
type Controller struct {
89-
client.Client // used to operate Cluster resources.
90-
EventRecorder record.EventRecorder
91-
EnableTaintManager bool
82+
client.Client // used to operate Cluster resources.
83+
EventRecorder record.EventRecorder
9284

9385
// ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does
9486
// cluster-controller check cluster health signal posted from cluster-status-controller.
@@ -100,9 +92,10 @@ type Controller struct {
10092
ClusterMonitorGracePeriod time.Duration
10193
// When cluster is just created, e.g. agent bootstrap or cluster join, we give a longer grace period.
10294
ClusterStartupGracePeriod time.Duration
103-
104-
ClusterTaintEvictionRetryFrequency time.Duration
105-
ExecutionSpaceRetryFrequency time.Duration
95+
// CleanupCheckInterval defines the fixed interval for polling resource deletion status during cluster removal.
96+
// The fixed interval bypasses exponential backoff mechanism to ensure the check frequency remains balanced
97+
// - neither too frequent to risk system overload nor too sparse to cause delays.
98+
CleanupCheckInterval time.Duration
10699

107100
// Per Cluster map stores last observed health together with a local time when it was observed.
108101
clusterHealthMap *clusterHealthMap
@@ -225,12 +218,6 @@ func (c *Controller) syncCluster(ctx context.Context, cluster *clusterv1alpha1.C
225218
}
226219

227220
func (c *Controller) removeCluster(ctx context.Context, cluster *clusterv1alpha1.Cluster) (controllerruntime.Result, error) {
228-
// add terminating taint before cluster is deleted
229-
if err := c.updateClusterTaints(ctx, []*corev1.Taint{TerminatingTaintTemplate}, nil, cluster); err != nil {
230-
klog.ErrorS(err, "Failed to update terminating taint", "cluster", cluster.Name)
231-
return controllerruntime.Result{}, err
232-
}
233-
234221
if err := c.removeExecutionSpace(ctx, cluster); err != nil {
235222
klog.Errorf("Failed to remove execution space %s: %v", cluster.Name, err)
236223
c.EventRecorder.Event(cluster, corev1.EventTypeWarning, events.EventReasonRemoveExecutionSpaceFailed, err.Error())
@@ -244,21 +231,19 @@ func (c *Controller) removeCluster(ctx context.Context, cluster *clusterv1alpha1
244231
return controllerruntime.Result{}, err
245232
} else if exist {
246233
klog.Infof("Requeuing operation until the cluster(%s) execution space deleted", cluster.Name)
247-
return controllerruntime.Result{RequeueAfter: c.ExecutionSpaceRetryFrequency}, nil
234+
return controllerruntime.Result{RequeueAfter: c.CleanupCheckInterval}, nil
248235
}
249236

250-
// delete the health data from the map explicitly after we removing the cluster.
237+
// delete the health data from the map explicitly when we're removing the cluster.
251238
c.clusterHealthMap.delete(cluster.Name)
252239

253240
// check if target cluster is removed from all bindings.
254-
if c.EnableTaintManager {
255-
if done, err := c.isTargetClusterRemoved(ctx, cluster); err != nil {
256-
klog.ErrorS(err, "Failed to check whether target cluster is removed from bindings", "cluster", cluster.Name)
257-
return controllerruntime.Result{}, err
258-
} else if !done {
259-
klog.InfoS("Terminating taint eviction process has not finished yet, will try again later", "cluster", cluster.Name)
260-
return controllerruntime.Result{RequeueAfter: c.ClusterTaintEvictionRetryFrequency}, nil
261-
}
241+
if done, err := c.isTargetClusterRemoved(ctx, cluster); err != nil {
242+
klog.ErrorS(err, "Failed to check target cluster is removed from all bindings", "cluster", cluster.Name)
243+
return controllerruntime.Result{}, err
244+
} else if !done {
245+
klog.InfoS("The cluster is still waiting to be removed from all bindings, will try again later", "cluster", cluster.Name)
246+
return controllerruntime.Result{RequeueAfter: c.CleanupCheckInterval}, nil
262247
}
263248

264249
return c.removeFinalizer(ctx, cluster)

pkg/controllers/cluster/cluster_controller_test.go

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import (
3838
"github.com/karmada-io/karmada/pkg/util"
3939
"github.com/karmada-io/karmada/pkg/util/gclient"
4040
"github.com/karmada-io/karmada/pkg/util/indexregistry"
41-
"github.com/karmada-io/karmada/pkg/util/names"
4241
)
4342

4443
func newClusterController() *Controller {
@@ -65,7 +64,6 @@ func newClusterController() *Controller {
6564
Client: client,
6665
EventRecorder: record.NewFakeRecorder(1024),
6766
clusterHealthMap: newClusterHealthMap(),
68-
EnableTaintManager: true,
6967
ClusterMonitorGracePeriod: 40 * time.Second,
7068
}
7169
}
@@ -249,41 +247,6 @@ func TestController_Reconcile(t *testing.T) {
249247
want: controllerruntime.Result{},
250248
wantErr: false,
251249
},
252-
{
253-
name: "remove cluster failed",
254-
cluster: &clusterv1alpha1.Cluster{
255-
ObjectMeta: controllerruntime.ObjectMeta{
256-
Name: "test-cluster",
257-
Finalizers: []string{util.ClusterControllerFinalizer},
258-
},
259-
Spec: clusterv1alpha1.ClusterSpec{
260-
SyncMode: clusterv1alpha1.Pull,
261-
},
262-
Status: clusterv1alpha1.ClusterStatus{
263-
Conditions: []metav1.Condition{
264-
{
265-
Type: clusterv1alpha1.ClusterConditionReady,
266-
Status: metav1.ConditionFalse,
267-
},
268-
},
269-
},
270-
},
271-
ns: &corev1.Namespace{
272-
ObjectMeta: metav1.ObjectMeta{
273-
Name: names.GenerateExecutionSpaceName("test-cluster"),
274-
},
275-
},
276-
work: &workv1alpha1.Work{
277-
ObjectMeta: metav1.ObjectMeta{
278-
Name: "test-work",
279-
Namespace: names.GenerateExecutionSpaceName("test-cluster"),
280-
Finalizers: []string{util.ExecutionControllerFinalizer},
281-
},
282-
},
283-
del: true,
284-
want: controllerruntime.Result{},
285-
wantErr: true,
286-
},
287250
}
288251
for _, tt := range tests {
289252
t.Run(tt.name, func(t *testing.T) {

pkg/scheduler/core/generic_scheduler.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,13 @@ func (g *genericScheduler) findClustersThatFit(
132132
// DO NOT filter unhealthy cluster, let users make decisions by using ClusterTolerations of Placement.
133133
clusters := clusterInfo.GetClusters()
134134
for _, c := range clusters {
135+
// When cluster is deleting, we will clean up the scheduled results in the cluster.
136+
// So we should not schedule resource to the deleting cluster.
137+
if !c.Cluster().DeletionTimestamp.IsZero() {
138+
klog.V(4).Infof("Cluster %q is deleting, skip it", c.Cluster().Name)
139+
continue
140+
}
141+
135142
if result := g.scheduleFramework.RunFilterPlugins(ctx, bindingSpec, bindingStatus, c.Cluster()); !result.IsSuccess() {
136143
klog.V(4).Infof("Cluster %q is not fit, reason: %v", c.Cluster().Name, result.AsError())
137144
diagnosis.ClusterToResultMap[c.Cluster().Name] = result

pkg/scheduler/event_handler.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,12 @@ func (s *Scheduler) updateCluster(oldObj, newObj interface{}) {
268268
}
269269

270270
switch {
271+
case oldCluster.DeletionTimestamp.IsZero() && !newCluster.DeletionTimestamp.IsZero():
272+
s.clusterReconcileWorker.Add(newCluster)
271273
case !equality.Semantic.DeepEqual(oldCluster.Labels, newCluster.Labels):
272274
fallthrough
273275
case oldCluster.Generation != newCluster.Generation:
274-
// To distinguish the obd and new cluster objects, we need to add the entire object
276+
// To distinguish the old and new cluster objects, we need to add the entire object
275277
// to the worker. Therefore, call Add func instead of Enqueue func.
276278
s.clusterReconcileWorker.Add(oldCluster)
277279
s.clusterReconcileWorker.Add(newCluster)

pkg/scheduler/scheduler.go

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
apierrors "k8s.io/apimachinery/pkg/api/errors"
3030
"k8s.io/apimachinery/pkg/api/meta"
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32+
"k8s.io/apimachinery/pkg/labels"
3233
"k8s.io/apimachinery/pkg/types"
3334
utilerrors "k8s.io/apimachinery/pkg/util/errors"
3435
"k8s.io/apimachinery/pkg/util/wait"
@@ -428,7 +429,13 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) {
428429
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
429430
return err
430431
}
431-
// TODO(dddddai): reschedule bindings on cluster change
432+
// TODO: reschedule binding on cluster change other than cluster deletion, such as cluster labels changed.
433+
if s.HasTerminatingTargetClusters(&rb.Spec) {
434+
klog.Infof("Reschedule ResourceBinding(%s/%s) as some scheduled clusters are deleted", namespace, name)
435+
err = s.scheduleResourceBinding(rb)
436+
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
437+
return err
438+
}
432439
klog.V(3).Infof("Don't need to schedule ResourceBinding(%s/%s)", rb.Namespace, rb.Name)
433440

434441
// If no scheduling is required, we need to ensure that binding.Generation is equal to
@@ -498,7 +505,13 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
498505
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
499506
return err
500507
}
501-
// TODO(dddddai): reschedule bindings on cluster change
508+
// TODO: reschedule binding on cluster change other than cluster deletion, such as cluster labels changed.
509+
if s.HasTerminatingTargetClusters(&crb.Spec) {
510+
klog.Infof("Reschedule ClusterResourceBinding(%s) as some scheduled clusters are deleted", name)
511+
err = s.scheduleClusterResourceBinding(crb)
512+
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
513+
return err
514+
}
502515
klog.Infof("Don't need to schedule ClusterResourceBinding(%s)", name)
503516

504517
// If no scheduling is required, we need to ensure that binding.Generation is equal to
@@ -512,6 +525,30 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
512525
return nil
513526
}
514527

528+
// HasTerminatingTargetClusters checks whether any cluster in the ResourceBinding's target list
529+
// is marked for deletion (i.e., has a non-zero DeletionTimestamp).
530+
//
531+
// This is used to trigger rescheduling when bound clusters are being terminated, ensuring
532+
// workloads get migrated before cluster resources become unavailable.
533+
func (s *Scheduler) HasTerminatingTargetClusters(bindingSpec *workv1alpha2.ResourceBindingSpec) bool {
534+
clusters, err := s.clusterLister.List(labels.Everything())
535+
if err != nil {
536+
klog.Errorf("Failed to list clusters: %v", err)
537+
return false
538+
}
539+
540+
for _, cluster := range clusters {
541+
if cluster.DeletionTimestamp.IsZero() {
542+
continue
543+
}
544+
545+
if bindingSpec.TargetContains(cluster.Name) {
546+
return true
547+
}
548+
}
549+
return false
550+
}
551+
515552
func (s *Scheduler) scheduleResourceBinding(rb *workv1alpha2.ResourceBinding) (err error) {
516553
defer func() {
517554
condition, ignoreErr := getConditionByError(err)

0 commit comments

Comments
 (0)