From d1d68ad344d0ee22a052bd483329573835fc6326 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Sun, 24 Nov 2024 07:56:28 -0800 Subject: [PATCH 1/5] feat: drain and volume attachment observability --- pkg/apis/v1/nodeclaim_status.go | 2 + .../node/termination/controller.go | 217 ++++-------------- pkg/controllers/node/termination/drain.go | 110 +++++++++ .../node/termination/instancetermination.go | 71 ++++++ .../node/termination/suite_test.go | 31 +-- .../termination/terminator/events/events.go | 16 +- .../node/termination/volumedetachment.go | 138 +++++++++++ pkg/utils/nodeclaim/nodeclaim.go | 25 ++ 8 files changed, 404 insertions(+), 206 deletions(-) create mode 100644 pkg/controllers/node/termination/drain.go create mode 100644 pkg/controllers/node/termination/instancetermination.go create mode 100644 pkg/controllers/node/termination/volumedetachment.go diff --git a/pkg/apis/v1/nodeclaim_status.go b/pkg/apis/v1/nodeclaim_status.go index aca25a4e77..ec0bf2f102 100644 --- a/pkg/apis/v1/nodeclaim_status.go +++ b/pkg/apis/v1/nodeclaim_status.go @@ -28,6 +28,8 @@ const ( ConditionTypeInitialized = "Initialized" ConditionTypeConsolidatable = "Consolidatable" ConditionTypeDrifted = "Drifted" + ConditionTypeDrained = "Drained" + ConditionTypeVolumesDetached = "VolumesDetached" ConditionTypeInstanceTerminating = "InstanceTerminating" ConditionTypeConsistentStateFound = "ConsistentStateFound" ) diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index 67e12c6ebf..fe28053f4f 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -21,14 +21,12 @@ import ( "fmt" "time" - "github.com/samber/lo" "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" - storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -44,14 +42,10 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/operator/injection" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" - "sigs.k8s.io/karpenter/pkg/utils/pod" - "sigs.k8s.io/karpenter/pkg/utils/termination" - volumeutil "sigs.k8s.io/karpenter/pkg/utils/volume" ) // Controller for the resource @@ -60,7 +54,12 @@ type Controller struct { kubeClient client.Client cloudProvider cloudprovider.CloudProvider terminator *terminator.Terminator - recorder events.Recorder + reconcilers []terminationReconciler +} + +// TODO (jmdeal@): Split subreconcilers into individual controllers +type terminationReconciler interface { + Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) } // NewController constructs a controller instance @@ -70,176 +69,63 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou kubeClient: kubeClient, cloudProvider: cloudProvider, terminator: terminator, - recorder: recorder, + reconcilers: []terminationReconciler{ + &DrainReconciler{kubeClient, cloudProvider, recorder, terminator}, + &VolumeDetachmentReconciler{kubeClient, clk, recorder}, + &InstanceTerminationReconciler{kubeClient, cloudProvider, clk}, + }, } } +// nolint:gocyclo func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, "node.termination") - - if !n.GetDeletionTimestamp().IsZero() { - return c.finalize(ctx, n) - } - return reconcile.Result{}, nil -} - -//nolint:gocyclo -func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile.Result, error) { - if !controllerutil.ContainsFinalizer(node, v1.TerminationFinalizer) { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) + if !nodeutils.IsManaged(n, c.cloudProvider) { return reconcile.Result{}, nil } - if !nodeutils.IsManaged(node, c.cloudProvider) { + if n.GetDeletionTimestamp().IsZero() { return reconcile.Result{}, nil } - - nodeClaims, err := nodeutils.GetNodeClaims(ctx, c.kubeClient, node) - if err != nil { - return reconcile.Result{}, fmt.Errorf("listing nodeclaims, %w", err) - } - - if err = c.deleteAllNodeClaims(ctx, nodeClaims...); err != nil { - return reconcile.Result{}, fmt.Errorf("deleting nodeclaims, %w", err) - } - - nodeTerminationTime, err := c.nodeTerminationTime(node, nodeClaims...) - if err != nil { - return reconcile.Result{}, err + if !controllerutil.ContainsFinalizer(n, v1.TerminationFinalizer) { + return reconcile.Result{}, nil } - if err = c.terminator.Taint(ctx, node, v1.DisruptedNoScheduleTaint); err != nil { + if err := c.terminator.Taint(ctx, n, v1.DisruptedNoScheduleTaint); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } - return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err)) - } - if err = c.terminator.Drain(ctx, node, nodeTerminationTime); err != nil { - if !terminator.IsNodeDrainError(err) { - return reconcile.Result{}, fmt.Errorf("draining node, %w", err) - } - c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err)) - // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining - // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even - // though the CloudProvider says the instance is not around, we know that the kubelet process is still running - // if the Node Ready condition is true - // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 - if nodeutils.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue { - if _, err = c.cloudProvider.Get(ctx, node.Spec.ProviderID); err != nil { - if cloudprovider.IsNodeClaimNotFoundError(err) { - return reconcile.Result{}, c.removeFinalizer(ctx, node) - } - return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) - } - } - - return reconcile.Result{RequeueAfter: 1 * time.Second}, nil - } - NodesDrainedTotal.Inc(map[string]string{ - metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey], - }) - // In order for Pods associated with PersistentVolumes to smoothly migrate from the terminating Node, we wait - // for VolumeAttachments of drain-able Pods to be cleaned up before terminating Node and removing its finalizer. - // However, if TerminationGracePeriod is configured for Node, and we are past that period, we will skip waiting. - if nodeTerminationTime == nil || c.clock.Now().Before(*nodeTerminationTime) { - areVolumesDetached, err := c.ensureVolumesDetached(ctx, node) - if err != nil { - return reconcile.Result{}, fmt.Errorf("ensuring no volume attachments, %w", err) - } - if !areVolumesDetached { - return reconcile.Result{RequeueAfter: 1 * time.Second}, nil - } + return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) } - nodeClaims, err = nodeutils.GetNodeClaims(ctx, c.kubeClient, node) + nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) if err != nil { - return reconcile.Result{}, fmt.Errorf("deleting nodeclaims, %w", err) - } - for _, nodeClaim := range nodeClaims { - isInstanceTerminated, err := termination.EnsureTerminated(ctx, c.kubeClient, nodeClaim, c.cloudProvider) - if err != nil { - // 404 = the nodeClaim no longer exists - if errors.IsNotFound(err) { - continue - } - // 409 - The nodeClaim exists, but its status has already been modified - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) - } - if !isInstanceTerminated { - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil } - } - if err := c.removeFinalizer(ctx, node); err != nil { return reconcile.Result{}, err } - return reconcile.Result{}, nil -} - -func (c *Controller) deleteAllNodeClaims(ctx context.Context, nodeClaims ...*v1.NodeClaim) error { - for _, nodeClaim := range nodeClaims { - // If we still get the NodeClaim, but it's already marked as terminating, we don't need to call Delete again - if nodeClaim.DeletionTimestamp.IsZero() { - if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil { - return client.IgnoreNotFound(err) - } + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) + if nc.DeletionTimestamp.IsZero() { + if err := c.kubeClient.Delete(ctx, nc); err != nil { + return reconcile.Result{}, client.IgnoreNotFound(err) } } - return nil -} - -func (c *Controller) ensureVolumesDetached(ctx context.Context, node *corev1.Node) (volumesDetached bool, err error) { - volumeAttachments, err := nodeutils.GetVolumeAttachments(ctx, c.kubeClient, node) - if err != nil { - return false, err - } - // Filter out VolumeAttachments associated with not drain-able Pods - filteredVolumeAttachments, err := filterVolumeAttachments(ctx, c.kubeClient, node, volumeAttachments, c.clock) - if err != nil { - return false, err - } - return len(filteredVolumeAttachments) == 0, nil -} -// filterVolumeAttachments filters out storagev1.VolumeAttachments that should not block the termination -// of the passed corev1.Node -func filterVolumeAttachments(ctx context.Context, kubeClient client.Client, node *corev1.Node, volumeAttachments []*storagev1.VolumeAttachment, clk clock.Clock) ([]*storagev1.VolumeAttachment, error) { - // No need to filter empty VolumeAttachments list - if len(volumeAttachments) == 0 { - return volumeAttachments, nil - } - // Create list of non-drain-able Pods associated with Node - pods, err := nodeutils.GetPods(ctx, kubeClient, node) - if err != nil { - return nil, err - } - unDrainablePods := lo.Reject(pods, func(p *corev1.Pod, _ int) bool { - return pod.IsDrainable(p, clk) - }) - // Filter out VolumeAttachments associated with non-drain-able Pods - // Match on Pod -> PersistentVolumeClaim -> PersistentVolume Name <- VolumeAttachment - shouldFilterOutVolume := sets.New[string]() - for _, p := range unDrainablePods { - for _, v := range p.Spec.Volumes { - pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, kubeClient, p, v) - if errors.IsNotFound(err) { - continue - } - if err != nil { - return nil, err - } - if pvc != nil { - shouldFilterOutVolume.Insert(pvc.Spec.VolumeName) - } + for _, r := range c.reconcilers { + res, err := r.Reconcile(ctx, n, nc) + if res.Requeue || res.RequeueAfter != 0 || err != nil { + return res, err } } - filteredVolumeAttachments := lo.Reject(volumeAttachments, func(v *storagev1.VolumeAttachment, _ int) bool { - pvName := v.Spec.Source.PersistentVolumeName - return pvName == nil || shouldFilterOutVolume.Has(*pvName) - }) - return filteredVolumeAttachments, nil + + return reconcile.Result{}, nil } -func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error { +func removeFinalizer(ctx context.Context, kubeClient client.Client, n *corev1.Node) error { stored := n.DeepCopy() controllerutil.RemoveFinalizer(n, v1.TerminationFinalizer) if !equality.Semantic.DeepEqual(stored, n) { @@ -247,44 +133,27 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error // a strategic merge patch represents the finalizer list as a keyed "set" so removing // an item from the list doesn't replace the full list // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 - if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { - return client.IgnoreNotFound(fmt.Errorf("removing finalizer, %w", err)) + if err := kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { + if errors.IsNotFound(err) { + return nil + } + return fmt.Errorf("removing finalizer, %w", err) } - metrics.NodesTerminatedTotal.Inc(map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], }) - // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], }) - NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], }) - log.FromContext(ctx).Info("deleted node") } return nil } -func (c *Controller) nodeTerminationTime(node *corev1.Node, nodeClaims ...*v1.NodeClaim) (*time.Time, error) { - if len(nodeClaims) == 0 { - return nil, nil - } - expirationTimeString, exists := nodeClaims[0].ObjectMeta.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey] - if !exists { - return nil, nil - } - c.recorder.Publish(terminatorevents.NodeTerminationGracePeriodExpiring(node, expirationTimeString)) - expirationTime, err := time.Parse(time.RFC3339, expirationTimeString) - if err != nil { - return nil, fmt.Errorf("parsing %s annotation, %w", v1.NodeClaimTerminationTimestampAnnotationKey, err) - } - return &expirationTime, nil -} - func (c *Controller) Register(_ context.Context, m manager.Manager) error { return controllerruntime.NewControllerManagedBy(m). Named("node.termination"). diff --git a/pkg/controllers/node/termination/drain.go b/pkg/controllers/node/termination/drain.go new file mode 100644 index 0000000000..e18c13f7fc --- /dev/null +++ b/pkg/controllers/node/termination/drain.go @@ -0,0 +1,110 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package termination + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" + "sigs.k8s.io/karpenter/pkg/events" + "sigs.k8s.io/karpenter/pkg/metrics" + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" +) + +type DrainReconciler struct { + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + recorder events.Recorder + terminator *terminator.Terminator +} + +// nolint:gocyclo +func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + if nc.StatusConditions().IsTrue(v1.ConditionTypeDrained) { + return reconcile.Result{}, nil + } + + tgpExpirationTime, err := nodeclaim.TerminationGracePeriodExpirationTime(nc) + if err != nil { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + if tgpExpirationTime != nil { + d.recorder.Publish(terminatorevents.NodeTerminationGracePeriodExpiring(n, *tgpExpirationTime)) + } + + if err := d.terminator.Drain(ctx, n, tgpExpirationTime); err != nil { + if !terminator.IsNodeDrainError(err) { + return reconcile.Result{}, fmt.Errorf("draining node, %w", err) + } + // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining + // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even + // though the CloudProvider says the instance is not around, we know that the kubelet process is still running + // if the Node Ready condition is true + // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 + if nodeutils.GetCondition(n, corev1.NodeReady).Status != corev1.ConditionTrue { + if _, err = d.cloudProvider.Get(ctx, n.Spec.ProviderID); err != nil { + if cloudprovider.IsNodeClaimNotFoundError(err) { + return reconcile.Result{}, removeFinalizer(ctx, d.kubeClient, n) + } + return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) + } + } + stored := nc.DeepCopy() + if nc.StatusConditions().SetFalse(v1.ConditionTypeDrained, "Draining", "Draining") { + if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + } + d.recorder.Publish(terminatorevents.NodeFailedToDrain(n, err)) + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + } + + stored := nc.DeepCopy() + _ = nc.StatusConditions().SetTrue(v1.ConditionTypeDrained) + if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + NodesDrainedTotal.Inc(map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/instancetermination.go b/pkg/controllers/node/termination/instancetermination.go new file mode 100644 index 0000000000..2e97c4edd0 --- /dev/null +++ b/pkg/controllers/node/termination/instancetermination.go @@ -0,0 +1,71 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package termination + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" + "sigs.k8s.io/karpenter/pkg/utils/termination" +) + +type InstanceTerminationReconciler struct { + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + clk clock.Clock +} + +func (i *InstanceTerminationReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + elapsed, err := nodeclaimutils.HasTerminationGracePeriodElapsed(i.clk, nc) + if err != nil { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + if !nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() && !elapsed { + return reconcile.Result{}, nil + } + isInstanceTerminated, err := termination.EnsureTerminated(ctx, i.kubeClient, nc, i.cloudProvider) + if err != nil { + // 404 = the nodeClaim no longer exists + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + // 409 - The nodeClaim exists, but its status has already been modified + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) + } + if !isInstanceTerminated { + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + } + if err := removeFinalizer(ctx, i.kubeClient, n); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go index 5758579227..24680048d2 100644 --- a/pkg/controllers/node/termination/suite_test.go +++ b/pkg/controllers/node/termination/suite_test.go @@ -18,7 +18,6 @@ package termination_test import ( "context" - "sync" "testing" "time" @@ -141,34 +140,6 @@ var _ = Describe("Termination", func() { ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) ExpectNotFound(ctx, env.Client, node, nodeClaim) }) - It("should not race if deleting nodes in parallel", func() { - nodes := lo.Times(10, func(_ int) *corev1.Node { - return test.NodeClaimLinkedNode(nodeClaim) - }) - for _, node := range nodes { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - *node = *ExpectNodeExists(ctx, env.Client, node.Name) - } - - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - for range 2 { - var wg sync.WaitGroup - // this is enough to trip the race detector - for i := range nodes { - wg.Add(1) - go func(node *corev1.Node) { - defer GinkgoRecover() - defer wg.Done() - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - }(nodes[i]) - } - wg.Wait() - } - for _, node := range nodes { - ExpectNotFound(ctx, env.Client, node) - } - }) It("should exclude nodes from load balancers when terminating", func() { labels := map[string]string{"foo": "bar"} pod := test.Pod(test.PodOptions{ @@ -882,7 +853,7 @@ var _ = Describe("Termination", func() { // Don't let any pod evict MinAvailable: &minAvailable, }) - ExpectApplied(ctx, env.Client, pdb, node) + ExpectApplied(ctx, env.Client, pdb, node, nodeClaim) pods := test.Pods(5, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{ OwnerReferences: defaultOwnerRefs, Labels: labelSelector, diff --git a/pkg/controllers/node/termination/terminator/events/events.go b/pkg/controllers/node/termination/terminator/events/events.go index d626173d56..3c39e3cc1e 100644 --- a/pkg/controllers/node/termination/terminator/events/events.go +++ b/pkg/controllers/node/termination/terminator/events/events.go @@ -22,6 +22,8 @@ import ( corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/events" ) @@ -56,12 +58,22 @@ func NodeFailedToDrain(node *corev1.Node, err error) events.Event { } } -func NodeTerminationGracePeriodExpiring(node *corev1.Node, terminationTime string) events.Event { +func NodeAwaitingVolumeDetachment(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { + return events.Event{ + InvolvedObject: node, + Type: corev1.EventTypeWarning, + Reason: "AwaitingVolumeDetachment", + Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), + DedupeValues: []string{node.Name}, + } +} + +func NodeTerminationGracePeriodExpiring(node *corev1.Node, t time.Time) events.Event { return events.Event{ InvolvedObject: node, Type: corev1.EventTypeWarning, Reason: "TerminationGracePeriodExpiring", - Message: fmt.Sprintf("All pods will be deleted by %s", terminationTime), + Message: fmt.Sprintf("All pods will be deleted by %s", t.Format(time.RFC3339)), DedupeValues: []string{node.Name}, } } diff --git a/pkg/controllers/node/termination/volumedetachment.go b/pkg/controllers/node/termination/volumedetachment.go new file mode 100644 index 0000000000..109d80d1a3 --- /dev/null +++ b/pkg/controllers/node/termination/volumedetachment.go @@ -0,0 +1,138 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package termination + +import ( + "context" + "time" + + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" + "sigs.k8s.io/karpenter/pkg/events" + + storagev1 "k8s.io/api/storage/v1" + + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" + podutils "sigs.k8s.io/karpenter/pkg/utils/pod" + volumeutil "sigs.k8s.io/karpenter/pkg/utils/volume" +) + +type VolumeDetachmentReconciler struct { + kubeClient client.Client + clk clock.Clock + recorder events.Recorder +} + +// nolint:gocyclo +func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + if !nc.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue() { + return reconcile.Result{}, nil + } + if nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() { + return reconcile.Result{}, nil + } + if elapsed, err := nodeclaim.HasTerminationGracePeriodElapsed(v.clk, nc); err != nil { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } else if elapsed { + return reconcile.Result{}, nil + } + + blockingVAs, err := v.blockingVolumeAttachments(ctx, n) + if err != nil { + return reconcile.Result{}, err + } + if len(blockingVAs) != 0 { + stored := nc.DeepCopy() + _ = nc.StatusConditions().SetFalse(v1.ConditionTypeVolumesDetached, "AwaitingVolumeDetachment", "AwaitingVolumeDetachment") + if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + v.recorder.Publish(terminatorevents.NodeAwaitingVolumeDetachment(n, blockingVAs...)) + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + } + + stored := nc.DeepCopy() + _ = nc.StatusConditions().SetTrue(v1.ConditionTypeVolumesDetached) + if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} + +func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Context, n *corev1.Node) ([]*storagev1.VolumeAttachment, error) { + vas, err := nodeutils.GetVolumeAttachments(ctx, v.kubeClient, n) + if err != nil { + return nil, err + } + if len(vas) == 0 { + return nil, nil + } + + pods, err := nodeutils.GetPods(ctx, v.kubeClient, n) + if err != nil { + return nil, err + } + pods = lo.Reject(pods, func(p *corev1.Pod, _ int) bool { + return podutils.IsDrainable(p, v.clk) + }) + + // Determine the VolumeAttachments associated with non-drainable pods. We consider these non-blocking since they + // will never be detached without intervention (since the pods aren't drained). + nonBlockingVolumes := sets.New[string]() + for _, p := range pods { + for _, vol := range p.Spec.Volumes { + pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, v.kubeClient, p, vol) + if errors.IsNotFound(err) { + continue + } + if err != nil { + return nil, err + } + if pvc != nil { + nonBlockingVolumes.Insert(pvc.Spec.VolumeName) + } + } + } + blockingVAs := lo.Reject(vas, func(v *storagev1.VolumeAttachment, _ int) bool { + pvName := v.Spec.Source.PersistentVolumeName + return pvName == nil || nonBlockingVolumes.Has(*pvName) + }) + return blockingVAs, nil +} diff --git a/pkg/utils/nodeclaim/nodeclaim.go b/pkg/utils/nodeclaim/nodeclaim.go index b229b6404d..43a82c5e2a 100644 --- a/pkg/utils/nodeclaim/nodeclaim.go +++ b/pkg/utils/nodeclaim/nodeclaim.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "time" "github.com/awslabs/operatorpkg/object" "github.com/awslabs/operatorpkg/status" @@ -27,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -236,3 +238,26 @@ func UpdateNodeOwnerReferences(nodeClaim *v1.NodeClaim, node *corev1.Node) *core }) return node } + +func TerminationGracePeriodExpirationTime(nc *v1.NodeClaim) (*time.Time, error) { + expirationTimeString, exists := nc.ObjectMeta.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey] + if !exists { + return nil, nil + } + expirationTime, err := time.Parse(time.RFC3339, expirationTimeString) + if err != nil { + return nil, fmt.Errorf("parsing %s annotation, %w", v1.NodeClaimTerminationTimestampAnnotationKey, err) + } + return &expirationTime, nil +} + +func HasTerminationGracePeriodElapsed(clk clock.Clock, nc *v1.NodeClaim) (bool, error) { + expirationTime, err := TerminationGracePeriodExpirationTime(nc) + if err != nil { + return false, err + } + if expirationTime == nil { + return false, nil + } + return clk.Now().After(*expirationTime), nil +} From d92392d2aa07ae82b7fc365b8d71aba196e74e24 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Sun, 24 Nov 2024 15:31:14 -0800 Subject: [PATCH 2/5] test updates --- .../node/termination/suite_test.go | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go index 24680048d2..b193deba72 100644 --- a/pkg/controllers/node/termination/suite_test.go +++ b/pkg/controllers/node/termination/suite_test.go @@ -106,25 +106,45 @@ var _ = Describe("Termination", func() { ExpectApplied(ctx, env.Client, node, nodeClaim) Expect(env.Client.Delete(ctx, node)).To(Succeed()) node = ExpectNodeExists(ctx, env.Client, node.Name) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). ExpectObjectReconciled(ctx, env.Client, terminationController, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectNotFound(ctx, env.Client, node) }) It("should ignore nodes not managed by this Karpenter instance", func() { delete(node.Labels, "karpenter.test.sh/testnodeclass") node.Labels = lo.Assign(node.Labels, map[string]string{"karpenter.test.sh/unmanagednodeclass": "default"}) - ExpectApplied(ctx, env.Client, node) + ExpectApplied(ctx, env.Client, node, nodeClaim) Expect(env.Client.Delete(ctx, node)).To(Succeed()) node = ExpectNodeExists(ctx, env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectExists(ctx, env.Client, node) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) }) It("should delete nodeclaims associated with nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim, nodeClaim) + ExpectApplied(ctx, env.Client, node, nodeClaim) Expect(env.Client.Delete(ctx, node)).To(Succeed()) node = ExpectNodeExists(ctx, env.Client, node.Name) @@ -169,6 +189,8 @@ var _ = Describe("Termination", func() { ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, }) ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) // Trigger Termination Controller Expect(env.Client.Delete(ctx, node)).To(Succeed()) @@ -176,6 +198,8 @@ var _ = Describe("Termination", func() { ExpectObjectReconciled(ctx, env.Client, terminationController, node) Expect(queue.Has(podSkip)).To(BeFalse()) ExpectSingletonReconciled(ctx, queue) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) // Expect node to exist and be draining ExpectNodeWithNodeClaimDraining(env.Client, node.Name) @@ -188,6 +212,9 @@ var _ = Describe("Termination", func() { node = ExpectNodeExists(ctx, env.Client, node.Name) // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). ExpectObjectReconciled(ctx, env.Client, terminationController, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectNotFound(ctx, env.Client, node) }) @@ -740,13 +767,20 @@ var _ = Describe("Termination", func() { }) ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectExists(ctx, env.Client, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) ExpectDeleted(ctx, env.Client, va) ExpectObjectReconciled(ctx, env.Client, terminationController, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectNotFound(ctx, env.Client, node) }) @@ -775,13 +809,20 @@ var _ = Describe("Termination", func() { ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, vaDrainable, vaNonDrainable, pod, pvc) ExpectManualBinding(ctx, env.Client, pod, node) Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectExists(ctx, env.Client, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) ExpectDeleted(ctx, env.Client, vaDrainable) ExpectObjectReconciled(ctx, env.Client, terminationController, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectNotFound(ctx, env.Client, node) }) @@ -795,13 +836,21 @@ var _ = Describe("Termination", func() { } ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectExists(ctx, env.Client, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) fakeClock.Step(5 * time.Minute) ExpectObjectReconciled(ctx, env.Client, terminationController, node) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + // Since TGP expired before the VolumeAttachments were deleted, this status condition should have never transitioned + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, terminationController, node) ExpectNotFound(ctx, env.Client, node) }) From 0f76cdc40f99765b47b5e195b5d7d31204c4eeb7 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Wed, 4 Dec 2024 13:41:27 -0800 Subject: [PATCH 3/5] remaining refactor --- pkg/controllers/controllers.go | 6 +- pkg/controllers/node/health/suite_test.go | 6 +- .../node/termination/controller.go | 112 +++++++---- pkg/controllers/node/termination/drain.go | 129 +++++++++++-- .../{terminator/events => }/events.go | 27 +-- .../node/termination/eviction/events.go | 43 +++++ .../{terminator => eviction}/eviction.go | 25 +-- .../{terminator => eviction}/metrics.go | 2 +- .../{terminator => eviction}/suite_test.go | 67 ++----- .../node/termination/instancetermination.go | 2 +- .../node/termination/suite_test.go | 10 +- .../node/termination/terminator/terminator.go | 177 ------------------ .../node/termination/volumedetachment.go | 5 +- .../nodeclaim/lifecycle/controller.go | 5 +- pkg/controllers/nodeclaim/lifecycle/events.go | 10 + pkg/events/suite_test.go | 45 ++--- 16 files changed, 309 insertions(+), 362 deletions(-) rename pkg/controllers/node/termination/{terminator/events => }/events.go (74%) create mode 100644 pkg/controllers/node/termination/eviction/events.go rename pkg/controllers/node/termination/{terminator => eviction}/eviction.go (89%) rename pkg/controllers/node/termination/{terminator => eviction}/metrics.go (98%) rename pkg/controllers/node/termination/{terminator => eviction}/suite_test.go (59%) delete mode 100644 pkg/controllers/node/termination/terminator/terminator.go diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index a7a5d4e403..f3e8b69b8a 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -39,7 +39,7 @@ import ( "sigs.k8s.io/karpenter/pkg/controllers/node/health" nodehydration "sigs.k8s.io/karpenter/pkg/controllers/node/hydration" "sigs.k8s.io/karpenter/pkg/controllers/node/termination" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" nodeclaimconsistency "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/consistency" nodeclaimdisruption "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/disruption" "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/expiration" @@ -68,7 +68,7 @@ func NewControllers( ) []controller.Controller { cluster := state.NewCluster(clock, kubeClient, cloudProvider) p := provisioning.NewProvisioner(kubeClient, recorder, cloudProvider, cluster, clock) - evictionQueue := terminator.NewQueue(kubeClient, recorder) + evictionQueue := eviction.NewQueue(kubeClient, recorder) disruptionQueue := orchestration.NewQueue(kubeClient, recorder, cluster, clock, p) controllers := []controller.Controller{ @@ -83,7 +83,7 @@ func NewControllers( informer.NewPodController(kubeClient, cluster), informer.NewNodePoolController(kubeClient, cloudProvider, cluster), informer.NewNodeClaimController(kubeClient, cloudProvider, cluster), - termination.NewController(clock, kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue, recorder), recorder), + termination.NewController(clock, kubeClient, cloudProvider, recorder, evictionQueue), metricspod.NewController(kubeClient, cluster), metricsnodepool.NewController(kubeClient, cloudProvider), metricsnode.NewController(cluster), diff --git a/pkg/controllers/node/health/suite_test.go b/pkg/controllers/node/health/suite_test.go index ea2a368b20..cf856c9fe9 100644 --- a/pkg/controllers/node/health/suite_test.go +++ b/pkg/controllers/node/health/suite_test.go @@ -33,7 +33,7 @@ import ( "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" "sigs.k8s.io/karpenter/pkg/controllers/node/health" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -48,7 +48,7 @@ var env *test.Environment var fakeClock *clock.FakeClock var cloudProvider *fake.CloudProvider var recorder *test.EventRecorder -var queue *terminator.Queue +var queue *eviction.Queue func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -66,7 +66,7 @@ var _ = BeforeSuite(func() { cloudProvider = fake.NewCloudProvider() cloudProvider = fake.NewCloudProvider() recorder = test.NewEventRecorder() - queue = terminator.NewTestingQueue(env.Client, recorder) + queue = eviction.NewTestingQueue(env.Client, recorder) healthController = health.NewController(env.Client, cloudProvider, fakeClock, recorder) }) diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index fe28053f4f..3880eddd7c 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -21,6 +21,7 @@ import ( "fmt" "time" + "github.com/samber/lo" "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -41,7 +42,7 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/operator/injection" @@ -53,53 +54,60 @@ type Controller struct { clock clock.Clock kubeClient client.Client cloudProvider cloudprovider.CloudProvider - terminator *terminator.Terminator - reconcilers []terminationReconciler + reconcilers []reconciler } -// TODO (jmdeal@): Split subreconcilers into individual controllers -type terminationReconciler interface { +type reconciler interface { Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) } // NewController constructs a controller instance -func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, terminator *terminator.Terminator, recorder events.Recorder) *Controller { +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, evictionQueue *eviction.Queue) *Controller { return &Controller{ clock: clk, kubeClient: kubeClient, cloudProvider: cloudProvider, - terminator: terminator, - reconcilers: []terminationReconciler{ - &DrainReconciler{kubeClient, cloudProvider, recorder, terminator}, - &VolumeDetachmentReconciler{kubeClient, clk, recorder}, - &InstanceTerminationReconciler{kubeClient, cloudProvider, clk}, + reconcilers: []reconciler{ + &DrainReconciler{clk, kubeClient, cloudProvider, recorder, evictionQueue}, + &VolumeDetachmentReconciler{clk, kubeClient, recorder}, + &InstanceTerminationReconciler{clk, kubeClient, cloudProvider}, }, } } +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named("node.termination"). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(reconcile.AsReconciler(m.GetClient(), c)) +} + // nolint:gocyclo func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, "node.termination") ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) - if !nodeutils.IsManaged(n, c.cloudProvider) { - return reconcile.Result{}, nil - } + if n.GetDeletionTimestamp().IsZero() { return reconcile.Result{}, nil } if !controllerutil.ContainsFinalizer(n, v1.TerminationFinalizer) { return reconcile.Result{}, nil } - - if err := c.terminator.Taint(ctx, n, v1.DisruptedNoScheduleTaint); err != nil { - if errors.IsNotFound(err) { - return reconcile.Result{}, nil - } - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) + if !nodeutils.IsManaged(n, c.cloudProvider) { + return reconcile.Result{}, nil } + nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) if err != nil { if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { @@ -115,16 +123,55 @@ func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.R } } + if err := c.prepareNode(ctx, n); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) + } + for _, r := range c.reconcilers { res, err := r.Reconcile(ctx, n, nc) if res.Requeue || res.RequeueAfter != 0 || err != nil { return res, err } } - return reconcile.Result{}, nil } +// prepareNode ensures that the node is ready to begin the drain / termination process. This includes ensuring that it +// is tainted appropriately and annotated to be ignored by external load balancers. +func (c *Controller) prepareNode(ctx context.Context, n *corev1.Node) error { + stored := n.DeepCopy() + + // Add the karpenter.sh/disrupted:NoSchedule taint to ensure no additional pods schedule to the Node during the + // drain process. + if !lo.ContainsBy(n.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&v1.DisruptedNoScheduleTaint) + }) { + n.Spec.Taints = append(n.Spec.Taints, v1.DisruptedNoScheduleTaint) + } + // Adding this label to the node ensures that the node is removed from the load-balancer target group while it is + // draining and before it is terminated. This prevents 500s coming prior to health check when the load balancer + // controller hasn't yet determined that the node and underlying connections are gone. + // https://github.com/aws/aws-node-termination-handler/issues/316 + // https://github.com/aws/karpenter/pull/2518 + n.Labels = lo.Assign(n.Labels, map[string]string{ + corev1.LabelNodeExcludeBalancers: "karpenter", + }) + + if equality.Semantic.DeepEqual(n, stored) { + return nil + } + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch can cause races due + // to the fact that it fully replaces the list on a change. Here, we are updating the taint list. + return c.kubeClient.Patch(ctx, n, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})) +} + +// removeFinalizer removes Karpenter's termination finalizer from the given Node, updating the termination metrics in the process. func removeFinalizer(ctx context.Context, kubeClient client.Client, n *corev1.Node) error { stored := n.DeepCopy() controllerutil.RemoveFinalizer(n, v1.TerminationFinalizer) @@ -153,20 +200,3 @@ func removeFinalizer(ctx context.Context, kubeClient client.Client, n *corev1.No } return nil } - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named("node.termination"). - For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). - WithOptions( - controller.Options{ - RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( - workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), - // 10 qps, 100 bucket size - &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, - ), - MaxConcurrentReconciles: 100, - }, - ). - Complete(reconcile.AsReconciler(m.GetClient(), c)) -} diff --git a/pkg/controllers/node/termination/drain.go b/pkg/controllers/node/termination/drain.go index e18c13f7fc..d95c7a3183 100644 --- a/pkg/controllers/node/termination/drain.go +++ b/pkg/controllers/node/termination/drain.go @@ -18,30 +18,50 @@ package termination import ( "context" + "errors" "fmt" "time" + "github.com/samber/lo" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" + podutils "sigs.k8s.io/karpenter/pkg/utils/pod" ) +type nodeDrainError struct { + error +} + +func newNodeDrainError(err error) *nodeDrainError { + return &nodeDrainError{error: err} +} + +func isNodeDrainError(err error) bool { + if err == nil { + return false + } + var nodeDrainErr *nodeDrainError + return errors.As(err, &nodeDrainErr) +} + type DrainReconciler struct { + clock clock.Clock kubeClient client.Client cloudProvider cloudprovider.CloudProvider recorder events.Recorder - terminator *terminator.Terminator + evictionQueue *eviction.Queue } // nolint:gocyclo @@ -56,11 +76,11 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. return reconcile.Result{}, nil } if tgpExpirationTime != nil { - d.recorder.Publish(terminatorevents.NodeTerminationGracePeriodExpiring(n, *tgpExpirationTime)) + d.recorder.Publish(NodeTerminationGracePeriodExpiringEvent(n, *tgpExpirationTime)) } - if err := d.terminator.Drain(ctx, n, tgpExpirationTime); err != nil { - if !terminator.IsNodeDrainError(err) { + if err := d.drain(ctx, n, tgpExpirationTime); err != nil { + if !isNodeDrainError(err) { return reconcile.Result{}, fmt.Errorf("draining node, %w", err) } // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining @@ -79,26 +99,26 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. stored := nc.DeepCopy() if nc.StatusConditions().SetFalse(v1.ConditionTypeDrained, "Draining", "Draining") { if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } - if errors.IsConflict(err) { + if apierrors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } return reconcile.Result{}, err } } - d.recorder.Publish(terminatorevents.NodeFailedToDrain(n, err)) + d.recorder.Publish(NodeDrainFailedEvent(n, err)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } stored := nc.DeepCopy() _ = nc.StatusConditions().SetTrue(v1.ConditionTypeDrained) if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } - if errors.IsConflict(err) { + if apierrors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil } return reconcile.Result{}, err @@ -108,3 +128,88 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. }) return reconcile.Result{}, nil } + +// Drain evicts pods from the node and returns true when all pods are evicted +// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown +func (d *DrainReconciler) drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { + pods, err := nodeutils.GetPods(ctx, d.kubeClient, node) + if err != nil { + return fmt.Errorf("listing pods on node, %w", err) + } + podsToDelete := lo.Filter(pods, func(p *corev1.Pod, _ int) bool { + return podutils.IsWaitingEviction(p, d.clock) && !podutils.IsTerminating(p) + }) + if err := d.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { + return fmt.Errorf("deleting expiring pods, %w", err) + } + // Monitor pods in pod groups that either haven't been evicted or are actively evicting + podGroups := d.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutils.IsWaitingEviction(p, d.clock) })) + for _, group := range podGroups { + if len(group) > 0 { + // Only add pods to the eviction queue that haven't been evicted yet + d.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutils.IsEvictable(p) })...) + return newNodeDrainError(fmt.Errorf("%d pods are waiting to be evicted", lo.SumBy(podGroups, func(pods []*corev1.Pod) int { return len(pods) }))) + } + } + return nil +} + +func (d *DrainReconciler) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { + // 1. Prioritize noncritical pods, non-daemon pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown + var nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon []*corev1.Pod + for _, pod := range pods { + if pod.Spec.PriorityClassName == "system-cluster-critical" || pod.Spec.PriorityClassName == "system-node-critical" { + if podutils.IsOwnedByDaemonSet(pod) { + criticalDaemon = append(criticalDaemon, pod) + } else { + criticalNonDaemon = append(criticalNonDaemon, pod) + } + } else { + if podutils.IsOwnedByDaemonSet(pod) { + nonCriticalDaemon = append(nonCriticalDaemon, pod) + } else { + nonCriticalNonDaemon = append(nonCriticalNonDaemon, pod) + } + } + } + return [][]*corev1.Pod{nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon} +} + +func (d *DrainReconciler) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { + for _, pod := range pods { + // check if the node has an expiration time and the pod needs to be deleted + deleteTime := d.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) + if deleteTime != nil && time.Now().After(*deleteTime) { + // delete pod proactively to give as much of its terminationGracePeriodSeconds as possible for deletion + // ensure that we clamp the maximum pod terminationGracePeriodSeconds to the node's remaining expiration time in the delete command + gracePeriodSeconds := lo.ToPtr(int64(time.Until(*nodeGracePeriodTerminationTime).Seconds())) + d.recorder.Publish(PodDeletedEvent(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) + opts := &client.DeleteOptions{ + GracePeriodSeconds: gracePeriodSeconds, + } + if err := d.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem + return fmt.Errorf("deleting pod, %w", err) // otherwise, bubble up the error + } + log.FromContext(ctx).WithValues( + "namespace", pod.Namespace, + "name", pod.Name, + "pod.terminationGracePeriodSeconds", *pod.Spec.TerminationGracePeriodSeconds, + "delete.gracePeriodSeconds", *gracePeriodSeconds, + "nodeclaim.terminationTime", *nodeGracePeriodTerminationTime, + ).V(1).Info("deleting pod") + } + } + return nil +} + +// if a pod should be deleted to give it the full terminationGracePeriodSeconds of time before the node will shut down, return the time the pod should be deleted +func (*DrainReconciler) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { + if nodeGracePeriodExpirationTime == nil || pod.Spec.TerminationGracePeriodSeconds == nil { // k8s defaults to 30s, so we should never see a nil TerminationGracePeriodSeconds + return nil + } + + // calculate the time the pod should be deleted to allow it's full grace period for termination, equal to its terminationGracePeriodSeconds before the node's expiration time + // eg: if a node will be force terminated in 30m, but the current pod has a grace period of 45m, we return a time of 15m ago + deleteTime := nodeGracePeriodExpirationTime.Add(time.Duration(*pod.Spec.TerminationGracePeriodSeconds) * time.Second * -1) + return &deleteTime +} diff --git a/pkg/controllers/node/termination/terminator/events/events.go b/pkg/controllers/node/termination/events.go similarity index 74% rename from pkg/controllers/node/termination/terminator/events/events.go rename to pkg/controllers/node/termination/events.go index 3c39e3cc1e..91ffbc26d5 100644 --- a/pkg/controllers/node/termination/terminator/events/events.go +++ b/pkg/controllers/node/termination/events.go @@ -14,41 +14,30 @@ See the License for the specific language governing permissions and limitations under the License. */ -package events +package termination import ( "fmt" "time" corev1 "k8s.io/api/core/v1" - storagev1 "k8s.io/api/storage/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/events" ) -func EvictPod(pod *corev1.Pod) events.Event { - return events.Event{ - InvolvedObject: pod, - Type: corev1.EventTypeNormal, - Reason: "Evicted", - Message: "Evicted pod", - DedupeValues: []string{pod.Name}, - } -} - -func DisruptPodDelete(pod *corev1.Pod, gracePeriodSeconds *int64, nodeGracePeriodTerminationTime *time.Time) events.Event { +func PodDeletedEvent(pod *corev1.Pod, gracePeriodSeconds *int64, nodeGracePeriodTerminationTime *time.Time) events.Event { return events.Event{ InvolvedObject: pod, Type: corev1.EventTypeNormal, Reason: "Disrupted", Message: fmt.Sprintf("Deleting the pod to accommodate the terminationTime %v of the node. The pod was granted %v seconds of grace-period of its %v terminationGracePeriodSeconds. This bypasses the PDB of the pod and the do-not-disrupt annotation.", *nodeGracePeriodTerminationTime, *gracePeriodSeconds, pod.Spec.TerminationGracePeriodSeconds), - DedupeValues: []string{pod.Name}, + DedupeValues: []string{pod.Namespace, pod.Name}, } } -func NodeFailedToDrain(node *corev1.Node, err error) events.Event { +func NodeDrainFailedEvent(node *corev1.Node, err error) events.Event { return events.Event{ InvolvedObject: node, Type: corev1.EventTypeWarning, @@ -58,17 +47,17 @@ func NodeFailedToDrain(node *corev1.Node, err error) events.Event { } } -func NodeAwaitingVolumeDetachment(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { +func NodeAwaitingVolumeDetachmentEvent(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { return events.Event{ InvolvedObject: node, - Type: corev1.EventTypeWarning, + Type: corev1.EventTypeNormal, Reason: "AwaitingVolumeDetachment", Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), DedupeValues: []string{node.Name}, } } -func NodeTerminationGracePeriodExpiring(node *corev1.Node, t time.Time) events.Event { +func NodeTerminationGracePeriodExpiringEvent(node *corev1.Node, t time.Time) events.Event { return events.Event{ InvolvedObject: node, Type: corev1.EventTypeWarning, @@ -78,7 +67,7 @@ func NodeTerminationGracePeriodExpiring(node *corev1.Node, t time.Time) events.E } } -func NodeClaimTerminationGracePeriodExpiring(nodeClaim *v1.NodeClaim, terminationTime string) events.Event { +func NodeClaimTerminationGracePeriodExpiringEvent(nodeClaim *v1.NodeClaim, terminationTime string) events.Event { return events.Event{ InvolvedObject: nodeClaim, Type: corev1.EventTypeWarning, diff --git a/pkg/controllers/node/termination/eviction/events.go b/pkg/controllers/node/termination/eviction/events.go new file mode 100644 index 0000000000..ff53cb0924 --- /dev/null +++ b/pkg/controllers/node/termination/eviction/events.go @@ -0,0 +1,43 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +import ( + corev1 "k8s.io/api/core/v1" + + "sigs.k8s.io/karpenter/pkg/events" +) + +func PodEvictedEvent(pod *corev1.Pod) events.Event { + return events.Event{ + InvolvedObject: pod, + Type: corev1.EventTypeNormal, + Reason: "Evicted", + Message: "Evicted pod", + DedupeValues: []string{pod.Namespace, pod.Name}, + } +} + +func PodEvictionFailedEvent(pod *corev1.Pod, message string) events.Event { + return events.Event{ + InvolvedObject: pod, + Type: corev1.EventTypeWarning, + Reason: "FailedEviction", + Message: message, + DedupeValues: []string{pod.Namespace, pod.Name}, + } +} diff --git a/pkg/controllers/node/termination/terminator/eviction.go b/pkg/controllers/node/termination/eviction/eviction.go similarity index 89% rename from pkg/controllers/node/termination/terminator/eviction.go rename to pkg/controllers/node/termination/eviction/eviction.go index 4401837067..1682b67a74 100644 --- a/pkg/controllers/node/termination/terminator/eviction.go +++ b/pkg/controllers/node/termination/eviction/eviction.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package terminator +package eviction import ( "context" @@ -40,7 +40,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/operator/injection" ) @@ -50,22 +49,6 @@ const ( evictionQueueMaxDelay = 10 * time.Second ) -type NodeDrainError struct { - error -} - -func NewNodeDrainError(err error) *NodeDrainError { - return &NodeDrainError{error: err} -} - -func IsNodeDrainError(err error) bool { - if err == nil { - return false - } - var nodeDrainErr *NodeDrainError - return errors.As(err, &nodeDrainErr) -} - type QueueKey struct { types.NamespacedName UID types.UID @@ -195,16 +178,16 @@ func (q *Queue) Evict(ctx context.Context, key QueueKey) bool { return true } if apierrors.IsTooManyRequests(err) { // 429 - PDB violation - q.recorder.Publish(terminatorevents.NodeFailedToDrain(&corev1.Node{ObjectMeta: metav1.ObjectMeta{ + q.recorder.Publish(PodEvictionFailedEvent(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{ Name: key.Name, Namespace: key.Namespace, - }}, fmt.Errorf("evicting pod %s/%s violates a PDB", key.Namespace, key.Name))) + }}, fmt.Sprintf("evicting pod %s/%s violated a PDB", key.Namespace, key.Name))) return false } log.FromContext(ctx).Error(err, "failed evicting pod") return false } NodesEvictionRequestsTotal.Inc(map[string]string{CodeLabel: "200"}) - q.recorder.Publish(terminatorevents.EvictPod(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}})) + q.recorder.Publish(PodEvictedEvent(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}})) return true } diff --git a/pkg/controllers/node/termination/terminator/metrics.go b/pkg/controllers/node/termination/eviction/metrics.go similarity index 98% rename from pkg/controllers/node/termination/terminator/metrics.go rename to pkg/controllers/node/termination/eviction/metrics.go index c2c2fb381b..aecc80c115 100644 --- a/pkg/controllers/node/termination/terminator/metrics.go +++ b/pkg/controllers/node/termination/eviction/metrics.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package terminator +package eviction import ( opmetrics "github.com/awslabs/operatorpkg/metrics" diff --git a/pkg/controllers/node/termination/terminator/suite_test.go b/pkg/controllers/node/termination/eviction/suite_test.go similarity index 59% rename from pkg/controllers/node/termination/terminator/suite_test.go rename to pkg/controllers/node/termination/eviction/suite_test.go index 2b0e3a72d4..db4363e30e 100644 --- a/pkg/controllers/node/termination/terminator/suite_test.go +++ b/pkg/controllers/node/termination/eviction/suite_test.go @@ -14,13 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package terminator_test +package eviction_test import ( "context" "sync" "testing" - "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -30,11 +29,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/uuid" - clock "k8s.io/utils/clock/testing" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/karpenter/pkg/apis" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -45,11 +43,9 @@ import ( var ctx context.Context var env *test.Environment var recorder *test.EventRecorder -var queue *terminator.Queue +var queue *eviction.Queue var pdb *policyv1.PodDisruptionBudget var pod *corev1.Pod -var fakeClock *clock.FakeClock -var terminatorInstance *terminator.Terminator func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -61,8 +57,7 @@ var _ = BeforeSuite(func() { env = test.NewEnvironment(test.WithCRDs(apis.CRDs...), test.WithCRDs(v1alpha1.CRDs...)) ctx = options.ToContext(ctx, test.Options()) recorder = test.NewEventRecorder() - queue = terminator.NewTestingQueue(env.Client, recorder) - terminatorInstance = terminator.NewTerminator(fakeClock, env.Client, queue, recorder) + queue = eviction.NewTestingQueue(env.Client, recorder) }) var _ = AfterSuite(func() { @@ -72,7 +67,7 @@ var _ = AfterSuite(func() { var _ = BeforeEach(func() { recorder.Reset() // Reset the events that we captured during the run // Shut down the queue and restart it to ensure no races - *queue = lo.FromPtr(terminator.NewTestingQueue(env.Client, recorder)) + *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) }) var _ = AfterEach(func() { @@ -92,24 +87,24 @@ var _ = Describe("Eviction/Queue", func() { Labels: testLabels, }, }) - terminator.NodesEvictionRequestsTotal.Reset() + eviction.NodesEvictionRequestsTotal.Reset() }) Context("Eviction API", func() { It("should succeed with no event when the pod is not found", func() { - Expect(queue.Evict(ctx, terminator.NewQueueKey(pod))).To(BeTrue()) + Expect(queue.Evict(ctx, eviction.NewQueueKey(pod))).To(BeTrue()) Expect(recorder.Events()).To(HaveLen(0)) }) It("should succeed with no event when the pod UID conflicts", func() { ExpectApplied(ctx, env.Client, pod) - Expect(queue.Evict(ctx, terminator.QueueKey{NamespacedName: client.ObjectKeyFromObject(pod), UID: uuid.NewUUID()})).To(BeTrue()) - ExpectMetricCounterValue(terminator.NodesEvictionRequestsTotal, 1, map[string]string{terminator.CodeLabel: "409"}) + Expect(queue.Evict(ctx, eviction.QueueKey{NamespacedName: client.ObjectKeyFromObject(pod), UID: uuid.NewUUID()})).To(BeTrue()) + ExpectMetricCounterValue(eviction.NodesEvictionRequestsTotal, 1, map[string]string{eviction.CodeLabel: "409"}) Expect(recorder.Events()).To(HaveLen(0)) }) It("should succeed with an evicted event when there are no PDBs", func() { ExpectApplied(ctx, env.Client, pod) - Expect(queue.Evict(ctx, terminator.NewQueueKey(pod))).To(BeTrue()) - ExpectMetricCounterValue(terminator.NodesEvictionRequestsTotal, 1, map[string]string{terminator.CodeLabel: "200"}) + Expect(queue.Evict(ctx, eviction.NewQueueKey(pod))).To(BeTrue()) + ExpectMetricCounterValue(eviction.NodesEvictionRequestsTotal, 1, map[string]string{eviction.CodeLabel: "200"}) Expect(recorder.Calls("Evicted")).To(Equal(1)) }) It("should succeed with no event when there are PDBs that allow an eviction", func() { @@ -118,13 +113,13 @@ var _ = Describe("Eviction/Queue", func() { MaxUnavailable: &intstr.IntOrString{IntVal: 1}, }) ExpectApplied(ctx, env.Client, pod) - Expect(queue.Evict(ctx, terminator.NewQueueKey(pod))).To(BeTrue()) + Expect(queue.Evict(ctx, eviction.NewQueueKey(pod))).To(BeTrue()) Expect(recorder.Calls("Evicted")).To(Equal(1)) }) - It("should return a NodeDrainError event when a PDB is blocking", func() { + It("should emit a FailedEviction event when a PDB is blocking", func() { ExpectApplied(ctx, env.Client, pdb, pod) - Expect(queue.Evict(ctx, terminator.NewQueueKey(pod))).To(BeFalse()) - Expect(recorder.Calls("FailedDraining")).To(Equal(1)) + Expect(queue.Evict(ctx, eviction.NewQueueKey(pod))).To(BeFalse()) + Expect(recorder.Calls("FailedEviction")).To(Equal(1)) }) It("should fail when two PDBs refer to the same pod", func() { pdb2 := test.PodDisruptionBudget(test.PDBOptions{ @@ -132,8 +127,8 @@ var _ = Describe("Eviction/Queue", func() { MaxUnavailable: &intstr.IntOrString{IntVal: 0}, }) ExpectApplied(ctx, env.Client, pdb, pdb2, pod) - Expect(queue.Evict(ctx, terminator.NewQueueKey(pod))).To(BeFalse()) - ExpectMetricCounterValue(terminator.NodesEvictionRequestsTotal, 1, map[string]string{terminator.CodeLabel: "500"}) + Expect(queue.Evict(ctx, eviction.NewQueueKey(pod))).To(BeFalse()) + ExpectMetricCounterValue(eviction.NodesEvictionRequestsTotal, 1, map[string]string{eviction.CodeLabel: "500"}) }) It("should ensure that calling Evict() is valid while making Add() calls", func() { cancelCtx, cancel := context.WithCancel(ctx) @@ -162,32 +157,4 @@ var _ = Describe("Eviction/Queue", func() { } }) }) - - Context("Pod Deletion API", func() { - It("should not delete a pod with no nodeTerminationTime", func() { - ExpectApplied(ctx, env.Client, pod) - - Expect(terminatorInstance.DeleteExpiringPods(ctx, []*corev1.Pod{pod}, nil)).To(Succeed()) - ExpectExists(ctx, env.Client, pod) - Expect(recorder.Calls("Disrupted")).To(Equal(0)) - }) - It("should not delete a pod with terminationGracePeriodSeconds still remaining before nodeTerminationTime", func() { - pod.Spec.TerminationGracePeriodSeconds = lo.ToPtr[int64](60) - ExpectApplied(ctx, env.Client, pod) - - nodeTerminationTime := time.Now().Add(time.Minute * 5) - Expect(terminatorInstance.DeleteExpiringPods(ctx, []*corev1.Pod{pod}, &nodeTerminationTime)).To(Succeed()) - ExpectExists(ctx, env.Client, pod) - Expect(recorder.Calls("Disrupted")).To(Equal(0)) - }) - It("should delete a pod with less than terminationGracePeriodSeconds remaining before nodeTerminationTime", func() { - pod.Spec.TerminationGracePeriodSeconds = lo.ToPtr[int64](120) - ExpectApplied(ctx, env.Client, pod) - - nodeTerminationTime := time.Now().Add(time.Minute * 1) - Expect(terminatorInstance.DeleteExpiringPods(ctx, []*corev1.Pod{pod}, &nodeTerminationTime)).To(Succeed()) - ExpectNotFound(ctx, env.Client, pod) - Expect(recorder.Calls("Disrupted")).To(Equal(1)) - }) - }) }) diff --git a/pkg/controllers/node/termination/instancetermination.go b/pkg/controllers/node/termination/instancetermination.go index 2e97c4edd0..5ea7086e43 100644 --- a/pkg/controllers/node/termination/instancetermination.go +++ b/pkg/controllers/node/termination/instancetermination.go @@ -35,9 +35,9 @@ import ( ) type InstanceTerminationReconciler struct { + clk clock.Clock kubeClient client.Client cloudProvider cloudprovider.CloudProvider - clk clock.Clock } func (i *InstanceTerminationReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go index b193deba72..deb6a5c629 100644 --- a/pkg/controllers/node/termination/suite_test.go +++ b/pkg/controllers/node/termination/suite_test.go @@ -34,7 +34,7 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" "sigs.k8s.io/karpenter/pkg/controllers/node/termination" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -49,7 +49,7 @@ var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: var fakeClock *clock.FakeClock var cloudProvider *fake.CloudProvider var recorder *test.EventRecorder -var queue *terminator.Queue +var queue *eviction.Queue func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -67,8 +67,8 @@ var _ = BeforeSuite(func() { cloudProvider = fake.NewCloudProvider() recorder = test.NewEventRecorder() - queue = terminator.NewTestingQueue(env.Client, recorder) - terminationController = termination.NewController(fakeClock, env.Client, cloudProvider, terminator.NewTerminator(fakeClock, env.Client, queue, recorder), recorder) + queue = eviction.NewTestingQueue(env.Client, recorder) + terminationController = termination.NewController(fakeClock, env.Client, cloudProvider, recorder, queue) }) var _ = AfterSuite(func() { @@ -83,7 +83,7 @@ var _ = Describe("Termination", func() { BeforeEach(func() { fakeClock.SetTime(time.Now()) cloudProvider.Reset() - *queue = lo.FromPtr(terminator.NewTestingQueue(env.Client, recorder)) + *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) nodePool = test.NodePool() nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1.TerminationFinalizer}}}) diff --git a/pkg/controllers/node/termination/terminator/terminator.go b/pkg/controllers/node/termination/terminator/terminator.go deleted file mode 100644 index 82940aa8ef..0000000000 --- a/pkg/controllers/node/termination/terminator/terminator.go +++ /dev/null @@ -1,177 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package terminator - -import ( - "context" - "fmt" - "time" - - "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/utils/clock" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" - "sigs.k8s.io/karpenter/pkg/events" - nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" - podutil "sigs.k8s.io/karpenter/pkg/utils/pod" -) - -type Terminator struct { - clock clock.Clock - kubeClient client.Client - evictionQueue *Queue - recorder events.Recorder -} - -func NewTerminator(clk clock.Clock, kubeClient client.Client, eq *Queue, recorder events.Recorder) *Terminator { - return &Terminator{ - clock: clk, - kubeClient: kubeClient, - evictionQueue: eq, - recorder: recorder, - } -} - -// Taint idempotently adds a given taint to a node with a NodeClaim -func (t *Terminator) Taint(ctx context.Context, node *corev1.Node, taint corev1.Taint) error { - stored := node.DeepCopy() - // If the node already has the correct taint (key and effect), do nothing. - if _, ok := lo.Find(node.Spec.Taints, func(t corev1.Taint) bool { - return t.MatchTaint(&taint) - }); !ok { - // Otherwise, if the taint key exists (but with a different effect), remove it. - node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t corev1.Taint, _ int) bool { - return t.Key == taint.Key - }) - node.Spec.Taints = append(node.Spec.Taints, taint) - } - // Adding this label to the node ensures that the node is removed from the load-balancer target group - // while it is draining and before it is terminated. This prevents 500s coming prior to health check - // when the load balancer controller hasn't yet determined that the node and underlying connections are gone - // https://github.com/aws/aws-node-termination-handler/issues/316 - // https://github.com/aws/karpenter/pull/2518 - node.Labels = lo.Assign(node.Labels, map[string]string{ - corev1.LabelNodeExcludeBalancers: "karpenter", - }) - if !equality.Semantic.DeepEqual(node, stored) { - // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch - // can cause races due to the fact that it fully replaces the list on a change - // Here, we are updating the taint list - if err := t.kubeClient.Patch(ctx, node, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { - return err - } - taintValues := []any{ - "taint.Key", taint.Key, - "taint.Value", taint.Value, - } - if len(string(taint.Effect)) > 0 { - taintValues = append(taintValues, "taint.Effect", taint.Effect) - } - log.FromContext(ctx).WithValues(taintValues...).Info("tainted node") - } - return nil -} - -// Drain evicts pods from the node and returns true when all pods are evicted -// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown -func (t *Terminator) Drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { - pods, err := nodeutils.GetPods(ctx, t.kubeClient, node) - if err != nil { - return fmt.Errorf("listing pods on node, %w", err) - } - podsToDelete := lo.Filter(pods, func(p *corev1.Pod, _ int) bool { - return podutil.IsWaitingEviction(p, t.clock) && !podutil.IsTerminating(p) - }) - if err := t.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { - return fmt.Errorf("deleting expiring pods, %w", err) - } - // Monitor pods in pod groups that either haven't been evicted or are actively evicting - podGroups := t.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutil.IsWaitingEviction(p, t.clock) })) - for _, group := range podGroups { - if len(group) > 0 { - // Only add pods to the eviction queue that haven't been evicted yet - t.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutil.IsEvictable(p) })...) - return NewNodeDrainError(fmt.Errorf("%d pods are waiting to be evicted", lo.SumBy(podGroups, func(pods []*corev1.Pod) int { return len(pods) }))) - } - } - return nil -} - -func (t *Terminator) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { - // 1. Prioritize noncritical pods, non-daemon pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown - var nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon []*corev1.Pod - for _, pod := range pods { - if pod.Spec.PriorityClassName == "system-cluster-critical" || pod.Spec.PriorityClassName == "system-node-critical" { - if podutil.IsOwnedByDaemonSet(pod) { - criticalDaemon = append(criticalDaemon, pod) - } else { - criticalNonDaemon = append(criticalNonDaemon, pod) - } - } else { - if podutil.IsOwnedByDaemonSet(pod) { - nonCriticalDaemon = append(nonCriticalDaemon, pod) - } else { - nonCriticalNonDaemon = append(nonCriticalNonDaemon, pod) - } - } - } - return [][]*corev1.Pod{nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon} -} - -func (t *Terminator) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { - for _, pod := range pods { - // check if the node has an expiration time and the pod needs to be deleted - deleteTime := t.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) - if deleteTime != nil && time.Now().After(*deleteTime) { - // delete pod proactively to give as much of its terminationGracePeriodSeconds as possible for deletion - // ensure that we clamp the maximum pod terminationGracePeriodSeconds to the node's remaining expiration time in the delete command - gracePeriodSeconds := lo.ToPtr(int64(time.Until(*nodeGracePeriodTerminationTime).Seconds())) - t.recorder.Publish(terminatorevents.DisruptPodDelete(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) - opts := &client.DeleteOptions{ - GracePeriodSeconds: gracePeriodSeconds, - } - if err := t.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem - return fmt.Errorf("deleting pod, %w", err) // otherwise, bubble up the error - } - log.FromContext(ctx).WithValues( - "namespace", pod.Namespace, - "name", pod.Name, - "pod.terminationGracePeriodSeconds", *pod.Spec.TerminationGracePeriodSeconds, - "delete.gracePeriodSeconds", *gracePeriodSeconds, - "nodeclaim.terminationTime", *nodeGracePeriodTerminationTime, - ).V(1).Info("deleting pod") - } - } - return nil -} - -// if a pod should be deleted to give it the full terminationGracePeriodSeconds of time before the node will shut down, return the time the pod should be deleted -func (t *Terminator) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { - if nodeGracePeriodExpirationTime == nil || pod.Spec.TerminationGracePeriodSeconds == nil { // k8s defaults to 30s, so we should never see a nil TerminationGracePeriodSeconds - return nil - } - - // calculate the time the pod should be deleted to allow it's full grace period for termination, equal to its terminationGracePeriodSeconds before the node's expiration time - // eg: if a node will be force terminated in 30m, but the current pod has a grace period of 45m, we return a time of 15m ago - deleteTime := nodeGracePeriodExpirationTime.Add(time.Duration(*pod.Spec.TerminationGracePeriodSeconds) * time.Second * -1) - return &deleteTime -} diff --git a/pkg/controllers/node/termination/volumedetachment.go b/pkg/controllers/node/termination/volumedetachment.go index 109d80d1a3..9b445ee02a 100644 --- a/pkg/controllers/node/termination/volumedetachment.go +++ b/pkg/controllers/node/termination/volumedetachment.go @@ -30,7 +30,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" "sigs.k8s.io/karpenter/pkg/events" storagev1 "k8s.io/api/storage/v1" @@ -42,8 +41,8 @@ import ( ) type VolumeDetachmentReconciler struct { - kubeClient client.Client clk clock.Clock + kubeClient client.Client recorder events.Recorder } @@ -78,7 +77,7 @@ func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.No } return reconcile.Result{}, err } - v.recorder.Publish(terminatorevents.NodeAwaitingVolumeDetachment(n, blockingVAs...)) + v.recorder.Publish(NodeAwaitingVolumeDetachmentEvent(n, blockingVAs...)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } diff --git a/pkg/controllers/nodeclaim/lifecycle/controller.go b/pkg/controllers/nodeclaim/lifecycle/controller.go index 6293af6b94..ea87ac0fea 100644 --- a/pkg/controllers/nodeclaim/lifecycle/controller.go +++ b/pkg/controllers/nodeclaim/lifecycle/controller.go @@ -40,8 +40,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/events" @@ -279,7 +277,6 @@ func (c *Controller) annotateTerminationGracePeriodTerminationTime(ctx context.C return client.IgnoreNotFound(err) } log.FromContext(ctx).WithValues(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime).Info("annotated nodeclaim") - c.recorder.Publish(terminatorevents.NodeClaimTerminationGracePeriodExpiring(nodeClaim, terminationTime)) - + c.recorder.Publish(NodeClaimTerminationGracePeriodExpiringEvent(nodeClaim, terminationTime)) return nil } diff --git a/pkg/controllers/nodeclaim/lifecycle/events.go b/pkg/controllers/nodeclaim/lifecycle/events.go index 2697b1cdc0..4480057471 100644 --- a/pkg/controllers/nodeclaim/lifecycle/events.go +++ b/pkg/controllers/nodeclaim/lifecycle/events.go @@ -44,3 +44,13 @@ func NodeClassNotReadyEvent(nodeClaim *v1.NodeClaim, err error) events.Event { DedupeValues: []string{string(nodeClaim.UID)}, } } + +func NodeClaimTerminationGracePeriodExpiringEvent(nodeClaim *v1.NodeClaim, terminationTime string) events.Event { + return events.Event{ + InvolvedObject: nodeClaim, + Type: corev1.EventTypeWarning, + Reason: "TerminationGracePeriodExpiring", + Message: fmt.Sprintf("All pods will be deleted by %s", terminationTime), + DedupeValues: []string{nodeClaim.Name}, + } +} diff --git a/pkg/events/suite_test.go b/pkg/events/suite_test.go index fec8d1462a..2c53df92ef 100644 --- a/pkg/events/suite_test.go +++ b/pkg/events/suite_test.go @@ -30,8 +30,9 @@ import ( "k8s.io/client-go/util/flowcontrol" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" - schedulingevents "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/test" ) @@ -78,26 +79,26 @@ func TestRecorder(t *testing.T) { var _ = BeforeEach(func() { internalRecorder = NewInternalRecorder() eventRecorder = events.NewRecorder(internalRecorder) - schedulingevents.PodNominationRateLimiter = flowcontrol.NewTokenBucketRateLimiter(5, 10) + scheduling.PodNominationRateLimiter = flowcontrol.NewTokenBucketRateLimiter(5, 10) }) var _ = Describe("Event Creation", func() { It("should create a NominatePod event", func() { - eventRecorder.Publish(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) - Expect(internalRecorder.Calls(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(1)) + eventRecorder.Publish(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) + Expect(internalRecorder.Calls(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(1)) }) It("should create a EvictPod event", func() { - eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID())) - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + eventRecorder.Publish(eviction.PodEvictedEvent(PodWithUID())) + Expect(internalRecorder.Calls(eviction.PodEvictedEvent(PodWithUID()).Reason)).To(Equal(1)) }) It("should create a PodFailedToSchedule event", func() { - eventRecorder.Publish(schedulingevents.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf(""))) - Expect(internalRecorder.Calls(schedulingevents.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) + eventRecorder.Publish(scheduling.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf(""))) + Expect(internalRecorder.Calls(scheduling.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) It("should create a NodeFailedToDrain event", func() { - eventRecorder.Publish(terminatorevents.NodeFailedToDrain(NodeWithUID(), fmt.Errorf(""))) - Expect(internalRecorder.Calls(terminatorevents.NodeFailedToDrain(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) + eventRecorder.Publish(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf(""))) + Expect(internalRecorder.Calls(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) }) @@ -105,49 +106,49 @@ var _ = Describe("Dedupe", func() { It("should only create a single event when many events are created quickly", func() { pod := PodWithUID() for i := 0; i < 100; i++ { - eventRecorder.Publish(terminatorevents.EvictPod(pod)) + eventRecorder.Publish(eviction.PodEvictedEvent(pod)) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + Expect(internalRecorder.Calls(eviction.PodEvictedEvent(PodWithUID()).Reason)).To(Equal(1)) }) It("should allow the dedupe timeout to be overridden", func() { pod := PodWithUID() - evt := terminatorevents.EvictPod(pod) + evt := eviction.PodEvictedEvent(pod) evt.DedupeTimeout = time.Second * 2 // Generate a set of events within the dedupe timeout for i := 0; i < 10; i++ { eventRecorder.Publish(evt) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + Expect(internalRecorder.Calls(eviction.PodEvictedEvent(PodWithUID()).Reason)).To(Equal(1)) // Wait until after the overridden dedupe timeout time.Sleep(time.Second * 3) eventRecorder.Publish(evt) - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(2)) + Expect(internalRecorder.Calls(eviction.PodEvictedEvent(PodWithUID()).Reason)).To(Equal(2)) }) It("should allow events with different entities to be created", func() { for i := 0; i < 100; i++ { - eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID())) + eventRecorder.Publish(eviction.PodEvictedEvent(PodWithUID())) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(100)) + Expect(internalRecorder.Calls(eviction.PodEvictedEvent(PodWithUID()).Reason)).To(Equal(100)) }) }) var _ = Describe("Rate Limiting", func() { It("should only create max-burst when many events are created quickly", func() { for i := 0; i < 100; i++ { - eventRecorder.Publish(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) + eventRecorder.Publish(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) } - Expect(internalRecorder.Calls(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(10)) + Expect(internalRecorder.Calls(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(10)) }) It("should allow many events over time due to smoothed rate limiting", func() { for i := 0; i < 3; i++ { for j := 0; j < 5; j++ { - eventRecorder.Publish(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) + eventRecorder.Publish(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID())) } time.Sleep(time.Second) } - Expect(internalRecorder.Calls(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(15)) + Expect(internalRecorder.Calls(scheduling.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(15)) }) }) From 6df1a3c99a53d6aa25027ef92b50a407b6229f04 Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Mon, 9 Dec 2024 09:15:48 -0800 Subject: [PATCH 4/5] finalizer hydration --- pkg/apis/v1/labels.go | 10 ++ pkg/controllers/node/hydration/controller.go | 55 +++++++---- pkg/controllers/node/hydration/suite_test.go | 89 ++++++++++++------ .../nodeclaim/hydration/controller.go | 47 ++++++---- .../nodeclaim/hydration/suite_test.go | 69 ++++++++------ pkg/utils/node/node.go | 66 ++++---------- pkg/utils/node/types.go | 91 +++++++++++++++++++ pkg/utils/nodeclaim/nodeclaim.go | 58 ++---------- pkg/utils/nodeclaim/types.go | 71 +++++++++++++++ 9 files changed, 367 insertions(+), 189 deletions(-) create mode 100644 pkg/utils/node/types.go create mode 100644 pkg/utils/nodeclaim/types.go diff --git a/pkg/apis/v1/labels.go b/pkg/apis/v1/labels.go index 9364936dfe..a627779474 100644 --- a/pkg/apis/v1/labels.go +++ b/pkg/apis/v1/labels.go @@ -50,11 +50,14 @@ const ( NodePoolHashAnnotationKey = apis.Group + "/nodepool-hash" NodePoolHashVersionAnnotationKey = apis.Group + "/nodepool-hash-version" NodeClaimTerminationTimestampAnnotationKey = apis.Group + "/nodeclaim-termination-timestamp" + HydrationAnnotationKey = apis.Group + "/hydrated-by" ) // Karpenter specific finalizers const ( TerminationFinalizer = apis.Group + "/termination" + DrainFinalizer = apis.Group + "/drain-protection" + VolumeFinalizer = apis.Group + "/volume-protection" ) var ( @@ -102,6 +105,13 @@ var ( v1.LabelInstanceType: v1.LabelInstanceTypeStable, v1.LabelFailureDomainBetaRegion: v1.LabelTopologyRegion, } + + // HydratedFinailzers contains the finalizers which must be applied to a resource by the hydration contrller. If + // the resource is not hydrated, Karpenter can not depend on the lack of one of these finalizers as a signal. + HydratedFinailzers = sets.New( + DrainFinalizer, + VolumeFinalizer, + ) ) // IsRestrictedLabel returns an error if the label is restricted. diff --git a/pkg/controllers/node/hydration/controller.go b/pkg/controllers/node/hydration/controller.go index 53b091e802..5a72e3ca37 100644 --- a/pkg/controllers/node/hydration/controller.go +++ b/pkg/controllers/node/hydration/controller.go @@ -26,6 +26,7 @@ import ( "k8s.io/klog/v2" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -35,6 +36,7 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator" "sigs.k8s.io/karpenter/pkg/operator/injection" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" @@ -54,10 +56,33 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr } } +func (c *Controller) Name() string { + return "node.hydration" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient)). + WithOptions(controller.Options{ + RateLimiter: reasonable.RateLimiter(), + MaxConcurrentReconciles: 1000, + }). + Complete(reconcile.AsReconciler(m.GetClient(), c)) +} + func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, c.Name()) ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) + if nodeutils.IsHydrated(n) { + return reconcile.Result{}, nil + } + n.Annotations = lo.Assign(n.Annotations, map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }) + nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) if err != nil { if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { @@ -70,29 +95,27 @@ func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.R } stored := n.DeepCopy() - n.Labels = lo.Assign(n.Labels, map[string]string{ - v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()): nc.Spec.NodeClassRef.Name, - }) + c.hydrateNodeClassLabel(n, nc) + c.hydrateFinalizers(n, nc) if !equality.Semantic.DeepEqual(stored, n) { - if err := c.kubeClient.Patch(ctx, n, client.MergeFrom(stored)); err != nil { + if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } } return reconcile.Result{}, nil } -func (c *Controller) Name() string { - return "node.hydration" +func (c *Controller) hydrateNodeClassLabel(n *corev1.Node, nc *v1.NodeClaim) { + n.Labels = lo.Assign(n.Labels, map[string]string{ + v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()): nc.Spec.NodeClassRef.Name, + }) } -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named(c.Name()). - For(&corev1.Node{}). - Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient)). - WithOptions(controller.Options{ - RateLimiter: reasonable.RateLimiter(), - MaxConcurrentReconciles: 1000, - }). - Complete(reconcile.AsReconciler(m.GetClient(), c)) +func (c *Controller) hydrateFinalizers(n *corev1.Node, nc *v1.NodeClaim) { + if !nc.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue() { + controllerutil.AddFinalizer(n, v1.DrainFinalizer) + } + if !nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() { + controllerutil.AddFinalizer(n, v1.VolumeFinalizer) + } } diff --git a/pkg/controllers/node/hydration/suite_test.go b/pkg/controllers/node/hydration/suite_test.go index e01aad3580..16b0661d16 100644 --- a/pkg/controllers/node/hydration/suite_test.go +++ b/pkg/controllers/node/hydration/suite_test.go @@ -24,10 +24,13 @@ import ( . "github.com/onsi/gomega" "github.com/samber/lo" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" "sigs.k8s.io/karpenter/pkg/controllers/node/hydration" + "sigs.k8s.io/karpenter/pkg/operator" "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -64,36 +67,66 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { + It("should hydrate the NodeClass label", func() { + nc, n := test.NodeClaimAndNode() + delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc, n) + ExpectObjectReconciled(ctx, env.Client, hydrationController, n) + n = ExpectExists(ctx, env.Client, n) + value := n.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())] + Expect(value).To(Equal(nc.Spec.NodeClassRef.Name)) + }) DescribeTable( - "Hydration", - func(isNodeClaimManaged bool) { - nodeClassRef := lo.Ternary(isNodeClaimManaged, &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "TestNodeClass", - Name: "default", - }, &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }) - nodeClaim, node := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: nodeClassRef, - }, - }) - delete(node.Labels, v1.NodeClassLabelKey(nodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nodeClaim, node) - ExpectObjectReconciled(ctx, env.Client, hydrationController, node) - - // The missing NodeClass label should have been propagated to the Node - node = ExpectExists(ctx, env.Client, node) - value, ok := node.Labels[v1.NodeClassLabelKey(nodeClassRef.GroupKind())] - Expect(ok).To(Equal(isNodeClaimManaged)) - if isNodeClaimManaged { - Expect(value).To(Equal(nodeClassRef.Name)) + "Finalizers", + func(nodeClaimConditions []string, expectedFinailzers []string) { + nc, n := test.NodeClaimAndNode() + for _, cond := range nodeClaimConditions { + nc.StatusConditions().SetTrue(cond) + } + ExpectApplied(ctx, env.Client, nc, n) + ExpectObjectReconciled(ctx, env.Client, hydrationController, n) + n = ExpectExists(ctx, env.Client, n) + Expect(len(n.Finalizers)).To(Equal(len(expectedFinailzers))) + for _, finalizer := range expectedFinailzers { + Expect(controllerutil.ContainsFinalizer(n, finalizer)) } }, - Entry("should hydrate missing metadata onto the Node", true), - Entry("should ignore Nodes which aren't managed by this Karpenter instance", false), + Entry("should hydrate all finalizers when none of the requisite status conditions are true", nil, []string{v1.DrainFinalizer, v1.VolumeFinalizer}), + Entry("should hydrate the volume finalizer when only the drain status condition is true", []string{v1.ConditionTypeDrained}, []string{v1.VolumeFinalizer}), + Entry("should hydrate the drain finalizer when only the volume status condition is true", []string{v1.ConditionTypeVolumesDetached}, []string{v1.VolumeFinalizer}), + Entry("shouldn't hydrate finalizers when all requisite conditions are true", []string{v1.ConditionTypeDrained, v1.ConditionTypeVolumesDetached}, nil), ) + + It("shouldn't hydrate nodes which have already been hydrated", func() { + nc, n := test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }, + }, + }) + delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc, n) + ExpectObjectReconciled(ctx, env.Client, hydrationController, n) + n = ExpectExists(ctx, env.Client, n) + Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + Expect(len(n.Finalizers)).To(Equal(0)) + }) + It("shouldn't hydrate nodes which are not managed by this instance of Karpenter", func() { + nc, n := test.NodeClaimAndNode(v1.NodeClaim{ + Spec: v1.NodeClaimSpec{ + NodeClassRef: &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + }, + }, + }) + delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc, n) + ExpectObjectReconciled(ctx, env.Client, hydrationController, n) + n = ExpectExists(ctx, env.Client, n) + Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + Expect(len(n.Finalizers)).To(Equal(0)) + }) }) diff --git a/pkg/controllers/nodeclaim/hydration/controller.go b/pkg/controllers/nodeclaim/hydration/controller.go index 5c9d08837c..40b8931a1a 100644 --- a/pkg/controllers/nodeclaim/hydration/controller.go +++ b/pkg/controllers/nodeclaim/hydration/controller.go @@ -34,6 +34,7 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator" "sigs.k8s.io/karpenter/pkg/operator/injection" nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" ) @@ -52,36 +53,48 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr } } +func (c *Controller) Name() string { + return "nodeclaim.hydration" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&v1.NodeClaim{}, builder.WithPredicates(nodeclaimutils.IsManagedPredicateFuncs(c.cloudProvider))). + WithOptions(controller.Options{ + RateLimiter: reasonable.RateLimiter(), + MaxConcurrentReconciles: 1000, + }). + Complete(reconcile.AsReconciler(m.GetClient(), c)) +} + func (c *Controller) Reconcile(ctx context.Context, nc *v1.NodeClaim) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, c.Name()) ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) + if !nodeclaimutils.IsManaged(nc, c.cloudProvider) { return reconcile.Result{}, nil } - stored := nc.DeepCopy() - nc.Labels = lo.Assign(nc.Labels, map[string]string{ - v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()): nc.Spec.NodeClassRef.Name, + if version := nc.Annotations[v1.HydrationAnnotationKey]; version == operator.Version { + return reconcile.Result{}, nil + } + nc.Annotations = lo.Assign(nc.Annotations, map[string]string{ + v1.HydrationAnnotationKey: operator.Version, }) + + stored := nc.DeepCopy() + c.hydrateNodeClassLabel(nc) if !equality.Semantic.DeepEqual(stored, nc) { - if err := c.kubeClient.Patch(ctx, nc, client.MergeFrom(stored)); err != nil { + if err := c.kubeClient.Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } } return reconcile.Result{}, nil } -func (c *Controller) Name() string { - return "nodeclaim.hydration" -} - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named(c.Name()). - For(&v1.NodeClaim{}, builder.WithPredicates(nodeclaimutils.IsManagedPredicateFuncs(c.cloudProvider))). - WithOptions(controller.Options{ - RateLimiter: reasonable.RateLimiter(), - MaxConcurrentReconciles: 1000, - }). - Complete(reconcile.AsReconciler(m.GetClient(), c)) +func (c *Controller) hydrateNodeClassLabel(nc *v1.NodeClaim) { + nc.Labels = lo.Assign(nc.Labels, map[string]string{ + v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()): nc.Spec.NodeClassRef.Name, + }) } diff --git a/pkg/controllers/nodeclaim/hydration/suite_test.go b/pkg/controllers/nodeclaim/hydration/suite_test.go index 559eee4061..86af3235bd 100644 --- a/pkg/controllers/nodeclaim/hydration/suite_test.go +++ b/pkg/controllers/nodeclaim/hydration/suite_test.go @@ -24,10 +24,12 @@ import ( . "github.com/onsi/gomega" "github.com/samber/lo" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/hydration" + "sigs.k8s.io/karpenter/pkg/operator" "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -64,35 +66,42 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { - DescribeTable( - "Hydration", - func(isNodeClaimManaged bool) { - nodeClassRef := lo.Ternary(isNodeClaimManaged, &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "TestNodeClass", - Name: "default", - }, &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }) - nodeClaim, _ := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: nodeClassRef, + It("should hydrate the NodeClass label", func() { + nc, _ := test.NodeClaimAndNode() + delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) + nc = ExpectExists(ctx, env.Client, nc) + Expect(nc.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())]).To(Equal(nc.Spec.NodeClassRef.Name)) + }) + It("shouldn't hydrate nodes which have already been hydrated", func() { + nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.HydrationAnnotationKey: operator.Version, }, - }) - delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nodeClaim) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - value, ok := nodeClaim.Labels[v1.NodeClassLabelKey(nodeClassRef.GroupKind())] - Expect(ok).To(Equal(isNodeClaimManaged)) - if isNodeClaimManaged { - Expect(value).To(Equal(nodeClassRef.Name)) - } - }, - Entry("should hydrate missing metadata onto the NodeClaim", true), - Entry("should ignore NodeClaims which aren't managed by this Karpenter instance", false), - ) + }, + }) + delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) + nc = ExpectExists(ctx, env.Client, nc) + Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + }) + It("shouldn't hydrate NodeClaims which aren't managed by this instance of Karpenter", func() { + nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ + Spec: v1.NodeClaimSpec{ + NodeClassRef: &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + }, + }, + }) + delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nc) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) + nc = ExpectExists(ctx, env.Client, nc) + Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + }) }) diff --git a/pkg/utils/node/node.go b/pkg/utils/node/node.go index c308589dce..ddcc6156b5 100644 --- a/pkg/utils/node/node.go +++ b/pkg/utils/node/node.go @@ -18,7 +18,6 @@ package node import ( "context" - "errors" "fmt" "github.com/awslabs/operatorpkg/object" @@ -27,64 +26,18 @@ import ( corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator" nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" "sigs.k8s.io/karpenter/pkg/utils/pod" ) -// NodeClaimNotFoundError is an error returned when no v1.NodeClaims are found matching the passed providerID -type NodeClaimNotFoundError struct { - ProviderID string -} - -func (e *NodeClaimNotFoundError) Error() string { - return fmt.Sprintf("no nodeclaims found for provider id '%s'", e.ProviderID) -} - -func IsNodeClaimNotFoundError(err error) bool { - if err == nil { - return false - } - nnfErr := &NodeClaimNotFoundError{} - return errors.As(err, &nnfErr) -} - -func IgnoreNodeClaimNotFoundError(err error) error { - if !IsNodeClaimNotFoundError(err) { - return err - } - return nil -} - -// DuplicateNodeClaimError is an error returned when multiple v1.NodeClaims are found matching the passed providerID -type DuplicateNodeClaimError struct { - ProviderID string -} - -func (e *DuplicateNodeClaimError) Error() string { - return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) -} - -func IsDuplicateNodeClaimError(err error) bool { - if err == nil { - return false - } - dnErr := &DuplicateNodeClaimError{} - return errors.As(err, &dnErr) -} - -func IgnoreDuplicateNodeClaimError(err error) error { - if !IsDuplicateNodeClaimError(err) { - return err - } - return nil -} - // GetPods grabs all pods that are currently bound to the passed nodes func GetPods(ctx context.Context, kubeClient client.Client, nodes ...*corev1.Node) ([]*corev1.Pod, error) { var pods []*corev1.Pod @@ -196,3 +149,18 @@ func NodeClaimEventHandler(c client.Client) handler.EventHandler { }) }) } + +func IsHydrated(n *corev1.Node) bool { + version, ok := n.Annotations[v1.HydrationAnnotationKey] + if !ok { + return false + } + return version == operator.Version +} + +func ContainsFinalizer(n *corev1.Node, finalizer string) (bool, error) { + if v1.HydratedFinailzers.Has(finalizer) && !IsHydrated(n) { + return false, &HydrationError{} + } + return controllerutil.ContainsFinalizer(n, finalizer), nil +} diff --git a/pkg/utils/node/types.go b/pkg/utils/node/types.go new file mode 100644 index 0000000000..895f71eb65 --- /dev/null +++ b/pkg/utils/node/types.go @@ -0,0 +1,91 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "errors" + "fmt" +) + +// NodeClaimNotFoundError is an error returned when no v1.NodeClaims are found matching the passed providerID +type NodeClaimNotFoundError struct { + ProviderID string +} + +func (e *NodeClaimNotFoundError) Error() string { + return fmt.Sprintf("no nodeclaims found for provider id '%s'", e.ProviderID) +} + +func IsNodeClaimNotFoundError(err error) bool { + if err == nil { + return false + } + nnfErr := &NodeClaimNotFoundError{} + return errors.As(err, &nnfErr) +} + +func IgnoreNodeClaimNotFoundError(err error) error { + if !IsNodeClaimNotFoundError(err) { + return err + } + return nil +} + +// DuplicateNodeClaimError is an error returned when multiple v1.NodeClaims are found matching the passed providerID +type DuplicateNodeClaimError struct { + ProviderID string +} + +func (e *DuplicateNodeClaimError) Error() string { + return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) +} + +func IsDuplicateNodeClaimError(err error) bool { + if err == nil { + return false + } + dnErr := &DuplicateNodeClaimError{} + return errors.As(err, &dnErr) +} + +func IgnoreDuplicateNodeClaimError(err error) error { + if !IsDuplicateNodeClaimError(err) { + return err + } + return nil +} + +type HydrationError struct{} + +func (e *HydrationError) Error() string { + return fmt.Sprintf("resource has not been hydrated") +} + +func IsHydrationError(err error) bool { + if err == nil { + return false + } + hErr := &HydrationError{} + return errors.As(err, &hErr) +} + +func IgnoreHydrationError(err error) error { + if !IsHydrationError(err) { + return err + } + return nil +} diff --git a/pkg/utils/nodeclaim/nodeclaim.go b/pkg/utils/nodeclaim/nodeclaim.go index 43a82c5e2a..689d3f744c 100644 --- a/pkg/utils/nodeclaim/nodeclaim.go +++ b/pkg/utils/nodeclaim/nodeclaim.go @@ -18,7 +18,6 @@ package nodeclaim import ( "context" - "errors" "fmt" "time" @@ -36,6 +35,7 @@ import ( v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator" ) func IsManaged(nodeClaim *v1.NodeClaim, cp cloudprovider.CloudProvider) bool { @@ -147,54 +147,6 @@ func NodeClassEventHandler(c client.Client) handler.EventHandler { }) } -// NodeNotFoundError is an error returned when no corev1.Nodes are found matching the passed providerID -type NodeNotFoundError struct { - ProviderID string -} - -func (e *NodeNotFoundError) Error() string { - return fmt.Sprintf("no nodes found for provider id '%s'", e.ProviderID) -} - -func IsNodeNotFoundError(err error) bool { - if err == nil { - return false - } - nnfErr := &NodeNotFoundError{} - return errors.As(err, &nnfErr) -} - -func IgnoreNodeNotFoundError(err error) error { - if !IsNodeNotFoundError(err) { - return err - } - return nil -} - -// DuplicateNodeError is an error returned when multiple corev1.Nodes are found matching the passed providerID -type DuplicateNodeError struct { - ProviderID string -} - -func (e *DuplicateNodeError) Error() string { - return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) -} - -func IsDuplicateNodeError(err error) bool { - if err == nil { - return false - } - dnErr := &DuplicateNodeError{} - return errors.As(err, &dnErr) -} - -func IgnoreDuplicateNodeError(err error) error { - if !IsDuplicateNodeError(err) { - return err - } - return nil -} - // NodeForNodeClaim is a helper function that takes a v1.NodeClaim and attempts to find the matching corev1.Node by its providerID // This function will return errors if: // 1. No corev1.Nodes match the v1.NodeClaim providerID @@ -261,3 +213,11 @@ func HasTerminationGracePeriodElapsed(clk clock.Clock, nc *v1.NodeClaim) (bool, } return clk.Now().After(*expirationTime), nil } + +func IsHydrated(nc *v1.NodeClaim) bool { + version, ok := nc.Annotations[v1.HydrationAnnotationKey] + if !ok { + return false + } + return version == operator.Version +} diff --git a/pkg/utils/nodeclaim/types.go b/pkg/utils/nodeclaim/types.go new file mode 100644 index 0000000000..9266aa2364 --- /dev/null +++ b/pkg/utils/nodeclaim/types.go @@ -0,0 +1,71 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclaim + +import ( + "errors" + "fmt" +) + +// NodeNotFoundError is an error returned when no corev1.Nodes are found matching the passed providerID +type NodeNotFoundError struct { + ProviderID string +} + +func (e *NodeNotFoundError) Error() string { + return fmt.Sprintf("no nodes found for provider id '%s'", e.ProviderID) +} + +func IsNodeNotFoundError(err error) bool { + if err == nil { + return false + } + nnfErr := &NodeNotFoundError{} + return errors.As(err, &nnfErr) +} + +func IgnoreNodeNotFoundError(err error) error { + if !IsNodeNotFoundError(err) { + return err + } + return nil +} + +// DuplicateNodeError is an error returned when multiple corev1.Nodes are found matching the passed providerID +type DuplicateNodeError struct { + ProviderID string +} + +func (e *DuplicateNodeError) Error() string { + return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) +} + +func IsDuplicateNodeError(err error) bool { + if err == nil { + return false + } + dnErr := &DuplicateNodeError{} + return errors.As(err, &dnErr) +} + +func IgnoreDuplicateNodeError(err error) error { + if !IsDuplicateNodeError(err) { + return err + } + return nil +} + From 0fdc2a285b9b230ba87db9596d338c4683b73c0c Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Tue, 10 Dec 2024 00:53:20 -0800 Subject: [PATCH 5/5] complete --- pkg/controllers/controllers.go | 8 +- pkg/controllers/node/hydration/controller.go | 2 +- pkg/controllers/node/hydration/suite_test.go | 86 +- .../node/termination/controller.go | 202 ---- .../{drain.go => drain/controller.go} | 154 ++- .../node/termination/{ => drain}/events.go | 44 +- .../node/termination/drain/metrics.go | 38 + .../node/termination/drain/suite_test.go | 599 +++++++++++ .../node/termination/instancetermination.go | 71 -- .../instancetermination/controller.go | 140 +++ .../{ => instancetermination}/metrics.go | 12 +- .../instancetermination/suite_test.go | 188 ++++ .../node/termination/reconcile/reconcile.go | 161 +++ .../node/termination/reconcile/suite_test.go | 292 ++++++ .../node/termination/suite_test.go | 930 ------------------ .../controller.go} | 102 +- .../termination/volumedetachment/events.go | 36 + .../volumedetachment/suite_test.go | 185 ++++ .../nodeclaim/hydration/suite_test.go | 63 +- .../nodeclaim/lifecycle/termination_test.go | 2 +- pkg/events/suite_test.go | 6 +- pkg/test/nodeclaim.go | 4 + pkg/test/nodes.go | 5 + pkg/utils/node/node.go | 5 +- pkg/utils/node/types.go | 4 +- pkg/utils/nodeclaim/types.go | 1 - 26 files changed, 1942 insertions(+), 1398 deletions(-) delete mode 100644 pkg/controllers/node/termination/controller.go rename pkg/controllers/node/termination/{drain.go => drain/controller.go} (53%) rename pkg/controllers/node/termination/{ => drain}/events.go (68%) create mode 100644 pkg/controllers/node/termination/drain/metrics.go create mode 100644 pkg/controllers/node/termination/drain/suite_test.go delete mode 100644 pkg/controllers/node/termination/instancetermination.go create mode 100644 pkg/controllers/node/termination/instancetermination/controller.go rename pkg/controllers/node/termination/{ => instancetermination}/metrics.go (87%) create mode 100644 pkg/controllers/node/termination/instancetermination/suite_test.go create mode 100644 pkg/controllers/node/termination/reconcile/reconcile.go create mode 100644 pkg/controllers/node/termination/reconcile/suite_test.go delete mode 100644 pkg/controllers/node/termination/suite_test.go rename pkg/controllers/node/termination/{volumedetachment.go => volumedetachment/controller.go} (50%) create mode 100644 pkg/controllers/node/termination/volumedetachment/events.go create mode 100644 pkg/controllers/node/termination/volumedetachment/suite_test.go diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index f3e8b69b8a..828255b361 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -38,8 +38,10 @@ import ( metricspod "sigs.k8s.io/karpenter/pkg/controllers/metrics/pod" "sigs.k8s.io/karpenter/pkg/controllers/node/health" nodehydration "sigs.k8s.io/karpenter/pkg/controllers/node/hydration" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/instancetermination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/volumedetachment" nodeclaimconsistency "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/consistency" nodeclaimdisruption "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/disruption" "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/expiration" @@ -83,7 +85,9 @@ func NewControllers( informer.NewPodController(kubeClient, cluster), informer.NewNodePoolController(kubeClient, cloudProvider, cluster), informer.NewNodeClaimController(kubeClient, cloudProvider, cluster), - termination.NewController(clock, kubeClient, cloudProvider, recorder, evictionQueue), + drain.NewController(clock, kubeClient, cloudProvider, recorder, evictionQueue), + volumedetachment.NewController(clock, kubeClient, cloudProvider, recorder), + instancetermination.NewController(clock, kubeClient, cloudProvider), metricspod.NewController(kubeClient, cluster), metricsnodepool.NewController(kubeClient, cloudProvider), metricsnode.NewController(cluster), diff --git a/pkg/controllers/node/hydration/controller.go b/pkg/controllers/node/hydration/controller.go index 5a72e3ca37..1140571a75 100644 --- a/pkg/controllers/node/hydration/controller.go +++ b/pkg/controllers/node/hydration/controller.go @@ -64,7 +64,7 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { return controllerruntime.NewControllerManagedBy(m). Named(c.Name()). For(&corev1.Node{}). - Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient)). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). WithOptions(controller.Options{ RateLimiter: reasonable.RateLimiter(), MaxConcurrentReconciles: 1000, diff --git a/pkg/controllers/node/hydration/suite_test.go b/pkg/controllers/node/hydration/suite_test.go index 16b0661d16..9025c1f061 100644 --- a/pkg/controllers/node/hydration/suite_test.go +++ b/pkg/controllers/node/hydration/suite_test.go @@ -24,8 +24,10 @@ import ( . "github.com/onsi/gomega" "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" @@ -67,28 +69,37 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { + var nodeClaim *v1.NodeClaim + var node *corev1.Node + + BeforeEach(func() { + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{v1.HydrationAnnotationKey: "not-hydrated"}, + }, + }) + }) + It("should hydrate the NodeClass label", func() { - nc, n := test.NodeClaimAndNode() - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - value := n.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())] - Expect(value).To(Equal(nc.Spec.NodeClassRef.Name)) + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + value := node.Labels[v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())] + Expect(value).To(Equal(nodeClaim.Spec.NodeClassRef.Name)) }) DescribeTable( "Finalizers", func(nodeClaimConditions []string, expectedFinailzers []string) { - nc, n := test.NodeClaimAndNode() for _, cond := range nodeClaimConditions { - nc.StatusConditions().SetTrue(cond) + nodeClaim.StatusConditions().SetTrue(cond) } - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(len(n.Finalizers)).To(Equal(len(expectedFinailzers))) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(len(node.Finalizers)).To(Equal(len(expectedFinailzers))) for _, finalizer := range expectedFinailzers { - Expect(controllerutil.ContainsFinalizer(n, finalizer)) + Expect(controllerutil.ContainsFinalizer(node, finalizer)) } }, Entry("should hydrate all finalizers when none of the requisite status conditions are true", nil, []string{v1.DrainFinalizer, v1.VolumeFinalizer}), @@ -96,37 +107,26 @@ var _ = Describe("Hydration", func() { Entry("should hydrate the drain finalizer when only the volume status condition is true", []string{v1.ConditionTypeVolumesDetached}, []string{v1.VolumeFinalizer}), Entry("shouldn't hydrate finalizers when all requisite conditions are true", []string{v1.ConditionTypeDrained, v1.ConditionTypeVolumesDetached}, nil), ) - It("shouldn't hydrate nodes which have already been hydrated", func() { - nc, n := test.NodeClaimAndNode(v1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.HydrationAnnotationKey: operator.Version, - }, - }, - }) - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) - Expect(len(n.Finalizers)).To(Equal(0)) + node.Annotations[v1.HydrationAnnotationKey] = operator.Version + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(lo.Keys(node.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) + Expect(len(node.Finalizers)).To(Equal(0)) }) It("shouldn't hydrate nodes which are not managed by this instance of Karpenter", func() { - nc, n := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }, - }, - }) - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) - Expect(len(n.Finalizers)).To(Equal(0)) + nodeClaim.Spec.NodeClassRef = &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + } + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(lo.Keys(node.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) + Expect(len(node.Finalizers)).To(Equal(0)) }) }) diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go deleted file mode 100644 index 3880eddd7c..0000000000 --- a/pkg/controllers/node/termination/controller.go +++ /dev/null @@ -1,202 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination - -import ( - "context" - "fmt" - "time" - - "github.com/samber/lo" - "golang.org/x/time/rate" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/client-go/util/workqueue" - "k8s.io/klog/v2" - "k8s.io/utils/clock" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "sigs.k8s.io/karpenter/pkg/utils/pretty" - - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" - "sigs.k8s.io/karpenter/pkg/events" - "sigs.k8s.io/karpenter/pkg/metrics" - "sigs.k8s.io/karpenter/pkg/operator/injection" - nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" -) - -// Controller for the resource -type Controller struct { - clock clock.Clock - kubeClient client.Client - cloudProvider cloudprovider.CloudProvider - reconcilers []reconciler -} - -type reconciler interface { - Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) -} - -// NewController constructs a controller instance -func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, evictionQueue *eviction.Queue) *Controller { - return &Controller{ - clock: clk, - kubeClient: kubeClient, - cloudProvider: cloudProvider, - reconcilers: []reconciler{ - &DrainReconciler{clk, kubeClient, cloudProvider, recorder, evictionQueue}, - &VolumeDetachmentReconciler{clk, kubeClient, recorder}, - &InstanceTerminationReconciler{clk, kubeClient, cloudProvider}, - }, - } -} - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named("node.termination"). - For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). - Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). - WithOptions( - controller.Options{ - RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( - workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), - // 10 qps, 100 bucket size - &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, - ), - MaxConcurrentReconciles: 100, - }, - ). - Complete(reconcile.AsReconciler(m.GetClient(), c)) -} - -// nolint:gocyclo -func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { - ctx = injection.WithControllerName(ctx, "node.termination") - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) - - if n.GetDeletionTimestamp().IsZero() { - return reconcile.Result{}, nil - } - if !controllerutil.ContainsFinalizer(n, v1.TerminationFinalizer) { - return reconcile.Result{}, nil - } - if !nodeutils.IsManaged(n, c.cloudProvider) { - return reconcile.Result{}, nil - } - - nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) - if err != nil { - if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } - return reconcile.Result{}, err - } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) - if nc.DeletionTimestamp.IsZero() { - if err := c.kubeClient.Delete(ctx, nc); err != nil { - return reconcile.Result{}, client.IgnoreNotFound(err) - } - } - - if err := c.prepareNode(ctx, n); err != nil { - if errors.IsNotFound(err) { - return reconcile.Result{}, nil - } - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) - } - - for _, r := range c.reconcilers { - res, err := r.Reconcile(ctx, n, nc) - if res.Requeue || res.RequeueAfter != 0 || err != nil { - return res, err - } - } - return reconcile.Result{}, nil -} - -// prepareNode ensures that the node is ready to begin the drain / termination process. This includes ensuring that it -// is tainted appropriately and annotated to be ignored by external load balancers. -func (c *Controller) prepareNode(ctx context.Context, n *corev1.Node) error { - stored := n.DeepCopy() - - // Add the karpenter.sh/disrupted:NoSchedule taint to ensure no additional pods schedule to the Node during the - // drain process. - if !lo.ContainsBy(n.Spec.Taints, func(t corev1.Taint) bool { - return t.MatchTaint(&v1.DisruptedNoScheduleTaint) - }) { - n.Spec.Taints = append(n.Spec.Taints, v1.DisruptedNoScheduleTaint) - } - // Adding this label to the node ensures that the node is removed from the load-balancer target group while it is - // draining and before it is terminated. This prevents 500s coming prior to health check when the load balancer - // controller hasn't yet determined that the node and underlying connections are gone. - // https://github.com/aws/aws-node-termination-handler/issues/316 - // https://github.com/aws/karpenter/pull/2518 - n.Labels = lo.Assign(n.Labels, map[string]string{ - corev1.LabelNodeExcludeBalancers: "karpenter", - }) - - if equality.Semantic.DeepEqual(n, stored) { - return nil - } - // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch can cause races due - // to the fact that it fully replaces the list on a change. Here, we are updating the taint list. - return c.kubeClient.Patch(ctx, n, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})) -} - -// removeFinalizer removes Karpenter's termination finalizer from the given Node, updating the termination metrics in the process. -func removeFinalizer(ctx context.Context, kubeClient client.Client, n *corev1.Node) error { - stored := n.DeepCopy() - controllerutil.RemoveFinalizer(n, v1.TerminationFinalizer) - if !equality.Semantic.DeepEqual(stored, n) { - // We use client.StrategicMergeFrom here since the node object supports it and - // a strategic merge patch represents the finalizer list as a keyed "set" so removing - // an item from the list doesn't replace the full list - // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 - if err := kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { - if errors.IsNotFound(err) { - return nil - } - return fmt.Errorf("removing finalizer, %w", err) - } - metrics.NodesTerminatedTotal.Inc(map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp - DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - log.FromContext(ctx).Info("deleted node") - } - return nil -} diff --git a/pkg/controllers/node/termination/drain.go b/pkg/controllers/node/termination/drain/controller.go similarity index 53% rename from pkg/controllers/node/termination/drain.go rename to pkg/controllers/node/termination/drain/controller.go index d95c7a3183..226f430250 100644 --- a/pkg/controllers/node/termination/drain.go +++ b/pkg/controllers/node/termination/drain/controller.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package drain import ( "context" @@ -23,21 +23,30 @@ import ( "time" "github.com/samber/lo" + "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" podutils "sigs.k8s.io/karpenter/pkg/utils/pod" + "sigs.k8s.io/karpenter/pkg/utils/pretty" ) type nodeDrainError struct { @@ -56,7 +65,7 @@ func isNodeDrainError(err error) bool { return errors.As(err, &nodeDrainErr) } -type DrainReconciler struct { +type Controller struct { clock clock.Clock kubeClient client.Client cloudProvider cloudprovider.CloudProvider @@ -64,10 +73,64 @@ type DrainReconciler struct { evictionQueue *eviction.Queue } +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, queue *eviction.Queue) *Controller { + return &Controller{ + clock: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + recorder: recorder, + evictionQueue: queue, + } +} + +func (*Controller) Name() string { + return "node.drain" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clock, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return nil +} + +func (*Controller) Finalizer() string { + return v1.DrainFinalizer +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + // nolint:gocyclo -func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - if nc.StatusConditions().IsTrue(v1.ConditionTypeDrained) { - return reconcile.Result{}, nil +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + if err := c.prepareNode(ctx, n); err != nil { + if apierrors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if apierrors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) } tgpExpirationTime, err := nodeclaim.TerminationGracePeriodExpirationTime(nc) @@ -76,29 +139,16 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. return reconcile.Result{}, nil } if tgpExpirationTime != nil { - d.recorder.Publish(NodeTerminationGracePeriodExpiringEvent(n, *tgpExpirationTime)) + c.recorder.Publish(NodeTerminationGracePeriodExpiringEvent(n, *tgpExpirationTime)) } - if err := d.drain(ctx, n, tgpExpirationTime); err != nil { + if err := c.drain(ctx, n, tgpExpirationTime); err != nil { if !isNodeDrainError(err) { return reconcile.Result{}, fmt.Errorf("draining node, %w", err) } - // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining - // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even - // though the CloudProvider says the instance is not around, we know that the kubelet process is still running - // if the Node Ready condition is true - // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 - if nodeutils.GetCondition(n, corev1.NodeReady).Status != corev1.ConditionTrue { - if _, err = d.cloudProvider.Get(ctx, n.Spec.ProviderID); err != nil { - if cloudprovider.IsNodeClaimNotFoundError(err) { - return reconcile.Result{}, removeFinalizer(ctx, d.kubeClient, n) - } - return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) - } - } stored := nc.DeepCopy() if nc.StatusConditions().SetFalse(v1.ConditionTypeDrained, "Draining", "Draining") { - if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -108,13 +158,13 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. return reconcile.Result{}, err } } - d.recorder.Publish(NodeDrainFailedEvent(n, err)) + c.recorder.Publish(NodeDrainFailedEvent(n, err)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } - stored := nc.DeepCopy() + storedNodeClaim := nc.DeepCopy() _ = nc.StatusConditions().SetTrue(v1.ConditionTypeDrained) - if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(storedNodeClaim, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -123,38 +173,70 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. } return reconcile.Result{}, err } + if err := terminationreconcile.RemoveFinalizer(ctx, c.kubeClient, n, c.Finalizer()); err != nil { + return reconcile.Result{}, err + } NodesDrainedTotal.Inc(map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], }) return reconcile.Result{}, nil } +// prepareNode ensures that the node is ready to begin the drain / termination process. This includes ensuring that it +// is tainted appropriately and annotated to be ignored by external load balancers. +func (c *Controller) prepareNode(ctx context.Context, n *corev1.Node) error { + stored := n.DeepCopy() + + // Add the karpenter.sh/disrupted:NoSchedule taint to ensure no additional pods schedule to the Node during the + // drain process. + if !lo.ContainsBy(n.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&v1.DisruptedNoScheduleTaint) + }) { + n.Spec.Taints = append(n.Spec.Taints, v1.DisruptedNoScheduleTaint) + } + // Adding this label to the node ensures that the node is removed from the load-balancer target group while it is + // draining and before it is terminated. This prevents 500s coming prior to health check when the load balancer + // controller hasn't yet determined that the node and underlying connections are gone. + // https://github.com/aws/aws-node-termination-handler/issues/316 + // https://github.com/aws/karpenter/pull/2518 + n.Labels = lo.Assign(n.Labels, map[string]string{ + corev1.LabelNodeExcludeBalancers: "karpenter", + }) + + if equality.Semantic.DeepEqual(n, stored) { + return nil + } + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch can cause races due + // to the fact that it fully replaces the list on a change. Here, we are updating the taint list. + return c.kubeClient.Patch(ctx, n, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})) +} + // Drain evicts pods from the node and returns true when all pods are evicted // https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown -func (d *DrainReconciler) drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { - pods, err := nodeutils.GetPods(ctx, d.kubeClient, node) +func (c *Controller) drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { + pods, err := nodeutils.GetPods(ctx, c.kubeClient, node) if err != nil { return fmt.Errorf("listing pods on node, %w", err) } podsToDelete := lo.Filter(pods, func(p *corev1.Pod, _ int) bool { - return podutils.IsWaitingEviction(p, d.clock) && !podutils.IsTerminating(p) + return podutils.IsWaitingEviction(p, c.clock) && !podutils.IsTerminating(p) }) - if err := d.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { + if err := c.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { return fmt.Errorf("deleting expiring pods, %w", err) } // Monitor pods in pod groups that either haven't been evicted or are actively evicting - podGroups := d.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutils.IsWaitingEviction(p, d.clock) })) + podGroups := c.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutils.IsWaitingEviction(p, c.clock) })) for _, group := range podGroups { if len(group) > 0 { // Only add pods to the eviction queue that haven't been evicted yet - d.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutils.IsEvictable(p) })...) + c.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutils.IsEvictable(p) })...) return newNodeDrainError(fmt.Errorf("%d pods are waiting to be evicted", lo.SumBy(podGroups, func(pods []*corev1.Pod) int { return len(pods) }))) } } return nil } -func (d *DrainReconciler) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { +func (c *Controller) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { // 1. Prioritize noncritical pods, non-daemon pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown var nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon []*corev1.Pod for _, pod := range pods { @@ -175,19 +257,19 @@ func (d *DrainReconciler) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Po return [][]*corev1.Pod{nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon} } -func (d *DrainReconciler) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { +func (c *Controller) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { for _, pod := range pods { // check if the node has an expiration time and the pod needs to be deleted - deleteTime := d.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) + deleteTime := c.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) if deleteTime != nil && time.Now().After(*deleteTime) { // delete pod proactively to give as much of its terminationGracePeriodSeconds as possible for deletion // ensure that we clamp the maximum pod terminationGracePeriodSeconds to the node's remaining expiration time in the delete command gracePeriodSeconds := lo.ToPtr(int64(time.Until(*nodeGracePeriodTerminationTime).Seconds())) - d.recorder.Publish(PodDeletedEvent(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) + c.recorder.Publish(PodDeletedEvent(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) opts := &client.DeleteOptions{ GracePeriodSeconds: gracePeriodSeconds, } - if err := d.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem + if err := c.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem return fmt.Errorf("deleting pod, %w", err) // otherwise, bubble up the error } log.FromContext(ctx).WithValues( @@ -203,7 +285,7 @@ func (d *DrainReconciler) DeleteExpiringPods(ctx context.Context, pods []*corev1 } // if a pod should be deleted to give it the full terminationGracePeriodSeconds of time before the node will shut down, return the time the pod should be deleted -func (*DrainReconciler) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { +func (*Controller) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { if nodeGracePeriodExpirationTime == nil || pod.Spec.TerminationGracePeriodSeconds == nil { // k8s defaults to 30s, so we should never see a nil TerminationGracePeriodSeconds return nil } diff --git a/pkg/controllers/node/termination/events.go b/pkg/controllers/node/termination/drain/events.go similarity index 68% rename from pkg/controllers/node/termination/events.go rename to pkg/controllers/node/termination/drain/events.go index 91ffbc26d5..533f228571 100644 --- a/pkg/controllers/node/termination/events.go +++ b/pkg/controllers/node/termination/drain/events.go @@ -14,19 +14,27 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package drain import ( "fmt" "time" corev1 "k8s.io/api/core/v1" - storagev1 "k8s.io/api/storage/v1" - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/events" ) +func NodeTerminationGracePeriodExpiringEvent(node *corev1.Node, t time.Time) events.Event { + return events.Event{ + InvolvedObject: node, + Type: corev1.EventTypeWarning, + Reason: "TerminationGracePeriodExpiring", + Message: fmt.Sprintf("All pods will be deleted by %s", t.Format(time.RFC3339)), + DedupeValues: []string{node.Name}, + } +} + func PodDeletedEvent(pod *corev1.Pod, gracePeriodSeconds *int64, nodeGracePeriodTerminationTime *time.Time) events.Event { return events.Event{ InvolvedObject: pod, @@ -46,33 +54,3 @@ func NodeDrainFailedEvent(node *corev1.Node, err error) events.Event { DedupeValues: []string{node.Name}, } } - -func NodeAwaitingVolumeDetachmentEvent(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { - return events.Event{ - InvolvedObject: node, - Type: corev1.EventTypeNormal, - Reason: "AwaitingVolumeDetachment", - Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), - DedupeValues: []string{node.Name}, - } -} - -func NodeTerminationGracePeriodExpiringEvent(node *corev1.Node, t time.Time) events.Event { - return events.Event{ - InvolvedObject: node, - Type: corev1.EventTypeWarning, - Reason: "TerminationGracePeriodExpiring", - Message: fmt.Sprintf("All pods will be deleted by %s", t.Format(time.RFC3339)), - DedupeValues: []string{node.Name}, - } -} - -func NodeClaimTerminationGracePeriodExpiringEvent(nodeClaim *v1.NodeClaim, terminationTime string) events.Event { - return events.Event{ - InvolvedObject: nodeClaim, - Type: corev1.EventTypeWarning, - Reason: "TerminationGracePeriodExpiring", - Message: fmt.Sprintf("All pods will be deleted by %s", terminationTime), - DedupeValues: []string{nodeClaim.Name}, - } -} diff --git a/pkg/controllers/node/termination/drain/metrics.go b/pkg/controllers/node/termination/drain/metrics.go new file mode 100644 index 0000000000..1aa8f1849e --- /dev/null +++ b/pkg/controllers/node/termination/drain/metrics.go @@ -0,0 +1,38 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drain + +import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "sigs.k8s.io/karpenter/pkg/metrics" +) + +var ( + NodesDrainedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: metrics.NodeSubsystem, + Name: "drained_total", + Help: "The total number of nodes drained by Karpenter", + }, + []string{metrics.NodePoolLabel}, + ) +) diff --git a/pkg/controllers/node/termination/drain/suite_test.go b/pkg/controllers/node/termination/drain/suite_test.go new file mode 100644 index 0000000000..a2e4bea43b --- /dev/null +++ b/pkg/controllers/node/termination/drain/suite_test.go @@ -0,0 +1,599 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drain_test + +import ( + "context" + "testing" + "time" + + clock "k8s.io/utils/clock/testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var queue *eviction.Queue +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var reconciler reconcile.Reconciler + +var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}} + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "Drain") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + queue = eviction.NewTestingQueue(env.Client, recorder) + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, drain.NewController(fakeClock, env.Client, cloudProvider, recorder, queue)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("Drain", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.TerminationFinalizer, + v1.DrainFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) + drain.NodesDrainedTotal.Reset() + }) + + // TODO: Move to reconciler + It("should exclude nodes from load balancers when terminating", func() { + labels := map[string]string{"foo": "bar"} + pod := test.Pod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: defaultOwnerRefs, + Labels: labels, + }, + }) + // Create a fully blocking PDB to prevent the node from being deleted before we can observe its labels + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labels, + MaxUnavailable: lo.ToPtr(intstr.FromInt(0)), + }) + + ExpectApplied(ctx, env.Client, node, nodeClaim, pod, pdb) + ExpectManualBinding(ctx, env.Client, pod, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectNodeExists(ctx, env.Client, node.Name) + Expect(node.Labels[corev1.LabelNodeExcludeBalancers]).Should(Equal("karpenter")) + }) + DescribeTable( + "should not evict pods that tolerate the 'karpenter.sh/disrupted' taint", + func(toleration corev1.Toleration) { + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podSkip := test.Pod(test.PodOptions{ + NodeName: node.Name, + Tolerations: []corev1.Toleration{toleration}, + ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(podSkip)).To(BeFalse()) + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }, + Entry("with an equals operator", corev1.Toleration{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpEqual, Effect: v1.DisruptedNoScheduleTaint.Effect, Value: v1.DisruptedNoScheduleTaint.Value}), + Entry("with an exists operator", corev1.Toleration{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpExists, Effect: v1.DisruptedNoScheduleTaint.Effect}), + ) + It("should evict pods that tolerate the node.kubernetes.io/unschedulable taint", func() { + podEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + Tolerations: []corev1.Toleration{{Key: corev1.TaintNodeUnschedulable, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}, + ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes that have pods without an ownerRef", func() { + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: nil, + }, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(pod)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pod) + ExpectDeleted(ctx, env.Client, pod) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes with terminal pods", func() { + podEvictPhaseSucceeded := test.Pod(test.PodOptions{ + NodeName: node.Name, + Phase: corev1.PodSucceeded, + }) + podEvictPhaseFailed := test.Pod(test.PodOptions{ + NodeName: node.Name, + Phase: corev1.PodFailed, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvictPhaseSucceeded, podEvictPhaseFailed) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + + Expect(queue.Has(podEvictPhaseSucceeded)).To(BeFalse()) + Expect(queue.Has(podEvictPhaseFailed)).To(BeFalse()) + + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should fail to evict pods that violate a PDB", func() { + minAvailable := intstr.FromInt32(1) + labelSelector := map[string]string{test.RandomName(): test.RandomName()} + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labelSelector, + // Don't let any pod evict + MinAvailable: &minAvailable, + }) + podNoEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Labels: labelSelector, + OwnerReferences: defaultOwnerRefs, + }, + Phase: corev1.PodRunning, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podNoEvict, pdb) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Expect podNoEvict to be added to the queue - we won't evict it due to PDBs + Expect(queue.Has(podNoEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + Expect(queue.Has(podNoEvict)).To(BeTrue()) + ExpectDeleted(ctx, env.Client, podNoEvict) + ExpectNotFound(ctx, env.Client, podNoEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + }) + It("should evict pods in order and wait until pods are fully deleted", func() { + daemonEvict := test.DaemonSet() + daemonNodeCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-node-critical"}}) + daemonClusterCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-cluster-critical"}}) + ExpectApplied(ctx, env.Client, daemonEvict, daemonNodeCritical, daemonClusterCritical) + + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podDaemonEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonEvict.Name, + UID: daemonEvict.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podDaemonNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonNodeCritical.Name, + UID: daemonNodeCritical.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + podDaemonClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonClusterCritical.Name, + UID: daemonClusterCritical.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + podGroups := [][]*corev1.Pod{{podEvict}, {podDaemonEvict}, {podNodeCritical, podClusterCritical}, {podDaemonNodeCritical, podDaemonClusterCritical}} + for i, podGroup := range podGroups { + node = ExpectNodeExists(ctx, env.Client, node.Name) + for _, p := range podGroup { + ExpectPodExists(ctx, env.Client, p.Name, p.Namespace) + } + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + if i == 0 { + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + } else { + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + } + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + for range podGroup { + ExpectSingletonReconciled(ctx, queue) + } + // Start draining the pod group, but don't complete it yet + EventuallyExpectTerminating(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) + + // Look at the next pod group and ensure that none of the pods have started terminating on it + if i != len(podGroups)-1 { + for range podGroups[i+1] { + ExpectSingletonReconciled(ctx, queue) + } + ConsistentlyExpectNotTerminating(ctx, env.Client, lo.Map(podGroups[i+1], func(p *corev1.Pod, _ int) client.Object { return p })...) + } + // Expect that the pods are deleted -- which should unblock the next pod group + ExpectDeleted(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) + } + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not evict static pods", func() { + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) + + podNoEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "v1", + Kind: "Node", + Name: node.Name, + UID: node.UID, + }}, + }, + }) + ExpectApplied(ctx, env.Client, podNoEvict) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Expect mirror pod to not be queued for eviction + Expect(queue.Has(podNoEvict)).To(BeFalse()) + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not finalize nodes until all pods are deleted", func() { + pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + for _, p := range pods { + Expect(queue.Has(p)).To(BeTrue()) + } + ExpectSingletonReconciled(ctx, queue) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) + ExpectDeleted(ctx, env.Client, pods[1]) + + // Expect node to exist and be draining, but not deleted + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + + ExpectDeleted(ctx, env.Client, pods[0]) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes when pods are stuck terminating", func() { + pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + // Before grace period, node should not delete + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(pod)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pod) + + // After grace period, node should delete. The deletion timestamps are from etcd which we can't control, so + // to eliminate test-flakiness we reset the time to current time + 90 seconds instead of just advancing + // the clock by 90 seconds. + fakeClock.SetTime(time.Now().Add(90 * time.Second)) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not evict a new pod with the same name using the old pod's eviction queue key", func() { + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + OwnerReferences: defaultOwnerRefs, + }, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + // Trigger Termination Controller + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Don't trigger a call into the queue to make sure that we effectively aren't triggering eviction + // We'll use this to try to leave pods in the queue + + // Delete the pod directly to act like something else is doing the pod termination + ExpectDeleted(ctx, env.Client, pod) + + // Requeue the termination controller to completely delete the node + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + ExpectFinalizersRemoved(ctx, env.Client, node) + ExpectNotFound(ctx, env.Client, node) + + // Expect that the old pod's key still exists in the queue + Expect(queue.Has(pod)).To(BeTrue()) + + // Re-create the pod and node, it should now have the same name, but a different UUID + node = test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{v1.TerminationFinalizer}, + }, + }) + pod = test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + OwnerReferences: defaultOwnerRefs, + }, + }) + ExpectApplied(ctx, env.Client, node, pod) + + // Trigger eviction queue with the pod key still in it + ExpectSingletonReconciled(ctx, queue) + + Consistently(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) + g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) + }, ReconcilerPropagationTime, RequestInterval).Should(Succeed()) + }) + It("should preemptively delete pods to satisfy their terminationGracePeriodSeconds", func() { + nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), + } + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.DoNotDisruptAnnotationKey: "true", + }, + OwnerReferences: defaultOwnerRefs, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(300)), + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectSingletonReconciled(ctx, queue) + ExpectNodeExists(ctx, env.Client, node.Name) + ExpectDeleted(ctx, env.Client, pod) + }) + It("should only delete pods when their terminationGracePeriodSeconds is less than the the node's remaining terminationGracePeriod", func() { + nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), + } + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.DoNotDisruptAnnotationKey: "true", + }, + OwnerReferences: defaultOwnerRefs, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + }) + fakeClock.SetTime(time.Now()) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + // expect pod still exists + fakeClock.Step(90 * time.Second) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + ExpectNodeExists(ctx, env.Client, node.Name) + ExpectPodExists(ctx, env.Client, pod.Name, pod.Namespace) + + // expect pod is now deleted + fakeClock.Step(175 * time.Second) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectSingletonReconciled(ctx, queue) + ExpectDeleted(ctx, env.Client, pod) + }) + Context("Metrics", func() { + It("should update the eviction queueDepth metric when reconciling pods", func() { + minAvailable := intstr.FromInt32(0) + labelSelector := map[string]string{test.RandomName(): test.RandomName()} + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labelSelector, + // Don't let any pod evict + MinAvailable: &minAvailable, + }) + ExpectApplied(ctx, env.Client, pdb, node, nodeClaim) + pods := test.Pods(5, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: defaultOwnerRefs, + Labels: labelSelector, + }}) + ExpectApplied(ctx, env.Client, lo.Map(pods, func(p *corev1.Pod, _ int) client.Object { return p })...) + + wqDepthBefore, _ := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + wqDepthAfter, ok := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(wqDepthAfter.GetCounter().Value) - lo.FromPtr(wqDepthBefore.GetCounter().Value)).To(BeNumerically("==", 5)) + }) + }) +}) + +func ExpectNodeWithNodeClaimDraining(c client.Client, nodeName string) *corev1.Node { + GinkgoHelper() + node := ExpectNodeExists(ctx, c, nodeName) + Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint)) + Expect(node.Finalizers).To(ContainElements(v1.TerminationFinalizer, v1.DrainFinalizer)) + Expect(node.DeletionTimestamp).ToNot(BeNil()) + return node +} diff --git a/pkg/controllers/node/termination/instancetermination.go b/pkg/controllers/node/termination/instancetermination.go deleted file mode 100644 index 5ea7086e43..0000000000 --- a/pkg/controllers/node/termination/instancetermination.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination - -import ( - "context" - "fmt" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/utils/clock" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider" - nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" - "sigs.k8s.io/karpenter/pkg/utils/termination" -) - -type InstanceTerminationReconciler struct { - clk clock.Clock - kubeClient client.Client - cloudProvider cloudprovider.CloudProvider -} - -func (i *InstanceTerminationReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - elapsed, err := nodeclaimutils.HasTerminationGracePeriodElapsed(i.clk, nc) - if err != nil { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } - if !nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() && !elapsed { - return reconcile.Result{}, nil - } - isInstanceTerminated, err := termination.EnsureTerminated(ctx, i.kubeClient, nc, i.cloudProvider) - if err != nil { - // 404 = the nodeClaim no longer exists - if errors.IsNotFound(err) { - return reconcile.Result{}, nil - } - // 409 - The nodeClaim exists, but its status has already been modified - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) - } - if !isInstanceTerminated { - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil - } - if err := removeFinalizer(ctx, i.kubeClient, n); err != nil { - return reconcile.Result{}, err - } - return reconcile.Result{}, nil -} diff --git a/pkg/controllers/node/termination/instancetermination/controller.go b/pkg/controllers/node/termination/instancetermination/controller.go new file mode 100644 index 0000000000..756cb605f9 --- /dev/null +++ b/pkg/controllers/node/termination/instancetermination/controller.go @@ -0,0 +1,140 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package instancetermination + +import ( + "context" + "fmt" + "time" + + "golang.org/x/time/rate" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" + "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/metrics" + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + terminationutils "sigs.k8s.io/karpenter/pkg/utils/termination" +) + +type Controller struct { + clk clock.Clock + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider +} + +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) *Controller { + return &Controller{ + clk: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + } +} + +func (*Controller) Name() string { + return "node.instancetermination" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clk, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return []string{v1.DrainFinalizer, v1.VolumeFinalizer} +} + +func (*Controller) Finalizer() string { + return v1.TerminationFinalizer +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyContinue +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyContinue +} + +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + isInstanceTerminated, err := terminationutils.EnsureTerminated(ctx, c.kubeClient, nc, c.cloudProvider) + if err != nil { + // 404 = the nodeClaim no longer exists + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + // 409 - The nodeClaim exists, but its status has already been modified + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) + } + if !isInstanceTerminated { + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + } + + stored := n.DeepCopy() + controllerutil.RemoveFinalizer(n, c.Finalizer()) + // We use client.StrategicMergeFrom here since the node object supports it and + // a strategic merge patch represents the finalizer list as a keyed "set" so removing + // an item from the list doesn't replace the full list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 + if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, fmt.Errorf("removing %q finalizer, %w", c.Finalizer(), err) + } + metrics.NodesTerminatedTotal.Inc(map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp + DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + log.FromContext(ctx).Info("deleted node") + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/metrics.go b/pkg/controllers/node/termination/instancetermination/metrics.go similarity index 87% rename from pkg/controllers/node/termination/metrics.go rename to pkg/controllers/node/termination/instancetermination/metrics.go index 32d829c6d4..eae7e85ea7 100644 --- a/pkg/controllers/node/termination/metrics.go +++ b/pkg/controllers/node/termination/instancetermination/metrics.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package instancetermination import ( "time" @@ -40,16 +40,6 @@ var ( }, []string{metrics.NodePoolLabel}, ) - NodesDrainedTotal = opmetrics.NewPrometheusCounter( - crmetrics.Registry, - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: metrics.NodeSubsystem, - Name: "drained_total", - Help: "The total number of nodes drained by Karpenter", - }, - []string{metrics.NodePoolLabel}, - ) NodeLifetimeDurationSeconds = opmetrics.NewPrometheusHistogram( crmetrics.Registry, prometheus.HistogramOpts{ diff --git a/pkg/controllers/node/termination/instancetermination/suite_test.go b/pkg/controllers/node/termination/instancetermination/suite_test.go new file mode 100644 index 0000000000..aa11218aca --- /dev/null +++ b/pkg/controllers/node/termination/instancetermination/suite_test.go @@ -0,0 +1,188 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package instancetermination_test + +import ( + "context" + "testing" + "time" + + clock "k8s.io/utils/clock/testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/instancetermination" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/metrics" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var reconciler reconcile.Reconciler + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "InstanceTermination") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, instancetermination.NewController(fakeClock, env.Client, cloudProvider)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("InstanceTermination", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.TerminationFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + AfterEach(func() { + metrics.NodesTerminatedTotal.Reset() + instancetermination.DurationSeconds.Reset() + instancetermination.NodeLifetimeDurationSeconds.Reset() + }) + + It("should terminate the instance", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(0)) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(1)) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(1)) + ExpectNotFound(ctx, env.Client, node) + }) + DescribeTable( + "should ignore nodes with dependent finalizers", + func(finalizer string) { + node.Finalizers = append(nodeClaim.Finalizers, finalizer) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + + node = ExpectExists(ctx, env.Client, node) + stored := node.DeepCopy() + node.Finalizers = lo.Reject(node.Finalizers, func(f string, _ int) bool { + return f == finalizer + }) + Expect(env.Client.Patch(ctx, node, client.StrategicMergeFrom(stored))).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNotFound(ctx, env.Client, node) + }, + Entry("VolumeFinalizer", v1.VolumeFinalizer), + Entry("DrainFinalizer", v1.DrainFinalizer), + ) + It("should finalize node when the instance has already been terminated", func() { + delete(cloudProvider.CreatedNodeClaims, node.Spec.ProviderID) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectMakeNodesNotReady(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectExists(ctx, env.Client, node) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNotFound(ctx, env.Client, node) + }) + Context("Metrics", func() { + It("should fire the terminationSummary metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_termination_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(m.GetSummary().GetSampleCount()).To(BeNumerically("==", 1)) + }) + It("should fire the nodesTerminated counter metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(m.GetCounter().Value)).To(BeNumerically("==", 1)) + }) + It("should fire the lifetime duration histogram metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_lifetime_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(m.GetHistogram().SampleCount)).To(BeNumerically("==", 1)) + }) + }) +}) diff --git a/pkg/controllers/node/termination/reconcile/reconcile.go b/pkg/controllers/node/termination/reconcile/reconcile.go new file mode 100644 index 0000000000..265e7ab1d4 --- /dev/null +++ b/pkg/controllers/node/termination/reconcile/reconcile.go @@ -0,0 +1,161 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconcile + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator/injection" + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" +) + +func AsReconciler(c client.Client, cp cloudprovider.CloudProvider, clk clock.Clock, r Reconciler) reconcile.Reconciler { + return reconcile.AsReconciler(c, &reconcilerAdapter{ + Reconciler: r, + kubeClient: c, + cloudProvider: cp, + clk: clk, + }) +} + +type Reconciler interface { + AwaitFinalizers() []string + Finalizer() string + + Name() string + Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) + + NodeClaimNotFoundPolicy() Policy + TerminationGracePeriodPolicy() Policy +} + +type Policy int + +const ( + // If the condition associated with this policy is true, the reconcilers's associated finalizer will be removed and + // reconciler will not be reconciled. + PolicyFinalize Policy = iota + // If the condition associated with this policy is true, the reconciler will be reconciled normally. + PolicyContinue +) + +type reconcilerAdapter struct { + Reconciler + + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + clk clock.Clock +} + +//nolint:gocyclo +func (r *reconcilerAdapter) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, r.Name()) + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) + + if n.GetDeletionTimestamp().IsZero() { + return reconcile.Result{}, nil + } + if !controllerutil.ContainsFinalizer(n, r.Finalizer()) { + return reconcile.Result{}, nil + } + if !nodeutils.IsManaged(n, r.cloudProvider) { + return reconcile.Result{}, nil + } + + for _, finalizer := range r.AwaitFinalizers() { + if ok, err := nodeutils.ContainsFinalizer(n, finalizer); err != nil { + // If any of the awaited finalizers require hydration, and the resources haven't been hydrated since the + // last Karpenter update, we should short-circuit. The resource should be re-reconciled once hydrated. + if nodeutils.IsHydrationError(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } else if ok { + return reconcile.Result{}, nil + } + } + + nc, err := nodeutils.NodeClaimForNode(ctx, r.kubeClient, n) + if err != nil { + if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) + if nc.DeletionTimestamp.IsZero() { + if err := r.kubeClient.Delete(ctx, nc); err != nil { + if errors.IsNotFound(err) { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + } + + // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining + // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even + // though the CloudProvider says the instance is not around, we know that the kubelet process is still running + // if the Node Ready condition is true + // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 + if r.NodeClaimNotFoundPolicy() == PolicyFinalize { + if nodeutils.GetCondition(n, corev1.NodeReady).Status != corev1.ConditionTrue { + if _, err := r.cloudProvider.Get(ctx, n.Spec.ProviderID); err != nil { + if cloudprovider.IsNodeClaimNotFoundError(err) { + return reconcile.Result{}, client.IgnoreNotFound(RemoveFinalizer(ctx, r.kubeClient, n, r.Finalizer())) + } + return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) + } + } + } + + if r.TerminationGracePeriodPolicy() == PolicyFinalize { + if elapsed, err := nodeclaimutils.HasTerminationGracePeriodElapsed(r.clk, nc); err != nil { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } else if elapsed { + return reconcile.Result{}, client.IgnoreNotFound(RemoveFinalizer(ctx, r.kubeClient, n, r.Finalizer())) + } + } + + return r.Reconciler.Reconcile(ctx, n, nc) +} + +func RemoveFinalizer(ctx context.Context, c client.Client, n *corev1.Node, finalizer string) error { + stored := n.DeepCopy() + if !controllerutil.RemoveFinalizer(n, finalizer) { + return nil + } + if err := c.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { + return err + } + return nil +} diff --git a/pkg/controllers/node/termination/reconcile/suite_test.go b/pkg/controllers/node/termination/reconcile/suite_test.go new file mode 100644 index 0000000000..34b99a8d95 --- /dev/null +++ b/pkg/controllers/node/termination/reconcile/suite_test.go @@ -0,0 +1,292 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconcile_test + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + + corev1 "k8s.io/api/core/v1" + clock "k8s.io/utils/clock/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ( + DependentFinalizers = sets.New(v1.DrainFinalizer, v1.VolumeFinalizer) + TestFinalizer = "karpenter.sh/test" +) + +var ctx context.Context +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var testReconciler *TestReconciler +var reconciler reconcile.Reconciler + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "TerminationReconciler") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + testReconciler = &TestReconciler{} + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, testReconciler) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("TerminationReconciler", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: append(DependentFinalizers.UnsortedList(), TestFinalizer), + }, + }) + testReconciler.Reset() + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + + It("shouldn't reconcile against Nodes not managed by this instance of Karpenter", func() { + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against Nodes without deletion timestamps set", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against Nodes without the reconciler's finalizer", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return finalizer == TestFinalizer + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile if dependent finalizers are present", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile if dependent finalizers aren't present, but the Node hasn't been hydrated", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + node.Annotations[v1.HydrationAnnotationKey] = "not-hydrated" + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against a Node with a duplicate NodeClaim", func() { + nodeClaim2 := nodeClaim.DeepCopy() + nodeClaim.Name = "duplicate-nodeclaim" + + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, nodeClaim2) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + // We should still succeed to reconcile - we won't succeed on a subsequent reconciliation so there's no reason + // to go into error backoff + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against a Node without a NodeClaim", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("should delete the NodeClaim for a Node", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + }) + DescribeTable( + "should respect the reconciler's NodeClaimNotFoundPolicy", + func(policy terminationreconcile.Policy) { + testReconciler.NodeClaimNotFoundPolicyResult = policy + delete(cloudProvider.CreatedNodeClaims, node.Spec.ProviderID) + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectMakeNodesNotReady(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + switch policy { + case terminationreconcile.PolicyContinue: + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + case terminationreconcile.PolicyFinalize: + ExpectNotFound(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + } + }, + Entry("Finalize", terminationreconcile.PolicyFinalize), + Entry("Continue", terminationreconcile.PolicyContinue), + ) + DescribeTable( + "should respect the reconciler's TerminationGracePeriodPolicy", + func(policy terminationreconcile.Policy) { + testReconciler.TerminationGracePeriodPolicyResult = policy + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + nodeClaim.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey] = fakeClock.Now().Add(-time.Minute).Format(time.RFC3339) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + switch policy { + case terminationreconcile.PolicyContinue: + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + case terminationreconcile.PolicyFinalize: + ExpectNotFound(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + } + }, + Entry("Finalize", terminationreconcile.PolicyFinalize), + Entry("Continue", terminationreconcile.PolicyContinue), + ) +}) + +type TestReconciler struct { + NodeClaimNotFoundPolicyResult terminationreconcile.Policy + TerminationGracePeriodPolicyResult terminationreconcile.Policy + HasReconciled bool +} + +func (r *TestReconciler) Reset() { + *r = TestReconciler{ + NodeClaimNotFoundPolicyResult: terminationreconcile.PolicyContinue, + TerminationGracePeriodPolicyResult: terminationreconcile.PolicyContinue, + } +} + +func (r *TestReconciler) Name() string { + return "test.reconciler" +} + +func (r *TestReconciler) AwaitFinalizers() []string { + return DependentFinalizers.UnsortedList() +} + +func (r *TestReconciler) Finalizer() string { + return TestFinalizer +} + +func (r *TestReconciler) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return r.NodeClaimNotFoundPolicyResult +} + +func (r *TestReconciler) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return r.TerminationGracePeriodPolicyResult +} + +func (r *TestReconciler) Reconcile(_ context.Context, _ *corev1.Node, _ *v1.NodeClaim) (reconcile.Result, error) { + r.HasReconciled = true + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go deleted file mode 100644 index deb6a5c629..0000000000 --- a/pkg/controllers/node/termination/suite_test.go +++ /dev/null @@ -1,930 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination_test - -import ( - "context" - "testing" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - clock "k8s.io/utils/clock/testing" - "sigs.k8s.io/controller-runtime/pkg/client" - - "sigs.k8s.io/karpenter/pkg/apis" - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" - "sigs.k8s.io/karpenter/pkg/metrics" - "sigs.k8s.io/karpenter/pkg/test" - . "sigs.k8s.io/karpenter/pkg/test/expectations" - "sigs.k8s.io/karpenter/pkg/test/v1alpha1" - . "sigs.k8s.io/karpenter/pkg/utils/testing" -) - -var ctx context.Context -var terminationController *termination.Controller -var env *test.Environment -var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}} -var fakeClock *clock.FakeClock -var cloudProvider *fake.CloudProvider -var recorder *test.EventRecorder -var queue *eviction.Queue - -func TestAPIs(t *testing.T) { - ctx = TestContextWithLogger(t) - RegisterFailHandler(Fail) - RunSpecs(t, "Termination") -} - -var _ = BeforeSuite(func() { - fakeClock = clock.NewFakeClock(time.Now()) - env = test.NewEnvironment( - test.WithCRDs(apis.CRDs...), - test.WithCRDs(v1alpha1.CRDs...), - test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), - ) - - cloudProvider = fake.NewCloudProvider() - recorder = test.NewEventRecorder() - queue = eviction.NewTestingQueue(env.Client, recorder) - terminationController = termination.NewController(fakeClock, env.Client, cloudProvider, recorder, queue) -}) - -var _ = AfterSuite(func() { - Expect(env.Stop()).To(Succeed(), "Failed to stop environment") -}) - -var _ = Describe("Termination", func() { - var node *corev1.Node - var nodeClaim *v1.NodeClaim - var nodePool *v1.NodePool - - BeforeEach(func() { - fakeClock.SetTime(time.Now()) - cloudProvider.Reset() - *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) - - nodePool = test.NodePool() - nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1.TerminationFinalizer}}}) - node.Labels[v1.NodePoolLabelKey] = test.NodePool().Name - cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim - }) - - AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) - - // Reset the metrics collectors - metrics.NodesTerminatedTotal.Reset() - termination.DurationSeconds.Reset() - termination.NodeLifetimeDurationSeconds.Reset() - termination.NodesDrainedTotal.Reset() - }) - - Context("Reconciliation", func() { - It("should delete nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should ignore nodes not managed by this Karpenter instance", func() { - delete(node.Labels, "karpenter.test.sh/testnodeclass") - node.Labels = lo.Assign(node.Labels, map[string]string{"karpenter.test.sh/unmanagednodeclass": "default"}) - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - }) - It("should delete nodeclaims associated with nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - - // The first reconciliation should trigger the Delete, and set the terminating status condition - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nc := ExpectExists(ctx, env.Client, nodeClaim) - Expect(nc.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) - ExpectNodeExists(ctx, env.Client, node.Name) - - // The second reconciliation should call get, see the "instance" is terminated, and remove the node. - // We should have deleted the NodeClaim from the node termination controller, so removing it's finalizer should result in it being removed. - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) - ExpectNotFound(ctx, env.Client, node, nodeClaim) - }) - It("should exclude nodes from load balancers when terminating", func() { - labels := map[string]string{"foo": "bar"} - pod := test.Pod(test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - Labels: labels, - }, - }) - // Create a fully blocking PDB to prevent the node from being deleted before we can observe its labels - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labels, - MaxUnavailable: lo.ToPtr(intstr.FromInt(0)), - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, pod, pdb) - ExpectManualBinding(ctx, env.Client, pod, node) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - node = ExpectNodeExists(ctx, env.Client, node.Name) - Expect(node.Labels[corev1.LabelNodeExcludeBalancers]).Should(Equal("karpenter")) - }) - It("should not evict pods that tolerate karpenter disruption taint with equal operator", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podSkip := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpEqual, Effect: v1.DisruptedNoScheduleTaint.Effect, Value: v1.DisruptedNoScheduleTaint.Value}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - Expect(queue.Has(podSkip)).To(BeFalse()) - ExpectSingletonReconciled(ctx, queue) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict pods that tolerate karpenter disruption taint with exists operator", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podSkip := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpExists, Effect: v1.DisruptedNoScheduleTaint.Effect}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - Expect(queue.Has(podSkip)).To(BeFalse()) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - Expect(queue.Has(podSkip)).To(BeFalse()) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict pods that tolerate the node.kubernetes.io/unschedulable taint", func() { - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: corev1.TaintNodeUnschedulable, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes that have pods without an ownerRef", func() { - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: nil, - }, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect pod with no owner ref to be enqueued for eviction - EventuallyExpectTerminating(ctx, env.Client, pod) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Delete no owner refs pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, pod) - - // Reconcile node to evict pod and delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes with terminal pods", func() { - podEvictPhaseSucceeded := test.Pod(test.PodOptions{ - NodeName: node.Name, - Phase: corev1.PodSucceeded, - }) - podEvictPhaseFailed := test.Pod(test.PodOptions{ - NodeName: node.Name, - Phase: corev1.PodFailed, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvictPhaseSucceeded, podEvictPhaseFailed) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Trigger Termination Controller, which should ignore these pods and delete the node - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should fail to evict pods that violate a PDB", func() { - minAvailable := intstr.FromInt32(1) - labelSelector := map[string]string{test.RandomName(): test.RandomName()} - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labelSelector, - // Don't let any pod evict - MinAvailable: &minAvailable, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Labels: labelSelector, - OwnerReferences: defaultOwnerRefs, - }, - Phase: corev1.PodRunning, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podNoEvict, pdb) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podNoEvict to be added to the queue - Expect(queue.Has(podNoEvict)).To(BeTrue()) - - // Attempt to evict the pod, but fail to do so - ExpectSingletonReconciled(ctx, queue) - - // Expect podNoEvict to fail eviction due to PDB, and be retried - Expect(queue.Has(podNoEvict)).To(BeTrue()) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podNoEvict) - ExpectNotFound(ctx, env.Client, podNoEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict pods in order and wait until pods are fully deleted", func() { - daemonEvict := test.DaemonSet() - daemonNodeCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-node-critical"}}) - daemonClusterCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-cluster-critical"}}) - ExpectApplied(ctx, env.Client, daemonEvict, daemonNodeCritical, daemonClusterCritical) - - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podDaemonEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonEvict.Name, - UID: daemonEvict.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podDaemonNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonNodeCritical.Name, - UID: daemonNodeCritical.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - podDaemonClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonClusterCritical.Name, - UID: daemonClusterCritical.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - - podGroups := [][]*corev1.Pod{{podEvict}, {podDaemonEvict}, {podNodeCritical, podClusterCritical}, {podDaemonNodeCritical, podDaemonClusterCritical}} - for i, podGroup := range podGroups { - node = ExpectNodeExists(ctx, env.Client, node.Name) - for _, p := range podGroup { - ExpectPodExists(ctx, env.Client, p.Name, p.Namespace) - } - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - for range podGroup { - ExpectSingletonReconciled(ctx, queue) - } - // Start draining the pod group, but don't complete it yet - EventuallyExpectTerminating(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) - - // Look at the next pod group and ensure that none of the pods have started terminating on it - if i != len(podGroups)-1 { - for range podGroups[i+1] { - ExpectSingletonReconciled(ctx, queue) - } - ConsistentlyExpectNotTerminating(ctx, env.Client, lo.Map(podGroups[i+1], func(p *corev1.Pod, _ int) client.Object { return p })...) - } - // Expect that the pods are deleted -- which should unblock the next pod group - ExpectDeleted(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) - } - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict non-critical pods first", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Expect the critical pods to be evicted and deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - EventuallyExpectTerminating(ctx, env.Client, podNodeCritical, podClusterCritical) - ExpectDeleted(ctx, env.Client, podNodeCritical) - ExpectDeleted(ctx, env.Client, podClusterCritical) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict static pods", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) - - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "v1", - Kind: "Node", - Name: node.Name, - UID: node.UID, - }}, - }, - }) - ExpectApplied(ctx, env.Client, podNoEvict) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect mirror pod to not be queued for eviction - Expect(queue.Has(podNoEvict)).To(BeFalse()) - - // Expect podEvict to be enqueued for eviction then be successful - EventuallyExpectTerminating(ctx, env.Client, podEvict) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes until all pods are deleted", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - ExpectDeleted(ctx, env.Client, pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - ExpectDeleted(ctx, env.Client, pods[0]) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes with no underlying instance even if not fully drained", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Make Node NotReady since it's automatically marked as Ready on first deploy - ExpectMakeNodesNotReady(ctx, env.Client, node) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // After this, the node still has one pod that is evicting. - ExpectDeleted(ctx, env.Client, pods[1]) - - // Remove the node from created nodeclaims so that the cloud provider returns DNE - cloudProvider.CreatedNodeClaims = map[string]*v1.NodeClaim{} - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes with no underlying instance if the node is still Ready", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // After this, the node still has one pod that is evicting. - ExpectDeleted(ctx, env.Client, pods[1]) - - // Remove the node from created nodeclaims so that the cloud provider returns DNE - cloudProvider.CreatedNodeClaims = map[string]*v1.NodeClaim{} - - // Reconcile to try to delete the node, but don't succeed because the readiness condition - // of the node still won't let us delete it - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - }) - It("should wait for pods to terminate", func() { - pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - - // Before grace period, node should not delete - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectNodeExists(ctx, env.Client, node.Name) - EventuallyExpectTerminating(ctx, env.Client, pod) - - // After grace period, node should delete. The deletion timestamps are from etcd which we can't control, so - // to eliminate test-flakiness we reset the time to current time + 90 seconds instead of just advancing - // the clock by 90 seconds. - fakeClock.SetTime(time.Now().Add(90 * time.Second)) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict a new pod with the same name using the old pod's eviction queue key", func() { - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - OwnerReferences: defaultOwnerRefs, - }, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Don't trigger a call into the queue to make sure that we effectively aren't triggering eviction - // We'll use this to try to leave pods in the queue - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Delete the pod directly to act like something else is doing the pod termination - ExpectDeleted(ctx, env.Client, pod) - - // Requeue the termination controller to completely delete the node - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Expect that the old pod's key still exists in the queue - Expect(queue.Has(pod)).To(BeTrue()) - - // Re-create the pod and node, it should now have the same name, but a different UUID - node = test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Finalizers: []string{v1.TerminationFinalizer}, - }, - }) - pod = test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - OwnerReferences: defaultOwnerRefs, - }, - }) - ExpectApplied(ctx, env.Client, node, pod) - - // Trigger eviction queue with the pod key still in it - ExpectSingletonReconciled(ctx, queue) - - Consistently(func(g Gomega) { - g.Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) - g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) - }, ReconcilerPropagationTime, RequestInterval).Should(Succeed()) - }) - It("should preemptively delete pods to satisfy their terminationGracePeriodSeconds", func() { - nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), - } - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.DoNotDisruptAnnotationKey: "true", - }, - OwnerReferences: defaultOwnerRefs, - }, - TerminationGracePeriodSeconds: lo.ToPtr(int64(300)), - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, pod) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectNodeExists(ctx, env.Client, node.Name) - ExpectDeleted(ctx, env.Client, pod) - }) - It("should only delete pods when their terminationGracePeriodSeconds is less than the the node's remaining terminationGracePeriod", func() { - nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), - } - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.DoNotDisruptAnnotationKey: "true", - }, - OwnerReferences: defaultOwnerRefs, - }, - TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), - }) - fakeClock.SetTime(time.Now()) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, pod) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - - // expect pod still exists - fakeClock.Step(90 * time.Second) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - ExpectNodeExists(ctx, env.Client, node.Name) - ExpectPodExists(ctx, env.Client, pod.Name, pod.Namespace) - - // expect pod is now deleted - fakeClock.Step(175 * time.Second) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectDeleted(ctx, env.Client, pod) - }) - Context("VolumeAttachments", func() { - It("should wait for volume attachments", func() { - va := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectDeleted(ctx, env.Client, va) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should only wait for volume attachments associated with drainable pods", func() { - vaDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - vaNonDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "bar", - }) - pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ - VolumeName: "bar", - }) - pod := test.Pod(test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - }, - Tolerations: []corev1.Toleration{{ - Key: v1.DisruptedTaintKey, - Operator: corev1.TolerationOpExists, - }}, - PersistentVolumeClaims: []string{pvc.Name}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, vaDrainable, vaNonDrainable, pod, pvc) - ExpectManualBinding(ctx, env.Client, pod, node) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectDeleted(ctx, env.Client, vaDrainable) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should wait for volume attachments until the nodeclaim's termination grace period expires", func() { - va := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(time.Minute).Format(time.RFC3339), - } - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - fakeClock.Step(5 * time.Minute) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - // Since TGP expired before the VolumeAttachments were deleted, this status condition should have never transitioned - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - }) - }) - Context("Metrics", func() { - It("should fire the terminationSummary metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_termination_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(m.GetSummary().GetSampleCount()).To(BeNumerically("==", 1)) - }) - It("should fire the nodesTerminated counter metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectMetricCounterValue(termination.NodesDrainedTotal, 1, map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(m.GetCounter().Value)).To(BeNumerically("==", 1)) - }) - It("should fire the lifetime duration histogram metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_lifetime_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(m.GetHistogram().SampleCount)).To(BeNumerically("==", 1)) - }) - It("should update the eviction queueDepth metric when reconciling pods", func() { - minAvailable := intstr.FromInt32(0) - labelSelector := map[string]string{test.RandomName(): test.RandomName()} - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labelSelector, - // Don't let any pod evict - MinAvailable: &minAvailable, - }) - ExpectApplied(ctx, env.Client, pdb, node, nodeClaim) - pods := test.Pods(5, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - Labels: labelSelector, - }}) - ExpectApplied(ctx, env.Client, lo.Map(pods, func(p *corev1.Pod, _ int) client.Object { return p })...) - - wqDepthBefore, _ := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - wqDepthAfter, ok := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(wqDepthAfter.GetCounter().Value) - lo.FromPtr(wqDepthBefore.GetCounter().Value)).To(BeNumerically("==", 5)) - }) - }) -}) - -func ExpectNodeWithNodeClaimDraining(c client.Client, nodeName string) *corev1.Node { - GinkgoHelper() - node := ExpectNodeExists(ctx, c, nodeName) - Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint)) - Expect(lo.Contains(node.Finalizers, v1.TerminationFinalizer)).To(BeTrue()) - Expect(node.DeletionTimestamp).ToNot(BeNil()) - return node -} diff --git a/pkg/controllers/node/termination/volumedetachment.go b/pkg/controllers/node/termination/volumedetachment/controller.go similarity index 50% rename from pkg/controllers/node/termination/volumedetachment.go rename to pkg/controllers/node/termination/volumedetachment/controller.go index 9b445ee02a..87754767e8 100644 --- a/pkg/controllers/node/termination/volumedetachment.go +++ b/pkg/controllers/node/termination/volumedetachment/controller.go @@ -14,61 +14,102 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package volumedetachment import ( "context" "time" "github.com/samber/lo" + "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" "sigs.k8s.io/karpenter/pkg/events" storagev1 "k8s.io/api/storage/v1" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" - "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" podutils "sigs.k8s.io/karpenter/pkg/utils/pod" volumeutil "sigs.k8s.io/karpenter/pkg/utils/volume" ) -type VolumeDetachmentReconciler struct { - clk clock.Clock - kubeClient client.Client - recorder events.Recorder +type Controller struct { + clk clock.Clock + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + recorder events.Recorder } -// nolint:gocyclo -func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - if !nc.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue() { - return reconcile.Result{}, nil - } - if nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() { - return reconcile.Result{}, nil - } - if elapsed, err := nodeclaim.HasTerminationGracePeriodElapsed(v.clk, nc); err != nil { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } else if elapsed { - return reconcile.Result{}, nil +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder) *Controller { + return &Controller{ + clk: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + recorder: recorder, } +} + +func (*Controller) Name() string { + return "node.volumedetachment" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clk, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return nil +} - blockingVAs, err := v.blockingVolumeAttachments(ctx, n) +func (*Controller) Finalizer() string { + return v1.VolumeFinalizer +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +// nolint:gocyclo +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + blockingVAs, err := c.blockingVolumeAttachments(ctx, n) if err != nil { return reconcile.Result{}, err } if len(blockingVAs) != 0 { stored := nc.DeepCopy() _ = nc.StatusConditions().SetFalse(v1.ConditionTypeVolumesDetached, "AwaitingVolumeDetachment", "AwaitingVolumeDetachment") - if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -77,13 +118,13 @@ func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.No } return reconcile.Result{}, err } - v.recorder.Publish(NodeAwaitingVolumeDetachmentEvent(n, blockingVAs...)) + c.recorder.Publish(NodeAwaitingVolumeDetachmentEvent(n, blockingVAs...)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } stored := nc.DeepCopy() _ = nc.StatusConditions().SetTrue(v1.ConditionTypeVolumesDetached) - if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -92,11 +133,14 @@ func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.No } return reconcile.Result{}, err } + if err := terminationreconcile.RemoveFinalizer(ctx, c.kubeClient, n, c.Finalizer()); err != nil { + return reconcile.Result{}, err + } return reconcile.Result{}, nil } -func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Context, n *corev1.Node) ([]*storagev1.VolumeAttachment, error) { - vas, err := nodeutils.GetVolumeAttachments(ctx, v.kubeClient, n) +func (c *Controller) blockingVolumeAttachments(ctx context.Context, n *corev1.Node) ([]*storagev1.VolumeAttachment, error) { + vas, err := nodeutils.GetVolumeAttachments(ctx, c.kubeClient, n) if err != nil { return nil, err } @@ -104,12 +148,12 @@ func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Conte return nil, nil } - pods, err := nodeutils.GetPods(ctx, v.kubeClient, n) + pods, err := nodeutils.GetPods(ctx, c.kubeClient, n) if err != nil { return nil, err } pods = lo.Reject(pods, func(p *corev1.Pod, _ int) bool { - return podutils.IsDrainable(p, v.clk) + return podutils.IsDrainable(p, c.clk) }) // Determine the VolumeAttachments associated with non-drainable pods. We consider these non-blocking since they @@ -117,7 +161,7 @@ func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Conte nonBlockingVolumes := sets.New[string]() for _, p := range pods { for _, vol := range p.Spec.Volumes { - pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, v.kubeClient, p, vol) + pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, c.kubeClient, p, vol) if errors.IsNotFound(err) { continue } diff --git a/pkg/controllers/node/termination/volumedetachment/events.go b/pkg/controllers/node/termination/volumedetachment/events.go new file mode 100644 index 0000000000..6683622c7c --- /dev/null +++ b/pkg/controllers/node/termination/volumedetachment/events.go @@ -0,0 +1,36 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumedetachment + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + + "sigs.k8s.io/karpenter/pkg/events" +) + +func NodeAwaitingVolumeDetachmentEvent(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { + return events.Event{ + InvolvedObject: node, + Type: corev1.EventTypeNormal, + Reason: "AwaitingVolumeDetachment", + Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), + DedupeValues: []string{node.Name}, + } +} diff --git a/pkg/controllers/node/termination/volumedetachment/suite_test.go b/pkg/controllers/node/termination/volumedetachment/suite_test.go new file mode 100644 index 0000000000..33e593ec80 --- /dev/null +++ b/pkg/controllers/node/termination/volumedetachment/suite_test.go @@ -0,0 +1,185 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumedetachment_test + +import ( + "context" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clock "k8s.io/utils/clock/testing" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/volumedetachment" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var reconciler reconcile.Reconciler +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "VolumeDetachment") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, volumedetachment.NewController(fakeClock, env.Client, cloudProvider, recorder)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("VolumeDetachment", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.DrainFinalizer, + v1.VolumeFinalizer, + v1.TerminationFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + It("should wait for volume attachments", func() { + va := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, va) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + ExpectDeleted(ctx, env.Client, va) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + }) + It("should only wait for volume attachments associated with drainable pods", func() { + vaDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + vaNonDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "bar", + }) + pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + VolumeName: "bar", + }) + pod := test.Pod(test.PodOptions{ + // ObjectMeta: metav1.ObjectMeta{ + // OwnerReferences: defaultOwnerRefs, + // }, + Tolerations: []corev1.Toleration{{ + Key: v1.DisruptedTaintKey, + Operator: corev1.TolerationOpExists, + }}, + PersistentVolumeClaims: []string{pvc.Name}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, vaDrainable, vaNonDrainable, pod, pvc) + ExpectManualBinding(ctx, env.Client, pod, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + ExpectDeleted(ctx, env.Client, vaDrainable) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + }) + It("should remove volume protection finalizer once TGP has elapsed", func() { + va := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(time.Minute).Format(time.RFC3339), + } + ExpectApplied(ctx, env.Client, node, nodeClaim, va) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + fakeClock.Step(5 * time.Minute) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + // Since TGP expired, the protection finalizer should have been removed. However, the status condition should + // remain false since volumes were never detached. + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/nodeclaim/hydration/suite_test.go b/pkg/controllers/nodeclaim/hydration/suite_test.go index 86af3235bd..f66422a764 100644 --- a/pkg/controllers/nodeclaim/hydration/suite_test.go +++ b/pkg/controllers/nodeclaim/hydration/suite_test.go @@ -25,6 +25,7 @@ import ( "github.com/samber/lo" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" @@ -66,42 +67,40 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { - It("should hydrate the NodeClass label", func() { - nc, _ := test.NodeClaimAndNode() - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(nc.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())]).To(Equal(nc.Spec.NodeClassRef.Name)) - }) - It("shouldn't hydrate nodes which have already been hydrated", func() { - nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ + var nodeClaim *v1.NodeClaim + BeforeEach(func() { + nodeClaim = test.NodeClaim(v1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.HydrationAnnotationKey: operator.Version, - }, + Annotations: map[string]string{v1.HydrationAnnotationKey: "not-hydrated"}, }, }) - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + }) + + It("should hydrate the NodeClass label", func() { + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.Labels[v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())]).To(Equal(nodeClaim.Spec.NodeClassRef.Name)) + }) + It("shouldn't hydrate nodes which have already been hydrated", func() { + nodeClaim.Annotations[v1.HydrationAnnotationKey] = operator.Version + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(lo.Keys(nodeClaim.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) }) It("shouldn't hydrate NodeClaims which aren't managed by this instance of Karpenter", func() { - nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }, - }, - }) - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + nodeClaim.Spec.NodeClassRef = &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + } + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(lo.Keys(nodeClaim.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) }) }) diff --git a/pkg/controllers/nodeclaim/lifecycle/termination_test.go b/pkg/controllers/nodeclaim/lifecycle/termination_test.go index 6d2d4d9238..4d1b450937 100644 --- a/pkg/controllers/nodeclaim/lifecycle/termination_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/termination_test.go @@ -300,7 +300,7 @@ var _ = Describe("Termination", func() { ExpectExists(ctx, env.Client, node) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.ObjectMeta.Annotations).To(BeNil()) + Expect(lo.Keys(nodeClaim.ObjectMeta.Annotations)).ToNot(ContainElement(v1.NodeClaimTerminationTimestampAnnotationKey)) }) It("should annotate the node if the NodeClaim has a terminationGracePeriod", func() { nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} diff --git a/pkg/events/suite_test.go b/pkg/events/suite_test.go index 2c53df92ef..f9338bb70d 100644 --- a/pkg/events/suite_test.go +++ b/pkg/events/suite_test.go @@ -30,7 +30,7 @@ import ( "k8s.io/client-go/util/flowcontrol" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling" "sigs.k8s.io/karpenter/pkg/events" @@ -97,8 +97,8 @@ var _ = Describe("Event Creation", func() { Expect(internalRecorder.Calls(scheduling.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) It("should create a NodeFailedToDrain event", func() { - eventRecorder.Publish(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf(""))) - Expect(internalRecorder.Calls(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) + eventRecorder.Publish(drain.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf(""))) + Expect(internalRecorder.Calls(drain.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) }) diff --git a/pkg/test/nodeclaim.go b/pkg/test/nodeclaim.go index 467297b66f..e7bee59e43 100644 --- a/pkg/test/nodeclaim.go +++ b/pkg/test/nodeclaim.go @@ -25,6 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/operator" ) // NodeClaim creates a test NodeClaim with defaults that can be overridden by overrides. @@ -49,6 +50,9 @@ func NodeClaim(overrides ...v1.NodeClaim) *v1.NodeClaim { Name: "default", } } + override.Annotations = lo.Assign(map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }, override.Annotations) override.Labels = lo.Assign(map[string]string{ v1.NodeClassLabelKey(override.Spec.NodeClassRef.GroupKind()): override.Spec.NodeClassRef.Name, }, override.Labels) diff --git a/pkg/test/nodes.go b/pkg/test/nodes.go index 4e8c9c5bb8..4c594331ae 100644 --- a/pkg/test/nodes.go +++ b/pkg/test/nodes.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/operator" ) type NodeOptions struct { @@ -53,6 +54,10 @@ func Node(overrides ...NodeOptions) *corev1.Node { options.Capacity = options.Allocatable } + options.Annotations = lo.Assign(map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }, options.Annotations) + return &corev1.Node{ ObjectMeta: ObjectMeta(options.ObjectMeta), Spec: corev1.NodeSpec{ diff --git a/pkg/utils/node/node.go b/pkg/utils/node/node.go index ddcc6156b5..dd42c787ff 100644 --- a/pkg/utils/node/node.go +++ b/pkg/utils/node/node.go @@ -134,8 +134,11 @@ func IsManagedPredicateFuncs(cp cloudprovider.CloudProvider) predicate.Funcs { }) } -func NodeClaimEventHandler(c client.Client) handler.EventHandler { +func NodeClaimEventHandler(c client.Client, cloudProvider cloudprovider.CloudProvider) handler.EventHandler { return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + if !nodeclaimutils.IsManaged(o.(*v1.NodeClaim), cloudProvider) { + return nil + } providerID := o.(*v1.NodeClaim).Status.ProviderID if providerID == "" { return nil diff --git a/pkg/utils/node/types.go b/pkg/utils/node/types.go index 895f71eb65..f6f35d1fd2 100644 --- a/pkg/utils/node/types.go +++ b/pkg/utils/node/types.go @@ -51,7 +51,7 @@ type DuplicateNodeClaimError struct { } func (e *DuplicateNodeClaimError) Error() string { - return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) + return fmt.Sprintf("multiple nodeclaims found for provider id '%s'", e.ProviderID) } func IsDuplicateNodeClaimError(err error) bool { @@ -72,7 +72,7 @@ func IgnoreDuplicateNodeClaimError(err error) error { type HydrationError struct{} func (e *HydrationError) Error() string { - return fmt.Sprintf("resource has not been hydrated") + return "resource has not been hydrated" } func IsHydrationError(err error) bool { diff --git a/pkg/utils/nodeclaim/types.go b/pkg/utils/nodeclaim/types.go index 9266aa2364..56c1397bca 100644 --- a/pkg/utils/nodeclaim/types.go +++ b/pkg/utils/nodeclaim/types.go @@ -68,4 +68,3 @@ func IgnoreDuplicateNodeError(err error) error { } return nil } -