From 0fdc2a285b9b230ba87db9596d338c4683b73c0c Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Tue, 10 Dec 2024 00:53:20 -0800 Subject: [PATCH] complete --- pkg/controllers/controllers.go | 8 +- pkg/controllers/node/hydration/controller.go | 2 +- pkg/controllers/node/hydration/suite_test.go | 86 +- .../node/termination/controller.go | 202 ---- .../{drain.go => drain/controller.go} | 154 ++- .../node/termination/{ => drain}/events.go | 44 +- .../node/termination/drain/metrics.go | 38 + .../node/termination/drain/suite_test.go | 599 +++++++++++ .../node/termination/instancetermination.go | 71 -- .../instancetermination/controller.go | 140 +++ .../{ => instancetermination}/metrics.go | 12 +- .../instancetermination/suite_test.go | 188 ++++ .../node/termination/reconcile/reconcile.go | 161 +++ .../node/termination/reconcile/suite_test.go | 292 ++++++ .../node/termination/suite_test.go | 930 ------------------ .../controller.go} | 102 +- .../termination/volumedetachment/events.go | 36 + .../volumedetachment/suite_test.go | 185 ++++ .../nodeclaim/hydration/suite_test.go | 63 +- .../nodeclaim/lifecycle/termination_test.go | 2 +- pkg/events/suite_test.go | 6 +- pkg/test/nodeclaim.go | 4 + pkg/test/nodes.go | 5 + pkg/utils/node/node.go | 5 +- pkg/utils/node/types.go | 4 +- pkg/utils/nodeclaim/types.go | 1 - 26 files changed, 1942 insertions(+), 1398 deletions(-) delete mode 100644 pkg/controllers/node/termination/controller.go rename pkg/controllers/node/termination/{drain.go => drain/controller.go} (53%) rename pkg/controllers/node/termination/{ => drain}/events.go (68%) create mode 100644 pkg/controllers/node/termination/drain/metrics.go create mode 100644 pkg/controllers/node/termination/drain/suite_test.go delete mode 100644 pkg/controllers/node/termination/instancetermination.go create mode 100644 pkg/controllers/node/termination/instancetermination/controller.go rename pkg/controllers/node/termination/{ => instancetermination}/metrics.go (87%) create mode 100644 pkg/controllers/node/termination/instancetermination/suite_test.go create mode 100644 pkg/controllers/node/termination/reconcile/reconcile.go create mode 100644 pkg/controllers/node/termination/reconcile/suite_test.go delete mode 100644 pkg/controllers/node/termination/suite_test.go rename pkg/controllers/node/termination/{volumedetachment.go => volumedetachment/controller.go} (50%) create mode 100644 pkg/controllers/node/termination/volumedetachment/events.go create mode 100644 pkg/controllers/node/termination/volumedetachment/suite_test.go diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index f3e8b69b8a..828255b361 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -38,8 +38,10 @@ import ( metricspod "sigs.k8s.io/karpenter/pkg/controllers/metrics/pod" "sigs.k8s.io/karpenter/pkg/controllers/node/health" nodehydration "sigs.k8s.io/karpenter/pkg/controllers/node/hydration" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/instancetermination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/volumedetachment" nodeclaimconsistency "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/consistency" nodeclaimdisruption "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/disruption" "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/expiration" @@ -83,7 +85,9 @@ func NewControllers( informer.NewPodController(kubeClient, cluster), informer.NewNodePoolController(kubeClient, cloudProvider, cluster), informer.NewNodeClaimController(kubeClient, cloudProvider, cluster), - termination.NewController(clock, kubeClient, cloudProvider, recorder, evictionQueue), + drain.NewController(clock, kubeClient, cloudProvider, recorder, evictionQueue), + volumedetachment.NewController(clock, kubeClient, cloudProvider, recorder), + instancetermination.NewController(clock, kubeClient, cloudProvider), metricspod.NewController(kubeClient, cluster), metricsnodepool.NewController(kubeClient, cloudProvider), metricsnode.NewController(cluster), diff --git a/pkg/controllers/node/hydration/controller.go b/pkg/controllers/node/hydration/controller.go index 5a72e3ca37..1140571a75 100644 --- a/pkg/controllers/node/hydration/controller.go +++ b/pkg/controllers/node/hydration/controller.go @@ -64,7 +64,7 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { return controllerruntime.NewControllerManagedBy(m). Named(c.Name()). For(&corev1.Node{}). - Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient)). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). WithOptions(controller.Options{ RateLimiter: reasonable.RateLimiter(), MaxConcurrentReconciles: 1000, diff --git a/pkg/controllers/node/hydration/suite_test.go b/pkg/controllers/node/hydration/suite_test.go index 16b0661d16..9025c1f061 100644 --- a/pkg/controllers/node/hydration/suite_test.go +++ b/pkg/controllers/node/hydration/suite_test.go @@ -24,8 +24,10 @@ import ( . "github.com/onsi/gomega" "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" @@ -67,28 +69,37 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { + var nodeClaim *v1.NodeClaim + var node *corev1.Node + + BeforeEach(func() { + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{v1.HydrationAnnotationKey: "not-hydrated"}, + }, + }) + }) + It("should hydrate the NodeClass label", func() { - nc, n := test.NodeClaimAndNode() - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - value := n.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())] - Expect(value).To(Equal(nc.Spec.NodeClassRef.Name)) + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + value := node.Labels[v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())] + Expect(value).To(Equal(nodeClaim.Spec.NodeClassRef.Name)) }) DescribeTable( "Finalizers", func(nodeClaimConditions []string, expectedFinailzers []string) { - nc, n := test.NodeClaimAndNode() for _, cond := range nodeClaimConditions { - nc.StatusConditions().SetTrue(cond) + nodeClaim.StatusConditions().SetTrue(cond) } - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(len(n.Finalizers)).To(Equal(len(expectedFinailzers))) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(len(node.Finalizers)).To(Equal(len(expectedFinailzers))) for _, finalizer := range expectedFinailzers { - Expect(controllerutil.ContainsFinalizer(n, finalizer)) + Expect(controllerutil.ContainsFinalizer(node, finalizer)) } }, Entry("should hydrate all finalizers when none of the requisite status conditions are true", nil, []string{v1.DrainFinalizer, v1.VolumeFinalizer}), @@ -96,37 +107,26 @@ var _ = Describe("Hydration", func() { Entry("should hydrate the drain finalizer when only the volume status condition is true", []string{v1.ConditionTypeVolumesDetached}, []string{v1.VolumeFinalizer}), Entry("shouldn't hydrate finalizers when all requisite conditions are true", []string{v1.ConditionTypeDrained, v1.ConditionTypeVolumesDetached}, nil), ) - It("shouldn't hydrate nodes which have already been hydrated", func() { - nc, n := test.NodeClaimAndNode(v1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.HydrationAnnotationKey: operator.Version, - }, - }, - }) - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) - Expect(len(n.Finalizers)).To(Equal(0)) + node.Annotations[v1.HydrationAnnotationKey] = operator.Version + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(lo.Keys(node.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) + Expect(len(node.Finalizers)).To(Equal(0)) }) It("shouldn't hydrate nodes which are not managed by this instance of Karpenter", func() { - nc, n := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }, - }, - }) - delete(n.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc, n) - ExpectObjectReconciled(ctx, env.Client, hydrationController, n) - n = ExpectExists(ctx, env.Client, n) - Expect(lo.Keys(n.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) - Expect(len(n.Finalizers)).To(Equal(0)) + nodeClaim.Spec.NodeClassRef = &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + } + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, hydrationController, node) + node = ExpectExists(ctx, env.Client, node) + Expect(lo.Keys(node.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) + Expect(len(node.Finalizers)).To(Equal(0)) }) }) diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go deleted file mode 100644 index 3880eddd7c..0000000000 --- a/pkg/controllers/node/termination/controller.go +++ /dev/null @@ -1,202 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination - -import ( - "context" - "fmt" - "time" - - "github.com/samber/lo" - "golang.org/x/time/rate" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/client-go/util/workqueue" - "k8s.io/klog/v2" - "k8s.io/utils/clock" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "sigs.k8s.io/karpenter/pkg/utils/pretty" - - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" - "sigs.k8s.io/karpenter/pkg/events" - "sigs.k8s.io/karpenter/pkg/metrics" - "sigs.k8s.io/karpenter/pkg/operator/injection" - nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" -) - -// Controller for the resource -type Controller struct { - clock clock.Clock - kubeClient client.Client - cloudProvider cloudprovider.CloudProvider - reconcilers []reconciler -} - -type reconciler interface { - Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) -} - -// NewController constructs a controller instance -func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, evictionQueue *eviction.Queue) *Controller { - return &Controller{ - clock: clk, - kubeClient: kubeClient, - cloudProvider: cloudProvider, - reconcilers: []reconciler{ - &DrainReconciler{clk, kubeClient, cloudProvider, recorder, evictionQueue}, - &VolumeDetachmentReconciler{clk, kubeClient, recorder}, - &InstanceTerminationReconciler{clk, kubeClient, cloudProvider}, - }, - } -} - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime.NewControllerManagedBy(m). - Named("node.termination"). - For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). - Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). - WithOptions( - controller.Options{ - RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( - workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), - // 10 qps, 100 bucket size - &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, - ), - MaxConcurrentReconciles: 100, - }, - ). - Complete(reconcile.AsReconciler(m.GetClient(), c)) -} - -// nolint:gocyclo -func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { - ctx = injection.WithControllerName(ctx, "node.termination") - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) - - if n.GetDeletionTimestamp().IsZero() { - return reconcile.Result{}, nil - } - if !controllerutil.ContainsFinalizer(n, v1.TerminationFinalizer) { - return reconcile.Result{}, nil - } - if !nodeutils.IsManaged(n, c.cloudProvider) { - return reconcile.Result{}, nil - } - - nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) - if err != nil { - if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } - return reconcile.Result{}, err - } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) - if nc.DeletionTimestamp.IsZero() { - if err := c.kubeClient.Delete(ctx, nc); err != nil { - return reconcile.Result{}, client.IgnoreNotFound(err) - } - } - - if err := c.prepareNode(ctx, n); err != nil { - if errors.IsNotFound(err) { - return reconcile.Result{}, nil - } - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) - } - - for _, r := range c.reconcilers { - res, err := r.Reconcile(ctx, n, nc) - if res.Requeue || res.RequeueAfter != 0 || err != nil { - return res, err - } - } - return reconcile.Result{}, nil -} - -// prepareNode ensures that the node is ready to begin the drain / termination process. This includes ensuring that it -// is tainted appropriately and annotated to be ignored by external load balancers. -func (c *Controller) prepareNode(ctx context.Context, n *corev1.Node) error { - stored := n.DeepCopy() - - // Add the karpenter.sh/disrupted:NoSchedule taint to ensure no additional pods schedule to the Node during the - // drain process. - if !lo.ContainsBy(n.Spec.Taints, func(t corev1.Taint) bool { - return t.MatchTaint(&v1.DisruptedNoScheduleTaint) - }) { - n.Spec.Taints = append(n.Spec.Taints, v1.DisruptedNoScheduleTaint) - } - // Adding this label to the node ensures that the node is removed from the load-balancer target group while it is - // draining and before it is terminated. This prevents 500s coming prior to health check when the load balancer - // controller hasn't yet determined that the node and underlying connections are gone. - // https://github.com/aws/aws-node-termination-handler/issues/316 - // https://github.com/aws/karpenter/pull/2518 - n.Labels = lo.Assign(n.Labels, map[string]string{ - corev1.LabelNodeExcludeBalancers: "karpenter", - }) - - if equality.Semantic.DeepEqual(n, stored) { - return nil - } - // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch can cause races due - // to the fact that it fully replaces the list on a change. Here, we are updating the taint list. - return c.kubeClient.Patch(ctx, n, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})) -} - -// removeFinalizer removes Karpenter's termination finalizer from the given Node, updating the termination metrics in the process. -func removeFinalizer(ctx context.Context, kubeClient client.Client, n *corev1.Node) error { - stored := n.DeepCopy() - controllerutil.RemoveFinalizer(n, v1.TerminationFinalizer) - if !equality.Semantic.DeepEqual(stored, n) { - // We use client.StrategicMergeFrom here since the node object supports it and - // a strategic merge patch represents the finalizer list as a keyed "set" so removing - // an item from the list doesn't replace the full list - // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 - if err := kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { - if errors.IsNotFound(err) { - return nil - } - return fmt.Errorf("removing finalizer, %w", err) - } - metrics.NodesTerminatedTotal.Inc(map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp - DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ - metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }) - log.FromContext(ctx).Info("deleted node") - } - return nil -} diff --git a/pkg/controllers/node/termination/drain.go b/pkg/controllers/node/termination/drain/controller.go similarity index 53% rename from pkg/controllers/node/termination/drain.go rename to pkg/controllers/node/termination/drain/controller.go index d95c7a3183..226f430250 100644 --- a/pkg/controllers/node/termination/drain.go +++ b/pkg/controllers/node/termination/drain/controller.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package drain import ( "context" @@ -23,21 +23,30 @@ import ( "time" "github.com/samber/lo" + "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" podutils "sigs.k8s.io/karpenter/pkg/utils/pod" + "sigs.k8s.io/karpenter/pkg/utils/pretty" ) type nodeDrainError struct { @@ -56,7 +65,7 @@ func isNodeDrainError(err error) bool { return errors.As(err, &nodeDrainErr) } -type DrainReconciler struct { +type Controller struct { clock clock.Clock kubeClient client.Client cloudProvider cloudprovider.CloudProvider @@ -64,10 +73,64 @@ type DrainReconciler struct { evictionQueue *eviction.Queue } +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, queue *eviction.Queue) *Controller { + return &Controller{ + clock: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + recorder: recorder, + evictionQueue: queue, + } +} + +func (*Controller) Name() string { + return "node.drain" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clock, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return nil +} + +func (*Controller) Finalizer() string { + return v1.DrainFinalizer +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + // nolint:gocyclo -func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - if nc.StatusConditions().IsTrue(v1.ConditionTypeDrained) { - return reconcile.Result{}, nil +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + if err := c.prepareNode(ctx, n); err != nil { + if apierrors.IsNotFound(err) { + return reconcile.Result{}, nil + } + if apierrors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("tainting node with %s, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err) } tgpExpirationTime, err := nodeclaim.TerminationGracePeriodExpirationTime(nc) @@ -76,29 +139,16 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. return reconcile.Result{}, nil } if tgpExpirationTime != nil { - d.recorder.Publish(NodeTerminationGracePeriodExpiringEvent(n, *tgpExpirationTime)) + c.recorder.Publish(NodeTerminationGracePeriodExpiringEvent(n, *tgpExpirationTime)) } - if err := d.drain(ctx, n, tgpExpirationTime); err != nil { + if err := c.drain(ctx, n, tgpExpirationTime); err != nil { if !isNodeDrainError(err) { return reconcile.Result{}, fmt.Errorf("draining node, %w", err) } - // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining - // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even - // though the CloudProvider says the instance is not around, we know that the kubelet process is still running - // if the Node Ready condition is true - // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 - if nodeutils.GetCondition(n, corev1.NodeReady).Status != corev1.ConditionTrue { - if _, err = d.cloudProvider.Get(ctx, n.Spec.ProviderID); err != nil { - if cloudprovider.IsNodeClaimNotFoundError(err) { - return reconcile.Result{}, removeFinalizer(ctx, d.kubeClient, n) - } - return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) - } - } stored := nc.DeepCopy() if nc.StatusConditions().SetFalse(v1.ConditionTypeDrained, "Draining", "Draining") { - if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -108,13 +158,13 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. return reconcile.Result{}, err } } - d.recorder.Publish(NodeDrainFailedEvent(n, err)) + c.recorder.Publish(NodeDrainFailedEvent(n, err)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } - stored := nc.DeepCopy() + storedNodeClaim := nc.DeepCopy() _ = nc.StatusConditions().SetTrue(v1.ConditionTypeDrained) - if err := d.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(storedNodeClaim, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -123,38 +173,70 @@ func (d *DrainReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1. } return reconcile.Result{}, err } + if err := terminationreconcile.RemoveFinalizer(ctx, c.kubeClient, n, c.Finalizer()); err != nil { + return reconcile.Result{}, err + } NodesDrainedTotal.Inc(map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], }) return reconcile.Result{}, nil } +// prepareNode ensures that the node is ready to begin the drain / termination process. This includes ensuring that it +// is tainted appropriately and annotated to be ignored by external load balancers. +func (c *Controller) prepareNode(ctx context.Context, n *corev1.Node) error { + stored := n.DeepCopy() + + // Add the karpenter.sh/disrupted:NoSchedule taint to ensure no additional pods schedule to the Node during the + // drain process. + if !lo.ContainsBy(n.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&v1.DisruptedNoScheduleTaint) + }) { + n.Spec.Taints = append(n.Spec.Taints, v1.DisruptedNoScheduleTaint) + } + // Adding this label to the node ensures that the node is removed from the load-balancer target group while it is + // draining and before it is terminated. This prevents 500s coming prior to health check when the load balancer + // controller hasn't yet determined that the node and underlying connections are gone. + // https://github.com/aws/aws-node-termination-handler/issues/316 + // https://github.com/aws/karpenter/pull/2518 + n.Labels = lo.Assign(n.Labels, map[string]string{ + corev1.LabelNodeExcludeBalancers: "karpenter", + }) + + if equality.Semantic.DeepEqual(n, stored) { + return nil + } + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch can cause races due + // to the fact that it fully replaces the list on a change. Here, we are updating the taint list. + return c.kubeClient.Patch(ctx, n, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})) +} + // Drain evicts pods from the node and returns true when all pods are evicted // https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown -func (d *DrainReconciler) drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { - pods, err := nodeutils.GetPods(ctx, d.kubeClient, node) +func (c *Controller) drain(ctx context.Context, node *corev1.Node, nodeGracePeriodExpirationTime *time.Time) error { + pods, err := nodeutils.GetPods(ctx, c.kubeClient, node) if err != nil { return fmt.Errorf("listing pods on node, %w", err) } podsToDelete := lo.Filter(pods, func(p *corev1.Pod, _ int) bool { - return podutils.IsWaitingEviction(p, d.clock) && !podutils.IsTerminating(p) + return podutils.IsWaitingEviction(p, c.clock) && !podutils.IsTerminating(p) }) - if err := d.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { + if err := c.DeleteExpiringPods(ctx, podsToDelete, nodeGracePeriodExpirationTime); err != nil { return fmt.Errorf("deleting expiring pods, %w", err) } // Monitor pods in pod groups that either haven't been evicted or are actively evicting - podGroups := d.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutils.IsWaitingEviction(p, d.clock) })) + podGroups := c.groupPodsByPriority(lo.Filter(pods, func(p *corev1.Pod, _ int) bool { return podutils.IsWaitingEviction(p, c.clock) })) for _, group := range podGroups { if len(group) > 0 { // Only add pods to the eviction queue that haven't been evicted yet - d.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutils.IsEvictable(p) })...) + c.evictionQueue.Add(lo.Filter(group, func(p *corev1.Pod, _ int) bool { return podutils.IsEvictable(p) })...) return newNodeDrainError(fmt.Errorf("%d pods are waiting to be evicted", lo.SumBy(podGroups, func(pods []*corev1.Pod) int { return len(pods) }))) } } return nil } -func (d *DrainReconciler) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { +func (c *Controller) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Pod { // 1. Prioritize noncritical pods, non-daemon pods https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown var nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon []*corev1.Pod for _, pod := range pods { @@ -175,19 +257,19 @@ func (d *DrainReconciler) groupPodsByPriority(pods []*corev1.Pod) [][]*corev1.Po return [][]*corev1.Pod{nonCriticalNonDaemon, nonCriticalDaemon, criticalNonDaemon, criticalDaemon} } -func (d *DrainReconciler) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { +func (c *Controller) DeleteExpiringPods(ctx context.Context, pods []*corev1.Pod, nodeGracePeriodTerminationTime *time.Time) error { for _, pod := range pods { // check if the node has an expiration time and the pod needs to be deleted - deleteTime := d.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) + deleteTime := c.podDeleteTimeWithGracePeriod(nodeGracePeriodTerminationTime, pod) if deleteTime != nil && time.Now().After(*deleteTime) { // delete pod proactively to give as much of its terminationGracePeriodSeconds as possible for deletion // ensure that we clamp the maximum pod terminationGracePeriodSeconds to the node's remaining expiration time in the delete command gracePeriodSeconds := lo.ToPtr(int64(time.Until(*nodeGracePeriodTerminationTime).Seconds())) - d.recorder.Publish(PodDeletedEvent(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) + c.recorder.Publish(PodDeletedEvent(pod, gracePeriodSeconds, nodeGracePeriodTerminationTime)) opts := &client.DeleteOptions{ GracePeriodSeconds: gracePeriodSeconds, } - if err := d.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem + if err := c.kubeClient.Delete(ctx, pod, opts); err != nil && !apierrors.IsNotFound(err) { // ignore 404, not a problem return fmt.Errorf("deleting pod, %w", err) // otherwise, bubble up the error } log.FromContext(ctx).WithValues( @@ -203,7 +285,7 @@ func (d *DrainReconciler) DeleteExpiringPods(ctx context.Context, pods []*corev1 } // if a pod should be deleted to give it the full terminationGracePeriodSeconds of time before the node will shut down, return the time the pod should be deleted -func (*DrainReconciler) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { +func (*Controller) podDeleteTimeWithGracePeriod(nodeGracePeriodExpirationTime *time.Time, pod *corev1.Pod) *time.Time { if nodeGracePeriodExpirationTime == nil || pod.Spec.TerminationGracePeriodSeconds == nil { // k8s defaults to 30s, so we should never see a nil TerminationGracePeriodSeconds return nil } diff --git a/pkg/controllers/node/termination/events.go b/pkg/controllers/node/termination/drain/events.go similarity index 68% rename from pkg/controllers/node/termination/events.go rename to pkg/controllers/node/termination/drain/events.go index 91ffbc26d5..533f228571 100644 --- a/pkg/controllers/node/termination/events.go +++ b/pkg/controllers/node/termination/drain/events.go @@ -14,19 +14,27 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package drain import ( "fmt" "time" corev1 "k8s.io/api/core/v1" - storagev1 "k8s.io/api/storage/v1" - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/events" ) +func NodeTerminationGracePeriodExpiringEvent(node *corev1.Node, t time.Time) events.Event { + return events.Event{ + InvolvedObject: node, + Type: corev1.EventTypeWarning, + Reason: "TerminationGracePeriodExpiring", + Message: fmt.Sprintf("All pods will be deleted by %s", t.Format(time.RFC3339)), + DedupeValues: []string{node.Name}, + } +} + func PodDeletedEvent(pod *corev1.Pod, gracePeriodSeconds *int64, nodeGracePeriodTerminationTime *time.Time) events.Event { return events.Event{ InvolvedObject: pod, @@ -46,33 +54,3 @@ func NodeDrainFailedEvent(node *corev1.Node, err error) events.Event { DedupeValues: []string{node.Name}, } } - -func NodeAwaitingVolumeDetachmentEvent(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { - return events.Event{ - InvolvedObject: node, - Type: corev1.EventTypeNormal, - Reason: "AwaitingVolumeDetachment", - Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), - DedupeValues: []string{node.Name}, - } -} - -func NodeTerminationGracePeriodExpiringEvent(node *corev1.Node, t time.Time) events.Event { - return events.Event{ - InvolvedObject: node, - Type: corev1.EventTypeWarning, - Reason: "TerminationGracePeriodExpiring", - Message: fmt.Sprintf("All pods will be deleted by %s", t.Format(time.RFC3339)), - DedupeValues: []string{node.Name}, - } -} - -func NodeClaimTerminationGracePeriodExpiringEvent(nodeClaim *v1.NodeClaim, terminationTime string) events.Event { - return events.Event{ - InvolvedObject: nodeClaim, - Type: corev1.EventTypeWarning, - Reason: "TerminationGracePeriodExpiring", - Message: fmt.Sprintf("All pods will be deleted by %s", terminationTime), - DedupeValues: []string{nodeClaim.Name}, - } -} diff --git a/pkg/controllers/node/termination/drain/metrics.go b/pkg/controllers/node/termination/drain/metrics.go new file mode 100644 index 0000000000..1aa8f1849e --- /dev/null +++ b/pkg/controllers/node/termination/drain/metrics.go @@ -0,0 +1,38 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drain + +import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "sigs.k8s.io/karpenter/pkg/metrics" +) + +var ( + NodesDrainedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: metrics.NodeSubsystem, + Name: "drained_total", + Help: "The total number of nodes drained by Karpenter", + }, + []string{metrics.NodePoolLabel}, + ) +) diff --git a/pkg/controllers/node/termination/drain/suite_test.go b/pkg/controllers/node/termination/drain/suite_test.go new file mode 100644 index 0000000000..a2e4bea43b --- /dev/null +++ b/pkg/controllers/node/termination/drain/suite_test.go @@ -0,0 +1,599 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drain_test + +import ( + "context" + "testing" + "time" + + clock "k8s.io/utils/clock/testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var queue *eviction.Queue +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var reconciler reconcile.Reconciler + +var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}} + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "Drain") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + queue = eviction.NewTestingQueue(env.Client, recorder) + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, drain.NewController(fakeClock, env.Client, cloudProvider, recorder, queue)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("Drain", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.TerminationFinalizer, + v1.DrainFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) + drain.NodesDrainedTotal.Reset() + }) + + // TODO: Move to reconciler + It("should exclude nodes from load balancers when terminating", func() { + labels := map[string]string{"foo": "bar"} + pod := test.Pod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: defaultOwnerRefs, + Labels: labels, + }, + }) + // Create a fully blocking PDB to prevent the node from being deleted before we can observe its labels + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labels, + MaxUnavailable: lo.ToPtr(intstr.FromInt(0)), + }) + + ExpectApplied(ctx, env.Client, node, nodeClaim, pod, pdb) + ExpectManualBinding(ctx, env.Client, pod, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectNodeExists(ctx, env.Client, node.Name) + Expect(node.Labels[corev1.LabelNodeExcludeBalancers]).Should(Equal("karpenter")) + }) + DescribeTable( + "should not evict pods that tolerate the 'karpenter.sh/disrupted' taint", + func(toleration corev1.Toleration) { + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podSkip := test.Pod(test.PodOptions{ + NodeName: node.Name, + Tolerations: []corev1.Toleration{toleration}, + ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(podSkip)).To(BeFalse()) + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }, + Entry("with an equals operator", corev1.Toleration{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpEqual, Effect: v1.DisruptedNoScheduleTaint.Effect, Value: v1.DisruptedNoScheduleTaint.Value}), + Entry("with an exists operator", corev1.Toleration{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpExists, Effect: v1.DisruptedNoScheduleTaint.Effect}), + ) + It("should evict pods that tolerate the node.kubernetes.io/unschedulable taint", func() { + podEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + Tolerations: []corev1.Toleration{{Key: corev1.TaintNodeUnschedulable, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}, + ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes that have pods without an ownerRef", func() { + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: nil, + }, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(pod)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pod) + ExpectDeleted(ctx, env.Client, pod) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes with terminal pods", func() { + podEvictPhaseSucceeded := test.Pod(test.PodOptions{ + NodeName: node.Name, + Phase: corev1.PodSucceeded, + }) + podEvictPhaseFailed := test.Pod(test.PodOptions{ + NodeName: node.Name, + Phase: corev1.PodFailed, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvictPhaseSucceeded, podEvictPhaseFailed) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + + Expect(queue.Has(podEvictPhaseSucceeded)).To(BeFalse()) + Expect(queue.Has(podEvictPhaseFailed)).To(BeFalse()) + + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should fail to evict pods that violate a PDB", func() { + minAvailable := intstr.FromInt32(1) + labelSelector := map[string]string{test.RandomName(): test.RandomName()} + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labelSelector, + // Don't let any pod evict + MinAvailable: &minAvailable, + }) + podNoEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Labels: labelSelector, + OwnerReferences: defaultOwnerRefs, + }, + Phase: corev1.PodRunning, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, podNoEvict, pdb) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Expect podNoEvict to be added to the queue - we won't evict it due to PDBs + Expect(queue.Has(podNoEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + Expect(queue.Has(podNoEvict)).To(BeTrue()) + ExpectDeleted(ctx, env.Client, podNoEvict) + ExpectNotFound(ctx, env.Client, podNoEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + }) + It("should evict pods in order and wait until pods are fully deleted", func() { + daemonEvict := test.DaemonSet() + daemonNodeCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-node-critical"}}) + daemonClusterCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-cluster-critical"}}) + ExpectApplied(ctx, env.Client, daemonEvict, daemonNodeCritical, daemonClusterCritical) + + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podDaemonEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonEvict.Name, + UID: daemonEvict.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + podDaemonNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonNodeCritical.Name, + UID: daemonNodeCritical.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + podDaemonClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: daemonClusterCritical.Name, + UID: daemonClusterCritical.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }}}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + podGroups := [][]*corev1.Pod{{podEvict}, {podDaemonEvict}, {podNodeCritical, podClusterCritical}, {podDaemonNodeCritical, podDaemonClusterCritical}} + for i, podGroup := range podGroups { + node = ExpectNodeExists(ctx, env.Client, node.Name) + for _, p := range podGroup { + ExpectPodExists(ctx, env.Client, p.Name, p.Namespace) + } + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + if i == 0 { + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + } else { + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + } + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + for range podGroup { + ExpectSingletonReconciled(ctx, queue) + } + // Start draining the pod group, but don't complete it yet + EventuallyExpectTerminating(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) + + // Look at the next pod group and ensure that none of the pods have started terminating on it + if i != len(podGroups)-1 { + for range podGroups[i+1] { + ExpectSingletonReconciled(ctx, queue) + } + ConsistentlyExpectNotTerminating(ctx, env.Client, lo.Map(podGroups[i+1], func(p *corev1.Pod, _ int) client.Object { return p })...) + } + // Expect that the pods are deleted -- which should unblock the next pod group + ExpectDeleted(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) + } + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not evict static pods", func() { + podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) + + podNoEvict := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "v1", + Kind: "Node", + Name: node.Name, + UID: node.UID, + }}, + }, + }) + ExpectApplied(ctx, env.Client, podNoEvict) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Expect mirror pod to not be queued for eviction + Expect(queue.Has(podNoEvict)).To(BeFalse()) + Expect(queue.Has(podEvict)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, podEvict) + ExpectDeleted(ctx, env.Client, podEvict) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not finalize nodes until all pods are deleted", func() { + pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + for _, p := range pods { + Expect(queue.Has(p)).To(BeTrue()) + } + ExpectSingletonReconciled(ctx, queue) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) + ExpectDeleted(ctx, env.Client, pods[1]) + + // Expect node to exist and be draining, but not deleted + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + + ExpectDeleted(ctx, env.Client, pods[0]) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should finalize nodes when pods are stuck terminating", func() { + pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) + fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + // Before grace period, node should not delete + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + Expect(queue.Has(pod)).To(BeTrue()) + ExpectSingletonReconciled(ctx, queue) + EventuallyExpectTerminating(ctx, env.Client, pod) + + // After grace period, node should delete. The deletion timestamps are from etcd which we can't control, so + // to eliminate test-flakiness we reset the time to current time + 90 seconds instead of just advancing + // the clock by 90 seconds. + fakeClock.SetTime(time.Now().Add(90 * time.Second)) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + }) + It("should not evict a new pod with the same name using the old pod's eviction queue key", func() { + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + OwnerReferences: defaultOwnerRefs, + }, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) + + // Trigger Termination Controller + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) + + // Don't trigger a call into the queue to make sure that we effectively aren't triggering eviction + // We'll use this to try to leave pods in the queue + + // Delete the pod directly to act like something else is doing the pod termination + ExpectDeleted(ctx, env.Client, pod) + + // Requeue the termination controller to completely delete the node + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.DrainFinalizer)) + ExpectFinalizersRemoved(ctx, env.Client, node) + ExpectNotFound(ctx, env.Client, node) + + // Expect that the old pod's key still exists in the queue + Expect(queue.Has(pod)).To(BeTrue()) + + // Re-create the pod and node, it should now have the same name, but a different UUID + node = test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: []string{v1.TerminationFinalizer}, + }, + }) + pod = test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + OwnerReferences: defaultOwnerRefs, + }, + }) + ExpectApplied(ctx, env.Client, node, pod) + + // Trigger eviction queue with the pod key still in it + ExpectSingletonReconciled(ctx, queue) + + Consistently(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) + g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) + }, ReconcilerPropagationTime, RequestInterval).Should(Succeed()) + }) + It("should preemptively delete pods to satisfy their terminationGracePeriodSeconds", func() { + nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), + } + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.DoNotDisruptAnnotationKey: "true", + }, + OwnerReferences: defaultOwnerRefs, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(300)), + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectSingletonReconciled(ctx, queue) + ExpectNodeExists(ctx, env.Client, node.Name) + ExpectDeleted(ctx, env.Client, pod) + }) + It("should only delete pods when their terminationGracePeriodSeconds is less than the the node's remaining terminationGracePeriod", func() { + nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), + } + pod := test.Pod(test.PodOptions{ + NodeName: node.Name, + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1.DoNotDisruptAnnotationKey: "true", + }, + OwnerReferences: defaultOwnerRefs, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + }) + fakeClock.SetTime(time.Now()) + ExpectApplied(ctx, env.Client, node, nodeClaim, pod) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + // expect pod still exists + fakeClock.Step(90 * time.Second) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNodeWithNodeClaimDraining(env.Client, node.Name) + ExpectNodeExists(ctx, env.Client, node.Name) + ExpectPodExists(ctx, env.Client, pod.Name, pod.Namespace) + + // expect pod is now deleted + fakeClock.Step(175 * time.Second) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectSingletonReconciled(ctx, queue) + ExpectDeleted(ctx, env.Client, pod) + }) + Context("Metrics", func() { + It("should update the eviction queueDepth metric when reconciling pods", func() { + minAvailable := intstr.FromInt32(0) + labelSelector := map[string]string{test.RandomName(): test.RandomName()} + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labelSelector, + // Don't let any pod evict + MinAvailable: &minAvailable, + }) + ExpectApplied(ctx, env.Client, pdb, node, nodeClaim) + pods := test.Pods(5, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: defaultOwnerRefs, + Labels: labelSelector, + }}) + ExpectApplied(ctx, env.Client, lo.Map(pods, func(p *corev1.Pod, _ int) client.Object { return p })...) + + wqDepthBefore, _ := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + wqDepthAfter, ok := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(wqDepthAfter.GetCounter().Value) - lo.FromPtr(wqDepthBefore.GetCounter().Value)).To(BeNumerically("==", 5)) + }) + }) +}) + +func ExpectNodeWithNodeClaimDraining(c client.Client, nodeName string) *corev1.Node { + GinkgoHelper() + node := ExpectNodeExists(ctx, c, nodeName) + Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint)) + Expect(node.Finalizers).To(ContainElements(v1.TerminationFinalizer, v1.DrainFinalizer)) + Expect(node.DeletionTimestamp).ToNot(BeNil()) + return node +} diff --git a/pkg/controllers/node/termination/instancetermination.go b/pkg/controllers/node/termination/instancetermination.go deleted file mode 100644 index 5ea7086e43..0000000000 --- a/pkg/controllers/node/termination/instancetermination.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination - -import ( - "context" - "fmt" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/utils/clock" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider" - nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" - "sigs.k8s.io/karpenter/pkg/utils/termination" -) - -type InstanceTerminationReconciler struct { - clk clock.Clock - kubeClient client.Client - cloudProvider cloudprovider.CloudProvider -} - -func (i *InstanceTerminationReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - elapsed, err := nodeclaimutils.HasTerminationGracePeriodElapsed(i.clk, nc) - if err != nil { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } - if !nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() && !elapsed { - return reconcile.Result{}, nil - } - isInstanceTerminated, err := termination.EnsureTerminated(ctx, i.kubeClient, nc, i.cloudProvider) - if err != nil { - // 404 = the nodeClaim no longer exists - if errors.IsNotFound(err) { - return reconcile.Result{}, nil - } - // 409 - The nodeClaim exists, but its status has already been modified - if errors.IsConflict(err) { - return reconcile.Result{Requeue: true}, nil - } - return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) - } - if !isInstanceTerminated { - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil - } - if err := removeFinalizer(ctx, i.kubeClient, n); err != nil { - return reconcile.Result{}, err - } - return reconcile.Result{}, nil -} diff --git a/pkg/controllers/node/termination/instancetermination/controller.go b/pkg/controllers/node/termination/instancetermination/controller.go new file mode 100644 index 0000000000..756cb605f9 --- /dev/null +++ b/pkg/controllers/node/termination/instancetermination/controller.go @@ -0,0 +1,140 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package instancetermination + +import ( + "context" + "fmt" + "time" + + "golang.org/x/time/rate" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/util/workqueue" + "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/metrics" + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + terminationutils "sigs.k8s.io/karpenter/pkg/utils/termination" +) + +type Controller struct { + clk clock.Clock + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider +} + +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) *Controller { + return &Controller{ + clk: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + } +} + +func (*Controller) Name() string { + return "node.instancetermination" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clk, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return []string{v1.DrainFinalizer, v1.VolumeFinalizer} +} + +func (*Controller) Finalizer() string { + return v1.TerminationFinalizer +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyContinue +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyContinue +} + +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + isInstanceTerminated, err := terminationutils.EnsureTerminated(ctx, c.kubeClient, nc, c.cloudProvider) + if err != nil { + // 404 = the nodeClaim no longer exists + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + // 409 - The nodeClaim exists, but its status has already been modified + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, fmt.Errorf("ensuring instance termination, %w", err) + } + if !isInstanceTerminated { + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + } + + stored := n.DeepCopy() + controllerutil.RemoveFinalizer(n, c.Finalizer()) + // We use client.StrategicMergeFrom here since the node object supports it and + // a strategic merge patch represents the finalizer list as a keyed "set" so removing + // an item from the list doesn't replace the full list + // https://github.com/kubernetes/kubernetes/issues/111643#issuecomment-2016489732 + if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, fmt.Errorf("removing %q finalizer, %w", c.Finalizer(), err) + } + metrics.NodesTerminatedTotal.Inc(map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp + DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ + metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], + }) + log.FromContext(ctx).Info("deleted node") + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/metrics.go b/pkg/controllers/node/termination/instancetermination/metrics.go similarity index 87% rename from pkg/controllers/node/termination/metrics.go rename to pkg/controllers/node/termination/instancetermination/metrics.go index 32d829c6d4..eae7e85ea7 100644 --- a/pkg/controllers/node/termination/metrics.go +++ b/pkg/controllers/node/termination/instancetermination/metrics.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package instancetermination import ( "time" @@ -40,16 +40,6 @@ var ( }, []string{metrics.NodePoolLabel}, ) - NodesDrainedTotal = opmetrics.NewPrometheusCounter( - crmetrics.Registry, - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: metrics.NodeSubsystem, - Name: "drained_total", - Help: "The total number of nodes drained by Karpenter", - }, - []string{metrics.NodePoolLabel}, - ) NodeLifetimeDurationSeconds = opmetrics.NewPrometheusHistogram( crmetrics.Registry, prometheus.HistogramOpts{ diff --git a/pkg/controllers/node/termination/instancetermination/suite_test.go b/pkg/controllers/node/termination/instancetermination/suite_test.go new file mode 100644 index 0000000000..aa11218aca --- /dev/null +++ b/pkg/controllers/node/termination/instancetermination/suite_test.go @@ -0,0 +1,188 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package instancetermination_test + +import ( + "context" + "testing" + "time" + + clock "k8s.io/utils/clock/testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/instancetermination" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/metrics" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var reconciler reconcile.Reconciler + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "InstanceTermination") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, instancetermination.NewController(fakeClock, env.Client, cloudProvider)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("InstanceTermination", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.TerminationFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + AfterEach(func() { + metrics.NodesTerminatedTotal.Reset() + instancetermination.DurationSeconds.Reset() + instancetermination.NodeLifetimeDurationSeconds.Reset() + }) + + It("should terminate the instance", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(0)) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(1)) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + Expect(len(cloudProvider.DeleteCalls)).To(Equal(1)) + ExpectNotFound(ctx, env.Client, node) + }) + DescribeTable( + "should ignore nodes with dependent finalizers", + func(finalizer string) { + node.Finalizers = append(nodeClaim.Finalizers, finalizer) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) + + node = ExpectExists(ctx, env.Client, node) + stored := node.DeepCopy() + node.Finalizers = lo.Reject(node.Finalizers, func(f string, _ int) bool { + return f == finalizer + }) + Expect(env.Client.Patch(ctx, node, client.StrategicMergeFrom(stored))).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNotFound(ctx, env.Client, node) + }, + Entry("VolumeFinalizer", v1.VolumeFinalizer), + Entry("DrainFinalizer", v1.DrainFinalizer), + ) + It("should finalize node when the instance has already been terminated", func() { + delete(cloudProvider.CreatedNodeClaims, node.Spec.ProviderID) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectMakeNodesNotReady(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectExists(ctx, env.Client, node) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectNotFound(ctx, env.Client, node) + }) + Context("Metrics", func() { + It("should fire the terminationSummary metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_termination_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(m.GetSummary().GetSampleCount()).To(BeNumerically("==", 1)) + }) + It("should fire the nodesTerminated counter metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(m.GetCounter().Value)).To(BeNumerically("==", 1)) + }) + It("should fire the lifetime duration histogram metric when deleting nodes", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + node = ExpectNodeExists(ctx, env.Client, node.Name) + // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + m, ok := FindMetricWithLabelValues("karpenter_nodes_lifetime_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) + Expect(ok).To(BeTrue()) + Expect(lo.FromPtr(m.GetHistogram().SampleCount)).To(BeNumerically("==", 1)) + }) + }) +}) diff --git a/pkg/controllers/node/termination/reconcile/reconcile.go b/pkg/controllers/node/termination/reconcile/reconcile.go new file mode 100644 index 0000000000..265e7ab1d4 --- /dev/null +++ b/pkg/controllers/node/termination/reconcile/reconcile.go @@ -0,0 +1,161 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconcile + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + "sigs.k8s.io/karpenter/pkg/operator/injection" + nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" + nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" +) + +func AsReconciler(c client.Client, cp cloudprovider.CloudProvider, clk clock.Clock, r Reconciler) reconcile.Reconciler { + return reconcile.AsReconciler(c, &reconcilerAdapter{ + Reconciler: r, + kubeClient: c, + cloudProvider: cp, + clk: clk, + }) +} + +type Reconciler interface { + AwaitFinalizers() []string + Finalizer() string + + Name() string + Reconcile(context.Context, *corev1.Node, *v1.NodeClaim) (reconcile.Result, error) + + NodeClaimNotFoundPolicy() Policy + TerminationGracePeriodPolicy() Policy +} + +type Policy int + +const ( + // If the condition associated with this policy is true, the reconcilers's associated finalizer will be removed and + // reconciler will not be reconciled. + PolicyFinalize Policy = iota + // If the condition associated with this policy is true, the reconciler will be reconciled normally. + PolicyContinue +) + +type reconcilerAdapter struct { + Reconciler + + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + clk clock.Clock +} + +//nolint:gocyclo +func (r *reconcilerAdapter) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, r.Name()) + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) + + if n.GetDeletionTimestamp().IsZero() { + return reconcile.Result{}, nil + } + if !controllerutil.ContainsFinalizer(n, r.Finalizer()) { + return reconcile.Result{}, nil + } + if !nodeutils.IsManaged(n, r.cloudProvider) { + return reconcile.Result{}, nil + } + + for _, finalizer := range r.AwaitFinalizers() { + if ok, err := nodeutils.ContainsFinalizer(n, finalizer); err != nil { + // If any of the awaited finalizers require hydration, and the resources haven't been hydrated since the + // last Karpenter update, we should short-circuit. The resource should be re-reconciled once hydrated. + if nodeutils.IsHydrationError(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } else if ok { + return reconcile.Result{}, nil + } + } + + nc, err := nodeutils.NodeClaimForNode(ctx, r.kubeClient, n) + if err != nil { + if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) + if nc.DeletionTimestamp.IsZero() { + if err := r.kubeClient.Delete(ctx, nc); err != nil { + if errors.IsNotFound(err) { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + } + + // If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully draining + // on nodes that are no longer alive. We do a check on the Ready condition of the node since, even + // though the CloudProvider says the instance is not around, we know that the kubelet process is still running + // if the Node Ready condition is true + // Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144 + if r.NodeClaimNotFoundPolicy() == PolicyFinalize { + if nodeutils.GetCondition(n, corev1.NodeReady).Status != corev1.ConditionTrue { + if _, err := r.cloudProvider.Get(ctx, n.Spec.ProviderID); err != nil { + if cloudprovider.IsNodeClaimNotFoundError(err) { + return reconcile.Result{}, client.IgnoreNotFound(RemoveFinalizer(ctx, r.kubeClient, n, r.Finalizer())) + } + return reconcile.Result{}, fmt.Errorf("getting nodeclaim, %w", err) + } + } + } + + if r.TerminationGracePeriodPolicy() == PolicyFinalize { + if elapsed, err := nodeclaimutils.HasTerminationGracePeriodElapsed(r.clk, nc); err != nil { + log.FromContext(ctx).Error(err, "failed to terminate node") + return reconcile.Result{}, nil + } else if elapsed { + return reconcile.Result{}, client.IgnoreNotFound(RemoveFinalizer(ctx, r.kubeClient, n, r.Finalizer())) + } + } + + return r.Reconciler.Reconcile(ctx, n, nc) +} + +func RemoveFinalizer(ctx context.Context, c client.Client, n *corev1.Node, finalizer string) error { + stored := n.DeepCopy() + if !controllerutil.RemoveFinalizer(n, finalizer) { + return nil + } + if err := c.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil { + return err + } + return nil +} diff --git a/pkg/controllers/node/termination/reconcile/suite_test.go b/pkg/controllers/node/termination/reconcile/suite_test.go new file mode 100644 index 0000000000..34b99a8d95 --- /dev/null +++ b/pkg/controllers/node/termination/reconcile/suite_test.go @@ -0,0 +1,292 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reconcile_test + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + + corev1 "k8s.io/api/core/v1" + clock "k8s.io/utils/clock/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ( + DependentFinalizers = sets.New(v1.DrainFinalizer, v1.VolumeFinalizer) + TestFinalizer = "karpenter.sh/test" +) + +var ctx context.Context +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +var testReconciler *TestReconciler +var reconciler reconcile.Reconciler + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "TerminationReconciler") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + testReconciler = &TestReconciler{} + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, testReconciler) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("TerminationReconciler", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Finalizers: append(DependentFinalizers.UnsortedList(), TestFinalizer), + }, + }) + testReconciler.Reset() + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + + It("shouldn't reconcile against Nodes not managed by this instance of Karpenter", func() { + delete(node.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against Nodes without deletion timestamps set", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against Nodes without the reconciler's finalizer", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return finalizer == TestFinalizer + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile if dependent finalizers are present", func() { + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile if dependent finalizers aren't present, but the Node hasn't been hydrated", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + node.Annotations[v1.HydrationAnnotationKey] = "not-hydrated" + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against a Node with a duplicate NodeClaim", func() { + nodeClaim2 := nodeClaim.DeepCopy() + nodeClaim.Name = "duplicate-nodeclaim" + + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, nodeClaim2) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + // We should still succeed to reconcile - we won't succeed on a subsequent reconciliation so there's no reason + // to go into error backoff + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeTrue()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("shouldn't reconcile against a Node without a NodeClaim", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeFalse()) + }) + It("should delete the NodeClaim for a Node", func() { + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + }) + DescribeTable( + "should respect the reconciler's NodeClaimNotFoundPolicy", + func(policy terminationreconcile.Policy) { + testReconciler.NodeClaimNotFoundPolicyResult = policy + delete(cloudProvider.CreatedNodeClaims, node.Spec.ProviderID) + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectMakeNodesNotReady(ctx, env.Client, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + switch policy { + case terminationreconcile.PolicyContinue: + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + case terminationreconcile.PolicyFinalize: + ExpectNotFound(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + } + }, + Entry("Finalize", terminationreconcile.PolicyFinalize), + Entry("Continue", terminationreconcile.PolicyContinue), + ) + DescribeTable( + "should respect the reconciler's TerminationGracePeriodPolicy", + func(policy terminationreconcile.Policy) { + testReconciler.TerminationGracePeriodPolicyResult = policy + node.Finalizers = lo.Reject(node.Finalizers, func(finalizer string, _ int) bool { + return DependentFinalizers.Has(finalizer) + }) + nodeClaim.Annotations[v1.NodeClaimTerminationTimestampAnnotationKey] = fakeClock.Now().Add(-time.Minute).Format(time.RFC3339) + ExpectApplied(ctx, env.Client, node, nodeClaim) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.DeletionTimestamp.IsZero()).To(BeFalse()) + switch policy { + case terminationreconcile.PolicyContinue: + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(TestFinalizer)) + Expect(testReconciler.HasReconciled).To(BeTrue()) + case terminationreconcile.PolicyFinalize: + ExpectNotFound(ctx, env.Client, node) + Expect(testReconciler.HasReconciled).To(BeFalse()) + } + }, + Entry("Finalize", terminationreconcile.PolicyFinalize), + Entry("Continue", terminationreconcile.PolicyContinue), + ) +}) + +type TestReconciler struct { + NodeClaimNotFoundPolicyResult terminationreconcile.Policy + TerminationGracePeriodPolicyResult terminationreconcile.Policy + HasReconciled bool +} + +func (r *TestReconciler) Reset() { + *r = TestReconciler{ + NodeClaimNotFoundPolicyResult: terminationreconcile.PolicyContinue, + TerminationGracePeriodPolicyResult: terminationreconcile.PolicyContinue, + } +} + +func (r *TestReconciler) Name() string { + return "test.reconciler" +} + +func (r *TestReconciler) AwaitFinalizers() []string { + return DependentFinalizers.UnsortedList() +} + +func (r *TestReconciler) Finalizer() string { + return TestFinalizer +} + +func (r *TestReconciler) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return r.NodeClaimNotFoundPolicyResult +} + +func (r *TestReconciler) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return r.TerminationGracePeriodPolicyResult +} + +func (r *TestReconciler) Reconcile(_ context.Context, _ *corev1.Node, _ *v1.NodeClaim) (reconcile.Result, error) { + r.HasReconciled = true + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go deleted file mode 100644 index deb6a5c629..0000000000 --- a/pkg/controllers/node/termination/suite_test.go +++ /dev/null @@ -1,930 +0,0 @@ -/* -Copyright The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package termination_test - -import ( - "context" - "testing" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - clock "k8s.io/utils/clock/testing" - "sigs.k8s.io/controller-runtime/pkg/client" - - "sigs.k8s.io/karpenter/pkg/apis" - v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" - "sigs.k8s.io/karpenter/pkg/metrics" - "sigs.k8s.io/karpenter/pkg/test" - . "sigs.k8s.io/karpenter/pkg/test/expectations" - "sigs.k8s.io/karpenter/pkg/test/v1alpha1" - . "sigs.k8s.io/karpenter/pkg/utils/testing" -) - -var ctx context.Context -var terminationController *termination.Controller -var env *test.Environment -var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}} -var fakeClock *clock.FakeClock -var cloudProvider *fake.CloudProvider -var recorder *test.EventRecorder -var queue *eviction.Queue - -func TestAPIs(t *testing.T) { - ctx = TestContextWithLogger(t) - RegisterFailHandler(Fail) - RunSpecs(t, "Termination") -} - -var _ = BeforeSuite(func() { - fakeClock = clock.NewFakeClock(time.Now()) - env = test.NewEnvironment( - test.WithCRDs(apis.CRDs...), - test.WithCRDs(v1alpha1.CRDs...), - test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), - ) - - cloudProvider = fake.NewCloudProvider() - recorder = test.NewEventRecorder() - queue = eviction.NewTestingQueue(env.Client, recorder) - terminationController = termination.NewController(fakeClock, env.Client, cloudProvider, recorder, queue) -}) - -var _ = AfterSuite(func() { - Expect(env.Stop()).To(Succeed(), "Failed to stop environment") -}) - -var _ = Describe("Termination", func() { - var node *corev1.Node - var nodeClaim *v1.NodeClaim - var nodePool *v1.NodePool - - BeforeEach(func() { - fakeClock.SetTime(time.Now()) - cloudProvider.Reset() - *queue = lo.FromPtr(eviction.NewTestingQueue(env.Client, recorder)) - - nodePool = test.NodePool() - nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1.TerminationFinalizer}}}) - node.Labels[v1.NodePoolLabelKey] = test.NodePool().Name - cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim - }) - - AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) - - // Reset the metrics collectors - metrics.NodesTerminatedTotal.Reset() - termination.DurationSeconds.Reset() - termination.NodeLifetimeDurationSeconds.Reset() - termination.NodesDrainedTotal.Reset() - }) - - Context("Reconciliation", func() { - It("should delete nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should ignore nodes not managed by this Karpenter instance", func() { - delete(node.Labels, "karpenter.test.sh/testnodeclass") - node.Labels = lo.Assign(node.Labels, map[string]string{"karpenter.test.sh/unmanagednodeclass": "default"}) - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsUnknown()).To(BeTrue()) - }) - It("should delete nodeclaims associated with nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - - // The first reconciliation should trigger the Delete, and set the terminating status condition - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nc := ExpectExists(ctx, env.Client, nodeClaim) - Expect(nc.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue()) - ExpectNodeExists(ctx, env.Client, node.Name) - - // The second reconciliation should call get, see the "instance" is terminated, and remove the node. - // We should have deleted the NodeClaim from the node termination controller, so removing it's finalizer should result in it being removed. - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) - ExpectNotFound(ctx, env.Client, node, nodeClaim) - }) - It("should exclude nodes from load balancers when terminating", func() { - labels := map[string]string{"foo": "bar"} - pod := test.Pod(test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - Labels: labels, - }, - }) - // Create a fully blocking PDB to prevent the node from being deleted before we can observe its labels - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labels, - MaxUnavailable: lo.ToPtr(intstr.FromInt(0)), - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, pod, pdb) - ExpectManualBinding(ctx, env.Client, pod, node) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - node = ExpectNodeExists(ctx, env.Client, node.Name) - Expect(node.Labels[corev1.LabelNodeExcludeBalancers]).Should(Equal("karpenter")) - }) - It("should not evict pods that tolerate karpenter disruption taint with equal operator", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podSkip := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpEqual, Effect: v1.DisruptedNoScheduleTaint.Effect, Value: v1.DisruptedNoScheduleTaint.Value}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue()) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - Expect(queue.Has(podSkip)).To(BeFalse()) - ExpectSingletonReconciled(ctx, queue) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue()) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict pods that tolerate karpenter disruption taint with exists operator", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podSkip := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: v1.DisruptedTaintKey, Operator: corev1.TolerationOpExists, Effect: v1.DisruptedNoScheduleTaint.Effect}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podSkip) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - Expect(queue.Has(podSkip)).To(BeFalse()) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - Expect(queue.Has(podSkip)).To(BeFalse()) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict pods that tolerate the node.kubernetes.io/unschedulable taint", func() { - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []corev1.Toleration{{Key: corev1.TaintNodeUnschedulable, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes that have pods without an ownerRef", func() { - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: nil, - }, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect pod with no owner ref to be enqueued for eviction - EventuallyExpectTerminating(ctx, env.Client, pod) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Delete no owner refs pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, pod) - - // Reconcile node to evict pod and delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes with terminal pods", func() { - podEvictPhaseSucceeded := test.Pod(test.PodOptions{ - NodeName: node.Name, - Phase: corev1.PodSucceeded, - }) - podEvictPhaseFailed := test.Pod(test.PodOptions{ - NodeName: node.Name, - Phase: corev1.PodFailed, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvictPhaseSucceeded, podEvictPhaseFailed) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Trigger Termination Controller, which should ignore these pods and delete the node - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should fail to evict pods that violate a PDB", func() { - minAvailable := intstr.FromInt32(1) - labelSelector := map[string]string{test.RandomName(): test.RandomName()} - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labelSelector, - // Don't let any pod evict - MinAvailable: &minAvailable, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Labels: labelSelector, - OwnerReferences: defaultOwnerRefs, - }, - Phase: corev1.PodRunning, - }) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podNoEvict, pdb) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podNoEvict to be added to the queue - Expect(queue.Has(podNoEvict)).To(BeTrue()) - - // Attempt to evict the pod, but fail to do so - ExpectSingletonReconciled(ctx, queue) - - // Expect podNoEvict to fail eviction due to PDB, and be retried - Expect(queue.Has(podNoEvict)).To(BeTrue()) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podNoEvict) - ExpectNotFound(ctx, env.Client, podNoEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict pods in order and wait until pods are fully deleted", func() { - daemonEvict := test.DaemonSet() - daemonNodeCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-node-critical"}}) - daemonClusterCritical := test.DaemonSet(test.DaemonSetOptions{PodOptions: test.PodOptions{PriorityClassName: "system-cluster-critical"}}) - ExpectApplied(ctx, env.Client, daemonEvict, daemonNodeCritical, daemonClusterCritical) - - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podDaemonEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonEvict.Name, - UID: daemonEvict.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podDaemonNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonNodeCritical.Name, - UID: daemonNodeCritical.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - podDaemonClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "apps/v1", - Kind: "DaemonSet", - Name: daemonClusterCritical.Name, - UID: daemonClusterCritical.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }}}}) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical, podDaemonEvict, podDaemonNodeCritical, podDaemonClusterCritical) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - - podGroups := [][]*corev1.Pod{{podEvict}, {podDaemonEvict}, {podNodeCritical, podClusterCritical}, {podDaemonNodeCritical, podDaemonClusterCritical}} - for i, podGroup := range podGroups { - node = ExpectNodeExists(ctx, env.Client, node.Name) - for _, p := range podGroup { - ExpectPodExists(ctx, env.Client, p.Name, p.Namespace) - } - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - for range podGroup { - ExpectSingletonReconciled(ctx, queue) - } - // Start draining the pod group, but don't complete it yet - EventuallyExpectTerminating(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) - - // Look at the next pod group and ensure that none of the pods have started terminating on it - if i != len(podGroups)-1 { - for range podGroups[i+1] { - ExpectSingletonReconciled(ctx, queue) - } - ConsistentlyExpectNotTerminating(ctx, env.Client, lo.Map(podGroups[i+1], func(p *corev1.Pod, _ int) client.Object { return p })...) - } - // Expect that the pods are deleted -- which should unblock the next pod group - ExpectDeleted(ctx, env.Client, lo.Map(podGroup, func(p *corev1.Pod, _ int) client.Object { return p })...) - } - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should evict non-critical pods first", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - podClusterCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-cluster-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict, podNodeCritical, podClusterCritical) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Expect podEvict to be evicting, and delete it - EventuallyExpectTerminating(ctx, env.Client, podEvict) - ExpectDeleted(ctx, env.Client, podEvict) - - // Expect the critical pods to be evicted and deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - EventuallyExpectTerminating(ctx, env.Client, podNodeCritical, podClusterCritical) - ExpectDeleted(ctx, env.Client, podNodeCritical) - ExpectDeleted(ctx, env.Client, podClusterCritical) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict static pods", func() { - podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, podEvict) - - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "v1", - Kind: "Node", - Name: node.Name, - UID: node.UID, - }}, - }, - }) - ExpectApplied(ctx, env.Client, podNoEvict) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - - // Expect mirror pod to not be queued for eviction - Expect(queue.Has(podNoEvict)).To(BeFalse()) - - // Expect podEvict to be enqueued for eviction then be successful - EventuallyExpectTerminating(ctx, env.Client, podEvict) - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes until all pods are deleted", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - ExpectDeleted(ctx, env.Client, pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - ExpectDeleted(ctx, env.Client, pods[0]) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should delete nodes with no underlying instance even if not fully drained", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Make Node NotReady since it's automatically marked as Ready on first deploy - ExpectMakeNodesNotReady(ctx, env.Client, node) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // After this, the node still has one pod that is evicting. - ExpectDeleted(ctx, env.Client, pods[1]) - - // Remove the node from created nodeclaims so that the cloud provider returns DNE - cloudProvider.CreatedNodeClaims = map[string]*v1.NodeClaim{} - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes with no underlying instance if the node is still Ready", func() { - pods := test.Pods(2, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - ExpectApplied(ctx, env.Client, node, nodeClaim, pods[0], pods[1]) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectSingletonReconciled(ctx, queue) - - // Expect the pods to be evicted - EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1]) - - // Expect node to exist and be draining, but not deleted - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // After this, the node still has one pod that is evicting. - ExpectDeleted(ctx, env.Client, pods[1]) - - // Remove the node from created nodeclaims so that the cloud provider returns DNE - cloudProvider.CreatedNodeClaims = map[string]*v1.NodeClaim{} - - // Reconcile to try to delete the node, but don't succeed because the readiness condition - // of the node still won't let us delete it - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - }) - It("should wait for pods to terminate", func() { - pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}) - fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - - // Before grace period, node should not delete - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectNodeExists(ctx, env.Client, node.Name) - EventuallyExpectTerminating(ctx, env.Client, pod) - - // After grace period, node should delete. The deletion timestamps are from etcd which we can't control, so - // to eliminate test-flakiness we reset the time to current time + 90 seconds instead of just advancing - // the clock by 90 seconds. - fakeClock.SetTime(time.Now().Add(90 * time.Second)) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not evict a new pod with the same name using the old pod's eviction queue key", func() { - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - OwnerReferences: defaultOwnerRefs, - }, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, pod) - - // Trigger Termination Controller - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Don't trigger a call into the queue to make sure that we effectively aren't triggering eviction - // We'll use this to try to leave pods in the queue - - // Expect node to exist and be draining - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - - // Delete the pod directly to act like something else is doing the pod termination - ExpectDeleted(ctx, env.Client, pod) - - // Requeue the termination controller to completely delete the node - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - // Expect that the old pod's key still exists in the queue - Expect(queue.Has(pod)).To(BeTrue()) - - // Re-create the pod and node, it should now have the same name, but a different UUID - node = test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Finalizers: []string{v1.TerminationFinalizer}, - }, - }) - pod = test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - OwnerReferences: defaultOwnerRefs, - }, - }) - ExpectApplied(ctx, env.Client, node, pod) - - // Trigger eviction queue with the pod key still in it - ExpectSingletonReconciled(ctx, queue) - - Consistently(func(g Gomega) { - g.Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) - g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) - }, ReconcilerPropagationTime, RequestInterval).Should(Succeed()) - }) - It("should preemptively delete pods to satisfy their terminationGracePeriodSeconds", func() { - nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), - } - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.DoNotDisruptAnnotationKey: "true", - }, - OwnerReferences: defaultOwnerRefs, - }, - TerminationGracePeriodSeconds: lo.ToPtr(int64(300)), - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, pod) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectNodeExists(ctx, env.Client, node.Name) - ExpectDeleted(ctx, env.Client, pod) - }) - It("should only delete pods when their terminationGracePeriodSeconds is less than the the node's remaining terminationGracePeriod", func() { - nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: time.Now().Add(nodeClaim.Spec.TerminationGracePeriod.Duration).Format(time.RFC3339), - } - pod := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.DoNotDisruptAnnotationKey: "true", - }, - OwnerReferences: defaultOwnerRefs, - }, - TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), - }) - fakeClock.SetTime(time.Now()) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, pod) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - - // expect pod still exists - fakeClock.Step(90 * time.Second) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNodeWithNodeClaimDraining(env.Client, node.Name) - ExpectNodeExists(ctx, env.Client, node.Name) - ExpectPodExists(ctx, env.Client, pod.Name, pod.Namespace) - - // expect pod is now deleted - fakeClock.Step(175 * time.Second) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectSingletonReconciled(ctx, queue) - ExpectDeleted(ctx, env.Client, pod) - }) - Context("VolumeAttachments", func() { - It("should wait for volume attachments", func() { - va := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectDeleted(ctx, env.Client, va) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should only wait for volume attachments associated with drainable pods", func() { - vaDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - vaNonDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "bar", - }) - pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ - VolumeName: "bar", - }) - pod := test.Pod(test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - }, - Tolerations: []corev1.Toleration{{ - Key: v1.DisruptedTaintKey, - Operator: corev1.TolerationOpExists, - }}, - PersistentVolumeClaims: []string{pvc.Name}, - }) - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, vaDrainable, vaNonDrainable, pod, pvc) - ExpectManualBinding(ctx, env.Client, pod, node) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectDeleted(ctx, env.Client, vaDrainable) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - It("should wait for volume attachments until the nodeclaim's termination grace period expires", func() { - va := test.VolumeAttachment(test.VolumeAttachmentOptions{ - NodeName: node.Name, - VolumeName: "foo", - }) - nodeClaim.Annotations = map[string]string{ - v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(time.Minute).Format(time.RFC3339), - } - ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectExists(ctx, env.Client, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - fakeClock.Step(5 * time.Minute) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - // Since TGP expired before the VolumeAttachments were deleted, this status condition should have never transitioned - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) - - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectNotFound(ctx, env.Client, node) - }) - }) - }) - Context("Metrics", func() { - It("should fire the terminationSummary metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_termination_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(m.GetSummary().GetSampleCount()).To(BeNumerically("==", 1)) - }) - It("should fire the nodesTerminated counter metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectMetricCounterValue(termination.NodesDrainedTotal, 1, map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(m.GetCounter().Value)).To(BeNumerically("==", 1)) - }) - It("should fire the lifetime duration histogram metric when deleting nodes", func() { - ExpectApplied(ctx, env.Client, node, nodeClaim) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node). - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - - m, ok := FindMetricWithLabelValues("karpenter_nodes_lifetime_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(m.GetHistogram().SampleCount)).To(BeNumerically("==", 1)) - }) - It("should update the eviction queueDepth metric when reconciling pods", func() { - minAvailable := intstr.FromInt32(0) - labelSelector := map[string]string{test.RandomName(): test.RandomName()} - pdb := test.PodDisruptionBudget(test.PDBOptions{ - Labels: labelSelector, - // Don't let any pod evict - MinAvailable: &minAvailable, - }) - ExpectApplied(ctx, env.Client, pdb, node, nodeClaim) - pods := test.Pods(5, test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: defaultOwnerRefs, - Labels: labelSelector, - }}) - ExpectApplied(ctx, env.Client, lo.Map(pods, func(p *corev1.Pod, _ int) client.Object { return p })...) - - wqDepthBefore, _ := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectObjectReconciled(ctx, env.Client, terminationController, node) - wqDepthAfter, ok := FindMetricWithLabelValues("workqueue_adds_total", map[string]string{"name": "eviction.workqueue"}) - Expect(ok).To(BeTrue()) - Expect(lo.FromPtr(wqDepthAfter.GetCounter().Value) - lo.FromPtr(wqDepthBefore.GetCounter().Value)).To(BeNumerically("==", 5)) - }) - }) -}) - -func ExpectNodeWithNodeClaimDraining(c client.Client, nodeName string) *corev1.Node { - GinkgoHelper() - node := ExpectNodeExists(ctx, c, nodeName) - Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint)) - Expect(lo.Contains(node.Finalizers, v1.TerminationFinalizer)).To(BeTrue()) - Expect(node.DeletionTimestamp).ToNot(BeNil()) - return node -} diff --git a/pkg/controllers/node/termination/volumedetachment.go b/pkg/controllers/node/termination/volumedetachment/controller.go similarity index 50% rename from pkg/controllers/node/termination/volumedetachment.go rename to pkg/controllers/node/termination/volumedetachment/controller.go index 9b445ee02a..87754767e8 100644 --- a/pkg/controllers/node/termination/volumedetachment.go +++ b/pkg/controllers/node/termination/volumedetachment/controller.go @@ -14,61 +14,102 @@ See the License for the specific language governing permissions and limitations under the License. */ -package termination +package volumedetachment import ( "context" "time" "github.com/samber/lo" + "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" "sigs.k8s.io/karpenter/pkg/events" storagev1 "k8s.io/api/storage/v1" nodeutils "sigs.k8s.io/karpenter/pkg/utils/node" - "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" podutils "sigs.k8s.io/karpenter/pkg/utils/pod" volumeutil "sigs.k8s.io/karpenter/pkg/utils/volume" ) -type VolumeDetachmentReconciler struct { - clk clock.Clock - kubeClient client.Client - recorder events.Recorder +type Controller struct { + clk clock.Clock + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider + recorder events.Recorder } -// nolint:gocyclo -func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { - if !nc.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue() { - return reconcile.Result{}, nil - } - if nc.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue() { - return reconcile.Result{}, nil - } - if elapsed, err := nodeclaim.HasTerminationGracePeriodElapsed(v.clk, nc); err != nil { - log.FromContext(ctx).Error(err, "failed to terminate node") - return reconcile.Result{}, nil - } else if elapsed { - return reconcile.Result{}, nil +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, recorder events.Recorder) *Controller { + return &Controller{ + clk: clk, + kubeClient: kubeClient, + cloudProvider: cloudProvider, + recorder: recorder, } +} + +func (*Controller) Name() string { + return "node.volumedetachment" +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime.NewControllerManagedBy(m). + Named(c.Name()). + For(&corev1.Node{}, builder.WithPredicates(nodeutils.IsManagedPredicateFuncs(c.cloudProvider))). + Watches(&v1.NodeClaim{}, nodeutils.NodeClaimEventHandler(c.kubeClient, c.cloudProvider)). + WithOptions( + controller.Options{ + RateLimiter: workqueue.NewTypedMaxOfRateLimiter[reconcile.Request]( + workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](100*time.Millisecond, 10*time.Second), + // 10 qps, 100 bucket size + &workqueue.TypedBucketRateLimiter[reconcile.Request]{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ), + MaxConcurrentReconciles: 100, + }, + ). + Complete(terminationreconcile.AsReconciler(m.GetClient(), c.cloudProvider, c.clk, c)) +} + +func (*Controller) AwaitFinalizers() []string { + return nil +} - blockingVAs, err := v.blockingVolumeAttachments(ctx, n) +func (*Controller) Finalizer() string { + return v1.VolumeFinalizer +} + +func (*Controller) TerminationGracePeriodPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +func (*Controller) NodeClaimNotFoundPolicy() terminationreconcile.Policy { + return terminationreconcile.PolicyFinalize +} + +// nolint:gocyclo +func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node, nc *v1.NodeClaim) (reconcile.Result, error) { + blockingVAs, err := c.blockingVolumeAttachments(ctx, n) if err != nil { return reconcile.Result{}, err } if len(blockingVAs) != 0 { stored := nc.DeepCopy() _ = nc.StatusConditions().SetFalse(v1.ConditionTypeVolumesDetached, "AwaitingVolumeDetachment", "AwaitingVolumeDetachment") - if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -77,13 +118,13 @@ func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.No } return reconcile.Result{}, err } - v.recorder.Publish(NodeAwaitingVolumeDetachmentEvent(n, blockingVAs...)) + c.recorder.Publish(NodeAwaitingVolumeDetachmentEvent(n, blockingVAs...)) return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } stored := nc.DeepCopy() _ = nc.StatusConditions().SetTrue(v1.ConditionTypeVolumesDetached) - if err := v.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { + if err := c.kubeClient.Status().Patch(ctx, nc, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil { if errors.IsNotFound(err) { return reconcile.Result{}, nil } @@ -92,11 +133,14 @@ func (v *VolumeDetachmentReconciler) Reconcile(ctx context.Context, n *corev1.No } return reconcile.Result{}, err } + if err := terminationreconcile.RemoveFinalizer(ctx, c.kubeClient, n, c.Finalizer()); err != nil { + return reconcile.Result{}, err + } return reconcile.Result{}, nil } -func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Context, n *corev1.Node) ([]*storagev1.VolumeAttachment, error) { - vas, err := nodeutils.GetVolumeAttachments(ctx, v.kubeClient, n) +func (c *Controller) blockingVolumeAttachments(ctx context.Context, n *corev1.Node) ([]*storagev1.VolumeAttachment, error) { + vas, err := nodeutils.GetVolumeAttachments(ctx, c.kubeClient, n) if err != nil { return nil, err } @@ -104,12 +148,12 @@ func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Conte return nil, nil } - pods, err := nodeutils.GetPods(ctx, v.kubeClient, n) + pods, err := nodeutils.GetPods(ctx, c.kubeClient, n) if err != nil { return nil, err } pods = lo.Reject(pods, func(p *corev1.Pod, _ int) bool { - return podutils.IsDrainable(p, v.clk) + return podutils.IsDrainable(p, c.clk) }) // Determine the VolumeAttachments associated with non-drainable pods. We consider these non-blocking since they @@ -117,7 +161,7 @@ func (v *VolumeDetachmentReconciler) blockingVolumeAttachments(ctx context.Conte nonBlockingVolumes := sets.New[string]() for _, p := range pods { for _, vol := range p.Spec.Volumes { - pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, v.kubeClient, p, vol) + pvc, err := volumeutil.GetPersistentVolumeClaim(ctx, c.kubeClient, p, vol) if errors.IsNotFound(err) { continue } diff --git a/pkg/controllers/node/termination/volumedetachment/events.go b/pkg/controllers/node/termination/volumedetachment/events.go new file mode 100644 index 0000000000..6683622c7c --- /dev/null +++ b/pkg/controllers/node/termination/volumedetachment/events.go @@ -0,0 +1,36 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumedetachment + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + + "sigs.k8s.io/karpenter/pkg/events" +) + +func NodeAwaitingVolumeDetachmentEvent(node *corev1.Node, volumeAttachments ...*storagev1.VolumeAttachment) events.Event { + return events.Event{ + InvolvedObject: node, + Type: corev1.EventTypeNormal, + Reason: "AwaitingVolumeDetachment", + Message: fmt.Sprintf("Awaiting deletion of %d VolumeAttachments bound to node", len(volumeAttachments)), + DedupeValues: []string{node.Name}, + } +} diff --git a/pkg/controllers/node/termination/volumedetachment/suite_test.go b/pkg/controllers/node/termination/volumedetachment/suite_test.go new file mode 100644 index 0000000000..33e593ec80 --- /dev/null +++ b/pkg/controllers/node/termination/volumedetachment/suite_test.go @@ -0,0 +1,185 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package volumedetachment_test + +import ( + "context" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clock "k8s.io/utils/clock/testing" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + terminationreconcile "sigs.k8s.io/karpenter/pkg/controllers/node/termination/reconcile" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/volumedetachment" + "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var reconciler reconcile.Reconciler +var env *test.Environment +var fakeClock *clock.FakeClock +var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "VolumeDetachment") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment( + test.WithCRDs(apis.CRDs...), + test.WithCRDs(v1alpha1.CRDs...), + test.WithFieldIndexers(test.NodeClaimProviderIDFieldIndexer(ctx), test.VolumeAttachmentFieldIndexer(ctx)), + ) + cloudProvider = fake.NewCloudProvider() + recorder = test.NewEventRecorder() + reconciler = terminationreconcile.AsReconciler(env.Client, cloudProvider, fakeClock, volumedetachment.NewController(fakeClock, env.Client, cloudProvider, recorder)) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("VolumeDetachment", func() { + var node *corev1.Node + var nodeClaim *v1.NodeClaim + + BeforeEach(func() { + fakeClock.SetTime(time.Now()) + cloudProvider.Reset() + + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{ + v1.DrainFinalizer, + v1.VolumeFinalizer, + v1.TerminationFinalizer, + }}}) + cloudProvider.CreatedNodeClaims[node.Spec.ProviderID] = nodeClaim + }) + It("should wait for volume attachments", func() { + va := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, va) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + ExpectDeleted(ctx, env.Client, va) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + }) + It("should only wait for volume attachments associated with drainable pods", func() { + vaDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + vaNonDrainable := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "bar", + }) + pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + VolumeName: "bar", + }) + pod := test.Pod(test.PodOptions{ + // ObjectMeta: metav1.ObjectMeta{ + // OwnerReferences: defaultOwnerRefs, + // }, + Tolerations: []corev1.Toleration{{ + Key: v1.DisruptedTaintKey, + Operator: corev1.TolerationOpExists, + }}, + PersistentVolumeClaims: []string{pvc.Name}, + }) + ExpectApplied(ctx, env.Client, node, nodeClaim, vaDrainable, vaNonDrainable, pod, pvc) + ExpectManualBinding(ctx, env.Client, pod, node) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + ExpectDeleted(ctx, env.Client, vaDrainable) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsTrue()).To(BeTrue()) + }) + It("should remove volume protection finalizer once TGP has elapsed", func() { + va := test.VolumeAttachment(test.VolumeAttachmentOptions{ + NodeName: node.Name, + VolumeName: "foo", + }) + nodeClaim.Annotations = map[string]string{ + v1.NodeClaimTerminationTimestampAnnotationKey: fakeClock.Now().Add(time.Minute).Format(time.RFC3339), + } + ExpectApplied(ctx, env.Client, node, nodeClaim, va) + Expect(env.Client.Delete(ctx, node)).To(Succeed()) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue()) + + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).To(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + + fakeClock.Step(5 * time.Minute) + ExpectReconcileSucceeded(ctx, reconciler, client.ObjectKeyFromObject(node)) + + // Since TGP expired, the protection finalizer should have been removed. However, the status condition should + // remain false since volumes were never detached. + node = ExpectExists(ctx, env.Client, node) + Expect(node.Finalizers).ToNot(ContainElement(v1.VolumeFinalizer)) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/nodeclaim/hydration/suite_test.go b/pkg/controllers/nodeclaim/hydration/suite_test.go index 86af3235bd..f66422a764 100644 --- a/pkg/controllers/nodeclaim/hydration/suite_test.go +++ b/pkg/controllers/nodeclaim/hydration/suite_test.go @@ -25,6 +25,7 @@ import ( "github.com/samber/lo" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" @@ -66,42 +67,40 @@ var _ = AfterEach(func() { }) var _ = Describe("Hydration", func() { - It("should hydrate the NodeClass label", func() { - nc, _ := test.NodeClaimAndNode() - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(nc.Labels[v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())]).To(Equal(nc.Spec.NodeClassRef.Name)) - }) - It("shouldn't hydrate nodes which have already been hydrated", func() { - nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ + var nodeClaim *v1.NodeClaim + BeforeEach(func() { + nodeClaim = test.NodeClaim(v1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - v1.HydrationAnnotationKey: operator.Version, - }, + Annotations: map[string]string{v1.HydrationAnnotationKey: "not-hydrated"}, }, }) - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + }) + + It("should hydrate the NodeClass label", func() { + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(nodeClaim.Labels[v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())]).To(Equal(nodeClaim.Spec.NodeClassRef.Name)) + }) + It("shouldn't hydrate nodes which have already been hydrated", func() { + nodeClaim.Annotations[v1.HydrationAnnotationKey] = operator.Version + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(lo.Keys(nodeClaim.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) }) It("shouldn't hydrate NodeClaims which aren't managed by this instance of Karpenter", func() { - nc, _ := test.NodeClaimAndNode(v1.NodeClaim{ - Spec: v1.NodeClaimSpec{ - NodeClassRef: &v1.NodeClassReference{ - Group: "karpenter.test.sh", - Kind: "UnmanagedNodeClass", - Name: "default", - }, - }, - }) - delete(nc.Labels, v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind())) - ExpectApplied(ctx, env.Client, nc) - ExpectObjectReconciled(ctx, env.Client, hydrationController, nc) - nc = ExpectExists(ctx, env.Client, nc) - Expect(lo.Keys(nc.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nc.Spec.NodeClassRef.GroupKind()))) + nodeClaim.Spec.NodeClassRef = &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + } + delete(nodeClaim.Labels, v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind())) + ExpectApplied(ctx, env.Client, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, hydrationController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + Expect(lo.Keys(nodeClaim.Labels)).ToNot(ContainElement(v1.NodeClassLabelKey(nodeClaim.Spec.NodeClassRef.GroupKind()))) }) }) diff --git a/pkg/controllers/nodeclaim/lifecycle/termination_test.go b/pkg/controllers/nodeclaim/lifecycle/termination_test.go index 6d2d4d9238..4d1b450937 100644 --- a/pkg/controllers/nodeclaim/lifecycle/termination_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/termination_test.go @@ -300,7 +300,7 @@ var _ = Describe("Termination", func() { ExpectExists(ctx, env.Client, node) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.ObjectMeta.Annotations).To(BeNil()) + Expect(lo.Keys(nodeClaim.ObjectMeta.Annotations)).ToNot(ContainElement(v1.NodeClaimTerminationTimestampAnnotationKey)) }) It("should annotate the node if the NodeClaim has a terminationGracePeriod", func() { nodeClaim.Spec.TerminationGracePeriod = &metav1.Duration{Duration: time.Second * 300} diff --git a/pkg/events/suite_test.go b/pkg/events/suite_test.go index 2c53df92ef..f9338bb70d 100644 --- a/pkg/events/suite_test.go +++ b/pkg/events/suite_test.go @@ -30,7 +30,7 @@ import ( "k8s.io/client-go/util/flowcontrol" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" - "sigs.k8s.io/karpenter/pkg/controllers/node/termination" + "sigs.k8s.io/karpenter/pkg/controllers/node/termination/drain" "sigs.k8s.io/karpenter/pkg/controllers/node/termination/eviction" "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling" "sigs.k8s.io/karpenter/pkg/events" @@ -97,8 +97,8 @@ var _ = Describe("Event Creation", func() { Expect(internalRecorder.Calls(scheduling.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) It("should create a NodeFailedToDrain event", func() { - eventRecorder.Publish(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf(""))) - Expect(internalRecorder.Calls(termination.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) + eventRecorder.Publish(drain.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf(""))) + Expect(internalRecorder.Calls(drain.NodeDrainFailedEvent(NodeWithUID(), fmt.Errorf("")).Reason)).To(Equal(1)) }) }) diff --git a/pkg/test/nodeclaim.go b/pkg/test/nodeclaim.go index 467297b66f..e7bee59e43 100644 --- a/pkg/test/nodeclaim.go +++ b/pkg/test/nodeclaim.go @@ -25,6 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/operator" ) // NodeClaim creates a test NodeClaim with defaults that can be overridden by overrides. @@ -49,6 +50,9 @@ func NodeClaim(overrides ...v1.NodeClaim) *v1.NodeClaim { Name: "default", } } + override.Annotations = lo.Assign(map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }, override.Annotations) override.Labels = lo.Assign(map[string]string{ v1.NodeClassLabelKey(override.Spec.NodeClassRef.GroupKind()): override.Spec.NodeClassRef.Name, }, override.Labels) diff --git a/pkg/test/nodes.go b/pkg/test/nodes.go index 4e8c9c5bb8..4c594331ae 100644 --- a/pkg/test/nodes.go +++ b/pkg/test/nodes.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/operator" ) type NodeOptions struct { @@ -53,6 +54,10 @@ func Node(overrides ...NodeOptions) *corev1.Node { options.Capacity = options.Allocatable } + options.Annotations = lo.Assign(map[string]string{ + v1.HydrationAnnotationKey: operator.Version, + }, options.Annotations) + return &corev1.Node{ ObjectMeta: ObjectMeta(options.ObjectMeta), Spec: corev1.NodeSpec{ diff --git a/pkg/utils/node/node.go b/pkg/utils/node/node.go index ddcc6156b5..dd42c787ff 100644 --- a/pkg/utils/node/node.go +++ b/pkg/utils/node/node.go @@ -134,8 +134,11 @@ func IsManagedPredicateFuncs(cp cloudprovider.CloudProvider) predicate.Funcs { }) } -func NodeClaimEventHandler(c client.Client) handler.EventHandler { +func NodeClaimEventHandler(c client.Client, cloudProvider cloudprovider.CloudProvider) handler.EventHandler { return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + if !nodeclaimutils.IsManaged(o.(*v1.NodeClaim), cloudProvider) { + return nil + } providerID := o.(*v1.NodeClaim).Status.ProviderID if providerID == "" { return nil diff --git a/pkg/utils/node/types.go b/pkg/utils/node/types.go index 895f71eb65..f6f35d1fd2 100644 --- a/pkg/utils/node/types.go +++ b/pkg/utils/node/types.go @@ -51,7 +51,7 @@ type DuplicateNodeClaimError struct { } func (e *DuplicateNodeClaimError) Error() string { - return fmt.Sprintf("multiple found for provider id '%s'", e.ProviderID) + return fmt.Sprintf("multiple nodeclaims found for provider id '%s'", e.ProviderID) } func IsDuplicateNodeClaimError(err error) bool { @@ -72,7 +72,7 @@ func IgnoreDuplicateNodeClaimError(err error) error { type HydrationError struct{} func (e *HydrationError) Error() string { - return fmt.Sprintf("resource has not been hydrated") + return "resource has not been hydrated" } func IsHydrationError(err error) bool { diff --git a/pkg/utils/nodeclaim/types.go b/pkg/utils/nodeclaim/types.go index 9266aa2364..56c1397bca 100644 --- a/pkg/utils/nodeclaim/types.go +++ b/pkg/utils/nodeclaim/types.go @@ -68,4 +68,3 @@ func IgnoreDuplicateNodeError(err error) error { } return nil } -