Skip to content

Commit

Permalink
chore: Ensure that NodeClaim is always logged with Node (#1999)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Feb 20, 2025
1 parent 65414aa commit df69a07
Show file tree
Hide file tree
Showing 11 changed files with 40 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pkg/controllers/node/health/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error {

func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "node.health")
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(node.Namespace, node.Name)))

// Validate that the node is owned by us
nodeClaim, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, node)
if err != nil {
return reconcile.Result{}, nodeutils.IgnoreDuplicateNodeClaimError(nodeutils.IgnoreNodeClaimNotFoundError(err))
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaim)))

// If a nodeclaim does has a nodepool label, validate the nodeclaims inside the nodepool are healthy (i.e bellow the allowed threshold)
// In the case of standalone nodeclaim, validate the nodes inside the cluster are healthy before proceeding
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/node/hydration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr

func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name)))

nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n)
if err != nil {
Expand All @@ -68,6 +67,7 @@ func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.R
if !nodeclaimutils.IsManaged(nc, c.cloudProvider) {
return reconcile.Result{}, nil
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nc)))

stored := n.DeepCopy()
n.Labels = lo.Assign(n.Labels, map[string]string{
Expand Down
5 changes: 4 additions & 1 deletion pkg/controllers/node/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
Expand Down Expand Up @@ -96,10 +97,12 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
if err != nil {
return reconcile.Result{}, fmt.Errorf("listing nodeclaims, %w", err)
}

if err = c.deleteAllNodeClaims(ctx, nodeClaims...); err != nil {
return reconcile.Result{}, fmt.Errorf("deleting nodeclaims, %w", err)
}
if len(nodeClaims) != 0 {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaims[0])))
}

nodeTerminationTime, err := c.nodeTerminationTime(node, nodeClaims...)
if err != nil {
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/nodeclaim/consistency/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
Expand Down Expand Up @@ -77,6 +78,10 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou

func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "nodeclaim.consistency")
if nodeClaim.Status.NodeName != "" {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName)))
}

if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) {
return reconcile.Result{}, nil
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/nodeclaim/disruption/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ import (
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

Expand Down Expand Up @@ -67,6 +69,9 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou
// Reconcile executes a control loop for the resource
func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "nodeclaim.disruption")
if nodeClaim.Status.NodeName != "" {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName)))
}

if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) {
return reconcile.Result{}, nil
Expand Down
12 changes: 12 additions & 0 deletions pkg/controllers/nodeclaim/expiration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"strings"
"time"

"k8s.io/klog/v2"
"k8s.io/utils/clock"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
Expand All @@ -29,6 +30,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"sigs.k8s.io/karpenter/pkg/operator/injection"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/cloudprovider"
"sigs.k8s.io/karpenter/pkg/metrics"
Expand All @@ -52,6 +55,11 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou
}

func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())
if nodeClaim.Status.NodeName != "" {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName)))
}

if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) {
return reconcile.Result{}, nil
}
Expand Down Expand Up @@ -87,6 +95,10 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re
return reconcile.Result{}, nil
}

func (c *Controller) Name() string {
return "nodeclaim.expiration"
}

func (c *Controller) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named("nodeclaim.expiration").
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/nodeclaim/garbagecollection/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
}
log.FromContext(ctx).WithValues(
"NodeClaim", klog.KRef("", nodeClaims[i].Name),
"Node", klog.KRef("", nodeClaims[i].Status.NodeName),
"provider-id", nodeClaims[i].Status.ProviderID,
"nodepool", nodeClaims[i].Labels[v1.NodePoolLabelKey],
).V(1).Info("garbage collecting nodeclaim with no cloudprovider representation")
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
metrics.ReasonLabel: "garbage_collected",
Expand Down
3 changes: 2 additions & 1 deletion pkg/controllers/nodeclaim/hydration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr

func (c *Controller) Reconcile(ctx context.Context, nc *v1.NodeClaim) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name)))
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nc.Status.NodeName)))

if !nodeclaimutils.IsManaged(nc, c.cloudProvider) {
return reconcile.Result{}, nil
}
Expand Down
15 changes: 8 additions & 7 deletions pkg/controllers/nodeclaim/lifecycle/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ import (
terminationutil "sigs.k8s.io/karpenter/pkg/utils/termination"
)

type nodeClaimReconciler interface {
Reconcile(context.Context, *v1.NodeClaim) (reconcile.Result, error)
}

// Controller is a NodeClaim Lifecycle controller that manages the lifecycle of the NodeClaim up until its termination
// The controller is responsible for ensuring that new Nodes get launched, that they have properly registered with
// the cluster as nodes and that they are properly initialized, ensuring that nodeclaims that do not have matching nodes
Expand Down Expand Up @@ -108,9 +104,15 @@ func (c *Controller) Name() string {
return "nodeclaim.lifecycle"
}

// nolint:gocyclo
func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())

if nodeClaim.Status.ProviderID != "" {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID))
}
if nodeClaim.Status.NodeName != "" {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName)))
}
if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) {
return reconcile.Result{}, nil
}
Expand All @@ -137,7 +139,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re
stored = nodeClaim.DeepCopy()
var results []reconcile.Result
var errs error
for _, reconciler := range []nodeClaimReconciler{
for _, reconciler := range []reconcile.TypedReconciler[*v1.NodeClaim]{
c.launch,
c.registration,
c.initialization,
Expand Down Expand Up @@ -169,7 +171,6 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re

//nolint:gocyclo
func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName), "provider-id", nodeClaim.Status.ProviderID))
if !controllerutil.ContainsFinalizer(nodeClaim, v1.TerminationFinalizer) {
return reconcile.Result{}, nil
}
Expand Down
3 changes: 0 additions & 3 deletions pkg/controllers/nodeclaim/lifecycle/initialization.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand Down Expand Up @@ -53,13 +52,11 @@ func (i *Initialization) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim)
if !nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue() {
return reconcile.Result{}, nil
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID))
node, err := nodeclaimutils.NodeForNodeClaim(ctx, i.kubeClient, nodeClaim)
if err != nil {
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster")
return reconcile.Result{}, nil //nolint:nilerr
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name)))
if nodeutils.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue {
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady")
return reconcile.Result{}, nil
Expand Down
3 changes: 1 addition & 2 deletions pkg/controllers/nodeclaim/lifecycle/registration.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (
nodeClaim.StatusConditions().Set(*cond)
return reconcile.Result{}, nil
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID))
node, err := nodeclaimutils.NodeForNodeClaim(ctx, r.kubeClient, nodeClaim)
if err != nil {
if nodeclaimutils.IsNodeNotFoundError(err) {
Expand All @@ -68,7 +67,7 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (
nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "UnregisteredTaintNotFound", fmt.Sprintf("Invariant violated, %s taint must be present on Karpenter-managed nodes", v1.UnregisteredTaintKey))
return reconcile.Result{}, fmt.Errorf("missing required startup taint, %s", v1.UnregisteredTaintKey)
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name)))
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KObj(node)))
if err = r.syncNode(ctx, nodeClaim, node); err != nil {
if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
Expand Down

0 comments on commit df69a07

Please sign in to comment.