Skip to content

Commit 376f2f1

Browse files
k8s-ci-robotnekkunti
authored andcommitted
Merge pull request #454 from meomnzak/feat/add-subnet-drift
feat: add subnet drift detection Signed-off-by: Anand Nekkunti <anand.nekkunti@ibm.com>
2 parents e6f7317 + d4ec5c0 commit 376f2f1

File tree

6 files changed

+583
-18
lines changed

6 files changed

+583
-18
lines changed

pkg/apis/v1alpha1/annotations.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ const (
2626
// AnnotationIBMNodeClassHashVersion is the annotation key for the version of the hash function
2727
AnnotationIBMNodeClassHashVersion = Group + "/nodeclass-hash-version"
2828

29+
// AnnotationIBMNodeClaimSubnetID is the annotation key used to record the subnet ID
30+
AnnotationIBMNodeClaimSubnetID = Group + "/subnet-id"
31+
2932
// AnnotationIBMNodeClaimImageID stores the resolved image ID used to create a NodeClaim.
3033
AnnotationIBMNodeClaimImageID = Group + "/image-id"
3134
)

pkg/cloudprovider/cloudprovider.go

Lines changed: 129 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"time"
2424

2525
"github.com/awslabs/operatorpkg/status"
26+
"github.com/go-logr/logr"
2627
"github.com/samber/lo"
2728
corev1 "k8s.io/api/core/v1"
2829
"k8s.io/apimachinery/pkg/api/errors"
@@ -47,6 +48,34 @@ import (
4748

4849
const CloudProviderName = "ibmcloud"
4950

51+
const (
52+
NodeClassNotFoundDrift cloudprovider.DriftReason = "NodeClassNotFound"
53+
NodeClassHashVersionChangedDrift cloudprovider.DriftReason = "NodeClassHashVersionChanged"
54+
NodeClassHashChangedDrift cloudprovider.DriftReason = "NodeClassHashChanged"
55+
SubnetDrift cloudprovider.DriftReason = "SubnetDrift"
56+
ImageDrift cloudprovider.DriftReason = "ImageDrift"
57+
)
58+
59+
// Disruption reasons specific to the IBM Cloud provider.
60+
// They are used to surface why nodes were disrupted (drift, quota issues, etc)
61+
// through Karpenter's disruption events and metrics.
62+
const (
63+
// DisruptionReasonVPCQuotaExceeded indicates the VPC/Network quota was exceeded
64+
// (for example, no more IP addresses available or VPC resource quota reached)
65+
// which prevented provisioning or caused instance failures.
66+
DisruptionReasonVPCQuotaExceeded karpv1.DisruptionReason = "VPCQuotaExceeded"
67+
68+
// DisruptionReasonAPIRateLimited indicates disruption due to IBM Cloud
69+
// API rate limiting (throttling), where provisioning/sync operations
70+
// repeatedly failed with rate-limit errors.
71+
DisruptionReasonAPIRateLimited karpv1.DisruptionReason = "APIRateLimited"
72+
73+
// DisruptionReasonInstanceTerminated indicates the instance was terminated
74+
// (e.g., by user action, by cloud provider lifecycle, or due to an underlying
75+
// platform event).
76+
DisruptionReasonInstanceTerminated karpv1.DisruptionReason = "InstanceTerminated"
77+
)
78+
5079
var _ cloudprovider.CloudProvider = (*CloudProvider)(nil)
5180

5281
type CloudProvider struct {
@@ -452,6 +481,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
452481
annotations := map[string]string{
453482
v1alpha1.AnnotationIBMNodeClassHash: nodeClass.Annotations[v1alpha1.AnnotationIBMNodeClassHash],
454483
v1alpha1.AnnotationIBMNodeClassHashVersion: v1alpha1.IBMNodeClassHashVersion,
484+
v1alpha1.AnnotationIBMNodeClaimSubnetID: node.Annotations[v1alpha1.AnnotationIBMNodeClaimSubnetID],
455485
}
456486

457487
// Store resolved image ID only if available
@@ -555,44 +585,118 @@ func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *karpv1.NodeCla
555585
log := log.FromContext(ctx).WithValues("nodeClaim", nodeClaim.Name, "providerID", nodeClaim.Status.ProviderID)
556586
log.Info("Checking if node has drifted")
557587

558-
// Get the current hash from the node's annotations
559-
currentHash := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClassHash]
560-
currentVersion := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClassHashVersion]
561-
storedImageID := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClaimImageID]
562-
563588
// Get the NodeClass
589+
nodeClass, driftReason, err := c.getNodeClassForDrift(ctx, log, nodeClaim)
590+
if err != nil {
591+
return "", err
592+
}
593+
if driftReason != "" {
594+
return driftReason, nil
595+
}
596+
597+
if driftReason = c.isNodeClassHashVersionDrifted(ctx, log, nodeClaim); driftReason != "" {
598+
return driftReason, nil
599+
}
600+
601+
if driftReason = c.isNodeClassHashDrifted(ctx, log, nodeClaim, nodeClass); driftReason != "" {
602+
return driftReason, nil
603+
}
604+
605+
if driftReason = c.isImageDrifted(ctx, log, nodeClaim, nodeClass); driftReason != "" {
606+
return driftReason, nil
607+
}
608+
609+
driftReason, err = c.isSubnetDrifted(ctx, log, nodeClaim, nodeClass)
610+
if err != nil {
611+
return "", err
612+
}
613+
if driftReason != "" {
614+
return driftReason, nil
615+
}
616+
617+
return "", nil
618+
}
619+
620+
func (c *CloudProvider) getNodeClassForDrift(ctx context.Context, log logr.Logger, nodeClaim *karpv1.NodeClaim) (*v1alpha1.IBMNodeClass, cloudprovider.DriftReason, error) {
564621
nodeClass := &v1alpha1.IBMNodeClass{}
565622
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Spec.NodeClassRef.Name}, nodeClass); err != nil {
566623
if errors.IsNotFound(err) {
567624
log.Error(err, "NodeClass not found")
568-
return "NodeClassNotFound", nil
625+
return nil, NodeClassNotFoundDrift, nil
569626
}
570-
return "", fmt.Errorf("getting nodeclass, %w", err)
627+
return nil, "", fmt.Errorf("getting nodeclass, %w", err)
571628
}
629+
return nodeClass, "", nil
630+
}
631+
632+
func (c *CloudProvider) isNodeClassHashVersionDrifted(ctx context.Context, log logr.Logger, nodeClaim *karpv1.NodeClaim) cloudprovider.DriftReason {
633+
// Get the current hash version from the node's annotations
634+
currentVersion := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClassHashVersion]
572635

573636
// Check if the hash version matches
574637
if currentVersion != v1alpha1.IBMNodeClassHashVersion {
575638
log.Info("NodeClass hash version mismatch", "current", currentVersion, "expected", v1alpha1.IBMNodeClassHashVersion)
576-
return "NodeClassHashVersionChanged", nil
639+
return NodeClassHashVersionChangedDrift
577640
}
641+
return ""
642+
}
578643

579-
// Check if the hash matches
644+
func (c *CloudProvider) isNodeClassHashDrifted(ctx context.Context, log logr.Logger, nodeClaim *karpv1.NodeClaim, nodeClass *v1alpha1.IBMNodeClass) cloudprovider.DriftReason {
645+
// Get the current hash from the node's annotations
646+
currentHash := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClassHash]
580647
expectedHash := nodeClass.Annotations[v1alpha1.AnnotationIBMNodeClassHash]
648+
649+
// Check if the hash matches
581650
if expectedHash != currentHash {
582651
log.Info("NodeClass hash mismatch", "current", currentHash, "expected", expectedHash)
583-
return "NodeClassHashChanged", nil
652+
return NodeClassHashChangedDrift
584653
}
654+
return ""
655+
}
585656

586-
// Check if the ImageID matches
657+
func (c *CloudProvider) isImageDrifted(ctx context.Context, log logr.Logger, nodeClaim *karpv1.NodeClaim, nodeClass *v1alpha1.IBMNodeClass) cloudprovider.DriftReason {
658+
// Get the stored image id from the node's annotations
659+
storedImageID := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClaimImageID]
587660
currentImageID := nodeClass.Status.ResolvedImageID
588-
if storedImageID != "" && currentImageID != "" {
589-
if storedImageID != currentImageID {
590-
log.Info("Node image drift detected", "storedImageID", storedImageID, "currentImageID", currentImageID)
591-
return "ImageDrift", nil
661+
662+
// Check if the ImageID matches
663+
if storedImageID != "" && currentImageID != "" && storedImageID != currentImageID {
664+
log.Info("Node image drift detected", "storedImageID", storedImageID, "currentImageID", currentImageID)
665+
return ImageDrift
666+
}
667+
return ""
668+
}
669+
670+
func (c *CloudProvider) isSubnetDrifted(ctx context.Context, log logr.Logger, nodeClaim *karpv1.NodeClaim, nodeClass *v1alpha1.IBMNodeClass) (cloudprovider.DriftReason, error) {
671+
storedSubnetID := nodeClaim.Annotations[v1alpha1.AnnotationIBMNodeClaimSubnetID]
672+
if storedSubnetID == "" {
673+
return "", nil
674+
}
675+
676+
// Case 1: Explicit subnet in spec - compare directly
677+
if nodeClass.Spec.Subnet != "" {
678+
if storedSubnetID != nodeClass.Spec.Subnet {
679+
log.Info("Subnet drift detected", "storedSubnetID", storedSubnetID, "specSubnet", nodeClass.Spec.Subnet)
680+
return SubnetDrift, nil
592681
}
682+
return "", nil
593683
}
594684

595-
return "", nil
685+
// Case 2: PlacementStrategy - check against Status.SelectedSubnets
686+
validSubnets := nodeClass.Status.SelectedSubnets
687+
if len(validSubnets) == 0 {
688+
log.Info("No subnets in Status.SelectedSubnets, skipping drift check", "nodeClass", nodeClass.Name)
689+
return "", nil
690+
}
691+
692+
for _, curSubnet := range validSubnets {
693+
if curSubnet == storedSubnetID {
694+
return "", nil
695+
}
696+
}
697+
698+
log.Info("Subnet drift detected", "storedSubnetID", storedSubnetID, "validSubnets", validSubnets)
699+
return SubnetDrift, nil
596700
}
597701

598702
func (c *CloudProvider) Name() string {
@@ -651,3 +755,12 @@ func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy {
651755
},
652756
}
653757
}
758+
759+
// DisruptionReasons returns the IBM Cloud provider disruption reasons.
760+
func (c *CloudProvider) DisruptionReasons() []karpv1.DisruptionReason {
761+
return []karpv1.DisruptionReason{
762+
DisruptionReasonVPCQuotaExceeded,
763+
DisruptionReasonAPIRateLimited,
764+
DisruptionReasonInstanceTerminated,
765+
}
766+
}

0 commit comments

Comments
 (0)