@@ -23,6 +23,7 @@ import (
2323 "time"
2424
2525 "github.com/awslabs/operatorpkg/status"
26+ "github.com/go-logr/logr"
2627 "github.com/samber/lo"
2728 corev1 "k8s.io/api/core/v1"
2829 "k8s.io/apimachinery/pkg/api/errors"
@@ -47,6 +48,34 @@ import (
4748
4849const CloudProviderName = "ibmcloud"
4950
51+ const (
52+ NodeClassNotFoundDrift cloudprovider.DriftReason = "NodeClassNotFound"
53+ NodeClassHashVersionChangedDrift cloudprovider.DriftReason = "NodeClassHashVersionChanged"
54+ NodeClassHashChangedDrift cloudprovider.DriftReason = "NodeClassHashChanged"
55+ SubnetDrift cloudprovider.DriftReason = "SubnetDrift"
56+ ImageDrift cloudprovider.DriftReason = "ImageDrift"
57+ )
58+
59+ // Disruption reasons specific to the IBM Cloud provider.
60+ // They are used to surface why nodes were disrupted (drift, quota issues, etc)
61+ // through Karpenter's disruption events and metrics.
62+ const (
63+ // DisruptionReasonVPCQuotaExceeded indicates the VPC/Network quota was exceeded
64+ // (for example, no more IP addresses available or VPC resource quota reached)
65+ // which prevented provisioning or caused instance failures.
66+ DisruptionReasonVPCQuotaExceeded karpv1.DisruptionReason = "VPCQuotaExceeded"
67+
68+ // DisruptionReasonAPIRateLimited indicates disruption due to IBM Cloud
69+ // API rate limiting (throttling), where provisioning/sync operations
70+ // repeatedly failed with rate-limit errors.
71+ DisruptionReasonAPIRateLimited karpv1.DisruptionReason = "APIRateLimited"
72+
73+ // DisruptionReasonInstanceTerminated indicates the instance was terminated
74+ // (e.g., by user action, by cloud provider lifecycle, or due to an underlying
75+ // platform event).
76+ DisruptionReasonInstanceTerminated karpv1.DisruptionReason = "InstanceTerminated"
77+ )
78+
5079var _ cloudprovider.CloudProvider = (* CloudProvider )(nil )
5180
5281type CloudProvider struct {
@@ -452,6 +481,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
452481 annotations := map [string ]string {
453482 v1alpha1 .AnnotationIBMNodeClassHash : nodeClass .Annotations [v1alpha1 .AnnotationIBMNodeClassHash ],
454483 v1alpha1 .AnnotationIBMNodeClassHashVersion : v1alpha1 .IBMNodeClassHashVersion ,
484+ v1alpha1 .AnnotationIBMNodeClaimSubnetID : node .Annotations [v1alpha1 .AnnotationIBMNodeClaimSubnetID ],
455485 }
456486
457487 // Store resolved image ID only if available
@@ -555,44 +585,118 @@ func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *karpv1.NodeCla
555585 log := log .FromContext (ctx ).WithValues ("nodeClaim" , nodeClaim .Name , "providerID" , nodeClaim .Status .ProviderID )
556586 log .Info ("Checking if node has drifted" )
557587
558- // Get the current hash from the node's annotations
559- currentHash := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClassHash ]
560- currentVersion := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClassHashVersion ]
561- storedImageID := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClaimImageID ]
562-
563588 // Get the NodeClass
589+ nodeClass , driftReason , err := c .getNodeClassForDrift (ctx , log , nodeClaim )
590+ if err != nil {
591+ return "" , err
592+ }
593+ if driftReason != "" {
594+ return driftReason , nil
595+ }
596+
597+ if driftReason = c .isNodeClassHashVersionDrifted (ctx , log , nodeClaim ); driftReason != "" {
598+ return driftReason , nil
599+ }
600+
601+ if driftReason = c .isNodeClassHashDrifted (ctx , log , nodeClaim , nodeClass ); driftReason != "" {
602+ return driftReason , nil
603+ }
604+
605+ if driftReason = c .isImageDrifted (ctx , log , nodeClaim , nodeClass ); driftReason != "" {
606+ return driftReason , nil
607+ }
608+
609+ driftReason , err = c .isSubnetDrifted (ctx , log , nodeClaim , nodeClass )
610+ if err != nil {
611+ return "" , err
612+ }
613+ if driftReason != "" {
614+ return driftReason , nil
615+ }
616+
617+ return "" , nil
618+ }
619+
620+ func (c * CloudProvider ) getNodeClassForDrift (ctx context.Context , log logr.Logger , nodeClaim * karpv1.NodeClaim ) (* v1alpha1.IBMNodeClass , cloudprovider.DriftReason , error ) {
564621 nodeClass := & v1alpha1.IBMNodeClass {}
565622 if err := c .kubeClient .Get (ctx , types.NamespacedName {Name : nodeClaim .Spec .NodeClassRef .Name }, nodeClass ); err != nil {
566623 if errors .IsNotFound (err ) {
567624 log .Error (err , "NodeClass not found" )
568- return "NodeClassNotFound" , nil
625+ return nil , NodeClassNotFoundDrift , nil
569626 }
570- return "" , fmt .Errorf ("getting nodeclass, %w" , err )
627+ return nil , "" , fmt .Errorf ("getting nodeclass, %w" , err )
571628 }
629+ return nodeClass , "" , nil
630+ }
631+
632+ func (c * CloudProvider ) isNodeClassHashVersionDrifted (ctx context.Context , log logr.Logger , nodeClaim * karpv1.NodeClaim ) cloudprovider.DriftReason {
633+ // Get the current hash version from the node's annotations
634+ currentVersion := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClassHashVersion ]
572635
573636 // Check if the hash version matches
574637 if currentVersion != v1alpha1 .IBMNodeClassHashVersion {
575638 log .Info ("NodeClass hash version mismatch" , "current" , currentVersion , "expected" , v1alpha1 .IBMNodeClassHashVersion )
576- return "NodeClassHashVersionChanged" , nil
639+ return NodeClassHashVersionChangedDrift
577640 }
641+ return ""
642+ }
578643
579- // Check if the hash matches
644+ func (c * CloudProvider ) isNodeClassHashDrifted (ctx context.Context , log logr.Logger , nodeClaim * karpv1.NodeClaim , nodeClass * v1alpha1.IBMNodeClass ) cloudprovider.DriftReason {
645+ // Get the current hash from the node's annotations
646+ currentHash := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClassHash ]
580647 expectedHash := nodeClass .Annotations [v1alpha1 .AnnotationIBMNodeClassHash ]
648+
649+ // Check if the hash matches
581650 if expectedHash != currentHash {
582651 log .Info ("NodeClass hash mismatch" , "current" , currentHash , "expected" , expectedHash )
583- return "NodeClassHashChanged" , nil
652+ return NodeClassHashChangedDrift
584653 }
654+ return ""
655+ }
585656
586- // Check if the ImageID matches
657+ func (c * CloudProvider ) isImageDrifted (ctx context.Context , log logr.Logger , nodeClaim * karpv1.NodeClaim , nodeClass * v1alpha1.IBMNodeClass ) cloudprovider.DriftReason {
658+ // Get the stored image id from the node's annotations
659+ storedImageID := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClaimImageID ]
587660 currentImageID := nodeClass .Status .ResolvedImageID
588- if storedImageID != "" && currentImageID != "" {
589- if storedImageID != currentImageID {
590- log .Info ("Node image drift detected" , "storedImageID" , storedImageID , "currentImageID" , currentImageID )
591- return "ImageDrift" , nil
661+
662+ // Check if the ImageID matches
663+ if storedImageID != "" && currentImageID != "" && storedImageID != currentImageID {
664+ log .Info ("Node image drift detected" , "storedImageID" , storedImageID , "currentImageID" , currentImageID )
665+ return ImageDrift
666+ }
667+ return ""
668+ }
669+
670+ func (c * CloudProvider ) isSubnetDrifted (ctx context.Context , log logr.Logger , nodeClaim * karpv1.NodeClaim , nodeClass * v1alpha1.IBMNodeClass ) (cloudprovider.DriftReason , error ) {
671+ storedSubnetID := nodeClaim .Annotations [v1alpha1 .AnnotationIBMNodeClaimSubnetID ]
672+ if storedSubnetID == "" {
673+ return "" , nil
674+ }
675+
676+ // Case 1: Explicit subnet in spec - compare directly
677+ if nodeClass .Spec .Subnet != "" {
678+ if storedSubnetID != nodeClass .Spec .Subnet {
679+ log .Info ("Subnet drift detected" , "storedSubnetID" , storedSubnetID , "specSubnet" , nodeClass .Spec .Subnet )
680+ return SubnetDrift , nil
592681 }
682+ return "" , nil
593683 }
594684
595- return "" , nil
685+ // Case 2: PlacementStrategy - check against Status.SelectedSubnets
686+ validSubnets := nodeClass .Status .SelectedSubnets
687+ if len (validSubnets ) == 0 {
688+ log .Info ("No subnets in Status.SelectedSubnets, skipping drift check" , "nodeClass" , nodeClass .Name )
689+ return "" , nil
690+ }
691+
692+ for _ , curSubnet := range validSubnets {
693+ if curSubnet == storedSubnetID {
694+ return "" , nil
695+ }
696+ }
697+
698+ log .Info ("Subnet drift detected" , "storedSubnetID" , storedSubnetID , "validSubnets" , validSubnets )
699+ return SubnetDrift , nil
596700}
597701
598702func (c * CloudProvider ) Name () string {
@@ -651,3 +755,12 @@ func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy {
651755 },
652756 }
653757}
758+
759+ // DisruptionReasons returns the IBM Cloud provider disruption reasons.
760+ func (c * CloudProvider ) DisruptionReasons () []karpv1.DisruptionReason {
761+ return []karpv1.DisruptionReason {
762+ DisruptionReasonVPCQuotaExceeded ,
763+ DisruptionReasonAPIRateLimited ,
764+ DisruptionReasonInstanceTerminated ,
765+ }
766+ }
0 commit comments