aws · Summonair · Nov 1, 2024 · Dec 21, 2024 · Jan 3, 2025 · Jan 14, 2025
@@ -13,6 +13,7 @@ KARPENTER_IAM_ROLE_ARN ?= arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-ka
 HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${KARPENTER_IAM_ROLE_ARN} \
       		--set settings.clusterName=${CLUSTER_NAME} \
 			--set settings.interruptionQueue=${CLUSTER_NAME} \
+			--set settings.avoidEmptySubnets=true \
 			--set controller.resources.requests.cpu=1 \
 			--set controller.resources.requests.memory=1Gi \
 			--set controller.resources.limits.cpu=1 \

@@ -84,7 +84,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.2.1 \
 | serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. |
 | serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. |
 | serviceMonitor.endpointConfig | object | `{}` | Configuration on `http-metrics` endpoint for the ServiceMonitor. Not to be used to add additional endpoints. See the Prometheus operator documentation for configurable fields https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint |
-| settings | object | `{"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","eksControlPlane":false,"featureGates":{"nodeRepair":false,"spotToSpotConsolidation":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter |
+| settings | object | `{"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","eksControlPlane":false,"featureGates":{"nodeRepair":false,"spotToSpotConsolidation":false},"interruptionQueue":"","isolatedVPC":false,"avoidEmptySubnets":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter |
 | settings.batchIdleDuration | string | `"1s"` | The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. |
 | settings.batchMaxDuration | string | `"10s"` | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. |
 | settings.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. |
@@ -95,6 +95,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.2.1 \
 | settings.featureGates.spotToSpotConsolidation | bool | `false` | spotToSpotConsolidation is ALPHA and is disabled by default. Setting this to true will enable spot replacement consolidation for both single and multi-node consolidation. |
 | settings.interruptionQueue | string | `""` | Interruption queue is the name of the SQS queue used for processing interruption events from EC2 Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. |
 | settings.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint |
+| settings.avoidEmptySubnets | bool | `false` | Setting this to true will filter out subnets with no available IPs |
 | settings.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html |
 | settings.vmMemoryOverheadPercent | float | `0.075` | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. The value of `0.075` equals to 7.5%. |
 | strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. |

@@ -132,6 +132,10 @@ spec:
             - name: ISOLATED_VPC
               value: "{{ . }}"
           {{- end }}
+          {{- with .Values.settings.avoidEmptySubnets }}
+            - name: AVOID_EMPTY_SUBNETS
+              value: "{{ . }}"
+          {{- end }}
           {{- with .Values.settings.eksControlPlane }}
             - name: EKS_CONTROL_PLANE
               value: "{{ . }}"

@@ -170,6 +170,8 @@ settings:
   # -- If true then assume we can't reach AWS services which don't have a VPC endpoint
   # This also has the effect of disabling look-ups to the AWS pricing endpoint
   isolatedVPC: false
+  # -- Setting this to true will filter out subnets with no available IPs
+  avoidEmptySubnets: false
   # Marking this true means that your cluster is running with an EKS control plane and Karpenter should attempt to discover cluster details from the DescribeCluster API
   eksControlPlane: false
   # -- The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. The value of `0.075` equals to 7.5%.

@@ -42,6 +42,7 @@ type Options struct {
 	VMMemoryOverheadPercent float64
 	InterruptionQueue       string
 	ReservedENIs            int
+	AvoidEmptySubnets       bool
 }
 
 func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
@@ -53,6 +54,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
 	fs.Float64Var(&o.VMMemoryOverheadPercent, "vm-memory-overhead-percent", utils.WithDefaultFloat64("VM_MEMORY_OVERHEAD_PERCENT", 0.075), "The VM memory overhead as a percent that will be subtracted from the total memory for all instance types when cached information is unavailable.")
 	fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.")
 	fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.")
+	fs.BoolVarWithEnv(&o.AvoidEmptySubnets, "avoid-empty-subnets", "AVOID_EMPTY_SUBNETS", false, "Setting this to true will filter out subnets with no available IPs ")
 }
 
 func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error {

@@ -60,6 +60,7 @@ var _ = Describe("Options", func() {
 			"--cluster-name", "env-cluster",
 			"--cluster-endpoint", "https://env-cluster",
 			"--isolated-vpc",
+			"--avoid-empty-subnets",
 			"--vm-memory-overhead-percent", "0.1",
 			"--interruption-queue", "env-cluster",
 			"--reserved-enis", "10")
@@ -69,6 +70,7 @@ var _ = Describe("Options", func() {
 			ClusterName:             lo.ToPtr("env-cluster"),
 			ClusterEndpoint:         lo.ToPtr("https://env-cluster"),
 			IsolatedVPC:             lo.ToPtr(true),
+			AvoidEmptySubnets:       lo.ToPtr(true),
 			VMMemoryOverheadPercent: lo.ToPtr[float64](0.1),
 			InterruptionQueue:       lo.ToPtr("env-cluster"),
 			ReservedENIs:            lo.ToPtr(10),
@@ -79,6 +81,7 @@ var _ = Describe("Options", func() {
 		os.Setenv("CLUSTER_NAME", "env-cluster")
 		os.Setenv("CLUSTER_ENDPOINT", "https://env-cluster")
 		os.Setenv("ISOLATED_VPC", "true")
+		os.Setenv("AVOID_EMPTY_SUBNETS", "true")
 		os.Setenv("VM_MEMORY_OVERHEAD_PERCENT", "0.1")
 		os.Setenv("INTERRUPTION_QUEUE", "env-cluster")
 		os.Setenv("RESERVED_ENIS", "10")
@@ -93,6 +96,7 @@ var _ = Describe("Options", func() {
 			ClusterName:             lo.ToPtr("env-cluster"),
 			ClusterEndpoint:         lo.ToPtr("https://env-cluster"),
 			IsolatedVPC:             lo.ToPtr(true),
+			AvoidEmptySubnets:       lo.ToPtr(true),
 			VMMemoryOverheadPercent: lo.ToPtr[float64](0.1),
 			InterruptionQueue:       lo.ToPtr("env-cluster"),
 			ReservedENIs:            lo.ToPtr(10),
@@ -128,6 +132,7 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) {
 	Expect(optsA.ClusterName).To(Equal(optsB.ClusterName))
 	Expect(optsA.ClusterEndpoint).To(Equal(optsB.ClusterEndpoint))
 	Expect(optsA.IsolatedVPC).To(Equal(optsB.IsolatedVPC))
+	Expect(optsA.AvoidEmptySubnets).To(Equal(optsB.AvoidEmptySubnets))
 	Expect(optsA.VMMemoryOverheadPercent).To(Equal(optsB.VMMemoryOverheadPercent))
 	Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue))
 	Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs))

@@ -23,6 +23,7 @@ import (
 	"sigs.k8s.io/karpenter/pkg/test/v1alpha1"
 
 	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/ec2"
 	ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
 	"github.com/awslabs/operatorpkg/object"
 	"github.com/samber/lo"
@@ -205,4 +206,38 @@ var _ = Describe("InstanceProvider", func() {
 		retrievedIDs := sets.New[string](lo.Map(instances, func(i *instance.Instance, _ int) string { return i.ID })...)
 		Expect(ids.Equal(retrievedIDs)).To(BeTrue())
 	})
+		It("should not consider subnet with no available IPs for instance creation", func() {
+		// Prepare the context, nodeClass, and nodeClaim as in the other tests
+		ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
+		nodeClass = ExpectExists(ctx, env.Client, nodeClass)
+
+		// Update the EC2 API mock to include this subnet
+    	awsEnv.EC2API.DescribeSubnetsOutput.Set(&ec2.DescribeSubnetsOutput{
+    	    Subnets: []ec2types.Subnet{
+    	        {
+    	            SubnetId:                aws.String("test-subnet-1"),
+    	            AvailabilityZone:        aws.String("test-zone-1a"),
+    	            AvailableIpAddressCount: aws.Int32(0), // Exhausted
+    	            Tags:                    []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-1")}},
+    	        },
+    	        {
+    	            SubnetId:                aws.String("test-subnet-2"),
+    	            AvailabilityZone:        aws.String("test-zone-1b"),
+    	            AvailableIpAddressCount: aws.Int32(5), // Has IPs
+    	            Tags:                    []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}},
+    	        },
+    	    },
+    	})
+
+    instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
+    Expect(err).ToNot(HaveOccurred())
+
+    instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" })
+    instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes)
+
+    // Assert that the instance is created using the subnet with available IPs
+    Expect(err).ToNot(HaveOccurred())
+    Expect(instance).ToNot(BeNil())
+    Expect(instance.SubnetID).To(Equal("test-subnet-2"))
+	})
 })
@@ -31,7 +31,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
 	v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"
-
+ 	"github.com/aws/karpenter-provider-aws/pkg/operator/options"
 	karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
 	"sigs.k8s.io/karpenter/pkg/cloudprovider"
 	"sigs.k8s.io/karpenter/pkg/scheduling"
@@ -168,6 +168,15 @@ func (p *DefaultProvider) ZonalSubnetsForLaunch(ctx context.Context, nodeClass *
 		if trackedIPs, ok := p.inflightIPs[subnet.ID]; ok {
 			prevIPs = trackedIPs
 		}
+
+		// Check if the remaining IP count is insufficient to meet the predicted IP usage;
+		// if so, remove this subnet zone record from inflightIPs and continue to the next item in the loop.
+		if options.FromContext(ctx).AvoidEmptySubnets {
+			if prevIPs-predictedIPsUsed < 0 {
+				delete(zonalSubnets, subnet.Zone)
+				continue
+			}
+		}
 		p.inflightIPs[subnet.ID] = prevIPs - predictedIPsUsed
 	}
 	return zonalSubnets, nil

@@ -28,6 +28,7 @@ type OptionsFields struct {
 	ClusterName             *string
 	ClusterEndpoint         *string
 	IsolatedVPC             *bool
+	AvoidEmptySubnets       *bool
 	EKSControlPlane         *bool
 	VMMemoryOverheadPercent *float64
 	InterruptionQueue       *string
@@ -46,6 +47,7 @@ func Options(overrides ...OptionsFields) *options.Options {
 		ClusterName:             lo.FromPtrOr(opts.ClusterName, "test-cluster"),
 		ClusterEndpoint:         lo.FromPtrOr(opts.ClusterEndpoint, "https://test-cluster"),
 		IsolatedVPC:             lo.FromPtrOr(opts.IsolatedVPC, false),
+		AvoidEmptySubnets:       lo.FromPtrOr(opts.AvoidEmptySubnets, false),
 		EKSControlPlane:         lo.FromPtrOr(opts.EKSControlPlane, false),
 		VMMemoryOverheadPercent: lo.FromPtrOr(opts.VMMemoryOverheadPercent, 0.075),
 		InterruptionQueue:       lo.FromPtrOr(opts.InterruptionQueue, ""),

@@ -24,6 +24,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
 | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)|
 | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.|
 | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.|
+| AVOID_EMPTY_SUBNETS | \-\-avoid-empty-subnets | Setting this to true will filter out subnets with no available IPs.|
 | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate|
 | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)|
 | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)|

@@ -24,6 +24,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
 | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)|
 | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.|
 | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.|
+| AVOID_EMPTY_SUBNETS | \-\-avoid-empty-subnets | Setting this to true will filter out subnets with no available IPs.|
 | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate|
 | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)|
 | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)|