diff --git a/Makefile b/Makefile index 0fcce58d5b9b..60d015137828 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ KARPENTER_IAM_ROLE_ARN ?= arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-ka HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${KARPENTER_IAM_ROLE_ARN} \ --set settings.clusterName=${CLUSTER_NAME} \ --set settings.interruptionQueue=${CLUSTER_NAME} \ + --set settings.avoidEmptySubnets=true \ --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index a4dbca075163..5bdcce08577d 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -84,7 +84,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.2.1 \ | serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | | serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | | serviceMonitor.endpointConfig | object | `{}` | Configuration on `http-metrics` endpoint for the ServiceMonitor. Not to be used to add additional endpoints. See the Prometheus operator documentation for configurable fields https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint | -| settings | object | `{"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","eksControlPlane":false,"featureGates":{"nodeRepair":false,"spotToSpotConsolidation":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | +| settings | object | `{"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","eksControlPlane":false,"featureGates":{"nodeRepair":false,"spotToSpotConsolidation":false},"interruptionQueue":"","isolatedVPC":false,"avoidEmptySubnets":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | | settings.batchIdleDuration | string | `"1s"` | The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. | | settings.batchMaxDuration | string | `"10s"` | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. | | settings.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | @@ -95,6 +95,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:1.2.1 \ | settings.featureGates.spotToSpotConsolidation | bool | `false` | spotToSpotConsolidation is ALPHA and is disabled by default. Setting this to true will enable spot replacement consolidation for both single and multi-node consolidation. | | settings.interruptionQueue | string | `""` | Interruption queue is the name of the SQS queue used for processing interruption events from EC2 Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | | settings.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | +| settings.avoidEmptySubnets | bool | `false` | Setting this to true will filter out subnets with no available IPs | | settings.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | | settings.vmMemoryOverheadPercent | float | `0.075` | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. The value of `0.075` equals to 7.5%. | | strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index edd2aea0429a..7b16009cb247 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -132,6 +132,10 @@ spec: - name: ISOLATED_VPC value: "{{ . }}" {{- end }} + {{- with .Values.settings.avoidEmptySubnets }} + - name: AVOID_EMPTY_SUBNETS + value: "{{ . }}" + {{- end }} {{- with .Values.settings.eksControlPlane }} - name: EKS_CONTROL_PLANE value: "{{ . }}" diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index bc24a5852ee9..b1ca73fe3c83 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -170,6 +170,8 @@ settings: # -- If true then assume we can't reach AWS services which don't have a VPC endpoint # This also has the effect of disabling look-ups to the AWS pricing endpoint isolatedVPC: false + # -- Setting this to true will filter out subnets with no available IPs + avoidEmptySubnets: false # Marking this true means that your cluster is running with an EKS control plane and Karpenter should attempt to discover cluster details from the DescribeCluster API eksControlPlane: false # -- The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. The value of `0.075` equals to 7.5%. diff --git a/pkg/operator/options/options.go b/pkg/operator/options/options.go index ef72316d6be7..40a56f4da205 100644 --- a/pkg/operator/options/options.go +++ b/pkg/operator/options/options.go @@ -42,6 +42,7 @@ type Options struct { VMMemoryOverheadPercent float64 InterruptionQueue string ReservedENIs int + AvoidEmptySubnets bool } func (o *Options) AddFlags(fs *coreoptions.FlagSet) { @@ -53,6 +54,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) { fs.Float64Var(&o.VMMemoryOverheadPercent, "vm-memory-overhead-percent", utils.WithDefaultFloat64("VM_MEMORY_OVERHEAD_PERCENT", 0.075), "The VM memory overhead as a percent that will be subtracted from the total memory for all instance types when cached information is unavailable.") fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.") fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.") + fs.BoolVarWithEnv(&o.AvoidEmptySubnets, "avoid-empty-subnets", "AVOID_EMPTY_SUBNETS", false, "Setting this to true will filter out subnets with no available IPs ") } func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error { diff --git a/pkg/operator/options/suite_test.go b/pkg/operator/options/suite_test.go index 14e60dd0332c..df65a9277609 100644 --- a/pkg/operator/options/suite_test.go +++ b/pkg/operator/options/suite_test.go @@ -60,6 +60,7 @@ var _ = Describe("Options", func() { "--cluster-name", "env-cluster", "--cluster-endpoint", "https://env-cluster", "--isolated-vpc", + "--avoid-empty-subnets", "--vm-memory-overhead-percent", "0.1", "--interruption-queue", "env-cluster", "--reserved-enis", "10") @@ -69,6 +70,7 @@ var _ = Describe("Options", func() { ClusterName: lo.ToPtr("env-cluster"), ClusterEndpoint: lo.ToPtr("https://env-cluster"), IsolatedVPC: lo.ToPtr(true), + AvoidEmptySubnets: lo.ToPtr(true), VMMemoryOverheadPercent: lo.ToPtr[float64](0.1), InterruptionQueue: lo.ToPtr("env-cluster"), ReservedENIs: lo.ToPtr(10), @@ -79,6 +81,7 @@ var _ = Describe("Options", func() { os.Setenv("CLUSTER_NAME", "env-cluster") os.Setenv("CLUSTER_ENDPOINT", "https://env-cluster") os.Setenv("ISOLATED_VPC", "true") + os.Setenv("AVOID_EMPTY_SUBNETS", "true") os.Setenv("VM_MEMORY_OVERHEAD_PERCENT", "0.1") os.Setenv("INTERRUPTION_QUEUE", "env-cluster") os.Setenv("RESERVED_ENIS", "10") @@ -93,6 +96,7 @@ var _ = Describe("Options", func() { ClusterName: lo.ToPtr("env-cluster"), ClusterEndpoint: lo.ToPtr("https://env-cluster"), IsolatedVPC: lo.ToPtr(true), + AvoidEmptySubnets: lo.ToPtr(true), VMMemoryOverheadPercent: lo.ToPtr[float64](0.1), InterruptionQueue: lo.ToPtr("env-cluster"), ReservedENIs: lo.ToPtr(10), @@ -128,6 +132,7 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) { Expect(optsA.ClusterName).To(Equal(optsB.ClusterName)) Expect(optsA.ClusterEndpoint).To(Equal(optsB.ClusterEndpoint)) Expect(optsA.IsolatedVPC).To(Equal(optsB.IsolatedVPC)) + Expect(optsA.AvoidEmptySubnets).To(Equal(optsB.AvoidEmptySubnets)) Expect(optsA.VMMemoryOverheadPercent).To(Equal(optsB.VMMemoryOverheadPercent)) Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue)) Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs)) diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 58b3ebdecf63..e999eb6fef08 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -23,6 +23,7 @@ import ( "sigs.k8s.io/karpenter/pkg/test/v1alpha1" "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/awslabs/operatorpkg/object" "github.com/samber/lo" @@ -205,4 +206,38 @@ var _ = Describe("InstanceProvider", func() { retrievedIDs := sets.New[string](lo.Map(instances, func(i *instance.Instance, _ int) string { return i.ID })...) Expect(ids.Equal(retrievedIDs)).To(BeTrue()) }) + It("should not consider subnet with no available IPs for instance creation", func() { + // Prepare the context, nodeClass, and nodeClaim as in the other tests + ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + + // Update the EC2 API mock to include this subnet + awsEnv.EC2API.DescribeSubnetsOutput.Set(&ec2.DescribeSubnetsOutput{ + Subnets: []ec2types.Subnet{ + { + SubnetId: aws.String("test-subnet-1"), + AvailabilityZone: aws.String("test-zone-1a"), + AvailableIpAddressCount: aws.Int32(0), // Exhausted + Tags: []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-1")}}, + }, + { + SubnetId: aws.String("test-subnet-2"), + AvailabilityZone: aws.String("test-zone-1b"), + AvailableIpAddressCount: aws.Int32(5), // Has IPs + Tags: []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}}, + }, + }, + }) + + instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) + Expect(err).ToNot(HaveOccurred()) + + instanceTypes = lo.Filter(instanceTypes, func(i *corecloudprovider.InstanceType, _ int) bool { return i.Name == "m5.xlarge" }) + instance, err := awsEnv.InstanceProvider.Create(ctx, nodeClass, nodeClaim, nil, instanceTypes) + + // Assert that the instance is created using the subnet with available IPs + Expect(err).ToNot(HaveOccurred()) + Expect(instance).ToNot(BeNil()) + Expect(instance.SubnetID).To(Equal("test-subnet-2")) + }) }) diff --git a/pkg/providers/subnet/subnet.go b/pkg/providers/subnet/subnet.go index 959b8bc4ddc8..83b923f8e16b 100644 --- a/pkg/providers/subnet/subnet.go +++ b/pkg/providers/subnet/subnet.go @@ -31,7 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" - + "github.com/aws/karpenter-provider-aws/pkg/operator/options" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/scheduling" @@ -168,6 +168,15 @@ func (p *DefaultProvider) ZonalSubnetsForLaunch(ctx context.Context, nodeClass * if trackedIPs, ok := p.inflightIPs[subnet.ID]; ok { prevIPs = trackedIPs } + + // Check if the remaining IP count is insufficient to meet the predicted IP usage; + // if so, remove this subnet zone record from inflightIPs and continue to the next item in the loop. + if options.FromContext(ctx).AvoidEmptySubnets { + if prevIPs-predictedIPsUsed < 0 { + delete(zonalSubnets, subnet.Zone) + continue + } + } p.inflightIPs[subnet.ID] = prevIPs - predictedIPsUsed } return zonalSubnets, nil diff --git a/pkg/test/options.go b/pkg/test/options.go index 9745bad539df..51e69a2d004b 100644 --- a/pkg/test/options.go +++ b/pkg/test/options.go @@ -28,6 +28,7 @@ type OptionsFields struct { ClusterName *string ClusterEndpoint *string IsolatedVPC *bool + AvoidEmptySubnets *bool EKSControlPlane *bool VMMemoryOverheadPercent *float64 InterruptionQueue *string @@ -46,6 +47,7 @@ func Options(overrides ...OptionsFields) *options.Options { ClusterName: lo.FromPtrOr(opts.ClusterName, "test-cluster"), ClusterEndpoint: lo.FromPtrOr(opts.ClusterEndpoint, "https://test-cluster"), IsolatedVPC: lo.FromPtrOr(opts.IsolatedVPC, false), + AvoidEmptySubnets: lo.FromPtrOr(opts.AvoidEmptySubnets, false), EKSControlPlane: lo.FromPtrOr(opts.EKSControlPlane, false), VMMemoryOverheadPercent: lo.FromPtrOr(opts.VMMemoryOverheadPercent, 0.075), InterruptionQueue: lo.FromPtrOr(opts.InterruptionQueue, ""), diff --git a/website/content/en/docs/reference/settings.md b/website/content/en/docs/reference/settings.md index 6d5febc52994..5eb5bca27aa5 100644 --- a/website/content/en/docs/reference/settings.md +++ b/website/content/en/docs/reference/settings.md @@ -24,6 +24,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.| +| AVOID_EMPTY_SUBNETS | \-\-avoid-empty-subnets | Setting this to true will filter out subnets with no available IPs.| | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate| | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)| | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)| diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md index 6d5febc52994..5eb5bca27aa5 100644 --- a/website/content/en/preview/reference/settings.md +++ b/website/content/en/preview/reference/settings.md @@ -24,6 +24,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.| +| AVOID_EMPTY_SUBNETS | \-\-avoid-empty-subnets | Setting this to true will filter out subnets with no available IPs.| | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate| | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)| | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)|