Skip to content

Commit 911cce0

Browse files
Added UnavailableOfferingsTTL exposed as the flag & env variable in the karpenter
1 parent 2cb43bc commit 911cce0

File tree

11 files changed

+31
-11
lines changed

11 files changed

+31
-11
lines changed

hack/docs/instancetypes_gen/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ below are the resources available with some assumptions and after the instance o
142142
ec2api,
143143
cfg.Region,
144144
),
145-
awscache.NewUnavailableOfferings(),
145+
awscache.NewUnavailableOfferings(ctx),
146146
),
147147
)
148148
if err = instanceTypeProvider.UpdateInstanceTypes(ctx); err != nil {

hack/tools/launchtemplate_counter/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ func main() {
6969
ec2api,
7070
cfg.Region,
7171
),
72-
awscache.NewUnavailableOfferings(),
72+
awscache.NewUnavailableOfferings(ctx),
7373
),
7474
)
7575
if err := instanceTypeProvider.UpdateInstanceTypes(ctx); err != nil {

pkg/cache/cache.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ const (
2424
// AWS APIs, which can have a serious impact on performance and scalability.
2525
// DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION
2626
DefaultTTL = time.Minute
27-
// UnavailableOfferingsTTL is the time before offerings that were marked as unavailable
28-
// are removed from the cache and are available for launch again
29-
UnavailableOfferingsTTL = 3 * time.Minute
3027
// InstanceTypesAndZonesTTL is the time before we refresh instance types and zones at EC2
3128
InstanceTypesAndZonesTTL = 5 * time.Minute
3229
// InstanceProfileTTL is the time before we refresh checking instance profile existence at IAM

pkg/cache/unavailableofferings.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import (
1818
"context"
1919
"fmt"
2020
"sync/atomic"
21+
"time"
2122

2223
"github.com/aws/aws-sdk-go-v2/aws"
2324
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
25+
"github.com/aws/karpenter-provider-aws/pkg/operator/options"
2426
"github.com/samber/lo"
2527

2628
"github.com/patrickmn/go-cache"
@@ -36,7 +38,8 @@ type UnavailableOfferings struct {
3638
SeqNum uint64
3739
}
3840

39-
func NewUnavailableOfferings() *UnavailableOfferings {
41+
func NewUnavailableOfferings(ctx context.Context) *UnavailableOfferings {
42+
UnavailableOfferingsTTL := time.Duration(int64(options.FromContext(ctx).UnavailableOfferingsTTL)) * time.Second
4043
uo := &UnavailableOfferings{
4144
cache: cache.New(UnavailableOfferingsTTL, UnavailableOfferingsCleanupInterval),
4245
SeqNum: 0,
@@ -55,6 +58,7 @@ func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType,
5558

5659
// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings
5760
func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, unavailableReason string, instanceType ec2types.InstanceType, zone, capacityType string) {
61+
UnavailableOfferingsTTL := time.Duration(int64(options.FromContext(ctx).UnavailableOfferingsTTL)) * time.Second
5862
// even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL
5963
log.FromContext(ctx).WithValues(
6064
"reason", unavailableReason,

pkg/controllers/interruption/interruption_benchmark_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) {
107107

108108
// Load all the fundamental components before setting up the controllers
109109
recorder := coretest.NewEventRecorder()
110-
unavailableOfferingsCache = awscache.NewUnavailableOfferings()
110+
unavailableOfferingsCache = awscache.NewUnavailableOfferings(ctx)
111111

112112
// Set-up the controllers
113113
interruptionController := interruption.NewController(env.Client, fakeClock, recorder, providers.sqsProvider, unavailableOfferingsCache)

pkg/controllers/interruption/suite_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ var _ = BeforeSuite(func() {
8787
env = coretest.NewEnvironment(coretest.WithCRDs(apis.CRDs...), coretest.WithCRDs(v1alpha1.CRDs...), coretest.WithFieldIndexers(test.NodeInstanceIDFieldIndexer(ctx), test.NodeClaimInstanceIDFieldIndexer(ctx)))
8888
awsEnv = test.NewEnvironment(ctx, env)
8989
fakeClock = &clock.FakeClock{}
90-
unavailableOfferingsCache = awscache.NewUnavailableOfferings()
90+
unavailableOfferingsCache = awscache.NewUnavailableOfferings(ctx)
9191
sqsapi = &fake.SQSAPI{}
9292
sqsProvider = lo.Must(sqs.NewDefaultProvider(sqsapi, fmt.Sprintf("https://sqs.%s.amazonaws.com/%s/test-cluster", fake.DefaultRegion, fake.DefaultAccount)))
9393
cloudProvider := cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),

pkg/operator/operator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
138138
} else {
139139
log.FromContext(ctx).WithValues("kube-dns-ip", kubeDNSIP).V(1).Info("discovered kube dns")
140140
}
141-
unavailableOfferingsCache := awscache.NewUnavailableOfferings()
141+
unavailableOfferingsCache := awscache.NewUnavailableOfferings(ctx)
142142
ssmCache := cache.New(awscache.SSMCacheTTL, awscache.DefaultCleanupInterval)
143143

144144
subnetProvider := subnet.NewDefaultProvider(ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AvailableIPAddressTTL, awscache.DefaultCleanupInterval), cache.New(awscache.AssociatePublicIPAddressTTL, awscache.DefaultCleanupInterval))

pkg/operator/options/options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type Options struct {
4242
VMMemoryOverheadPercent float64
4343
InterruptionQueue string
4444
ReservedENIs int
45+
UnavailableOfferingsTTL int
4546
}
4647

4748
func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
@@ -53,6 +54,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
5354
fs.Float64Var(&o.VMMemoryOverheadPercent, "vm-memory-overhead-percent", utils.WithDefaultFloat64("VM_MEMORY_OVERHEAD_PERCENT", 0.075), "The VM memory overhead as a percent that will be subtracted from the total memory for all instance types when cached information is unavailable.")
5455
fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.")
5556
fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.")
57+
fs.IntVar(&o.UnavailableOfferingsTTL, "unavailable-offerings-ttl", env.WithDefaultInt("UNAVAILABLE_OFFERINGS_TTL", 180), "The Unavailable offerings TTL is the time before offerings that were marked as unavailable are removed from the cache and are available for launch again.")
5658
}
5759

5860
func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error {

pkg/operator/options/options_validation.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ func (o Options) Validate() error {
2626
o.validateEndpoint(),
2727
o.validateVMMemoryOverheadPercent(),
2828
o.validateReservedENIs(),
29+
o.validateUnavailableOfferingsTTL(),
2930
o.validateRequiredFields(),
3031
)
3132
}
@@ -57,6 +58,13 @@ func (o Options) validateReservedENIs() error {
5758
return nil
5859
}
5960

61+
func (o Options) validateUnavailableOfferingsTTL() error {
62+
if o.UnavailableOfferingsTTL < 0 {
63+
return fmt.Errorf("unavailable-offerings-ttl cannot be negative")
64+
}
65+
return nil
66+
}
67+
6068
func (o Options) validateRequiredFields() error {
6169
if o.ClusterName == "" {
6270
return fmt.Errorf("missing field, cluster-name")

pkg/operator/options/suite_test.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ var _ = Describe("Options", func() {
6262
"--isolated-vpc",
6363
"--vm-memory-overhead-percent", "0.1",
6464
"--interruption-queue", "env-cluster",
65-
"--reserved-enis", "10")
65+
"--reserved-enis", "10",
66+
"--unavailable-offerings-ttl", "30")
6667
Expect(err).ToNot(HaveOccurred())
6768
expectOptionsEqual(opts, test.Options(test.OptionsFields{
6869
ClusterCABundle: lo.ToPtr("env-bundle"),
@@ -72,6 +73,7 @@ var _ = Describe("Options", func() {
7273
VMMemoryOverheadPercent: lo.ToPtr[float64](0.1),
7374
InterruptionQueue: lo.ToPtr("env-cluster"),
7475
ReservedENIs: lo.ToPtr(10),
76+
UnavailableOfferingsTTL: lo.ToPtr(30),
7577
}))
7678
})
7779
It("should correctly fallback to env vars when CLI flags aren't set", func() {
@@ -82,6 +84,7 @@ var _ = Describe("Options", func() {
8284
os.Setenv("VM_MEMORY_OVERHEAD_PERCENT", "0.1")
8385
os.Setenv("INTERRUPTION_QUEUE", "env-cluster")
8486
os.Setenv("RESERVED_ENIS", "10")
87+
os.Setenv("UNAVAILABLE_OFFERINGS_TTL", "30")
8588

8689
// Add flags after we set the environment variables so that the parsing logic correctly refers
8790
// to the new environment variable values
@@ -96,6 +99,7 @@ var _ = Describe("Options", func() {
9699
VMMemoryOverheadPercent: lo.ToPtr[float64](0.1),
97100
InterruptionQueue: lo.ToPtr("env-cluster"),
98101
ReservedENIs: lo.ToPtr(10),
102+
UnavailableOfferingsTTL: lo.ToPtr(30),
99103
}))
100104
})
101105

@@ -119,6 +123,10 @@ var _ = Describe("Options", func() {
119123
err := opts.Parse(fs, "--cluster-name", "test-cluster", "--reserved-enis", "-1")
120124
Expect(err).To(HaveOccurred())
121125
})
126+
It("should fail when unavailableOfferingsTTL is negative", func() {
127+
err := opts.Parse(fs, "--cluster-name", "test-cluster", "--unavailable-offerings-ttl", "-1")
128+
Expect(err).To(HaveOccurred())
129+
})
122130
})
123131
})
124132

@@ -131,4 +139,5 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) {
131139
Expect(optsA.VMMemoryOverheadPercent).To(Equal(optsB.VMMemoryOverheadPercent))
132140
Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue))
133141
Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs))
142+
Expect(optsA.UnavailableOfferingsTTL).To(Equal(optsB.UnavailableOfferingsTTL))
134143
}

pkg/test/environment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment
105105
ec2Cache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
106106
instanceTypeCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
107107
discoveredCapacityCache := cache.New(awscache.DiscoveredCapacityCacheTTL, awscache.DefaultCleanupInterval)
108-
unavailableOfferingsCache := awscache.NewUnavailableOfferings()
108+
unavailableOfferingsCache := awscache.NewUnavailableOfferings(ctx)
109109
launchTemplateCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
110110
subnetCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
111111
availableIPAdressCache := cache.New(awscache.AvailableIPAddressTTL, awscache.DefaultCleanupInterval)

0 commit comments

Comments
 (0)