Skip to content

Commit cabd483

Browse files
DerekFrankSarthug99
authored andcommitted
fix: forcefully terminate nodes on EC2 instance health failure signals (#9198)
1 parent 15781e3 commit cabd483

11 files changed

Lines changed: 371 additions & 100 deletions

File tree

go.mod

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/aws/karpenter-provider-aws
22

3-
go 1.26.2
3+
go 1.26.3
44

55
require (
66
github.com/Pallinder/go-randomdata v1.2.0
@@ -47,7 +47,7 @@ require (
4747
k8s.io/klog/v2 v2.130.1
4848
k8s.io/utils v0.0.0-20251222233032-718f0e51e6d2
4949
sigs.k8s.io/controller-runtime v0.22.4
50-
sigs.k8s.io/karpenter v1.12.0
50+
sigs.k8s.io/karpenter v1.12.1
5151
sigs.k8s.io/yaml v1.6.0
5252
)
5353

@@ -120,14 +120,14 @@ require (
120120
github.com/x448/float16 v0.8.4 // indirect
121121
go.yaml.in/yaml/v2 v2.4.3 // indirect
122122
go.yaml.in/yaml/v3 v3.0.4 // indirect
123-
golang.org/x/mod v0.34.0 // indirect
124-
golang.org/x/net v0.52.0 // indirect
123+
golang.org/x/mod v0.35.0 // indirect
124+
golang.org/x/net v0.55.0 // indirect
125125
golang.org/x/oauth2 v0.34.0 // indirect
126-
golang.org/x/sys v0.42.0 // indirect
127-
golang.org/x/term v0.41.0 // indirect
128-
golang.org/x/text v0.36.0 // indirect
126+
golang.org/x/sys v0.45.0 // indirect
127+
golang.org/x/term v0.43.0 // indirect
128+
golang.org/x/text v0.37.0 // indirect
129129
golang.org/x/time v0.15.0 // indirect
130-
golang.org/x/tools v0.43.0 // indirect
130+
golang.org/x/tools v0.44.0 // indirect
131131
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
132132
google.golang.org/protobuf v1.36.11 // indirect
133133
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect

go.sum

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,8 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
278278
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
279279
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
280280
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
281-
golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI=
282-
golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY=
281+
golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM=
282+
golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU=
283283
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
284284
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
285285
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
@@ -289,8 +289,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
289289
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
290290
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
291291
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
292-
golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
293-
golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
292+
golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8=
293+
golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww=
294294
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
295295
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
296296
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -315,8 +315,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
315315
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
316316
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
317317
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
318-
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
319-
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
318+
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
319+
golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
320320
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
321321
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
322322
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
@@ -326,8 +326,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
326326
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
327327
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
328328
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
329-
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
330-
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
329+
golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4=
330+
golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk=
331331
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
332332
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
333333
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
@@ -337,8 +337,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
337337
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
338338
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
339339
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
340-
golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg=
341-
golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164=
340+
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
341+
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
342342
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
343343
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
344344
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -347,8 +347,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc
347347
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
348348
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
349349
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
350-
golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s=
351-
golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0=
350+
golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c=
351+
golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI=
352352
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
353353
gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0=
354354
gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
@@ -389,8 +389,8 @@ sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327U
389389
sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
390390
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
391391
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
392-
sigs.k8s.io/karpenter v1.12.0 h1:SffONGtSqVfTiEBM55OZb5ZVEsBPaEcLuRLL8v98sPo=
393-
sigs.k8s.io/karpenter v1.12.0/go.mod h1:9eMMjkY2Wjx37QkrcPrSr+oiILoOr+DcuEV523Ygg2k=
392+
sigs.k8s.io/karpenter v1.12.1 h1:B73nBcw+va8eRzX3OXLexcw6vaGZuLHlN9iMksK2oa0=
393+
sigs.k8s.io/karpenter v1.12.1/go.mod h1:TAGfq4tY+33AnGzzwiNGRAMRHAxYNMbsqj1nmt/Kz+w=
394394
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
395395
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
396396
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=

pkg/controllers/controllers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func NewControllers(
112112
crexpiration.NewController(clk, kubeClient, cloudProvider, capacityReservationProvider),
113113
metrics.NewController(kubeClient, cloudProvider),
114114
arczonalshiftcontroller.NewController(zonalshiftProvider),
115-
interruption.NewInstanceStatusController(kubeClient, cloudProvider, recorder, instanceStatusProvider),
115+
interruption.NewInstanceStatusController(kubeClient, clk, cloudProvider, recorder, instanceStatusProvider),
116116
}
117117
// Instance profile garbage collection requires IAM API access. Skip registering the controller when running
118118
// in isolated VPC mode to avoid initiating calls to public AWS endpoints that won’t be reachable.
@@ -122,7 +122,7 @@ func NewControllers(
122122
if options.FromContext(ctx).InterruptionQueue != "" {
123123
sqsAPI := servicesqs.NewFromConfig(cfg)
124124
prov, _ := sqs.NewSQSProvider(ctx, sqsAPI)
125-
controllers = append(controllers, interruption.NewController(kubeClient, cloudProvider, recorder, prov, sqsAPI, unavailableOfferings, capacityReservationProvider))
125+
controllers = append(controllers, interruption.NewController(kubeClient, clk, cloudProvider, recorder, prov, sqsAPI, unavailableOfferings, capacityReservationProvider))
126126
}
127127
return controllers
128128
}

pkg/controllers/interruption/controller.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/awslabs/operatorpkg/singleton"
2727
"go.uber.org/multierr"
2828
"k8s.io/client-go/util/workqueue"
29+
"k8s.io/utils/clock"
2930
controllerruntime "sigs.k8s.io/controller-runtime"
3031
"sigs.k8s.io/controller-runtime/pkg/client"
3132
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -54,6 +55,7 @@ type Controller struct {
5455

5556
func NewController(
5657
kubeClient client.Client,
58+
clk clock.Clock,
5759
cloudProvider cloudprovider.CloudProvider,
5860
recorder events.Recorder,
5961
sqsProvider sqs.Provider,
@@ -64,6 +66,7 @@ func NewController(
6466
return &Controller{
6567
InterruptionHandler: InterruptionHandler{
6668
kubeClient: kubeClient,
69+
clk: clk,
6770
cloudProvider: cloudProvider,
6871
recorder: recorder,
6972
unavailableOfferingsCache: unavailableOfferingsCache,
@@ -109,7 +112,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconciler.Result, error) {
109112
return
110113
}
111114
ReceivedMessages.Inc(map[string]string{messageTypeLabel: string(msg.Kind())})
112-
if e = c.handleMessage(ctx, msg); e != nil {
115+
if _, e = c.handleMessage(ctx, msg, false); e != nil {
113116
errs[i] = fmt.Errorf("handling message, %w", e)
114117
return
115118
}

pkg/controllers/interruption/instancestatus_controller.go

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ package interruption
1717
import (
1818
"context"
1919
"fmt"
20+
"sync"
2021
"time"
2122

2223
"github.com/awslabs/operatorpkg/reconciler"
2324
"github.com/awslabs/operatorpkg/singleton"
2425
"go.uber.org/multierr"
2526
"k8s.io/client-go/util/workqueue"
27+
"k8s.io/utils/clock"
2628
controllerruntime "sigs.k8s.io/controller-runtime"
2729
"sigs.k8s.io/controller-runtime/pkg/client"
2830
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -31,36 +33,59 @@ import (
3133
"sigs.k8s.io/karpenter/pkg/events"
3234
"sigs.k8s.io/karpenter/pkg/operator/injection"
3335

34-
"github.com/aws/karpenter-provider-aws/pkg/controllers/interruption/messages/instancestatusfailure"
36+
"github.com/aws/karpenter-provider-aws/pkg/controllers/interruption/messages"
37+
instancestatusmsg "github.com/aws/karpenter-provider-aws/pkg/controllers/interruption/messages/instancestatus"
3538
awserrors "github.com/aws/karpenter-provider-aws/pkg/errors"
3639
"github.com/aws/karpenter-provider-aws/pkg/providers/instancestatus"
3740
)
3841

42+
// unhealthyKey uniquely identifies an unhealthy status check for deduplication.
43+
// The metric is only incremented the first time a given instance+category is observed.
44+
type unhealthyKey struct {
45+
instanceID string
46+
category string
47+
}
48+
3949
var (
4050
// InstanceStatusInterval is the polling interval for the EC2 DescribeInstanceStatus API.
4151
InstanceStatusInterval = 1 * time.Minute
52+
// InstanceStatusDryRun controls whether the instance status controller takes action on
53+
// unhealthy instances. When true, the controller only emits metrics without cordoning
54+
// and draining affected nodes. Default is false (full remediation enabled).
55+
InstanceStatusDryRun = false
56+
// categoryToKind maps EC2 DescribeInstanceStatus categories to message kinds.
57+
categoryToKind = map[instancestatus.Category]messages.Kind{
58+
instancestatus.InstanceStatus: messages.InstanceStatusKind,
59+
instancestatus.SystemStatus: messages.SystemStatusKind,
60+
instancestatus.EventStatus: messages.EventStatusKind,
61+
}
4262
)
4363

4464
// InstanceStatusController polls EC2 DescribeInstanceStatus to detect unhealthy instances
4565
// and scheduled maintenance events, then cordons and drains affected nodes.
4666
type InstanceStatusController struct {
4767
InterruptionHandler
4868
instanceStatusProvider instancestatus.Provider
69+
seen map[unhealthyKey]struct{}
70+
mu sync.Mutex
4971
}
5072

5173
func NewInstanceStatusController(
5274
kubeClient client.Client,
75+
clk clock.Clock,
5376
cloudProvider cloudprovider.CloudProvider,
5477
recorder events.Recorder,
5578
instanceStatusProvider instancestatus.Provider,
5679
) *InstanceStatusController {
5780
return &InstanceStatusController{
5881
InterruptionHandler: InterruptionHandler{
5982
kubeClient: kubeClient,
83+
clk: clk,
6084
cloudProvider: cloudProvider,
6185
recorder: recorder,
6286
},
6387
instanceStatusProvider: instanceStatusProvider,
88+
seen: map[unhealthyKey]struct{}{},
6489
}
6590
}
6691

@@ -76,25 +101,68 @@ func (c *InstanceStatusController) Reconcile(ctx context.Context) (reconciler.Re
76101
return reconciler.Result{}, fmt.Errorf("getting instance statuses, %w", err)
77102
}
78103

104+
// Build the set of keys observed in this poll cycle for pruning stale entries.
105+
currentKeys := make(map[unhealthyKey]struct{})
79106
errs := make([]error, len(instanceStatuses))
80107
workqueue.ParallelizeUntil(ctx, 10, len(instanceStatuses), func(i int) {
81-
categories := map[string]bool{}
82-
for _, d := range instanceStatuses[i].Details {
83-
categories[string(d.Category)] = true
84-
}
85-
for cat := range categories {
86-
InstanceStatusUnhealthy.Inc(map[string]string{categoryLabel: cat})
87-
}
88-
if err := c.handleMessage(ctx, instancestatusfailure.Message(instanceStatuses[i])); err != nil {
89-
errs[i] = fmt.Errorf("handling instance status check message, %w", err)
90-
}
108+
errs[i] = c.handleHealthStatus(ctx, instanceStatuses[i], currentKeys)
91109
})
110+
111+
// Prune entries for instances that are no longer reported as unhealthy,
112+
// so that if the same instance becomes unhealthy again later it gets counted again.
113+
c.mu.Lock()
114+
for key := range c.seen {
115+
if _, ok := currentKeys[key]; !ok {
116+
delete(c.seen, key)
117+
}
118+
}
119+
c.mu.Unlock()
120+
92121
if err = multierr.Combine(errs...); err != nil {
93122
return reconciler.Result{}, err
94123
}
95124
return reconciler.Result{RequeueAfter: InstanceStatusInterval}, nil
96125
}
97126

127+
// handleHealthStatus dispatches a message per EC2 status category and records metrics.
128+
func (c *InstanceStatusController) handleHealthStatus(ctx context.Context, hs instancestatus.HealthStatus, currentKeys map[unhealthyKey]struct{}) error {
129+
categories := make(map[instancestatus.Category]struct{})
130+
for _, d := range hs.Details {
131+
categories[d.Category] = struct{}{}
132+
}
133+
for category := range categories {
134+
kind, ok := categoryToKind[category]
135+
if !ok {
136+
continue
137+
}
138+
f, err := c.handleMessage(ctx, instancestatusmsg.New(hs.InstanceID, kind, hs.ImpairedSince), InstanceStatusDryRun)
139+
if err != nil {
140+
return fmt.Errorf("handling instance status check message, %w", err)
141+
}
142+
if f {
143+
c.recordUnhealthyInstance(ctx, hs.InstanceID, category, currentKeys)
144+
}
145+
}
146+
return nil
147+
}
148+
149+
func (c *InstanceStatusController) recordUnhealthyInstance(ctx context.Context, instanceID string, category instancestatus.Category, currentKeys map[unhealthyKey]struct{}) {
150+
key := unhealthyKey{instanceID: instanceID, category: string(category)}
151+
c.mu.Lock()
152+
currentKeys[key] = struct{}{}
153+
_, already := c.seen[key]
154+
if !already {
155+
c.seen[key] = struct{}{}
156+
}
157+
c.mu.Unlock()
158+
if !already {
159+
log.FromContext(ctx).Info("detected unhealthy instance owned by cluster",
160+
"instanceID", instanceID,
161+
"category", string(category))
162+
InstanceStatusUnhealthy.Inc(map[string]string{categoryLabel: string(category)})
163+
}
164+
}
165+
98166
func (c *InstanceStatusController) Register(_ context.Context, m manager.Manager) error {
99167
return controllerruntime.NewControllerManagedBy(m).
100168
Named("interruption.instancestatus").

pkg/controllers/interruption/interruption_benchmark_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) {
110110
unavailableOfferingsCache = awscache.NewUnavailableOfferings()
111111

112112
// Set-up the controllers
113-
interruptionController := interruption.NewController(env.Client, fakeClock, recorder, providers.sqsProvider, unavailableOfferingsCache)
113+
interruptionController := interruption.NewController(env.Client, fakeClock, nil, recorder, providers.sqsProvider, nil, unavailableOfferingsCache, nil)
114114

115115
messages, nodes := makeDiverseMessagesAndNodes(messageCount)
116116
log.FromContext(ctx).Info("provisioning nodes")
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License.
13+
*/
14+
15+
package instancestatus
16+
17+
import (
18+
"time"
19+
20+
"github.com/aws/karpenter-provider-aws/pkg/controllers/interruption/messages"
21+
)
22+
23+
// Message represents a single category from an EC2 DescribeInstanceStatus response.
24+
// The Kind maps directly to the EC2 status category (instance_status, system_status, event_status).
25+
type Message struct {
26+
instanceID string
27+
kind messages.Kind
28+
startTime time.Time
29+
}
30+
31+
func New(instanceID string, kind messages.Kind, startTime time.Time) Message {
32+
return Message{
33+
instanceID: instanceID,
34+
kind: kind,
35+
startTime: startTime,
36+
}
37+
}
38+
39+
func (m Message) EC2InstanceIDs() []string { return []string{m.instanceID} }
40+
func (m Message) Kind() messages.Kind { return m.kind }
41+
func (m Message) StartTime() time.Time { return m.startTime }

0 commit comments

Comments
 (0)