Skip to content

Commit 9f87a47

Browse files
authored
🐛 fixing crashes on State-change events (#313)
NTH crashes often from "EC2 Instance State-change Notification" events. The underlying issue was that an empty parsed NodeName derived from PrivateDnsName was forwarded unverified, creating cascading problems. This solves the root cause, that "EC2 Instance State-change Notification" can arrive at a time where the instance is in shutting-down, terminated or any other not-online situation where the PrivateDnsName metadata is empty! Instead of just ignoring these errors this implementation gets and decides based on the state of the instance message if it is an error to fail or to ignore. Therefor such messages are dropped in above situation because they are useless.
1 parent 363674f commit 9f87a47

File tree

3 files changed

+83
-14
lines changed

3 files changed

+83
-14
lines changed

cmd/node-termination-handler.go

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"github.com/aws/aws-sdk-go/service/sqs"
3838
"github.com/rs/zerolog"
3939
"github.com/rs/zerolog/log"
40+
"k8s.io/apimachinery/pkg/api/errors"
4041
"k8s.io/apimachinery/pkg/util/wait"
4142
)
4243

@@ -267,23 +268,33 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
267268
if nthConfig.CordonOnly || drainEvent.IsRebalanceRecommendation() {
268269
err := node.Cordon(nodeName)
269270
if err != nil {
270-
log.Log().Err(err).Msg("There was a problem while trying to cordon the node")
271-
os.Exit(1)
272-
}
273-
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned")
274-
err = node.LogPods(nodeName)
275-
if err != nil {
276-
log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node")
271+
if errors.IsNotFound(err) {
272+
log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName)
273+
} else {
274+
log.Log().Err(err).Msg("There was a problem while trying to cordon the node")
275+
os.Exit(1)
276+
}
277+
} else {
278+
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned")
279+
err = node.LogPods(nodeName)
280+
if err != nil {
281+
log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node")
282+
}
283+
metrics.NodeActionsInc("cordon", nodeName, err)
277284
}
278-
metrics.NodeActionsInc("cordon", nodeName, err)
279285
} else {
280286
err := node.CordonAndDrain(nodeName)
281287
if err != nil {
282-
log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node")
283-
os.Exit(1)
288+
if errors.IsNotFound(err) {
289+
log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName)
290+
} else {
291+
log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node")
292+
os.Exit(1)
293+
}
294+
} else {
295+
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained")
296+
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
284297
}
285-
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained")
286-
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
287298
}
288299

289300
interruptionEventStore.MarkAllAsDrained(nodeName)

pkg/monitor/sqsevent/sqs-monitor.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package sqsevent
1515

1616
import (
1717
"encoding/json"
18+
"errors"
1819
"fmt"
1920

2021
"github.com/aws/aws-node-termination-handler/pkg/monitor"
@@ -33,6 +34,9 @@ const (
3334
SQSTerminateKind = "SQS_TERMINATE"
3435
)
3536

37+
// ErrNodeStateNotRunning forwards condition that the instance is terminated thus metadata missing
38+
var ErrNodeStateNotRunning = errors.New("node metadata unavailable")
39+
3640
// SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge
3741
type SQSMonitor struct {
3842
InterruptionChan chan<- monitor.InterruptionEvent
@@ -54,6 +58,10 @@ func (m SQSMonitor) Kind() string {
5458
func (m SQSMonitor) Monitor() error {
5559
interruptionEvent, err := m.checkForSQSMessage()
5660
if err != nil {
61+
if errors.Is(err, ErrNodeStateNotRunning) {
62+
log.Warn().Err(err).Msg("dropping event for an already terminated node")
63+
return nil
64+
}
5765
return err
5866
}
5967
if interruptionEvent != nil && interruptionEvent.Kind == SQSTerminateKind {
@@ -175,10 +183,24 @@ func (m SQSMonitor) retrieveNodeName(instanceID string) (string, error) {
175183
}
176184

177185
instance := result.Reservations[0].Instances[0]
178-
log.Debug().Msgf("Got nodename from private ip %s", *instance.PrivateDnsName)
186+
nodeName := *instance.PrivateDnsName
187+
log.Debug().Msgf("Got nodename from private ip %s", nodeName)
179188
instanceJSON, _ := json.MarshalIndent(*instance, " ", " ")
180189
log.Debug().Msgf("Got nodename from ec2 describe call: %s", instanceJSON)
181-
return *instance.PrivateDnsName, nil
190+
191+
if nodeName == "" {
192+
state := "unknown"
193+
// safe access instance.State potentially being nil
194+
if instance.State != nil {
195+
state = *instance.State.Name
196+
}
197+
// anything except running might not contain PrivateDnsName
198+
if state != ec2.InstanceStateNameRunning {
199+
return "", ErrNodeStateNotRunning
200+
}
201+
return "", fmt.Errorf("unable to retrieve PrivateDnsName name for '%s' in state '%s'", instanceID, state)
202+
}
203+
return nodeName, nil
182204
}
183205

184206
// isInstanceManaged returns whether the instance specified should be managed by node termination handler

pkg/monitor/sqsevent/sqs-monitor_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,42 @@ func TestMonitor_EC2NoDNSName(t *testing.T) {
410410
h.Ok(t, err)
411411
}
412412

413+
func TestMonitor_EC2NoDNSNameOnTerminatedInstance(t *testing.T) {
414+
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
415+
h.Ok(t, err)
416+
messages := []*sqs.Message{
417+
&msg,
418+
}
419+
sqsMock := h.MockedSQS{
420+
ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: messages},
421+
ReceiveMessageErr: nil,
422+
DeleteMessageResp: sqs.DeleteMessageOutput{},
423+
}
424+
ec2Mock := h.MockedEC2{
425+
DescribeInstancesResp: getDescribeInstancesResp(""),
426+
}
427+
ec2Mock.DescribeInstancesResp.Reservations[0].Instances[0].State = &ec2.InstanceState{
428+
Name: aws.String("running"),
429+
}
430+
drainChan := make(chan monitor.InterruptionEvent)
431+
432+
sqsMonitor := sqsevent.SQSMonitor{
433+
SQS: sqsMock,
434+
EC2: ec2Mock,
435+
ASG: mockIsManagedTrue(nil),
436+
CheckIfManaged: true,
437+
QueueURL: "https://test-queue",
438+
InterruptionChan: drainChan,
439+
}
440+
go func() {
441+
result := <-drainChan
442+
h.Equals(t, result.Kind, sqsevent.SQSTerminateKind)
443+
}()
444+
445+
err = sqsMonitor.Monitor()
446+
h.Nok(t, err)
447+
}
448+
413449
func TestMonitor_SQSDeleteFailure(t *testing.T) {
414450
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
415451
h.Ok(t, err)

0 commit comments

Comments
 (0)