Skip to content

Commit 2261c5e

Browse files
author
John Lam
committed
Add GetInstanceENIs method and improve ENI creation logging
Implements a new GetInstanceENIs method to retrieve all ENIs attached to an EC2 instance, returning a map of device index to ENI ID. This helps prevent race conditions during ENI creation. Enhances logging in the NodeENI controller to better track ENI attachment operations, particularly during the subnet processing and device index determination phases. Updates the NodeENI controller to immediately update status after ENI attachment to prevent race conditions in subsequent reconciliations. Updates container image tags to the latest version with cleanup fixes.
1 parent 3614101 commit 2261c5e

9 files changed

Lines changed: 217 additions & 6 deletions

File tree

charts/aws-multi-eni-controller/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Image configuration
66
image:
77
repository: johnlam90/aws-multi-eni-controller
8-
tag: fix-al2023-dpdk-setup-7101ae5
8+
tag: fix-cleanup-case-01-3614101
99
pullPolicy: Always
1010

1111
# Namespace to deploy the controller

deploy/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ spec:
173173
# - "kubernetes"
174174
containers:
175175
- name: manager
176-
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
176+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
177177
env:
178178
- name: COMPONENT
179179
value: "eni-controller"

deploy/eni-manager-daemonset.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
ng: multi-eni
2121
initContainers:
2222
- name: dpdk-setup
23-
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
23+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
2424
command: ["/bin/sh", "-c"]
2525
args:
2626
- |
@@ -180,7 +180,7 @@ spec:
180180
readOnly: true
181181
containers:
182182
- name: eni-manager
183-
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
183+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
184184
env:
185185
- name: COMPONENT
186186
value: "eni-manager"

pkg/aws/ec2.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,73 @@ func (c *EC2Client) DescribeInstance(ctx context.Context, instanceID string) (*E
577577
return ec2Instance, nil
578578
}
579579

580+
// GetInstanceENIs gets all ENIs attached to an instance
581+
func (c *EC2Client) GetInstanceENIs(ctx context.Context, instanceID string) (map[int]string, error) {
582+
log := c.Logger.WithValues("instanceID", instanceID)
583+
log.V(1).Info("Getting all ENIs attached to instance")
584+
585+
// Use DescribeNetworkInterfaces with a filter to get all ENIs attached to this instance
586+
input := &ec2.DescribeNetworkInterfacesInput{
587+
Filters: []types.Filter{
588+
{
589+
Name: aws.String("attachment.instance-id"),
590+
Values: []string{instanceID},
591+
},
592+
{
593+
Name: aws.String("attachment.status"),
594+
Values: []string{"attached"},
595+
},
596+
},
597+
}
598+
599+
// Use exponential backoff for API rate limiting
600+
backoff := wait.Backoff{
601+
Steps: 5,
602+
Duration: 1 * time.Second,
603+
Factor: 2.0,
604+
Jitter: 0.1,
605+
}
606+
607+
var result *ec2.DescribeNetworkInterfacesOutput
608+
var lastErr error
609+
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
610+
var err error
611+
result, err = c.EC2.DescribeNetworkInterfaces(ctx, input)
612+
if err != nil {
613+
// Check if this is a rate limit error
614+
if strings.Contains(err.Error(), "RequestLimitExceeded") ||
615+
strings.Contains(err.Error(), "Throttling") {
616+
log.Info("AWS API rate limit exceeded, retrying", "error", err.Error())
617+
lastErr = err
618+
return false, nil // Return nil error to continue retrying
619+
}
620+
621+
// For other errors, fail immediately
622+
lastErr = err
623+
return false, err
624+
}
625+
return true, nil
626+
})
627+
628+
if err != nil {
629+
return nil, fmt.Errorf("failed to describe network interfaces after retries: %v", lastErr)
630+
}
631+
632+
// Build a map of device index to ENI ID
633+
eniMap := make(map[int]string)
634+
for _, eni := range result.NetworkInterfaces {
635+
if eni.Attachment != nil && eni.Attachment.DeviceIndex != nil {
636+
deviceIndex := int(*eni.Attachment.DeviceIndex)
637+
eniID := *eni.NetworkInterfaceId
638+
eniMap[deviceIndex] = eniID
639+
log.V(1).Info("Found attached ENI", "eniID", eniID, "deviceIndex", deviceIndex)
640+
}
641+
}
642+
643+
log.Info("Retrieved all ENIs attached to instance", "count", len(eniMap), "eniMap", eniMap)
644+
return eniMap, nil
645+
}
646+
580647
// GetSubnetIDByName looks up a subnet ID by its Name tag
581648
func (c *EC2Client) GetSubnetIDByName(ctx context.Context, subnetName string) (string, error) {
582649
log := c.Logger.WithValues("subnetName", subnetName)

pkg/aws/ec2_client.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,8 @@ func (c *EC2ClientFacade) GetSecurityGroupIDByName(ctx context.Context, security
104104
func (c *EC2ClientFacade) DescribeInstance(ctx context.Context, instanceID string) (*EC2Instance, error) {
105105
return c.instanceDescriber.DescribeInstance(ctx, instanceID)
106106
}
107+
108+
// GetInstanceENIs delegates to InstanceDescriber
109+
func (c *EC2ClientFacade) GetInstanceENIs(ctx context.Context, instanceID string) (map[int]string, error) {
110+
return c.instanceDescriber.GetInstanceENIs(ctx, instanceID)
111+
}

pkg/aws/instance_describer.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ import (
66
"strings"
77
"time"
88

9+
"github.com/aws/aws-sdk-go-v2/aws"
910
"github.com/aws/aws-sdk-go-v2/service/ec2"
11+
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
1012
"github.com/go-logr/logr"
1113
"k8s.io/apimachinery/pkg/util/wait"
1214
)
@@ -82,3 +84,70 @@ func (d *EC2InstanceDescriber) DescribeInstance(ctx context.Context, instanceID
8284
log.V(1).Info("Found EC2 instance", "instanceID", ec2Instance.InstanceID, "state", ec2Instance.State)
8385
return ec2Instance, nil
8486
}
87+
88+
// GetInstanceENIs gets all ENIs attached to an instance
89+
func (d *EC2InstanceDescriber) GetInstanceENIs(ctx context.Context, instanceID string) (map[int]string, error) {
90+
log := d.logger.WithValues("instanceID", instanceID)
91+
log.V(1).Info("Getting all ENIs attached to instance")
92+
93+
// Use DescribeNetworkInterfaces with a filter to get all ENIs attached to this instance
94+
input := &ec2.DescribeNetworkInterfacesInput{
95+
Filters: []types.Filter{
96+
{
97+
Name: aws.String("attachment.instance-id"),
98+
Values: []string{instanceID},
99+
},
100+
{
101+
Name: aws.String("attachment.status"),
102+
Values: []string{"attached"},
103+
},
104+
},
105+
}
106+
107+
// Use exponential backoff for API rate limiting
108+
backoff := wait.Backoff{
109+
Steps: 5,
110+
Duration: 1 * time.Second,
111+
Factor: 2.0,
112+
Jitter: 0.1,
113+
}
114+
115+
var result *ec2.DescribeNetworkInterfacesOutput
116+
var lastErr error
117+
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
118+
var err error
119+
result, err = d.ec2Client.DescribeNetworkInterfaces(ctx, input)
120+
if err != nil {
121+
// Check if this is a rate limit error
122+
if strings.Contains(err.Error(), "RequestLimitExceeded") ||
123+
strings.Contains(err.Error(), "Throttling") {
124+
log.Info("AWS API rate limit exceeded, retrying", "error", err.Error())
125+
lastErr = err
126+
return false, nil // Return nil error to continue retrying
127+
}
128+
129+
// For other errors, fail immediately
130+
lastErr = err
131+
return false, err
132+
}
133+
return true, nil
134+
})
135+
136+
if err != nil {
137+
return nil, fmt.Errorf("failed to describe network interfaces after retries: %v", lastErr)
138+
}
139+
140+
// Build a map of device index to ENI ID
141+
eniMap := make(map[int]string)
142+
for _, eni := range result.NetworkInterfaces {
143+
if eni.Attachment != nil && eni.Attachment.DeviceIndex != nil {
144+
deviceIndex := int(*eni.Attachment.DeviceIndex)
145+
eniID := *eni.NetworkInterfaceId
146+
eniMap[deviceIndex] = eniID
147+
log.V(1).Info("Found attached ENI", "eniID", eniID, "deviceIndex", deviceIndex)
148+
}
149+
}
150+
151+
log.Info("Retrieved all ENIs attached to instance", "count", len(eniMap), "eniMap", eniMap)
152+
return eniMap, nil
153+
}

pkg/aws/interface.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ type InstanceDescriber interface {
5050
// DescribeInstance describes an EC2 instance
5151
// Returns nil, nil if the instance doesn't exist
5252
DescribeInstance(ctx context.Context, instanceID string) (*EC2Instance, error)
53+
54+
// GetInstanceENIs gets all ENIs attached to an instance
55+
// Returns a map of device index to ENI ID for all attached ENIs
56+
GetInstanceENIs(ctx context.Context, instanceID string) (map[int]string, error)
5357
}
5458

5559
// EC2Interface defines the combined interface for all EC2 operations using AWS SDK v2

pkg/aws/mock_ec2.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,36 @@ func (m *MockEC2Client) DescribeInstance(ctx context.Context, instanceID string)
345345
return result, nil
346346
}
347347

348+
// GetInstanceENIs gets all ENIs attached to an instance in the mock AWS environment
349+
func (m *MockEC2Client) GetInstanceENIs(ctx context.Context, instanceID string) (map[int]string, error) {
350+
m.mutex.RLock()
351+
defer m.mutex.RUnlock()
352+
353+
// Simulate describe delay
354+
if m.DescribeWaitTime > 0 {
355+
time.Sleep(m.DescribeWaitTime)
356+
}
357+
358+
// Check if we should simulate a failure
359+
if m.FailureScenarios["GetInstanceENIs"] {
360+
return nil, fmt.Errorf("simulated GetInstanceENIs failure")
361+
}
362+
363+
// Build a map of device index to ENI ID for this instance
364+
eniMap := make(map[int]string)
365+
366+
// Check all ENIs to see which ones are attached to this instance
367+
for eniID, eni := range m.ENIs {
368+
if eni.Attachment != nil &&
369+
eni.Attachment.InstanceID == instanceID &&
370+
eni.Attachment.Status == "attached" {
371+
eniMap[int(eni.Attachment.DeviceIndex)] = eniID
372+
}
373+
}
374+
375+
return eniMap, nil
376+
}
377+
348378
// GetSubnetIDByName looks up a subnet ID by its Name tag in the mock AWS environment
349379
func (m *MockEC2Client) GetSubnetIDByName(ctx context.Context, subnetName string) (string, error) {
350380
m.mutex.RLock()

pkg/controller/nodeeni_controller.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,12 +1412,23 @@ func (r *NodeENIReconciler) buildAttachmentMaps(nodeENI *networkingv1alpha1.Node
14121412
usedDeviceIndices map[int]bool,
14131413
subnetToDeviceIndex map[string]int,
14141414
) {
1415+
log := r.Log.WithValues("nodeeni", nodeENI.Name, "node", nodeName)
1416+
14151417
existingSubnets = make(map[string]bool)
14161418
usedDeviceIndices = make(map[int]bool)
14171419
subnetToDeviceIndex = make(map[string]int)
14181420

1421+
log.Info("Building attachment maps", "totalAttachments", len(nodeENI.Status.Attachments))
1422+
14191423
// First pass: build the subnet to device index mapping from existing attachments
14201424
for _, attachment := range nodeENI.Status.Attachments {
1425+
log.V(1).Info("Processing attachment",
1426+
"nodeID", attachment.NodeID,
1427+
"eniID", attachment.ENIID,
1428+
"subnetID", attachment.SubnetID,
1429+
"deviceIndex", attachment.DeviceIndex,
1430+
"isCurrentNode", attachment.NodeID == nodeName)
1431+
14211432
// We want to build a global mapping across all nodes
14221433
if attachment.SubnetID != "" && attachment.DeviceIndex > 0 {
14231434
// If we haven't seen this subnet before, or if this device index is lower
@@ -1430,14 +1441,21 @@ func (r *NodeENIReconciler) buildAttachmentMaps(nodeENI *networkingv1alpha1.Node
14301441
// For this specific node, track which subnets already have ENIs
14311442
if attachment.NodeID == nodeName && attachment.SubnetID != "" {
14321443
existingSubnets[attachment.SubnetID] = true
1444+
log.Info("Found existing ENI for node in subnet", "subnetID", attachment.SubnetID, "eniID", attachment.ENIID)
14331445
}
14341446

14351447
// For this specific node, track which device indices are already in use
14361448
if attachment.NodeID == nodeName && attachment.DeviceIndex > 0 {
14371449
usedDeviceIndices[attachment.DeviceIndex] = true
1450+
log.Info("Found used device index for node", "deviceIndex", attachment.DeviceIndex, "eniID", attachment.ENIID)
14381451
}
14391452
}
14401453

1454+
log.Info("Built attachment maps",
1455+
"existingSubnets", existingSubnets,
1456+
"usedDeviceIndices", usedDeviceIndices,
1457+
"subnetToDeviceIndex", subnetToDeviceIndex)
1458+
14411459
return existingSubnets, usedDeviceIndices, subnetToDeviceIndex
14421460
}
14431461

@@ -1454,22 +1472,32 @@ func (r *NodeENIReconciler) createMissingENIs(
14541472
) {
14551473
log := r.Log.WithValues("nodeeni", nodeENI.Name, "node", node.Name)
14561474

1475+
log.Info("Processing subnets for ENI creation",
1476+
"totalSubnets", len(subnetIDs),
1477+
"subnetIDs", subnetIDs,
1478+
"existingSubnets", existingSubnets,
1479+
"usedDeviceIndices", usedDeviceIndices)
1480+
14571481
for i, subnetID := range subnetIDs {
14581482
// Skip if we already have an ENI in this subnet
14591483
if existingSubnets[subnetID] {
1460-
log.Info("Node already has an ENI in this subnet", "subnetID", subnetID)
1484+
log.Info("Node already has an ENI in this subnet, skipping", "subnetID", subnetID)
14611485
continue
14621486
}
14631487

1488+
log.Info("Creating ENI for subnet", "subnetID", subnetID, "subnetIndex", i)
1489+
14641490
// Determine the device index to use for this subnet
14651491
deviceIndex := r.determineDeviceIndex(nodeENI, subnetID, i, subnetToDeviceIndex, usedDeviceIndices, log)
14661492

14671493
// Create and attach a new ENI for this subnet
14681494
if err := r.createAndAttachENIForSubnet(ctx, nodeENI, node, instanceID, subnetID, deviceIndex); err != nil {
1469-
log.Error(err, "Failed to create and attach ENI for subnet", "subnetID", subnetID)
1495+
log.Error(err, "Failed to create and attach ENI for subnet", "subnetID", subnetID, "deviceIndex", deviceIndex)
14701496
// Continue with other subnets even if one fails
14711497
continue
14721498
}
1499+
1500+
log.Info("Successfully created and attached ENI for subnet", "subnetID", subnetID, "deviceIndex", deviceIndex)
14731501
}
14741502
}
14751503

@@ -1602,6 +1630,14 @@ func (r *NodeENIReconciler) createAndAttachENIForSubnet(ctx context.Context, nod
16021630
LastUpdated: metav1.Now(),
16031631
})
16041632

1633+
// Update the NodeENI status immediately to prevent race conditions
1634+
// This ensures that subsequent reconciliations see the attachment
1635+
if err := r.updateNodeENIStatusWithRetry(ctx, nodeENI); err != nil {
1636+
log.Error(err, "Failed to update NodeENI status after ENI attachment", "eniID", eniID)
1637+
// Don't return error here as the ENI is already attached successfully
1638+
// The status will be updated in the next reconciliation
1639+
}
1640+
16051641
r.Recorder.Eventf(nodeENI, corev1.EventTypeNormal, "ENIAttached",
16061642
"Successfully attached ENI %s to node %s in subnet %s", eniID, node.Name, subnetID)
16071643

0 commit comments

Comments
 (0)