Skip to content

Commit 77789df

Browse files
author
John Lam
committed
Update ENI controller image tag and enhance device index handling with AWS state integration
1 parent 2261c5e commit 77789df

7 files changed

Lines changed: 558 additions & 10 deletions

File tree

charts/aws-multi-eni-controller/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Image configuration
66
image:
77
repository: johnlam90/aws-multi-eni-controller
8-
tag: fix-cleanup-case-01-3614101
8+
tag: fix-cleanup-case-01-2261c5e
99
pullPolicy: Always
1010

1111
# Namespace to deploy the controller

deploy/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ spec:
173173
# - "kubernetes"
174174
containers:
175175
- name: manager
176-
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
176+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-2261c5e
177177
env:
178178
- name: COMPONENT
179179
value: "eni-controller"

deploy/eni-manager-daemonset.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
ng: multi-eni
2121
initContainers:
2222
- name: dpdk-setup
23-
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
23+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-2261c5e
2424
command: ["/bin/sh", "-c"]
2525
args:
2626
- |
@@ -180,7 +180,7 @@ spec:
180180
readOnly: true
181181
containers:
182182
- name: eni-manager
183-
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-3614101
183+
image: johnlam90/aws-multi-eni-controller:fix-cleanup-case-01-2261c5e
184184
env:
185185
- name: COMPONENT
186186
value: "eni-manager"

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ require (
6666
github.com/prometheus/procfs v0.9.0 // indirect
6767
github.com/rogpeppe/go-internal v1.14.1 // indirect
6868
github.com/spf13/pflag v1.0.5 // indirect
69+
github.com/stretchr/objx v0.5.0 // indirect
6970
github.com/vishvananda/netns v0.0.4 // indirect
7071
go.uber.org/atomic v1.7.0 // indirect
7172
go.uber.org/multierr v1.6.0 // indirect

go.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
184184
github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
185185
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
186186
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
187+
github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
187188
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
188189
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
189190
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=

pkg/controller/nodeeni_controller.go

Lines changed: 133 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1400,6 +1400,16 @@ func (r *NodeENIReconciler) processNode(ctx context.Context, nodeENI *networking
14001400
// Build maps for tracking existing attachments and device indices
14011401
existingSubnets, usedDeviceIndices, subnetToDeviceIndex := r.buildAttachmentMaps(nodeENI, node.Name)
14021402

1403+
// Query AWS for actual current ENI attachments to ensure state consistency
1404+
awsUsedDeviceIndices, err := r.getAWSUsedDeviceIndices(ctx, instanceID)
1405+
if err != nil {
1406+
log.Error(err, "Failed to query AWS for current ENI attachments, proceeding with internal state only")
1407+
// Continue with internal state only as fallback
1408+
} else {
1409+
// Merge AWS state with internal state for more accurate device index tracking
1410+
usedDeviceIndices = r.mergeDeviceIndices(usedDeviceIndices, awsUsedDeviceIndices, log)
1411+
}
1412+
14031413
// Create ENIs for any subnets that don't already have one
14041414
r.createMissingENIs(ctx, nodeENI, node, instanceID, subnetIDs, existingSubnets, usedDeviceIndices, subnetToDeviceIndex)
14051415

@@ -1556,6 +1566,62 @@ func (r *NodeENIReconciler) findAvailableDeviceIndex(
15561566
return deviceIndex
15571567
}
15581568

1569+
// getAWSUsedDeviceIndices queries AWS for actual current ENI attachments on an instance
1570+
func (r *NodeENIReconciler) getAWSUsedDeviceIndices(ctx context.Context, instanceID string) (map[int]bool, error) {
1571+
log := r.Log.WithValues("instanceID", instanceID)
1572+
log.V(1).Info("Querying AWS for current ENI attachments")
1573+
1574+
// Query AWS for all ENIs currently attached to this instance
1575+
eniMap, err := r.AWS.GetInstanceENIs(ctx, instanceID)
1576+
if err != nil {
1577+
return nil, fmt.Errorf("failed to get instance ENIs from AWS: %v", err)
1578+
}
1579+
1580+
// Convert to device index map
1581+
awsUsedDeviceIndices := make(map[int]bool)
1582+
for deviceIndex := range eniMap {
1583+
awsUsedDeviceIndices[deviceIndex] = true
1584+
}
1585+
1586+
log.Info("Retrieved AWS ENI attachments", "deviceIndices", awsUsedDeviceIndices)
1587+
return awsUsedDeviceIndices, nil
1588+
}
1589+
1590+
// mergeDeviceIndices merges internal state with AWS state for more accurate device index tracking
1591+
func (r *NodeENIReconciler) mergeDeviceIndices(internalIndices, awsIndices map[int]bool, log logr.Logger) map[int]bool {
1592+
merged := make(map[int]bool)
1593+
1594+
// Start with AWS state as the source of truth
1595+
for deviceIndex := range awsIndices {
1596+
merged[deviceIndex] = true
1597+
}
1598+
1599+
// Add any internal indices that might not be reflected in AWS yet
1600+
// (e.g., during attachment process)
1601+
for deviceIndex := range internalIndices {
1602+
if !merged[deviceIndex] {
1603+
log.Info("Adding internal device index not found in AWS", "deviceIndex", deviceIndex)
1604+
merged[deviceIndex] = true
1605+
}
1606+
}
1607+
1608+
// Log any discrepancies for debugging
1609+
for deviceIndex := range awsIndices {
1610+
if !internalIndices[deviceIndex] {
1611+
log.Info("AWS shows device index not in internal state", "deviceIndex", deviceIndex)
1612+
}
1613+
}
1614+
1615+
for deviceIndex := range internalIndices {
1616+
if !awsIndices[deviceIndex] {
1617+
log.Info("Internal state shows device index not in AWS", "deviceIndex", deviceIndex)
1618+
}
1619+
}
1620+
1621+
log.Info("Merged device indices", "internal", internalIndices, "aws", awsIndices, "merged", merged)
1622+
return merged
1623+
}
1624+
15591625
// createAndAttachENI creates and attaches a new ENI to a node
15601626
// This is kept for backward compatibility
15611627
func (r *NodeENIReconciler) createAndAttachENI(ctx context.Context, nodeENI *networkingv1alpha1.NodeENI, node corev1.Node, instanceID string) error {
@@ -1587,7 +1653,7 @@ func (r *NodeENIReconciler) createAndAttachENIForSubnet(ctx context.Context, nod
15871653
}
15881654

15891655
// Attach the ENI
1590-
attachmentID, err := r.attachENI(ctx, eniID, instanceID, deviceIndex)
1656+
attachmentID, actualDeviceIndex, err := r.attachENI(ctx, eniID, instanceID, deviceIndex)
15911657
if err != nil {
15921658
log.Error(err, "Failed to attach ENI", "eniID", eniID)
15931659
r.Recorder.Eventf(nodeENI, corev1.EventTypeWarning, "ENIAttachmentFailed",
@@ -1608,6 +1674,9 @@ func (r *NodeENIReconciler) createAndAttachENIForSubnet(ctx context.Context, nod
16081674
return err
16091675
}
16101676

1677+
// Use the actual device index that was used (may be different if corrected during retry)
1678+
deviceIndex = actualDeviceIndex
1679+
16111680
// Get the subnet CIDR
16121681
subnetCIDR, err := r.AWS.GetSubnetCIDRByID(ctx, subnetID)
16131682
if err != nil {
@@ -2109,20 +2178,78 @@ func (r *NodeENIReconciler) getAllSubnetIDs(ctx context.Context, nodeENI *networ
21092178
return subnetIDs, nil
21102179
}
21112180

2112-
// attachENI attaches an ENI to an EC2 instance
2113-
func (r *NodeENIReconciler) attachENI(ctx context.Context, eniID, instanceID string, deviceIndex int) (string, error) {
2181+
// attachENI attaches an ENI to an EC2 instance with retry logic for device index conflicts
2182+
// Returns the attachment ID and the actual device index used (which may differ from requested if corrected during retry)
2183+
func (r *NodeENIReconciler) attachENI(ctx context.Context, eniID, instanceID string, deviceIndex int) (string, int, error) {
2184+
log := r.Log.WithValues("eniID", eniID, "instanceID", instanceID, "deviceIndex", deviceIndex)
2185+
21142186
// Use default device index if not specified
21152187
if deviceIndex <= 0 {
21162188
deviceIndex = r.Config.DefaultDeviceIndex
21172189
}
21182190

2119-
// Attach the ENI with delete on termination set to the configured default
2191+
// First attempt: try to attach with the requested device index
21202192
attachmentID, err := r.AWS.AttachENI(ctx, eniID, instanceID, deviceIndex, r.Config.DefaultDeleteOnTermination)
21212193
if err != nil {
2122-
return "", fmt.Errorf("failed to attach ENI: %v", err)
2194+
// Check if this is a device index conflict error
2195+
if r.isDeviceIndexConflictError(err) {
2196+
log.Info("Device index conflict detected, querying AWS for current state and retrying", "error", err.Error())
2197+
2198+
// Query AWS for actual current ENI attachments
2199+
awsUsedDeviceIndices, awsErr := r.getAWSUsedDeviceIndices(ctx, instanceID)
2200+
if awsErr != nil {
2201+
log.Error(awsErr, "Failed to query AWS for current ENI attachments during retry")
2202+
return "", deviceIndex, fmt.Errorf("failed to attach ENI: %v (retry query failed: %v)", err, awsErr)
2203+
}
2204+
2205+
// Find the next available device index based on AWS reality
2206+
retryDeviceIndex := r.findNextAvailableDeviceIndex(deviceIndex, awsUsedDeviceIndices, log)
2207+
if retryDeviceIndex == deviceIndex {
2208+
// No conflict found in AWS state, this might be a transient error
2209+
log.Info("No device index conflict found in AWS state, original error might be transient")
2210+
return "", deviceIndex, fmt.Errorf("failed to attach ENI: %v", err)
2211+
}
2212+
2213+
log.Info("Retrying attachment with corrected device index", "originalIndex", deviceIndex, "retryIndex", retryDeviceIndex)
2214+
2215+
// Retry with the corrected device index
2216+
attachmentID, retryErr := r.AWS.AttachENI(ctx, eniID, instanceID, retryDeviceIndex, r.Config.DefaultDeleteOnTermination)
2217+
if retryErr != nil {
2218+
return "", retryDeviceIndex, fmt.Errorf("failed to attach ENI after retry: original error: %v, retry error: %v", err, retryErr)
2219+
}
2220+
2221+
log.Info("Successfully attached ENI after device index correction", "finalDeviceIndex", retryDeviceIndex)
2222+
return attachmentID, retryDeviceIndex, nil
2223+
}
2224+
2225+
// For non-device-index errors, return the original error
2226+
return "", deviceIndex, fmt.Errorf("failed to attach ENI: %v", err)
21232227
}
21242228

2125-
return attachmentID, nil
2229+
return attachmentID, deviceIndex, nil
2230+
}
2231+
2232+
// isDeviceIndexConflictError checks if the error indicates a device index conflict
2233+
func (r *NodeENIReconciler) isDeviceIndexConflictError(err error) bool {
2234+
if err == nil {
2235+
return false
2236+
}
2237+
errorStr := err.Error()
2238+
return strings.Contains(errorStr, "already has an interface attached at device index") ||
2239+
strings.Contains(errorStr, "InvalidParameterValue") && strings.Contains(errorStr, "device index")
2240+
}
2241+
2242+
// findNextAvailableDeviceIndex finds the next available device index starting from the given index
2243+
func (r *NodeENIReconciler) findNextAvailableDeviceIndex(startIndex int, usedDeviceIndices map[int]bool, log logr.Logger) int {
2244+
deviceIndex := startIndex
2245+
2246+
// Find the next available device index
2247+
for usedDeviceIndices[deviceIndex] {
2248+
deviceIndex++
2249+
log.Info("Device index in use according to AWS, incrementing", "usedIndex", deviceIndex-1, "nextIndex", deviceIndex)
2250+
}
2251+
2252+
return deviceIndex
21262253
}
21272254

21282255
// verifyENIAttachments verifies the actual state of ENIs in AWS and updates the NodeENI resource status accordingly

0 commit comments

Comments
 (0)