@@ -1400,6 +1400,16 @@ func (r *NodeENIReconciler) processNode(ctx context.Context, nodeENI *networking
14001400 // Build maps for tracking existing attachments and device indices
14011401 existingSubnets , usedDeviceIndices , subnetToDeviceIndex := r .buildAttachmentMaps (nodeENI , node .Name )
14021402
1403+ // Query AWS for actual current ENI attachments to ensure state consistency
1404+ awsUsedDeviceIndices , err := r .getAWSUsedDeviceIndices (ctx , instanceID )
1405+ if err != nil {
1406+ log .Error (err , "Failed to query AWS for current ENI attachments, proceeding with internal state only" )
1407+ // Continue with internal state only as fallback
1408+ } else {
1409+ // Merge AWS state with internal state for more accurate device index tracking
1410+ usedDeviceIndices = r .mergeDeviceIndices (usedDeviceIndices , awsUsedDeviceIndices , log )
1411+ }
1412+
14031413 // Create ENIs for any subnets that don't already have one
14041414 r .createMissingENIs (ctx , nodeENI , node , instanceID , subnetIDs , existingSubnets , usedDeviceIndices , subnetToDeviceIndex )
14051415
@@ -1556,6 +1566,62 @@ func (r *NodeENIReconciler) findAvailableDeviceIndex(
15561566 return deviceIndex
15571567}
15581568
1569+ // getAWSUsedDeviceIndices queries AWS for actual current ENI attachments on an instance
1570+ func (r * NodeENIReconciler ) getAWSUsedDeviceIndices (ctx context.Context , instanceID string ) (map [int ]bool , error ) {
1571+ log := r .Log .WithValues ("instanceID" , instanceID )
1572+ log .V (1 ).Info ("Querying AWS for current ENI attachments" )
1573+
1574+ // Query AWS for all ENIs currently attached to this instance
1575+ eniMap , err := r .AWS .GetInstanceENIs (ctx , instanceID )
1576+ if err != nil {
1577+ return nil , fmt .Errorf ("failed to get instance ENIs from AWS: %v" , err )
1578+ }
1579+
1580+ // Convert to device index map
1581+ awsUsedDeviceIndices := make (map [int ]bool )
1582+ for deviceIndex := range eniMap {
1583+ awsUsedDeviceIndices [deviceIndex ] = true
1584+ }
1585+
1586+ log .Info ("Retrieved AWS ENI attachments" , "deviceIndices" , awsUsedDeviceIndices )
1587+ return awsUsedDeviceIndices , nil
1588+ }
1589+
1590+ // mergeDeviceIndices merges internal state with AWS state for more accurate device index tracking
1591+ func (r * NodeENIReconciler ) mergeDeviceIndices (internalIndices , awsIndices map [int ]bool , log logr.Logger ) map [int ]bool {
1592+ merged := make (map [int ]bool )
1593+
1594+ // Start with AWS state as the source of truth
1595+ for deviceIndex := range awsIndices {
1596+ merged [deviceIndex ] = true
1597+ }
1598+
1599+ // Add any internal indices that might not be reflected in AWS yet
1600+ // (e.g., during attachment process)
1601+ for deviceIndex := range internalIndices {
1602+ if ! merged [deviceIndex ] {
1603+ log .Info ("Adding internal device index not found in AWS" , "deviceIndex" , deviceIndex )
1604+ merged [deviceIndex ] = true
1605+ }
1606+ }
1607+
1608+ // Log any discrepancies for debugging
1609+ for deviceIndex := range awsIndices {
1610+ if ! internalIndices [deviceIndex ] {
1611+ log .Info ("AWS shows device index not in internal state" , "deviceIndex" , deviceIndex )
1612+ }
1613+ }
1614+
1615+ for deviceIndex := range internalIndices {
1616+ if ! awsIndices [deviceIndex ] {
1617+ log .Info ("Internal state shows device index not in AWS" , "deviceIndex" , deviceIndex )
1618+ }
1619+ }
1620+
1621+ log .Info ("Merged device indices" , "internal" , internalIndices , "aws" , awsIndices , "merged" , merged )
1622+ return merged
1623+ }
1624+
15591625// createAndAttachENI creates and attaches a new ENI to a node
15601626// This is kept for backward compatibility
15611627func (r * NodeENIReconciler ) createAndAttachENI (ctx context.Context , nodeENI * networkingv1alpha1.NodeENI , node corev1.Node , instanceID string ) error {
@@ -1587,7 +1653,7 @@ func (r *NodeENIReconciler) createAndAttachENIForSubnet(ctx context.Context, nod
15871653 }
15881654
15891655 // Attach the ENI
1590- attachmentID , err := r .attachENI (ctx , eniID , instanceID , deviceIndex )
1656+ attachmentID , actualDeviceIndex , err := r .attachENI (ctx , eniID , instanceID , deviceIndex )
15911657 if err != nil {
15921658 log .Error (err , "Failed to attach ENI" , "eniID" , eniID )
15931659 r .Recorder .Eventf (nodeENI , corev1 .EventTypeWarning , "ENIAttachmentFailed" ,
@@ -1608,6 +1674,9 @@ func (r *NodeENIReconciler) createAndAttachENIForSubnet(ctx context.Context, nod
16081674 return err
16091675 }
16101676
1677+ // Use the actual device index that was used (may be different if corrected during retry)
1678+ deviceIndex = actualDeviceIndex
1679+
16111680 // Get the subnet CIDR
16121681 subnetCIDR , err := r .AWS .GetSubnetCIDRByID (ctx , subnetID )
16131682 if err != nil {
@@ -2109,20 +2178,78 @@ func (r *NodeENIReconciler) getAllSubnetIDs(ctx context.Context, nodeENI *networ
21092178 return subnetIDs , nil
21102179}
21112180
2112- // attachENI attaches an ENI to an EC2 instance
2113- func (r * NodeENIReconciler ) attachENI (ctx context.Context , eniID , instanceID string , deviceIndex int ) (string , error ) {
2181+ // attachENI attaches an ENI to an EC2 instance with retry logic for device index conflicts
2182+ // Returns the attachment ID and the actual device index used (which may differ from requested if corrected during retry)
2183+ func (r * NodeENIReconciler ) attachENI (ctx context.Context , eniID , instanceID string , deviceIndex int ) (string , int , error ) {
2184+ log := r .Log .WithValues ("eniID" , eniID , "instanceID" , instanceID , "deviceIndex" , deviceIndex )
2185+
21142186 // Use default device index if not specified
21152187 if deviceIndex <= 0 {
21162188 deviceIndex = r .Config .DefaultDeviceIndex
21172189 }
21182190
2119- // Attach the ENI with delete on termination set to the configured default
2191+ // First attempt: try to attach with the requested device index
21202192 attachmentID , err := r .AWS .AttachENI (ctx , eniID , instanceID , deviceIndex , r .Config .DefaultDeleteOnTermination )
21212193 if err != nil {
2122- return "" , fmt .Errorf ("failed to attach ENI: %v" , err )
2194+ // Check if this is a device index conflict error
2195+ if r .isDeviceIndexConflictError (err ) {
2196+ log .Info ("Device index conflict detected, querying AWS for current state and retrying" , "error" , err .Error ())
2197+
2198+ // Query AWS for actual current ENI attachments
2199+ awsUsedDeviceIndices , awsErr := r .getAWSUsedDeviceIndices (ctx , instanceID )
2200+ if awsErr != nil {
2201+ log .Error (awsErr , "Failed to query AWS for current ENI attachments during retry" )
2202+ return "" , deviceIndex , fmt .Errorf ("failed to attach ENI: %v (retry query failed: %v)" , err , awsErr )
2203+ }
2204+
2205+ // Find the next available device index based on AWS reality
2206+ retryDeviceIndex := r .findNextAvailableDeviceIndex (deviceIndex , awsUsedDeviceIndices , log )
2207+ if retryDeviceIndex == deviceIndex {
2208+ // No conflict found in AWS state, this might be a transient error
2209+ log .Info ("No device index conflict found in AWS state, original error might be transient" )
2210+ return "" , deviceIndex , fmt .Errorf ("failed to attach ENI: %v" , err )
2211+ }
2212+
2213+ log .Info ("Retrying attachment with corrected device index" , "originalIndex" , deviceIndex , "retryIndex" , retryDeviceIndex )
2214+
2215+ // Retry with the corrected device index
2216+ attachmentID , retryErr := r .AWS .AttachENI (ctx , eniID , instanceID , retryDeviceIndex , r .Config .DefaultDeleteOnTermination )
2217+ if retryErr != nil {
2218+ return "" , retryDeviceIndex , fmt .Errorf ("failed to attach ENI after retry: original error: %v, retry error: %v" , err , retryErr )
2219+ }
2220+
2221+ log .Info ("Successfully attached ENI after device index correction" , "finalDeviceIndex" , retryDeviceIndex )
2222+ return attachmentID , retryDeviceIndex , nil
2223+ }
2224+
2225+ // For non-device-index errors, return the original error
2226+ return "" , deviceIndex , fmt .Errorf ("failed to attach ENI: %v" , err )
21232227 }
21242228
2125- return attachmentID , nil
2229+ return attachmentID , deviceIndex , nil
2230+ }
2231+
2232+ // isDeviceIndexConflictError checks if the error indicates a device index conflict
2233+ func (r * NodeENIReconciler ) isDeviceIndexConflictError (err error ) bool {
2234+ if err == nil {
2235+ return false
2236+ }
2237+ errorStr := err .Error ()
2238+ return strings .Contains (errorStr , "already has an interface attached at device index" ) ||
2239+ strings .Contains (errorStr , "InvalidParameterValue" ) && strings .Contains (errorStr , "device index" )
2240+ }
2241+
2242+ // findNextAvailableDeviceIndex finds the next available device index starting from the given index
2243+ func (r * NodeENIReconciler ) findNextAvailableDeviceIndex (startIndex int , usedDeviceIndices map [int ]bool , log logr.Logger ) int {
2244+ deviceIndex := startIndex
2245+
2246+ // Find the next available device index
2247+ for usedDeviceIndices [deviceIndex ] {
2248+ deviceIndex ++
2249+ log .Info ("Device index in use according to AWS, incrementing" , "usedIndex" , deviceIndex - 1 , "nextIndex" , deviceIndex )
2250+ }
2251+
2252+ return deviceIndex
21262253}
21272254
21282255// verifyENIAttachments verifies the actual state of ENIs in AWS and updates the NodeENI resource status accordingly
0 commit comments