@@ -496,13 +496,29 @@ func (r *NodeENIReconciler) cleanupENIAttachmentCoordinated(ctx context.Context,
496496 // Create operation ID for coordination
497497 operationID := fmt .Sprintf ("cleanup-%s-%s-%s" , nodeENI .Name , attachment .NodeID , attachment .ENIID )
498498
499- // Define resource IDs that this operation will use
499+ // Define resource IDs with granular locking strategy
500+ // Only lock the specific ENI resource to allow parallel cleanup of different ENIs
501+ // on the same node/instance
500502 resourceIDs := []string {
501- attachment .ENIID , // ENI resource
502- attachment .InstanceID , // Instance resource
503- fmt .Sprintf ("node-%s" , attachment .NodeID ), // Node resource
503+ attachment .ENIID , // Only lock the specific ENI being cleaned up
504504 }
505505
506+ // For operations that might conflict at the instance level (like instance termination),
507+ // we use a more specific resource ID that includes the ENI ID
508+ if attachment .InstanceID != "" {
509+ // Create instance-specific resource ID that includes ENI to avoid conflicts
510+ // between different ENI cleanup operations on the same instance
511+ instanceResourceID := fmt .Sprintf ("instance-%s-eni-%s" , attachment .InstanceID , attachment .ENIID )
512+ resourceIDs = append (resourceIDs , instanceResourceID )
513+ }
514+
515+ log .V (1 ).Info ("Acquiring locks for ENI cleanup" ,
516+ "operationID" , operationID ,
517+ "resourceIDs" , resourceIDs ,
518+ "eniID" , attachment .ENIID ,
519+ "instanceID" , attachment .InstanceID ,
520+ "nodeID" , attachment .NodeID )
521+
506522 // Create coordinated operation
507523 operation := & CoordinatedOperation {
508524 ID : operationID ,
@@ -524,14 +540,87 @@ func (r *NodeENIReconciler) cleanupENIAttachmentCoordinated(ctx context.Context,
524540 // Execute with coordination
525541 err := r .Coordinator .ExecuteCoordinated (ctx , operation )
526542 if err != nil {
527- log .Error (err , "Coordinated cleanup failed" )
543+ log .Error (err , "Coordinated cleanup failed" ,
544+ "operationID" , operationID ,
545+ "resourceIDs" , resourceIDs )
528546 return false
529547 }
530548
531- log .Info ("Coordinated cleanup succeeded" )
549+ log .Info ("Coordinated cleanup succeeded" , "operationID" , operationID )
532550 return true
533551}
534552
553+ // cleanupENIAttachmentWithNodeCoordination performs ENI cleanup with node-level coordination
554+ // This is used when operations need to be serialized at the node level (e.g., for DPDK operations)
555+ func (r * NodeENIReconciler ) cleanupENIAttachmentWithNodeCoordination (ctx context.Context , nodeENI * networkingv1alpha1.NodeENI , attachment networkingv1alpha1.ENIAttachment ) bool {
556+ log := r .Log .WithValues ("nodeeni" , nodeENI .Name , "node" , attachment .NodeID , "eniID" , attachment .ENIID )
557+
558+ // Create operation ID for coordination
559+ operationID := fmt .Sprintf ("node-cleanup-%s-%s-%s" , nodeENI .Name , attachment .NodeID , attachment .ENIID )
560+
561+ // Define resource IDs with node-level locking for operations that require serialization
562+ resourceIDs := []string {
563+ attachment .ENIID , // ENI resource
564+ fmt .Sprintf ("node-%s" , attachment .NodeID ), // Node resource for serialization
565+ }
566+
567+ // Add instance-level coordination if needed
568+ if attachment .InstanceID != "" {
569+ resourceIDs = append (resourceIDs , attachment .InstanceID )
570+ }
571+
572+ log .V (1 ).Info ("Acquiring node-level locks for ENI cleanup" ,
573+ "operationID" , operationID ,
574+ "resourceIDs" , resourceIDs ,
575+ "reason" , "node-level coordination required" )
576+
577+ // Create coordinated operation
578+ operation := & CoordinatedOperation {
579+ ID : operationID ,
580+ Type : "eni-node-cleanup" ,
581+ ResourceIDs : resourceIDs ,
582+ Priority : 2 , // Higher priority for node-level operations
583+ DependsOn : []string {},
584+ Timeout : r .Config .DetachmentTimeout ,
585+ Execute : func (ctx context.Context ) error {
586+ // Execute the actual cleanup
587+ success := r .cleanupENIAttachment (ctx , nodeENI , attachment )
588+ if ! success {
589+ return fmt .Errorf ("node-coordinated cleanup failed for ENI %s" , attachment .ENIID )
590+ }
591+ return nil
592+ },
593+ }
594+
595+ // Execute with coordination
596+ err := r .Coordinator .ExecuteCoordinated (ctx , operation )
597+ if err != nil {
598+ log .Error (err , "Node-coordinated cleanup failed" ,
599+ "operationID" , operationID ,
600+ "resourceIDs" , resourceIDs )
601+ return false
602+ }
603+
604+ log .Info ("Node-coordinated cleanup succeeded" , "operationID" , operationID )
605+ return true
606+ }
607+
608+ // shouldUseNodeLevelCoordination determines if node-level coordination is needed
609+ func (r * NodeENIReconciler ) shouldUseNodeLevelCoordination (nodeENI * networkingv1alpha1.NodeENI , attachment networkingv1alpha1.ENIAttachment ) bool {
610+ // Use node-level coordination for DPDK-enabled ENIs
611+ if nodeENI .Spec .EnableDPDK {
612+ return true
613+ }
614+
615+ // Use node-level coordination for SR-IOV enabled ENIs (indicated by PCI address or resource name)
616+ if nodeENI .Spec .DPDKPCIAddress != "" || nodeENI .Spec .DPDKResourceName != "" {
617+ return true
618+ }
619+
620+ // For standard ENIs (Case 1), use granular coordination (no node-level locking)
621+ return false
622+ }
623+
535624// InterfaceState represents the current state of a network interface
536625type InterfaceState struct {
537626 PCIAddress string
@@ -2297,3 +2386,8 @@ func (r *NodeENIReconciler) startIMDSConfigurationRetry(ctx context.Context) {
22972386 }()
22982387 }
22992388}
2389+
2390+ // ShouldUseNodeLevelCoordinationTest exposes shouldUseNodeLevelCoordination for testing purposes
2391+ func (r * NodeENIReconciler ) ShouldUseNodeLevelCoordinationTest (nodeENI * networkingv1alpha1.NodeENI , attachment networkingv1alpha1.ENIAttachment ) bool {
2392+ return r .shouldUseNodeLevelCoordination (nodeENI , attachment )
2393+ }
0 commit comments