Skip to content

Commit 3614101

Browse files
author
John Lam
committed
Improve ENI cleanup coordination and interface detection
Implements a more granular locking strategy for ENI cleanup operations: - Adds node-level coordination for DPDK/SR-IOV ENIs while using granular locking for standard ENIs - Enhances interface detection with better logging and retry mechanisms - Improves wait logic for expected interfaces to appear based on NodeENI attachments - Updates image references to use the fixed AL2023 DPDK setup version - Adds more detailed logging for troubleshooting coordination issues These changes help prevent race conditions during cleanup operations while allowing higher parallelism for standard ENI operations.
1 parent 7101ae5 commit 3614101

13 files changed

Lines changed: 1440 additions & 39 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,6 @@ recommended-sample-improvements.yaml
5858
# Test artifacts that shouldn't be committed
5959
sriov-device-plugin-hostpath.yaml
6060

61+
charts/aws-multi-eni-controller/values.yaml.backup
62+
deploy/deployment.yaml.backup
63+
deploy/eni-manager-daemonset.yaml.backup

charts/aws-multi-eni-controller/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
# Image configuration
66
image:
7-
repository: ghcr.io/johnlam90/aws-multi-eni-controller
8-
tag: beta-a86ce7b
7+
repository: johnlam90/aws-multi-eni-controller
8+
tag: fix-al2023-dpdk-setup-7101ae5
99
pullPolicy: Always
1010

1111
# Namespace to deploy the controller

deploy/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ spec:
173173
# - "kubernetes"
174174
containers:
175175
- name: manager
176-
image: johnlam90/aws-multi-eni-controller:v1.3.6-cloud-native-auth
176+
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
177177
env:
178178
- name: COMPONENT
179179
value: "eni-controller"

deploy/eni-manager-daemonset.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
ng: multi-eni
2121
initContainers:
2222
- name: dpdk-setup
23-
image: johnlam90/aws-multi-eni-controller:v1.3.6-ens8-device-index-fix
23+
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
2424
command: ["/bin/sh", "-c"]
2525
args:
2626
- |
@@ -180,7 +180,7 @@ spec:
180180
readOnly: true
181181
containers:
182182
- name: eni-manager
183-
image: johnlam90/aws-multi-eni-controller:v1.3.6-ens8-device-index-fix
183+
image: johnlam90/aws-multi-eni-controller:fix-al2023-dpdk-setup-7101ae5
184184
env:
185185
- name: COMPONENT
186186
value: "eni-manager"

pkg/controller/nodeeni_controller.go

Lines changed: 100 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -496,13 +496,29 @@ func (r *NodeENIReconciler) cleanupENIAttachmentCoordinated(ctx context.Context,
496496
// Create operation ID for coordination
497497
operationID := fmt.Sprintf("cleanup-%s-%s-%s", nodeENI.Name, attachment.NodeID, attachment.ENIID)
498498

499-
// Define resource IDs that this operation will use
499+
// Define resource IDs with granular locking strategy
500+
// Only lock the specific ENI resource to allow parallel cleanup of different ENIs
501+
// on the same node/instance
500502
resourceIDs := []string{
501-
attachment.ENIID, // ENI resource
502-
attachment.InstanceID, // Instance resource
503-
fmt.Sprintf("node-%s", attachment.NodeID), // Node resource
503+
attachment.ENIID, // Only lock the specific ENI being cleaned up
504504
}
505505

506+
// For operations that might conflict at the instance level (like instance termination),
507+
// we use a more specific resource ID that includes the ENI ID
508+
if attachment.InstanceID != "" {
509+
// Create instance-specific resource ID that includes ENI to avoid conflicts
510+
// between different ENI cleanup operations on the same instance
511+
instanceResourceID := fmt.Sprintf("instance-%s-eni-%s", attachment.InstanceID, attachment.ENIID)
512+
resourceIDs = append(resourceIDs, instanceResourceID)
513+
}
514+
515+
log.V(1).Info("Acquiring locks for ENI cleanup",
516+
"operationID", operationID,
517+
"resourceIDs", resourceIDs,
518+
"eniID", attachment.ENIID,
519+
"instanceID", attachment.InstanceID,
520+
"nodeID", attachment.NodeID)
521+
506522
// Create coordinated operation
507523
operation := &CoordinatedOperation{
508524
ID: operationID,
@@ -524,14 +540,87 @@ func (r *NodeENIReconciler) cleanupENIAttachmentCoordinated(ctx context.Context,
524540
// Execute with coordination
525541
err := r.Coordinator.ExecuteCoordinated(ctx, operation)
526542
if err != nil {
527-
log.Error(err, "Coordinated cleanup failed")
543+
log.Error(err, "Coordinated cleanup failed",
544+
"operationID", operationID,
545+
"resourceIDs", resourceIDs)
528546
return false
529547
}
530548

531-
log.Info("Coordinated cleanup succeeded")
549+
log.Info("Coordinated cleanup succeeded", "operationID", operationID)
532550
return true
533551
}
534552

553+
// cleanupENIAttachmentWithNodeCoordination performs ENI cleanup with node-level coordination
554+
// This is used when operations need to be serialized at the node level (e.g., for DPDK operations)
555+
func (r *NodeENIReconciler) cleanupENIAttachmentWithNodeCoordination(ctx context.Context, nodeENI *networkingv1alpha1.NodeENI, attachment networkingv1alpha1.ENIAttachment) bool {
556+
log := r.Log.WithValues("nodeeni", nodeENI.Name, "node", attachment.NodeID, "eniID", attachment.ENIID)
557+
558+
// Create operation ID for coordination
559+
operationID := fmt.Sprintf("node-cleanup-%s-%s-%s", nodeENI.Name, attachment.NodeID, attachment.ENIID)
560+
561+
// Define resource IDs with node-level locking for operations that require serialization
562+
resourceIDs := []string{
563+
attachment.ENIID, // ENI resource
564+
fmt.Sprintf("node-%s", attachment.NodeID), // Node resource for serialization
565+
}
566+
567+
// Add instance-level coordination if needed
568+
if attachment.InstanceID != "" {
569+
resourceIDs = append(resourceIDs, attachment.InstanceID)
570+
}
571+
572+
log.V(1).Info("Acquiring node-level locks for ENI cleanup",
573+
"operationID", operationID,
574+
"resourceIDs", resourceIDs,
575+
"reason", "node-level coordination required")
576+
577+
// Create coordinated operation
578+
operation := &CoordinatedOperation{
579+
ID: operationID,
580+
Type: "eni-node-cleanup",
581+
ResourceIDs: resourceIDs,
582+
Priority: 2, // Higher priority for node-level operations
583+
DependsOn: []string{},
584+
Timeout: r.Config.DetachmentTimeout,
585+
Execute: func(ctx context.Context) error {
586+
// Execute the actual cleanup
587+
success := r.cleanupENIAttachment(ctx, nodeENI, attachment)
588+
if !success {
589+
return fmt.Errorf("node-coordinated cleanup failed for ENI %s", attachment.ENIID)
590+
}
591+
return nil
592+
},
593+
}
594+
595+
// Execute with coordination
596+
err := r.Coordinator.ExecuteCoordinated(ctx, operation)
597+
if err != nil {
598+
log.Error(err, "Node-coordinated cleanup failed",
599+
"operationID", operationID,
600+
"resourceIDs", resourceIDs)
601+
return false
602+
}
603+
604+
log.Info("Node-coordinated cleanup succeeded", "operationID", operationID)
605+
return true
606+
}
607+
608+
// shouldUseNodeLevelCoordination determines if node-level coordination is needed
609+
func (r *NodeENIReconciler) shouldUseNodeLevelCoordination(nodeENI *networkingv1alpha1.NodeENI, attachment networkingv1alpha1.ENIAttachment) bool {
610+
// Use node-level coordination for DPDK-enabled ENIs
611+
if nodeENI.Spec.EnableDPDK {
612+
return true
613+
}
614+
615+
// Use node-level coordination for SR-IOV enabled ENIs (indicated by PCI address or resource name)
616+
if nodeENI.Spec.DPDKPCIAddress != "" || nodeENI.Spec.DPDKResourceName != "" {
617+
return true
618+
}
619+
620+
// For standard ENIs (Case 1), use granular coordination (no node-level locking)
621+
return false
622+
}
623+
535624
// InterfaceState represents the current state of a network interface
536625
type InterfaceState struct {
537626
PCIAddress string
@@ -2297,3 +2386,8 @@ func (r *NodeENIReconciler) startIMDSConfigurationRetry(ctx context.Context) {
22972386
}()
22982387
}
22992388
}
2389+
2390+
// ShouldUseNodeLevelCoordinationTest exposes shouldUseNodeLevelCoordination for testing purposes
2391+
func (r *NodeENIReconciler) ShouldUseNodeLevelCoordinationTest(nodeENI *networkingv1alpha1.NodeENI, attachment networkingv1alpha1.ENIAttachment) bool {
2392+
return r.shouldUseNodeLevelCoordination(nodeENI, attachment)
2393+
}

pkg/controller/parallel_cleanup.go

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,32 @@ func (r *NodeENIReconciler) workerFunc(
6464
workChan <-chan networkingv1alpha1.ENIAttachment,
6565
resultChan chan<- bool,
6666
) {
67+
log := r.Log.WithValues("nodeeni", nodeENI.Name, "worker", "cleanup")
68+
6769
for {
6870
select {
6971
case att, ok := <-workChan:
7072
if !ok {
7173
// Channel closed, exit worker
7274
return
7375
}
74-
// Clean up the attachment with coordinated execution
75-
success := r.cleanupENIAttachmentCoordinated(ctx, nodeENI, att)
76+
77+
// Determine the appropriate coordination strategy
78+
var success bool
79+
if r.shouldUseNodeLevelCoordination(nodeENI, att) {
80+
log.V(1).Info("Using node-level coordination for ENI cleanup",
81+
"eniID", att.ENIID,
82+
"nodeID", att.NodeID,
83+
"reason", "DPDK or SR-IOV enabled")
84+
success = r.cleanupENIAttachmentWithNodeCoordination(ctx, nodeENI, att)
85+
} else {
86+
log.V(1).Info("Using granular coordination for ENI cleanup",
87+
"eniID", att.ENIID,
88+
"nodeID", att.NodeID,
89+
"reason", "standard ENI")
90+
success = r.cleanupENIAttachmentCoordinated(ctx, nodeENI, att)
91+
}
92+
7693
select {
7794
case resultChan <- success:
7895
case <-ctx.Done():
@@ -139,6 +156,9 @@ func (r *NodeENIReconciler) parallelCleanupENIs(ctx context.Context, nodeENI *ne
139156
log := r.Log.WithValues("nodeeni", nodeENI.Name)
140157
log.Info(logPrefix+" ENI attachments in parallel", "count", len(attachments))
141158

159+
// Analyze coordination requirements
160+
r.analyzeCoordinationRequirements(nodeENI, attachments)
161+
142162
// Handle the case of no attachments or a single attachment
143163
singleResult := r.handleSingleAttachment(ctx, nodeENI, attachments)
144164
if len(attachments) <= 1 {
@@ -169,6 +189,51 @@ func (r *NodeENIReconciler) parallelCleanupENIs(ctx context.Context, nodeENI *ne
169189
return r.collectResults(resultChan, nodeENI, logPrefix)
170190
}
171191

192+
// analyzeCoordinationRequirements analyzes and logs the coordination strategy for each attachment
193+
func (r *NodeENIReconciler) analyzeCoordinationRequirements(nodeENI *networkingv1alpha1.NodeENI, attachments []networkingv1alpha1.ENIAttachment) {
194+
log := r.Log.WithValues("nodeeni", nodeENI.Name)
195+
196+
granularCount := 0
197+
nodeLevelCount := 0
198+
nodeGroups := make(map[string][]string) // nodeID -> list of ENI IDs
199+
200+
for _, att := range attachments {
201+
if r.shouldUseNodeLevelCoordination(nodeENI, att) {
202+
nodeLevelCount++
203+
log.V(1).Info("ENI requires node-level coordination",
204+
"eniID", att.ENIID,
205+
"nodeID", att.NodeID,
206+
"enableDPDK", nodeENI.Spec.EnableDPDK,
207+
"dpdkPCIAddress", nodeENI.Spec.DPDKPCIAddress,
208+
"dpdkResourceName", nodeENI.Spec.DPDKResourceName)
209+
} else {
210+
granularCount++
211+
log.V(1).Info("ENI uses granular coordination",
212+
"eniID", att.ENIID,
213+
"nodeID", att.NodeID)
214+
}
215+
216+
// Group by node for analysis
217+
nodeGroups[att.NodeID] = append(nodeGroups[att.NodeID], att.ENIID)
218+
}
219+
220+
log.Info("Coordination strategy analysis",
221+
"totalAttachments", len(attachments),
222+
"granularCoordination", granularCount,
223+
"nodeLevelCoordination", nodeLevelCount,
224+
"affectedNodes", len(nodeGroups))
225+
226+
// Log node-level grouping for multi-subnet scenarios
227+
for nodeID, eniIDs := range nodeGroups {
228+
if len(eniIDs) > 1 {
229+
log.Info("Multi-ENI node detected",
230+
"nodeID", nodeID,
231+
"eniCount", len(eniIDs),
232+
"eniIDs", eniIDs)
233+
}
234+
}
235+
}
236+
172237
// min returns the minimum of two integers
173238
func min(a, b int) int {
174239
if a < b {

pkg/controller/stale_detection_test.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,12 @@ func TestComprehensiveStaleDetection(t *testing.T) {
9595
}
9696

9797
// Test 4: Test comprehensive stale detection for running instance
98+
// Note: isAttachmentComprehensivelyStale is designed to be called only when the node
99+
// doesn't match the NodeENI selector. In this case, since we're testing a running instance
100+
// that would normally match the selector, we should test isAttachmentStaleInAWS instead.
98101
runningAttachment := nodeENI.Status.Attachments[0]
99-
if reconciler.isAttachmentComprehensivelyStale(ctx, nodeENI, runningAttachment) {
100-
t.Error("Expected attachment to running instance to not be stale")
102+
if reconciler.isAttachmentStaleInAWS(ctx, nodeENI, runningAttachment) {
103+
t.Error("Expected attachment to running instance to not be stale in AWS")
101104
}
102105

103106
// Test 5: Test AWS stale detection for properly attached ENI
@@ -231,8 +234,10 @@ func TestStaleAttachmentRemoval(t *testing.T) {
231234
t.Error("Expected terminated instance attachment to be comprehensively stale")
232235
}
233236

234-
// Test 4: Comprehensive stale detection for running instance (should not be stale)
235-
if reconciler.isAttachmentComprehensivelyStale(ctx, nodeENI, runningAttachment) {
236-
t.Error("Expected running instance attachment to not be comprehensively stale")
237+
// Test 4: Comprehensive stale detection for running instance
238+
// Note: isAttachmentComprehensivelyStale assumes the node doesn't match the selector
239+
// For a running instance that would normally match, we should test AWS-level staleness
240+
if reconciler.isAttachmentStaleInAWS(ctx, nodeENI, runningAttachment) {
241+
t.Error("Expected running instance attachment to not be stale in AWS")
237242
}
238243
}

0 commit comments

Comments
 (0)