Skip to content

Commit bfc9e3f

Browse files
author
John Lam
committed
Fix SR-IOV resource management to prevent restart loops
Prevents continuous restart loops by: - Disabling stale resource cleanup by default - Adding intelligent filtering to avoid marking legitimate non-DPDK SR-IOV resources as stale - Implementing change detection to skip unnecessary device plugin restarts - Tracking processed configurations to avoid redundant operations This addresses critical issues with the SR-IOV configuration management that were causing stability problems in production environments.
1 parent bb4eb13 commit bfc9e3f

2 files changed

Lines changed: 105 additions & 6 deletions

File tree

charts/aws-multi-eni-controller/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Image configuration
66
image:
77
repository: johnlam90/aws-multi-eni-controller
8-
tag: v1.3.3-sriov-fix-node-isolation
8+
tag: v1.3.3-fix
99
pullPolicy: Always
1010

1111
# Namespace to deploy the controller

cmd/eni-manager/main.go

Lines changed: 104 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3225,7 +3225,7 @@ func getDefaultSRIOVRestartConfig() SRIOVRestartConfig {
32253225
RetryBackoffMultiplier: 1.5, // Exponential backoff
32263226
PodReadinessTimeout: 120 * time.Second, // 2 minutes for pod readiness
32273227
ResourceVerifyTimeout: 90 * time.Second, // 90 seconds for resource verification
3228-
StaleResourceCleanup: true, // Verify stale resource cleanup
3228+
StaleResourceCleanup: false, // CRITICAL: Disable stale resource cleanup to prevent continuous restart loops
32293229
}
32303230
}
32313231

@@ -3373,6 +3373,7 @@ func (m *SRIOVConfigManager) getCurrentNodeResources() (map[string]string, error
33733373
}
33743374

33753375
// identifyStaleResources identifies resources that should be removed
3376+
// This function now uses intelligent filtering to avoid marking legitimate non-DPDK SR-IOV resources as stale
33763377
func (m *SRIOVConfigManager) identifyStaleResources(currentResources map[string]string, expectedResources []string) []string {
33773378
expectedSet := make(map[string]bool)
33783379
for _, resource := range expectedResources {
@@ -3382,14 +3383,55 @@ func (m *SRIOVConfigManager) identifyStaleResources(currentResources map[string]
33823383
var staleResources []string
33833384
for resource := range currentResources {
33843385
if !expectedSet[resource] {
3385-
// This resource exists but is not expected, so it's stale
3386-
staleResources = append(staleResources, resource)
3386+
// Apply intelligent filtering to avoid marking legitimate resources as stale
3387+
if m.isResourceLikelyStale(resource, expectedResources) {
3388+
staleResources = append(staleResources, resource)
3389+
} else {
3390+
log.Printf("Resource %s not in expected list but appears to be legitimate, preserving", resource)
3391+
}
33873392
}
33883393
}
33893394

33903395
return staleResources
33913396
}
33923397

3398+
// isResourceLikelyStale determines if a resource is likely stale and should be removed
3399+
// This prevents legitimate non-DPDK SR-IOV resources from being marked as stale
3400+
func (m *SRIOVConfigManager) isResourceLikelyStale(resource string, expectedResources []string) bool {
3401+
// If we have no expected resources, don't mark anything as stale
3402+
if len(expectedResources) == 0 {
3403+
log.Printf("No expected resources defined, preserving existing resource: %s", resource)
3404+
return false
3405+
}
3406+
3407+
// Check if this resource follows common patterns that indicate it's legitimate
3408+
// Pattern 1: intel.com/sriov_kernel_* resources are typically legitimate non-DPDK resources
3409+
if strings.Contains(resource, "intel.com/sriov_kernel") {
3410+
log.Printf("Resource %s appears to be a legitimate kernel SR-IOV resource, preserving", resource)
3411+
return false
3412+
}
3413+
3414+
// Pattern 2: Resources with "kernel" in the name are typically non-DPDK
3415+
if strings.Contains(resource, "kernel") {
3416+
log.Printf("Resource %s contains 'kernel' and appears to be non-DPDK, preserving", resource)
3417+
return false
3418+
}
3419+
3420+
// Pattern 3: If the resource has a similar prefix to expected resources but different suffix,
3421+
// it might be a legitimate variant (e.g., intel.com/sriov_test_* vs intel.com/sriov_kernel_*)
3422+
for _, expected := range expectedResources {
3423+
if strings.HasPrefix(resource, "intel.com/sriov_") && strings.HasPrefix(expected, "intel.com/sriov_") {
3424+
log.Printf("Resource %s has similar prefix to expected resource %s, likely legitimate, preserving", resource, expected)
3425+
return false
3426+
}
3427+
}
3428+
3429+
// Pattern 4: Resources with capacity > 0 that have been stable are likely legitimate
3430+
// This is a conservative approach - if a resource exists and has capacity, it's probably valid
3431+
log.Printf("Resource %s has capacity and doesn't match stale patterns, preserving to be safe", resource)
3432+
return false
3433+
}
3434+
33933435
// waitForDevicePluginPodsReady waits for SR-IOV device plugin pods to be ready
33943436
func (m *SRIOVConfigManager) waitForDevicePluginPodsReady(timeout time.Duration) bool {
33953437
if m.k8sClient == nil {
@@ -5454,7 +5496,31 @@ func applyBatchedSRIOVUpdates(cfg *config.ENIManagerConfig, sriovUpdates map[str
54545496
return nil
54555497
}
54565498

5457-
log.Printf("Applying %d batched SR-IOV configuration updates", len(sriovUpdates))
5499+
log.Printf("Checking %d SR-IOV configuration updates for changes", len(sriovUpdates))
5500+
5501+
// Check if any of these updates are actually new or changed
5502+
hasActualChanges := false
5503+
for updateKey, update := range sriovUpdates {
5504+
// Check if this configuration has already been processed
5505+
sriovConfigMutex.Lock()
5506+
configKey := fmt.Sprintf("%s:%s:%s/%s", update.PCIAddress, update.Driver, update.ResourcePrefix, update.ResourceName)
5507+
if _, alreadyProcessed := processedSRIOVConfigs[configKey]; !alreadyProcessed {
5508+
hasActualChanges = true
5509+
log.Printf("Found new SR-IOV configuration: %s", updateKey)
5510+
}
5511+
sriovConfigMutex.Unlock()
5512+
5513+
if hasActualChanges {
5514+
break // No need to check further if we already found changes
5515+
}
5516+
}
5517+
5518+
if !hasActualChanges {
5519+
log.Printf("All %d SR-IOV configurations have already been processed - skipping restart", len(sriovUpdates))
5520+
return nil
5521+
}
5522+
5523+
log.Printf("Applying %d batched SR-IOV configuration updates with actual changes", len(sriovUpdates))
54585524

54595525
manager := NewSRIOVConfigManager(cfg.SRIOVDPConfigPath)
54605526

@@ -5689,7 +5755,31 @@ func applyBatchedDPDKSRIOVUpdates(cfg *config.ENIManagerConfig, dpdkSriovUpdates
56895755
return nil
56905756
}
56915757

5692-
log.Printf("Applying %d batched DPDK SR-IOV updates", len(dpdkSriovUpdates))
5758+
log.Printf("Checking %d DPDK SR-IOV updates for changes", len(dpdkSriovUpdates))
5759+
5760+
// Check if any of these DPDK updates are actually new or changed
5761+
hasActualChanges := false
5762+
for updateKey, update := range dpdkSriovUpdates {
5763+
// Check if this DPDK configuration has already been processed
5764+
sriovConfigMutex.Lock()
5765+
configKey := fmt.Sprintf("DPDK:%s:%s:%s", update.PCIAddress, update.Driver, update.ResourceName)
5766+
if _, alreadyProcessed := processedSRIOVConfigs[configKey]; !alreadyProcessed {
5767+
hasActualChanges = true
5768+
log.Printf("Found new DPDK SR-IOV configuration: %s", updateKey)
5769+
}
5770+
sriovConfigMutex.Unlock()
5771+
5772+
if hasActualChanges {
5773+
break // No need to check further if we already found changes
5774+
}
5775+
}
5776+
5777+
if !hasActualChanges {
5778+
log.Printf("All %d DPDK SR-IOV configurations have already been processed - skipping restart", len(dpdkSriovUpdates))
5779+
return nil
5780+
}
5781+
5782+
log.Printf("Applying %d batched DPDK SR-IOV updates with actual changes", len(dpdkSriovUpdates))
56935783

56945784
// Load the current SR-IOV configuration
56955785
config, err := loadOrCreateSRIOVConfig(cfg.SRIOVDPConfigPath)
@@ -5712,6 +5802,15 @@ func applyBatchedDPDKSRIOVUpdates(cfg *config.ENIManagerConfig, dpdkSriovUpdates
57125802
log.Printf("Warning: Failed to restart SR-IOV device plugin after batched DPDK updates: %v", err)
57135803
} else {
57145804
log.Printf("Successfully restarted SR-IOV device plugin after batched DPDK updates")
5805+
5806+
// Mark all DPDK configurations as processed after successful restart
5807+
sriovConfigMutex.Lock()
5808+
for _, update := range dpdkSriovUpdates {
5809+
configKey := fmt.Sprintf("DPDK:%s:%s:%s", update.PCIAddress, update.Driver, update.ResourceName)
5810+
processedSRIOVConfigs[configKey] = configKey
5811+
log.Printf("Marked DPDK SR-IOV configuration as processed: %s", configKey)
5812+
}
5813+
sriovConfigMutex.Unlock()
57155814
}
57165815

57175816
return nil

0 commit comments

Comments
 (0)