@@ -3225,7 +3225,7 @@ func getDefaultSRIOVRestartConfig() SRIOVRestartConfig {
32253225 RetryBackoffMultiplier : 1.5 , // Exponential backoff
32263226 PodReadinessTimeout : 120 * time .Second , // 2 minutes for pod readiness
32273227 ResourceVerifyTimeout : 90 * time .Second , // 90 seconds for resource verification
3228- StaleResourceCleanup : true , // Verify stale resource cleanup
3228+ StaleResourceCleanup : false , // CRITICAL: Disable stale resource cleanup to prevent continuous restart loops
32293229 }
32303230}
32313231
@@ -3373,6 +3373,7 @@ func (m *SRIOVConfigManager) getCurrentNodeResources() (map[string]string, error
33733373}
33743374
33753375// identifyStaleResources identifies resources that should be removed
3376+ // This function now uses intelligent filtering to avoid marking legitimate non-DPDK SR-IOV resources as stale
33763377func (m * SRIOVConfigManager ) identifyStaleResources (currentResources map [string ]string , expectedResources []string ) []string {
33773378 expectedSet := make (map [string ]bool )
33783379 for _ , resource := range expectedResources {
@@ -3382,14 +3383,55 @@ func (m *SRIOVConfigManager) identifyStaleResources(currentResources map[string]
33823383 var staleResources []string
33833384 for resource := range currentResources {
33843385 if ! expectedSet [resource ] {
3385- // This resource exists but is not expected, so it's stale
3386- staleResources = append (staleResources , resource )
3386+ // Apply intelligent filtering to avoid marking legitimate resources as stale
3387+ if m .isResourceLikelyStale (resource , expectedResources ) {
3388+ staleResources = append (staleResources , resource )
3389+ } else {
3390+ log .Printf ("Resource %s not in expected list but appears to be legitimate, preserving" , resource )
3391+ }
33873392 }
33883393 }
33893394
33903395 return staleResources
33913396}
33923397
3398+ // isResourceLikelyStale determines if a resource is likely stale and should be removed
3399+ // This prevents legitimate non-DPDK SR-IOV resources from being marked as stale
3400+ func (m * SRIOVConfigManager ) isResourceLikelyStale (resource string , expectedResources []string ) bool {
3401+ // If we have no expected resources, don't mark anything as stale
3402+ if len (expectedResources ) == 0 {
3403+ log .Printf ("No expected resources defined, preserving existing resource: %s" , resource )
3404+ return false
3405+ }
3406+
3407+ // Check if this resource follows common patterns that indicate it's legitimate
3408+ // Pattern 1: intel.com/sriov_kernel_* resources are typically legitimate non-DPDK resources
3409+ if strings .Contains (resource , "intel.com/sriov_kernel" ) {
3410+ log .Printf ("Resource %s appears to be a legitimate kernel SR-IOV resource, preserving" , resource )
3411+ return false
3412+ }
3413+
3414+ // Pattern 2: Resources with "kernel" in the name are typically non-DPDK
3415+ if strings .Contains (resource , "kernel" ) {
3416+ log .Printf ("Resource %s contains 'kernel' and appears to be non-DPDK, preserving" , resource )
3417+ return false
3418+ }
3419+
3420+ // Pattern 3: If the resource has a similar prefix to expected resources but different suffix,
3421+ // it might be a legitimate variant (e.g., intel.com/sriov_test_* vs intel.com/sriov_kernel_*)
3422+ for _ , expected := range expectedResources {
3423+ if strings .HasPrefix (resource , "intel.com/sriov_" ) && strings .HasPrefix (expected , "intel.com/sriov_" ) {
3424+ log .Printf ("Resource %s has similar prefix to expected resource %s, likely legitimate, preserving" , resource , expected )
3425+ return false
3426+ }
3427+ }
3428+
3429+ // Pattern 4: Resources with capacity > 0 that have been stable are likely legitimate
3430+ // This is a conservative approach - if a resource exists and has capacity, it's probably valid
3431+ log .Printf ("Resource %s has capacity and doesn't match stale patterns, preserving to be safe" , resource )
3432+ return false
3433+ }
3434+
33933435// waitForDevicePluginPodsReady waits for SR-IOV device plugin pods to be ready
33943436func (m * SRIOVConfigManager ) waitForDevicePluginPodsReady (timeout time.Duration ) bool {
33953437 if m .k8sClient == nil {
@@ -5454,7 +5496,31 @@ func applyBatchedSRIOVUpdates(cfg *config.ENIManagerConfig, sriovUpdates map[str
54545496 return nil
54555497 }
54565498
5457- log .Printf ("Applying %d batched SR-IOV configuration updates" , len (sriovUpdates ))
5499+ log .Printf ("Checking %d SR-IOV configuration updates for changes" , len (sriovUpdates ))
5500+
5501+ // Check if any of these updates are actually new or changed
5502+ hasActualChanges := false
5503+ for updateKey , update := range sriovUpdates {
5504+ // Check if this configuration has already been processed
5505+ sriovConfigMutex .Lock ()
5506+ configKey := fmt .Sprintf ("%s:%s:%s/%s" , update .PCIAddress , update .Driver , update .ResourcePrefix , update .ResourceName )
5507+ if _ , alreadyProcessed := processedSRIOVConfigs [configKey ]; ! alreadyProcessed {
5508+ hasActualChanges = true
5509+ log .Printf ("Found new SR-IOV configuration: %s" , updateKey )
5510+ }
5511+ sriovConfigMutex .Unlock ()
5512+
5513+ if hasActualChanges {
5514+ break // No need to check further if we already found changes
5515+ }
5516+ }
5517+
5518+ if ! hasActualChanges {
5519+ log .Printf ("All %d SR-IOV configurations have already been processed - skipping restart" , len (sriovUpdates ))
5520+ return nil
5521+ }
5522+
5523+ log .Printf ("Applying %d batched SR-IOV configuration updates with actual changes" , len (sriovUpdates ))
54585524
54595525 manager := NewSRIOVConfigManager (cfg .SRIOVDPConfigPath )
54605526
@@ -5689,7 +5755,31 @@ func applyBatchedDPDKSRIOVUpdates(cfg *config.ENIManagerConfig, dpdkSriovUpdates
56895755 return nil
56905756 }
56915757
5692- log .Printf ("Applying %d batched DPDK SR-IOV updates" , len (dpdkSriovUpdates ))
5758+ log .Printf ("Checking %d DPDK SR-IOV updates for changes" , len (dpdkSriovUpdates ))
5759+
5760+ // Check if any of these DPDK updates are actually new or changed
5761+ hasActualChanges := false
5762+ for updateKey , update := range dpdkSriovUpdates {
5763+ // Check if this DPDK configuration has already been processed
5764+ sriovConfigMutex .Lock ()
5765+ configKey := fmt .Sprintf ("DPDK:%s:%s:%s" , update .PCIAddress , update .Driver , update .ResourceName )
5766+ if _ , alreadyProcessed := processedSRIOVConfigs [configKey ]; ! alreadyProcessed {
5767+ hasActualChanges = true
5768+ log .Printf ("Found new DPDK SR-IOV configuration: %s" , updateKey )
5769+ }
5770+ sriovConfigMutex .Unlock ()
5771+
5772+ if hasActualChanges {
5773+ break // No need to check further if we already found changes
5774+ }
5775+ }
5776+
5777+ if ! hasActualChanges {
5778+ log .Printf ("All %d DPDK SR-IOV configurations have already been processed - skipping restart" , len (dpdkSriovUpdates ))
5779+ return nil
5780+ }
5781+
5782+ log .Printf ("Applying %d batched DPDK SR-IOV updates with actual changes" , len (dpdkSriovUpdates ))
56935783
56945784 // Load the current SR-IOV configuration
56955785 config , err := loadOrCreateSRIOVConfig (cfg .SRIOVDPConfigPath )
@@ -5712,6 +5802,15 @@ func applyBatchedDPDKSRIOVUpdates(cfg *config.ENIManagerConfig, dpdkSriovUpdates
57125802 log .Printf ("Warning: Failed to restart SR-IOV device plugin after batched DPDK updates: %v" , err )
57135803 } else {
57145804 log .Printf ("Successfully restarted SR-IOV device plugin after batched DPDK updates" )
5805+
5806+ // Mark all DPDK configurations as processed after successful restart
5807+ sriovConfigMutex .Lock ()
5808+ for _ , update := range dpdkSriovUpdates {
5809+ configKey := fmt .Sprintf ("DPDK:%s:%s:%s" , update .PCIAddress , update .Driver , update .ResourceName )
5810+ processedSRIOVConfigs [configKey ] = configKey
5811+ log .Printf ("Marked DPDK SR-IOV configuration as processed: %s" , configKey )
5812+ }
5813+ sriovConfigMutex .Unlock ()
57155814 }
57165815
57175816 return nil
0 commit comments