Skip to content

Commit 900f54b

Browse files
Fix early return skipping critical vfio-pci, MOFED, and nouveau operations when driver reinstall is skipped
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 1991b8c commit 900f54b

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

cmd/driver-manager/main.go

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ func (dm *DriverManager) uninstallDriver() error {
289289

290290
if skip, reason := dm.shouldSkipUninstall(); skip {
291291
dm.log.Infof("Skipping driver uninstall: %s", reason)
292-
return nil
292+
return dm.performPostDriverOperations(false)
293293
}
294294

295295
// Fetch current component states
@@ -363,6 +363,14 @@ func (dm *DriverManager) uninstallDriver() error {
363363
dm.log.Info("Successfully uninstalled nvidia driver components")
364364
}
365365

366+
return dm.performPostDriverOperations(true)
367+
}
368+
369+
// performPostDriverOperations executes operations that run after driver operations.
370+
// The componentsWereEvicted parameter indicates whether GPU operator components were
371+
// actually evicted during this run, which determines whether uncordoning and rescheduling
372+
// are necessary.
373+
func (dm *DriverManager) performPostDriverOperations(componentsWereEvicted bool) error {
366374
// Handle vfio-pci driver unbinding
367375
if err := dm.unbindVfioPCI(); err != nil {
368376
dm.log.Error("Unable to unbind vfio-pci driver from all devices")
@@ -380,14 +388,17 @@ func (dm *DriverManager) uninstallDriver() error {
380388
}
381389

382390
// Cleanup and reschedule components
383-
if dm.isGPUPodEvictionEnabled() || dm.isAutoDrainEnabled() {
384-
if err := dm.kubeClient.UncordonNode(dm.config.nodeName); err != nil {
385-
dm.log.Warn("Failed to uncordon node")
391+
// Only needed if we actually evicted components during this run
392+
if componentsWereEvicted {
393+
if dm.isGPUPodEvictionEnabled() || dm.isAutoDrainEnabled() {
394+
if err := dm.kubeClient.UncordonNode(dm.config.nodeName); err != nil {
395+
dm.log.Warn("Failed to uncordon node")
396+
}
386397
}
387-
}
388398

389-
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
390-
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
399+
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
400+
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
401+
}
391402
}
392403

393404
// Handle nouveau driver

0 commit comments

Comments
 (0)