Skip to content

Commit b8a7a79

Browse files
Add fast path optimization to skip driver reinstall when configuration digest matches
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 69eec27 commit b8a7a79

File tree

1 file changed

+99
-6
lines changed

1 file changed

+99
-6
lines changed

cmd/driver-manager/main.go

Lines changed: 99 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
const (
4343
driverRoot = "/run/nvidia/driver"
4444
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45+
driverConfigStateFile = "/run/nvidia/nvidia-driver.state"
4546
operatorNamespace = "gpu-operator"
4647
pausedStr = "paused-for-driver-upgrade"
4748
defaultDrainTimeout = time.Second * 0
@@ -77,6 +78,8 @@ type config struct {
7778
gpuDirectRDMAEnabled bool
7879
useHostMofed bool
7980
kubeconfig string
81+
driverVersion string
82+
forceReinstall bool
8083
}
8184

8285
// ComponentState tracks the deployment state of GPU operator components
@@ -208,6 +211,20 @@ func main() {
208211
EnvVars: []string{"KUBECONFIG"},
209212
Value: "",
210213
},
214+
&cli.StringFlag{
215+
Name: "driver-version",
216+
Usage: "Desired NVIDIA driver version",
217+
Destination: &cfg.driverVersion,
218+
EnvVars: []string{"DRIVER_VERSION"},
219+
Value: "",
220+
},
221+
&cli.BoolFlag{
222+
Name: "force-reinstall",
223+
Usage: "Force driver reinstall regardless of current state",
224+
Destination: &cfg.forceReinstall,
225+
EnvVars: []string{"FORCE_REINSTALL"},
226+
Value: false,
227+
},
211228
}
212229

213230
app.Commands = []*cli.Command{
@@ -288,6 +305,26 @@ func (dm *DriverManager) uninstallDriver() error {
288305
return fmt.Errorf("failed to evict GPU operator components: %w", err)
289306
}
290307

308+
if dm.shouldSkipUninstall() {
309+
dm.log.Info("Fast path activated: desired driver version and configuration already present")
310+
311+
// Clean up stale artifacts from previous container before rescheduling operands
312+
dm.log.Info("Cleaning up stale mounts and state files...")
313+
314+
// Unmount stale rootfs from previous container
315+
if err := dm.unmountRootfs(); err != nil {
316+
return fmt.Errorf("failed to unmount stale rootfs: %w", err)
317+
}
318+
319+
// Remove stale PID file from previous container
320+
dm.removePIDFile()
321+
322+
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
323+
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
324+
}
325+
return nil
326+
}
327+
291328
drainOpts := kube.DrainOptions{
292329
Force: dm.config.drainUseForce,
293330
DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData,
@@ -629,6 +666,61 @@ func (dm *DriverManager) isDriverLoaded() bool {
629666
return err == nil
630667
}
631668

669+
// readStoredDigest reads the driver configuration digest from the state file
670+
func readStoredDigest() (string, error) {
671+
data, err := os.ReadFile(driverConfigStateFile)
672+
if err != nil {
673+
return "", err
674+
}
675+
return strings.TrimSpace(string(data)), nil
676+
}
677+
678+
// getCurrentDigest returns the digest from the environment variable
679+
func getCurrentDigest() string {
680+
return os.Getenv("DRIVER_CONFIG_DIGEST")
681+
}
682+
683+
// shouldUpdateDriverConfig checks if the driver configuration needs to be updated
684+
func (dm *DriverManager) shouldUpdateDriverConfig() bool {
685+
if !dm.isDriverLoaded() {
686+
return true
687+
}
688+
689+
currentDigest := getCurrentDigest()
690+
if currentDigest == "" {
691+
dm.log.Warn("DRIVER_CONFIG_DIGEST env var not set, assuming config changed")
692+
return true
693+
}
694+
695+
storedDigest, err := readStoredDigest()
696+
if err != nil {
697+
if os.IsNotExist(err) {
698+
dm.log.Info("No previous driver configuration found")
699+
} else {
700+
dm.log.Warnf("Failed to read driver config state file: %v", err)
701+
}
702+
return true
703+
}
704+
705+
return currentDigest != storedDigest
706+
}
707+
708+
func (dm *DriverManager) shouldSkipUninstall() bool {
709+
if dm.config.forceReinstall {
710+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
711+
return false
712+
}
713+
714+
if !dm.shouldUpdateDriverConfig() {
715+
dm.log.Info("Driver is loaded with matching config, enabling fast path")
716+
return true
717+
}
718+
719+
// Driver not loaded or config changed - proceed with cleanup
720+
dm.log.Info("Proceeding with cleanup operations")
721+
return false
722+
}
723+
632724
func (dm *DriverManager) isNouveauLoaded() bool {
633725
_, err := os.Stat("/sys/module/nouveau/refcnt")
634726
return err == nil
@@ -639,6 +731,12 @@ func (dm *DriverManager) unloadNouveau() error {
639731
return unix.DeleteModule("nouveau", 0)
640732
}
641733

734+
func (dm *DriverManager) removePIDFile() {
735+
if err := os.Remove(driverPIDFile); err != nil && !os.IsNotExist(err) {
736+
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
737+
}
738+
}
739+
642740
func (dm *DriverManager) cleanupDriver() error {
643741
dm.log.Info("Cleaning up NVIDIA driver")
644742

@@ -652,12 +750,7 @@ func (dm *DriverManager) cleanupDriver() error {
652750
return fmt.Errorf("failed to unmount rootfs: %w", err)
653751
}
654752

655-
// Remove PID file
656-
if _, err := os.Stat(driverPIDFile); err == nil {
657-
if err := os.Remove(driverPIDFile); err != nil {
658-
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
659-
}
660-
}
753+
dm.removePIDFile()
661754

662755
return nil
663756
}

0 commit comments

Comments
 (0)