@@ -42,6 +42,7 @@ import (
4242const (
4343 driverRoot = "/run/nvidia/driver"
4444 driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45+ driverConfigStateFile = "/run/nvidia/nvidia-driver.state"
4546 operatorNamespace = "gpu-operator"
4647 pausedStr = "paused-for-driver-upgrade"
4748 defaultDrainTimeout = time .Second * 0
@@ -77,6 +78,8 @@ type config struct {
7778 gpuDirectRDMAEnabled bool
7879 useHostMofed bool
7980 kubeconfig string
81+ driverVersion string
82+ forceReinstall bool
8083}
8184
8285// ComponentState tracks the deployment state of GPU operator components
@@ -208,6 +211,20 @@ func main() {
208211 EnvVars : []string {"KUBECONFIG" },
209212 Value : "" ,
210213 },
214+ & cli.StringFlag {
215+ Name : "driver-version" ,
216+ Usage : "Desired NVIDIA driver version" ,
217+ Destination : & cfg .driverVersion ,
218+ EnvVars : []string {"DRIVER_VERSION" },
219+ Value : "" ,
220+ },
221+ & cli.BoolFlag {
222+ Name : "force-reinstall" ,
223+ Usage : "Force driver reinstall regardless of current state" ,
224+ Destination : & cfg .forceReinstall ,
225+ EnvVars : []string {"FORCE_REINSTALL" },
226+ Value : false ,
227+ },
211228 }
212229
213230 app .Commands = []* cli.Command {
@@ -288,6 +305,26 @@ func (dm *DriverManager) uninstallDriver() error {
288305 return fmt .Errorf ("failed to evict GPU operator components: %w" , err )
289306 }
290307
308+ if dm .shouldSkipUninstall () {
309+ dm .log .Info ("Fast path activated: desired driver version and configuration already present" )
310+
311+ // Clean up stale artifacts from previous container before rescheduling operands
312+ dm .log .Info ("Cleaning up stale mounts and state files..." )
313+
314+ // Unmount stale rootfs from previous container
315+ if err := dm .unmountRootfs (); err != nil {
316+ return fmt .Errorf ("failed to unmount stale rootfs: %w" , err )
317+ }
318+
319+ // Remove stale PID file from previous container
320+ dm .removePIDFile ()
321+
322+ if err := dm .rescheduleGPUOperatorComponents (); err != nil {
323+ dm .log .Warnf ("Failed to reschedule GPU operator components: %v" , err )
324+ }
325+ return nil
326+ }
327+
291328 drainOpts := kube.DrainOptions {
292329 Force : dm .config .drainUseForce ,
293330 DeleteEmptyDirData : dm .config .drainDeleteEmptyDirData ,
@@ -629,6 +666,61 @@ func (dm *DriverManager) isDriverLoaded() bool {
629666 return err == nil
630667}
631668
669+ // readStoredDigest reads the driver configuration digest from the state file
670+ func readStoredDigest () (string , error ) {
671+ data , err := os .ReadFile (driverConfigStateFile )
672+ if err != nil {
673+ return "" , err
674+ }
675+ return strings .TrimSpace (string (data )), nil
676+ }
677+
678+ // getCurrentDigest returns the digest from the environment variable
679+ func getCurrentDigest () string {
680+ return os .Getenv ("DRIVER_CONFIG_DIGEST" )
681+ }
682+
683+ // shouldUpdateDriverConfig checks if the driver configuration needs to be updated
684+ func (dm * DriverManager ) shouldUpdateDriverConfig () bool {
685+ if ! dm .isDriverLoaded () {
686+ return true
687+ }
688+
689+ currentDigest := getCurrentDigest ()
690+ if currentDigest == "" {
691+ dm .log .Warn ("DRIVER_CONFIG_DIGEST env var not set, assuming config changed" )
692+ return true
693+ }
694+
695+ storedDigest , err := readStoredDigest ()
696+ if err != nil {
697+ if os .IsNotExist (err ) {
698+ dm .log .Info ("No previous driver configuration found" )
699+ } else {
700+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
701+ }
702+ return true
703+ }
704+
705+ return currentDigest != storedDigest
706+ }
707+
708+ func (dm * DriverManager ) shouldSkipUninstall () bool {
709+ if dm .config .forceReinstall {
710+ dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
711+ return false
712+ }
713+
714+ if ! dm .shouldUpdateDriverConfig () {
715+ dm .log .Info ("Driver is loaded with matching config, enabling fast path" )
716+ return true
717+ }
718+
719+ // Driver not loaded or config changed - proceed with cleanup
720+ dm .log .Info ("Proceeding with cleanup operations" )
721+ return false
722+ }
723+
632724func (dm * DriverManager ) isNouveauLoaded () bool {
633725 _ , err := os .Stat ("/sys/module/nouveau/refcnt" )
634726 return err == nil
@@ -639,6 +731,12 @@ func (dm *DriverManager) unloadNouveau() error {
639731 return unix .DeleteModule ("nouveau" , 0 )
640732}
641733
734+ func (dm * DriverManager ) removePIDFile () {
735+ if err := os .Remove (driverPIDFile ); err != nil && ! os .IsNotExist (err ) {
736+ dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
737+ }
738+ }
739+
642740func (dm * DriverManager ) cleanupDriver () error {
643741 dm .log .Info ("Cleaning up NVIDIA driver" )
644742
@@ -652,12 +750,7 @@ func (dm *DriverManager) cleanupDriver() error {
652750 return fmt .Errorf ("failed to unmount rootfs: %w" , err )
653751 }
654752
655- // Remove PID file
656- if _ , err := os .Stat (driverPIDFile ); err == nil {
657- if err := os .Remove (driverPIDFile ); err != nil {
658- dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
659- }
660- }
753+ dm .removePIDFile ()
661754
662755 return nil
663756}
0 commit comments