@@ -29,11 +29,9 @@ import (
2929
3030// Main intent: help users to self-troubleshoot when the GPU driver is not set up
3131// properly before installing this DRA driver. In that case, the log of the init
32- // container running this script is meant to yield an actionable error message.
32+ // container running the prestart code is meant to yield an actionable error message.
3333// For now, rely on k8s to implement a high-level retry with back-off.
34- func runPrestartInit (ctx context.Context ) error {
35- // Design goal: long-running init container that retries at constant frequency,
36- // and leaves only upon success (with code 0).
34+ func runPrestart (ctx context.Context ) error {
3735 waitS := 10 * time .Second
3836 attempt := 0
3937
@@ -45,7 +43,7 @@ func runPrestartInit(ctx context.Context) error {
4543 }
4644
4745 driverRootParent := "/driver-root-parent"
48- // Remove trailing slash (if existing) and get last path element.
46+ // filepath.Base removes trailing slash (if existing) and get last path element.
4947 driverRootPath := filepath .Join (driverRootParent , filepath .Base (nvidiaDriverRoot ))
5048
5149 // Ensure the /driver-root-parent directory exists
@@ -58,7 +56,7 @@ func runPrestartInit(ctx context.Context) error {
5856 // once the driver becomes mounted (e.g., once GPU operator provides the driver
5957 // on the host at /run/nvidia/driver).
6058 fmt .Printf ("create symlink: /driver-root -> %s\n " , driverRootPath )
61- _ = os . Remove ( "/driver-root" )
59+
6260 if err := os .Symlink (driverRootPath , "/driver-root" ); err != nil {
6361 klog .Warningf ("Failed to create symlink: %v" , err )
6462 }
@@ -96,6 +94,10 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
9694 // Search specific set of directories (not recursively: not required, and
9795 // /driver-root may be a big tree). Limit to first result (multiple results
9896 // are a bit of a pathological state, but continue with validation logic).
97+
98+ // original script does not follow symlink for nvpath but since symlinkm
99+ // can also execute so reuse findFirstFile to avoid new func that's largely
100+ // duplicative.
99101 nvPath := findFirstFile (
100102 "nvidia-smi" ,
101103 "/driver-root/opt/bin" ,
@@ -105,7 +107,6 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
105107 "/driver-root/sbin" ,
106108 )
107109
108- // Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link.
109110 nvLibPath := findFirstFile (
110111 "libnvidia-ml.so.1" ,
111112 "/driver-root/usr/lib64" ,
@@ -145,6 +146,7 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
145146 // hang).
146147 fmt .Printf ("invoke: env -i LD_PRELOAD=%s %s\n " , nvLibPath , nvPath )
147148
149+ // override default env to just LD_PRELOAD
148150 cmd := exec .CommandContext (ctx , nvPath )
149151 cmd .Env = []string {"LD_PRELOAD=" + nvLibPath }
150152 cmd .Stdout = os .Stdout
@@ -162,6 +164,7 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
162164 if exitErr , ok := err .(* exec.ExitError ); ok {
163165 fmt .Printf ("exit code: %d\n " , exitErr .ExitCode ())
164166 } else {
167+ // nvidia-smi fails to start. e.g. permission denied etc.
165168 fmt .Printf ("execution failed: %v, exit code: -1\n " , err )
166169 }
167170 }
@@ -209,6 +212,9 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
209212 return false
210213}
211214
215+ // findFirstFile finds the first occurrence of filename in the provided
216+ // directories not recursively.
217+ // It follows symlinks (similar to find -L).
212218func findFirstFile (filename string , dirs ... string ) string {
213219 for _ , dir := range dirs {
214220 path := filepath .Join (dir , filename )
0 commit comments