Skip to content

Commit 6fd5c53

Browse files
committed
fix 1. todo: squash later
Signed-off-by: Léiyì Zhang <leiyiz@google.com>
1 parent bcd7bb5 commit 6fd5c53

File tree

4 files changed

+22
-17
lines changed

4 files changed

+22
-17
lines changed

cmd/gpu-kubelet-plugin/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,11 @@ func main() {
7373
fmt.Fprintf(os.Stderr, "Error masking NVIDIA driver params: %v\n", err)
7474
}
7575

76-
if len(os.Args) > 1 && os.Args[1] == "prestart-init" {
76+
if len(os.Args) > 1 && os.Args[1] == "prestart" {
7777
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
7878
defer cancel()
7979

80-
if err := runPrestartInit(ctx); err != nil {
80+
if err := runPrestart(ctx); err != nil {
8181
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
8282
os.Exit(1)
8383
}

cmd/gpu-kubelet-plugin/prestart.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,9 @@ import (
2929

3030
// Main intent: help users to self-troubleshoot when the GPU driver is not set up
3131
// properly before installing this DRA driver. In that case, the log of the init
32-
// container running this script is meant to yield an actionable error message.
32+
// container running the prestart code is meant to yield an actionable error message.
3333
// For now, rely on k8s to implement a high-level retry with back-off.
34-
func runPrestartInit(ctx context.Context) error {
35-
// Design goal: long-running init container that retries at constant frequency,
36-
// and leaves only upon success (with code 0).
34+
func runPrestart(ctx context.Context) error {
3735
waitS := 10 * time.Second
3836
attempt := 0
3937

@@ -45,7 +43,7 @@ func runPrestartInit(ctx context.Context) error {
4543
}
4644

4745
driverRootParent := "/driver-root-parent"
48-
// Remove trailing slash (if existing) and get last path element.
46+
// filepath.Base removes trailing slash (if existing) and get last path element.
4947
driverRootPath := filepath.Join(driverRootParent, filepath.Base(nvidiaDriverRoot))
5048

5149
// Ensure the /driver-root-parent directory exists
@@ -58,7 +56,7 @@ func runPrestartInit(ctx context.Context) error {
5856
// once the driver becomes mounted (e.g., once GPU operator provides the driver
5957
// on the host at /run/nvidia/driver).
6058
fmt.Printf("create symlink: /driver-root -> %s\n", driverRootPath)
61-
_ = os.Remove("/driver-root")
59+
6260
if err := os.Symlink(driverRootPath, "/driver-root"); err != nil {
6361
klog.Warningf("Failed to create symlink: %v", err)
6462
}
@@ -96,6 +94,10 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
9694
// Search specific set of directories (not recursively: not required, and
9795
// /driver-root may be a big tree). Limit to first result (multiple results
9896
// are a bit of a pathological state, but continue with validation logic).
97+
98+
// original script does not follow symlink for nvpath but since symlinkm
99+
// can also execute so reuse findFirstFile to avoid new func that's largely
100+
// duplicative.
99101
nvPath := findFirstFile(
100102
"nvidia-smi",
101103
"/driver-root/opt/bin",
@@ -105,7 +107,6 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
105107
"/driver-root/sbin",
106108
)
107109

108-
// Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link.
109110
nvLibPath := findFirstFile(
110111
"libnvidia-ml.so.1",
111112
"/driver-root/usr/lib64",
@@ -145,6 +146,7 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
145146
// hang).
146147
fmt.Printf("invoke: env -i LD_PRELOAD=%s %s\n", nvLibPath, nvPath)
147148

149+
// override default env to just LD_PRELOAD
148150
cmd := exec.CommandContext(ctx, nvPath)
149151
cmd.Env = []string{"LD_PRELOAD=" + nvLibPath}
150152
cmd.Stdout = os.Stdout
@@ -162,6 +164,7 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
162164
if exitErr, ok := err.(*exec.ExitError); ok {
163165
fmt.Printf("exit code: %d\n", exitErr.ExitCode())
164166
} else {
167+
// nvidia-smi fails to start. e.g. permission denied etc.
165168
fmt.Printf("execution failed: %v, exit code: -1\n", err)
166169
}
167170
}
@@ -209,6 +212,9 @@ func validateAndExitOnSuccess(ctx context.Context, nvidiaDriverRoot string, atte
209212
return false
210213
}
211214

215+
// findFirstFile finds the first occurrence of filename in the provided
216+
// directories not recursively.
217+
// It follows symlinks (similar to find -L).
212218
func findFirstFile(filename string, dirs ...string) string {
213219
for _, dir := range dirs {
214220
path := filepath.Join(dir, filename)

cmd/gpu-kubelet-plugin/vfio-device.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ func (vm *VfioPciManager) acquireUnbindLock(gpu string) error {
260260
klog.Infof("[retry %d/%d] Attempting to acquire unbindLock for %s", attempt, lockRetries, gpu)
261261

262262
// Try to write 1 to acquire the lock
263-
err := os.WriteFile(unbindLockFile, []byte("1\n"), 0200)
263+
err := os.WriteFile(unbindLockFile, []byte("1\n"), 0644)
264264
if err != nil {
265265
klog.Warningf("failed to write to unbindLock file %s: %v", unbindLockFile, err)
266266
}
@@ -300,8 +300,7 @@ func (vm *VfioPciManager) unbindFromDriver(pciAddress string) error {
300300
}
301301
}
302302

303-
unbindFile := filepath.Join(existingDriver, "unbind")
304-
if err := os.WriteFile(unbindFile, []byte(pciAddress+"\n"), 0200); err != nil {
303+
if err := os.WriteFile(filepath.Join(existingDriver, "unbind"), []byte(pciAddress+"\n"), 0644); err != nil {
305304
klog.Errorf("Attempting to unbind %s from its driver failed; err: %v", pciAddress, err)
306305
return err
307306
}
@@ -318,7 +317,7 @@ func (vm *VfioPciManager) bindToDriver(pciAddress, driver string) error {
318317
return fmt.Errorf("driver_override file not found: %v", err)
319318
}
320319

321-
if err := os.WriteFile(driverOverrideFile, []byte(driver+"\n"), 0200); err != nil {
320+
if err := os.WriteFile(driverOverrideFile, []byte(driver+"\n"), 0644); err != nil {
322321
klog.Errorf("failed to write '%s' to %s", driver, driverOverrideFile)
323322
return fmt.Errorf("failed to write to driver_override: %v", err)
324323
}
@@ -328,10 +327,10 @@ func (vm *VfioPciManager) bindToDriver(pciAddress, driver string) error {
328327
return fmt.Errorf("bind file not found: %v", err)
329328
}
330329

331-
if err := os.WriteFile(bindFile, []byte(pciAddress+"\n"), 0200); err != nil {
332-
klog.Errorf("Attempting to bind %s to %s driver failed; err: %v", pciAddress, driver, err)
330+
if err := os.WriteFile(bindFile, []byte(pciAddress+"\n"), 0644); err != nil {
331+
klog.Errorf("failed to write %s to %s; err: %v", pciAddress, bindFile, err)
333332
// attempt to revert driver_override
334-
_ = os.WriteFile(driverOverrideFile, []byte("\n"), 0200)
333+
_ = os.WriteFile(driverOverrideFile, []byte("\n"), 0644)
335334
return fmt.Errorf("failed to write to bind file: %v", err)
336335
}
337336
return nil

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ spec:
5454
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
5555
securityContext:
5656
privileged: true
57-
command: [/usr/bin/gpu-kubelet-plugin, prestart-init]
57+
command: [/usr/bin/gpu-kubelet-plugin, prestart]
5858
env:
5959
- name: NVIDIA_DRIVER_ROOT
6060
value: "{{ .Values.nvidiaDriverRoot }}"

0 commit comments

Comments
 (0)