Skip to content

Commit 0e03ca5

Browse files
committed
use fuser to check if gpu is free
Signed-off-by: Varun Ramachandra Sekar <[email protected]>
1 parent d669d06 commit 0e03ca5

File tree

3 files changed

+17
-25
lines changed

3 files changed

+17
-25
lines changed

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
9898

9999
var vfioPciManager *VfioPciManager
100100
if featuregates.Enabled(featuregates.PassthroughSupport) {
101-
manager := NewVfioPciManager(string(containerDriverRoot), nvdevlib, true /* nvidiaEnabled */)
101+
manager := NewVfioPciManager(string(containerDriverRoot), string(hostDriverRoot), nvdevlib, true /* nvidiaEnabled */)
102102
if err := manager.Prechecks(); err == nil {
103103
vfioPciManager = manager
104104
} else {

cmd/gpu-kubelet-plugin/vfio-device.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,20 @@ const (
4747
)
4848

4949
type VfioPciManager struct {
50-
driverRoot string
51-
driver string
52-
nvlib *deviceLib
53-
nvidiaEnabled bool
50+
containerDriverRoot string
51+
hostDriverRoot string
52+
driver string
53+
nvlib *deviceLib
54+
nvidiaEnabled bool
5455
}
5556

56-
func NewVfioPciManager(driverRoot string, nvlib *deviceLib, nvidiaEnabled bool) *VfioPciManager {
57+
func NewVfioPciManager(containerDriverRoot string, hostDriverRoot string, nvlib *deviceLib, nvidiaEnabled bool) *VfioPciManager {
5758
vm := &VfioPciManager{
58-
driverRoot: driverRoot,
59-
driver: vfioPciDriver,
60-
nvlib: nvlib,
61-
nvidiaEnabled: nvidiaEnabled,
59+
containerDriverRoot: containerDriverRoot,
60+
hostDriverRoot: hostDriverRoot,
61+
driver: vfioPciDriver,
62+
nvlib: nvlib,
63+
nvidiaEnabled: nvidiaEnabled,
6264
}
6365
if !vm.isVfioPCIModuleLoaded() {
6466
err := vm.loadVfioPciModule()
@@ -135,20 +137,21 @@ func (vm *VfioPciManager) WaitForGPUFree(ctx context.Context, info *VfioDeviceIn
135137
ticker := time.NewTicker(gpuFreeCheckInterval)
136138
defer ticker.Stop()
137139

138-
gpuDeviceNode := filepath.Join(vm.driverRoot, "dev", fmt.Sprintf("nvidia%d", info.parent.minor))
140+
gpuDeviceNode := filepath.Join(vm.hostDriverRoot, "dev", fmt.Sprintf("nvidia%d", info.parent.minor))
139141
for {
140142
select {
141143
case <-timeout:
142144
return fmt.Errorf("timed out waiting for gpu to be free")
143145
case <-ticker.C:
144-
out, err := execCommand("lsof", []string{gpuDeviceNode}) //nolint:gosec
145-
klog.Infof("lsof output: %s, err: %v", string(out), err)
146+
out, err := execCommandWithChroot(hostRoot, "fuser", []string{gpuDeviceNode}) //nolint:gosec
146147
if err != nil {
147148
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
148149
return nil
149150
}
151+
klog.Errorf("Unexpected error checking if gpu device %q is free: %v", info.pcieBusID, err)
150152
continue
151153
}
154+
klog.Infof("gpu device %q has open fds by process(es): %s", info.pcieBusID, string(out))
152155
}
153156
}
154157
}

deployments/container/Dockerfile

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ RUN apt-get update && \
4343
make \
4444
git \
4545
gcc-aarch64-linux-gnu \
46-
gcc \
47-
lsof
46+
gcc
4847

4948
# Install dependencies for `bash-static` build.
5049
RUN apt-get install -y gpg curl autoconf file
@@ -148,16 +147,6 @@ COPY --from=build /artifacts/webhook /usr/bin/webhook
148147
COPY --from=build /bashbuild/bash /bin/bash
149148
COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh
150149
COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh
151-
COPY --from=build /lib/x86_64-linux-gnu/libselinux.so.1 /lib/x86_64-linux-gnu/libselinux.so.1
152-
COPY --from=build /lib/x86_64-linux-gnu/libtirpc.so.3 /lib/x86_64-linux-gnu/libtirpc.so.3
153-
COPY --from=build /lib/x86_64-linux-gnu/libpcre2-8.so.0 /lib/x86_64-linux-gnu/libpcre2-8.so.0
154-
COPY --from=build /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 /lib/x86_64-linux-gnu/libgssapi_krb5.so.2
155-
COPY --from=build /lib/x86_64-linux-gnu/libkrb5.so.3 /lib/x86_64-linux-gnu/libkrb5.so.3
156-
COPY --from=build /lib/x86_64-linux-gnu/libk5crypto.so.3 /lib/x86_64-linux-gnu/libk5crypto.so.3
157-
COPY --from=build /lib/x86_64-linux-gnu/libcom_err.so.2 /lib/x86_64-linux-gnu/libcom_err.so.2
158-
COPY --from=build /lib/x86_64-linux-gnu/libkrb5support.so.0 /lib/x86_64-linux-gnu/libkrb5support.so.0
159-
COPY --from=build /lib/x86_64-linux-gnu/libkeyutils.so.1 /lib/x86_64-linux-gnu/libkeyutils.so.1
160-
COPY --from=build /usr/bin/lsof /usr/bin/lsof
161150
COPY /hack/kubelet-plugin-prestart.sh /usr/bin/kubelet-plugin-prestart.sh
162151
COPY /templates /templates
163152

0 commit comments

Comments
 (0)