@@ -47,18 +47,20 @@ const (
4747)
4848
4949type VfioPciManager struct {
50- driverRoot string
51- driver string
52- nvlib * deviceLib
53- nvidiaEnabled bool
50+ containerDriverRoot string
51+ hostDriverRoot string
52+ driver string
53+ nvlib * deviceLib
54+ nvidiaEnabled bool
5455}
5556
56- func NewVfioPciManager (driverRoot string , nvlib * deviceLib , nvidiaEnabled bool ) * VfioPciManager {
57+ func NewVfioPciManager (containerDriverRoot string , hostDriverRoot string , nvlib * deviceLib , nvidiaEnabled bool ) * VfioPciManager {
5758 vm := & VfioPciManager {
58- driverRoot : driverRoot ,
59- driver : vfioPciDriver ,
60- nvlib : nvlib ,
61- nvidiaEnabled : nvidiaEnabled ,
59+ containerDriverRoot : containerDriverRoot ,
60+ hostDriverRoot : hostDriverRoot ,
61+ driver : vfioPciDriver ,
62+ nvlib : nvlib ,
63+ nvidiaEnabled : nvidiaEnabled ,
6264 }
6365 if ! vm .isVfioPCIModuleLoaded () {
6466 err := vm .loadVfioPciModule ()
@@ -135,20 +137,21 @@ func (vm *VfioPciManager) WaitForGPUFree(ctx context.Context, info *VfioDeviceIn
135137 ticker := time .NewTicker (gpuFreeCheckInterval )
136138 defer ticker .Stop ()
137139
138- gpuDeviceNode := filepath .Join (vm .driverRoot , "dev" , fmt .Sprintf ("nvidia%d" , info .parent .minor ))
140+ gpuDeviceNode := filepath .Join (vm .hostDriverRoot , "dev" , fmt .Sprintf ("nvidia%d" , info .parent .minor ))
139141 for {
140142 select {
141143 case <- timeout :
142144 return fmt .Errorf ("timed out waiting for gpu to be free" )
143145 case <- ticker .C :
144- out , err := execCommand ("lsof" , []string {gpuDeviceNode }) //nolint:gosec
145- klog .Infof ("lsof output: %s, err: %v" , string (out ), err )
146+ out , err := execCommandWithChroot (hostRoot , "fuser" , []string {gpuDeviceNode }) //nolint:gosec
146147 if err != nil {
147148 if exitErr , ok := err .(* exec.ExitError ); ok && exitErr .ExitCode () == 1 {
148149 return nil
149150 }
151+ klog .Errorf ("Unexpected error checking if gpu device %q is free: %v" , info .pcieBusID , err )
150152 continue
151153 }
154+ klog .Infof ("gpu device %q has open fds by process(es): %s" , info .pcieBusID , string (out ))
152155 }
153156 }
154157}
0 commit comments