@@ -188,7 +188,17 @@ func (m *NvidiaDevicePlugin) DevicesNum() int {
188
188
func (m * NvidiaDevicePlugin ) Serve () error {
189
189
sock , err := net .Listen ("unix" , m .socket )
190
190
if err != nil {
191
- return err
191
+ log .Printf ("Listen sock fail and retry for '%s': %s" , m .resourceName , err )
192
+ err = os .Remove (m .socket )
193
+ if err != nil {
194
+ log .Printf ("Error deleting file: %s, %v\n " , m .socket , err )
195
+ return err
196
+ }
197
+ sock , err = net .Listen ("unix" , m .socket )
198
+ if err != nil {
199
+ log .Printf ("Retry Listen sock fail '%s': %s" , m .resourceName , err )
200
+ return err
201
+ }
192
202
}
193
203
194
204
pluginapi .RegisterDevicePluginServer (m .server , m )
@@ -343,6 +353,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc
343
353
}
344
354
345
355
sort .Sort (availablePods )
356
+ util .UseClient (m .kubeInteractor .clientset )
346
357
347
358
var candidatePod * v1.Pod
348
359
for _ , pod := range availablePods {
@@ -406,7 +417,6 @@ Allocate:
406
417
return nil , fmt .Errorf ("failed to update pod annotation %v" , err )
407
418
}
408
419
409
- util .UseClient (m .kubeInteractor .clientset )
410
420
klog .V (3 ).Infoln ("Releasing lock: nodeName=" , m .kubeInteractor .nodeName )
411
421
err = util .ReleaseNodeLock (m .kubeInteractor .nodeName , "gpu" )
412
422
if err != nil {
0 commit comments