@@ -88,8 +88,8 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
88
88
}()
89
89
90
90
parentToDeviceMap := make (map [string ]* Device )
91
- deviceIDToGiMap := make (map [string ]int )
92
- deviceIDToCiMap := make (map [string ]int )
91
+ deviceIDToGiMap := make (map [string ]uint32 )
92
+ deviceIDToCiMap := make (map [string ]uint32 )
93
93
94
94
eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
95
95
for _ , d := range devices {
@@ -112,7 +112,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
112
112
113
113
supportedEvents , ret := gpu .GetSupportedEventTypes ()
114
114
if ret != nvml .SUCCESS {
115
- klog .Infof ("Unable to determine the supported events for %v: %v; marking it as unhealthy" , d .ID , ret )
115
+ klog .Infof ("unable to determine the supported events for %v: %v; marking it as unhealthy" , d .ID , ret )
116
116
unhealthy <- d
117
117
continue
118
118
}
@@ -176,7 +176,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
176
176
if d .IsMigDevice () && e .GpuInstanceId != 0xFFFFFFFF && e .ComputeInstanceId != 0xFFFFFFFF {
177
177
gi := deviceIDToGiMap [d .ID ]
178
178
ci := deviceIDToCiMap [d .ID ]
179
- if ! (uint32 ( gi ) == e .GpuInstanceId && uint32 ( ci ) == e .ComputeInstanceId ) {
179
+ if ! (gi == e .GpuInstanceId && ci == e .ComputeInstanceId ) {
180
180
continue
181
181
}
182
182
klog .Infof ("Event for mig device %v (gi=%v, ci=%v)" , d .ID , gi , ci )
@@ -215,15 +215,15 @@ func getAdditionalXids(input string) []uint64 {
215
215
// getDevicePlacement returns the placement of the specified device.
216
216
// For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>
217
217
// For a full device the returned 3-tuple is the device's uuid and 0xFFFFFFFF for the other two elements.
218
- func (r * nvmlResourceManager ) getDevicePlacement (d * Device ) (string , int , int , error ) {
218
+ func (r * nvmlResourceManager ) getDevicePlacement (d * Device ) (string , uint32 , uint32 , error ) {
219
219
if ! d .IsMigDevice () {
220
220
return d .GetUUID (), 0xFFFFFFFF , 0xFFFFFFFF , nil
221
221
}
222
222
return r .getMigDeviceParts (d )
223
223
}
224
224
225
225
// getMigDeviceParts returns the parent GI and CI ids of the MIG device.
226
- func (r * nvmlResourceManager ) getMigDeviceParts (d * Device ) (string , int , int , error ) {
226
+ func (r * nvmlResourceManager ) getMigDeviceParts (d * Device ) (string , uint32 , uint32 , error ) {
227
227
if ! d .IsMigDevice () {
228
228
return "" , 0 , 0 , fmt .Errorf ("cannot get GI and CI of full device" )
229
229
}
@@ -250,32 +250,42 @@ func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, er
250
250
if ret != nvml .SUCCESS {
251
251
return "" , 0 , 0 , fmt .Errorf ("failed to get Compute Instance ID: %v" , ret )
252
252
}
253
- return parentUUID , gi , ci , nil
253
+ //nolint:gosec // We know that the values returned from Get*InstanceId are within the valid uint32 range.
254
+ return parentUUID , uint32 (gi ), uint32 (ci ), nil
254
255
}
255
256
return parseMigDeviceUUID (uuid )
256
257
}
257
258
258
259
// parseMigDeviceUUID splits the MIG device UUID into the parent device UUID and ci and gi
259
- func parseMigDeviceUUID (mig string ) (string , int , int , error ) {
260
+ func parseMigDeviceUUID (mig string ) (string , uint32 , uint32 , error ) {
260
261
tokens := strings .SplitN (mig , "-" , 2 )
261
262
if len (tokens ) != 2 || tokens [0 ] != "MIG" {
262
- return "" , 0 , 0 , fmt .Errorf ("Unable to parse UUID as MIG device" )
263
+ return "" , 0 , 0 , fmt .Errorf ("unable to parse UUID as MIG device" )
263
264
}
264
265
265
266
tokens = strings .SplitN (tokens [1 ], "/" , 3 )
266
267
if len (tokens ) != 3 || ! strings .HasPrefix (tokens [0 ], "GPU-" ) {
267
- return "" , 0 , 0 , fmt .Errorf ("Unable to parse UUID as MIG device" )
268
+ return "" , 0 , 0 , fmt .Errorf ("unable to parse UUID as MIG device" )
268
269
}
269
270
270
- gi , err := strconv . Atoi (tokens [1 ])
271
+ gi , err := toUint32 (tokens [1 ])
271
272
if err != nil {
272
- return "" , 0 , 0 , fmt .Errorf ("Unable to parse UUID as MIG device" )
273
+ return "" , 0 , 0 , fmt .Errorf ("unable to parse UUID as MIG device" )
273
274
}
274
275
275
- ci , err := strconv . Atoi (tokens [2 ])
276
+ ci , err := toUint32 (tokens [2 ])
276
277
if err != nil {
277
- return "" , 0 , 0 , fmt .Errorf ("Unable to parse UUID as MIG device" )
278
+ return "" , 0 , 0 , fmt .Errorf ("unable to parse UUID as MIG device" )
278
279
}
279
280
280
281
return tokens [0 ], gi , ci , nil
281
282
}
283
+
284
+ func toUint32 (s string ) (uint32 , error ) {
285
+ u , err := strconv .ParseUint (s , 10 , 32 )
286
+ if err != nil {
287
+ return 0 , err
288
+ }
289
+ //nolint:gosec // Since we parse s with a 32-bit size this will not overflow.
290
+ return uint32 (u ), nil
291
+ }
0 commit comments