Skip to content

Commit 741f361

Browse files
authored
Merge pull request #71 from Monokaix/fix-ci
Fix ci and add OWNER info
2 parents cb6e45e + efeab00 commit 741f361

File tree

7 files changed

+47
-25
lines changed

7 files changed

+47
-25
lines changed

.golangci.yml

+3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ linters-settings:
2222
local-prefixes: github.com/NVIDIA/k8s-device-plugin
2323

2424
issues:
25+
exclude:
26+
# A conversion of a uint8 to an int cannot overflow.
27+
- "G115: integer overflow conversion uint8 -> int"
2528
exclude-rules:
2629
# We use math/rand instead of crypto/rand for unique names in e2e tests.
2730
- path: tests/e2e/

OWNERS

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
reviewers:
2+
- william-wang
3+
- archlitchi
4+
- wangyang0616
5+
- Monokaix
6+
approvers:
7+
- william-wang
8+
- Monokaix

api/config/v1/replicas.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -299,9 +299,9 @@ func (s *ReplicatedDevices) UnmarshalJSON(b []byte) error {
299299
result := make([]ReplicatedDeviceRef, len(slice))
300300
for i, s := range slice {
301301
// Match a uint as a GPU index and convert it to a string
302-
var index uint
302+
var index uint64
303303
if err = json.Unmarshal(s, &index); err == nil {
304-
result[i] = ReplicatedDeviceRef(strconv.Itoa(int(index)))
304+
result[i] = ReplicatedDeviceRef(strconv.FormatUint(index, 10))
305305
continue
306306
}
307307
// Match strings as valid entries if they are GPU indices, MIG indices, or UUIDs

internal/cuda/api.go

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ func DriverGetVersion() (int, Result) {
6565
// DeviceGet returns the device with the specified index.
6666
func DeviceGet(index int) (Device, Result) {
6767
var device Device
68+
//nolint:gosec // Since index is internal-only, we ignore possible overflow errors here.
6869
r := cuDeviceGet(&device, int32(index))
6970

7071
return device, r

internal/rm/health.go

+24-14
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
8888
}()
8989

9090
parentToDeviceMap := make(map[string]*Device)
91-
deviceIDToGiMap := make(map[string]int)
92-
deviceIDToCiMap := make(map[string]int)
91+
deviceIDToGiMap := make(map[string]uint32)
92+
deviceIDToCiMap := make(map[string]uint32)
9393

9494
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
9595
for _, d := range devices {
@@ -112,7 +112,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
112112

113113
supportedEvents, ret := gpu.GetSupportedEventTypes()
114114
if ret != nvml.SUCCESS {
115-
klog.Infof("Unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret)
115+
klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret)
116116
unhealthy <- d
117117
continue
118118
}
@@ -176,7 +176,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
176176
if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF {
177177
gi := deviceIDToGiMap[d.ID]
178178
ci := deviceIDToCiMap[d.ID]
179-
if !(uint32(gi) == e.GpuInstanceId && uint32(ci) == e.ComputeInstanceId) {
179+
if !(gi == e.GpuInstanceId && ci == e.ComputeInstanceId) {
180180
continue
181181
}
182182
klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci)
@@ -215,15 +215,15 @@ func getAdditionalXids(input string) []uint64 {
215215
// getDevicePlacement returns the placement of the specified device.
216216
// For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>
217217
// For a full device the returned 3-tuple is the device's uuid and 0xFFFFFFFF for the other two elements.
218-
func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, error) {
218+
func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, uint32, uint32, error) {
219219
if !d.IsMigDevice() {
220220
return d.GetUUID(), 0xFFFFFFFF, 0xFFFFFFFF, nil
221221
}
222222
return r.getMigDeviceParts(d)
223223
}
224224

225225
// getMigDeviceParts returns the parent GI and CI ids of the MIG device.
226-
func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, error) {
226+
func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, uint32, uint32, error) {
227227
if !d.IsMigDevice() {
228228
return "", 0, 0, fmt.Errorf("cannot get GI and CI of full device")
229229
}
@@ -250,32 +250,42 @@ func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, er
250250
if ret != nvml.SUCCESS {
251251
return "", 0, 0, fmt.Errorf("failed to get Compute Instance ID: %v", ret)
252252
}
253-
return parentUUID, gi, ci, nil
253+
//nolint:gosec // We know that the values returned from Get*InstanceId are within the valid uint32 range.
254+
return parentUUID, uint32(gi), uint32(ci), nil
254255
}
255256
return parseMigDeviceUUID(uuid)
256257
}
257258

258259
// parseMigDeviceUUID splits the MIG device UUID into the parent device UUID and ci and gi
259-
func parseMigDeviceUUID(mig string) (string, int, int, error) {
260+
func parseMigDeviceUUID(mig string) (string, uint32, uint32, error) {
260261
tokens := strings.SplitN(mig, "-", 2)
261262
if len(tokens) != 2 || tokens[0] != "MIG" {
262-
return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device")
263+
return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device")
263264
}
264265

265266
tokens = strings.SplitN(tokens[1], "/", 3)
266267
if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") {
267-
return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device")
268+
return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device")
268269
}
269270

270-
gi, err := strconv.Atoi(tokens[1])
271+
gi, err := toUint32(tokens[1])
271272
if err != nil {
272-
return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device")
273+
return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device")
273274
}
274275

275-
ci, err := strconv.Atoi(tokens[2])
276+
ci, err := toUint32(tokens[2])
276277
if err != nil {
277-
return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device")
278+
return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device")
278279
}
279280

280281
return tokens[0], gi, ci, nil
281282
}
283+
284+
func toUint32(s string) (uint32, error) {
285+
u, err := strconv.ParseUint(s, 10, 32)
286+
if err != nil {
287+
return 0, err
288+
}
289+
//nolint:gosec // Since we parse s with a 32-bit size this will not overflow.
290+
return uint32(u), nil
291+
}

internal/vgpu/pciutil.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,11 @@ func (d *PCIDevice) GetVendorSpecificCapability() ([]byte, error) {
122122
}
123123

124124
var visited [256]byte
125-
pos := int(GetByte(d.Config, PciCapabilityList))
125+
pos := GetByte(d.Config, PciCapabilityList)
126126
for pos != 0 {
127-
id := int(GetByte(d.Config, pos+PciCapabilityListID))
128-
next := int(GetByte(d.Config, pos+PciCapabilityListNext))
129-
length := int(GetByte(d.Config, pos+PciCapabilityLength))
127+
id := GetByte(d.Config, pos+PciCapabilityListID)
128+
next := GetByte(d.Config, pos+PciCapabilityListNext)
129+
length := GetByte(d.Config, pos+PciCapabilityLength)
130130

131131
if visited[pos] != 0 {
132132
// chain looped
@@ -149,7 +149,7 @@ func (d *PCIDevice) GetVendorSpecificCapability() ([]byte, error) {
149149
}
150150

151151
// GetByte returns a single byte of data at specified position
152-
func GetByte(buffer []byte, pos int) uint8 {
152+
func GetByte(buffer []byte, pos uint8) uint8 {
153153
return buffer[pos]
154154
}
155155

internal/vgpu/vgpu.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ type Info struct {
4040

4141
const (
4242
// VGPUCapabilityRecordStart indicates offset of beginning vGPU capability record
43-
VGPUCapabilityRecordStart = 5
43+
VGPUCapabilityRecordStart uint8 = 5
4444
// HostDriverVersionLength indicates max length of driver version
4545
HostDriverVersionLength = 10
4646
// HostDriverBranchLength indicates max length of driver branch
@@ -116,14 +116,14 @@ func (d *Device) GetInfo() (*Info, error) {
116116
foundDriverVersionRecord := false
117117
pos := VGPUCapabilityRecordStart
118118
record := GetByte(d.vGPUCapability, VGPUCapabilityRecordStart)
119-
for record != 0 && pos < len(d.vGPUCapability) {
119+
for record != 0 && int(pos) < len(d.vGPUCapability) {
120120
// find next record
121121
recordLength := GetByte(d.vGPUCapability, pos+1)
122-
pos += int(recordLength)
122+
pos += recordLength
123123
record = GetByte(d.vGPUCapability, pos)
124124
}
125125

126-
if record == 0 && pos+2+HostDriverVersionLength+HostDriverBranchLength <= len(d.vGPUCapability) {
126+
if record == 0 && int(pos+2+HostDriverVersionLength+HostDriverBranchLength) <= len(d.vGPUCapability) {
127127
foundDriverVersionRecord = true
128128
// found vGPU host driver version record type
129129
// initialized at record data byte, i.e pos + 1(record id byte) + 1(record lengh byte)

0 commit comments

Comments
 (0)