Skip to content

Commit 43fcbc8

Browse files
committed
address review comment of eventMap and health string
Signed-off-by: Swati Gupta <[email protected]>
1 parent 599fb15 commit 43fcbc8

File tree

4 files changed

+83
-60
lines changed

4 files changed

+83
-60
lines changed

cmd/gpu-kubelet-plugin/allocatable.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ import (
2222
resourceapi "k8s.io/api/resource/v1"
2323
)
2424

25+
type HealthStatus string
26+
2527
const (
2628
// Healthy means that the device is healthy.
27-
Healthy = "Healthy"
29+
Healthy HealthStatus = "Healthy"
2830
// Unhealthy means that the device is unhealthy.
29-
Unhealthy = "Unhealthy"
31+
Unhealthy HealthStatus = "Unhealthy"
3032
)
3133

3234
type AllocatableDevices map[string]*AllocatableDevice
@@ -35,7 +37,7 @@ type AllocatableDevice struct {
3537
Gpu *GpuInfo
3638
Mig *MigDeviceInfo
3739
// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy
38-
Health string
40+
Health HealthStatus
3941
}
4042

4143
func (d AllocatableDevice) Type() string {

cmd/gpu-kubelet-plugin/device_health.go

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -31,23 +31,24 @@ const (
3131
FullGPUInstanceID uint32 = 0xFFFFFFFF
3232
)
3333

34-
type deviceHealthMonitor struct {
35-
nvmllib nvml.Interface
36-
eventSet nvml.EventSet
37-
unhealthy chan *AllocatableDevice
38-
cancelContext context.CancelFunc
39-
uuidToDeviceMap map[string]*AllocatableDevice
40-
wg sync.WaitGroup
34+
type nvmlDeviceHealthMonitor struct {
35+
nvmllib nvml.Interface
36+
eventSet nvml.EventSet
37+
unhealthy chan *AllocatableDevice
38+
cancelContext context.CancelFunc
39+
uuidToDeviceMap map[string]*AllocatableDevice
40+
getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
41+
wg sync.WaitGroup
4142
}
4243

43-
func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*deviceHealthMonitor, error) {
44+
func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
4445
if nvdevlib.nvmllib == nil {
4546
return nil, fmt.Errorf("nvml library is nil")
4647
}
4748

4849
ctx, cancel := context.WithCancel(ctx)
4950

50-
m := &deviceHealthMonitor{
51+
m := &nvmlDeviceHealthMonitor{
5152
nvmllib: nvdevlib.nvmllib,
5253
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
5354
cancelContext: cancel,
@@ -69,8 +70,10 @@ func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable All
6970

7071
m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
7172

73+
m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)
74+
7275
klog.V(6).Info("registering NVML events for device health monitor")
73-
m.registerDevicesForEvents()
76+
m.registerEventsForDevices()
7477

7578
skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
7679
klog.V(6).Info("started device health monitoring")
@@ -80,7 +83,7 @@ func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable All
8083
return m, nil
8184
}
8285

83-
func (m *deviceHealthMonitor) registerDevicesForEvents() {
86+
func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
8487
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
8588

8689
processedUUIDs := make(map[string]bool)
@@ -122,7 +125,7 @@ func (m *deviceHealthMonitor) registerDevicesForEvents() {
122125
}
123126
}
124127

125-
func (m *deviceHealthMonitor) Stop() {
128+
func (m *nvmlDeviceHealthMonitor) Stop() {
126129
if m == nil {
127130
return
128131
}
@@ -153,7 +156,7 @@ func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableD
153156
return uuidToDeviceMap
154157
}
155158

156-
func (m *deviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bool) {
159+
func (m *nvmlDeviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bool) {
157160
defer m.wg.Done()
158161
for {
159162
select {
@@ -194,10 +197,12 @@ func (m *deviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bo
194197
}
195198

196199
var affectedDevice *AllocatableDevice
197-
if event.GpuInstanceId != FullGPUInstanceID && event.ComputeInstanceId != FullGPUInstanceID {
198-
affectedDevice = m.findMigDevice(eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
199-
} else {
200-
affectedDevice = m.findGpuDevice(eventUUID)
200+
pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
201+
if ok1 {
202+
giMap, ok2 := pMap[event.GpuInstanceId]
203+
if ok2 {
204+
affectedDevice, _ = giMap[event.ComputeInstanceId]
205+
}
201206
}
202207

203208
if affectedDevice == nil {
@@ -211,31 +216,46 @@ func (m *deviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bo
211216
}
212217
}
213218

214-
func (m *deviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
219+
func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
215220
return m.unhealthy
216221
}
217222

218-
func (m *deviceHealthMonitor) findMigDevice(parentUUID string, giID uint32, ciID uint32) *AllocatableDevice {
219-
for _, device := range m.uuidToDeviceMap {
220-
if device.Type() != MigDeviceType {
223+
func getDeviceByParentGiCiMap(allocatable AllocatableDevices) map[string]map[uint32]map[uint32]*AllocatableDevice {
224+
deviceByParentGiCiMap := make(map[string]map[uint32]map[uint32]*AllocatableDevice)
225+
226+
for _, d := range allocatable {
227+
var parentUUID string
228+
var giID, ciID uint32
229+
230+
switch d.Type() {
231+
case GpuDeviceType:
232+
parentUUID = d.UUID()
233+
if parentUUID == "" {
234+
continue
235+
}
236+
giID = FullGPUInstanceID
237+
ciID = FullGPUInstanceID
238+
case MigDeviceType:
239+
parentUUID = d.Mig.parent.UUID
240+
if parentUUID == "" {
241+
continue
242+
}
243+
giID = d.Mig.giInfo.Id
244+
ciID = d.Mig.ciInfo.Id
245+
default:
246+
klog.Errorf("Skipping device with unknown type: %s", d.UUID())
221247
continue
222248
}
223249

224-
if device.Mig.parent.UUID == parentUUID &&
225-
device.Mig.giInfo.Id == giID &&
226-
device.Mig.ciInfo.Id == ciID {
227-
return device
250+
if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
251+
deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
228252
}
253+
if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
254+
deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
255+
}
256+
deviceByParentGiCiMap[parentUUID][giID][ciID] = d
229257
}
230-
return nil
231-
}
232-
233-
func (m *deviceHealthMonitor) findGpuDevice(uuid string) *AllocatableDevice {
234-
device, exists := m.uuidToDeviceMap[uuid]
235-
if exists && device.Type() == GpuDeviceType {
236-
return device
237-
}
238-
return nil
258+
return deviceByParentGiCiMap
239259
}
240260

241261
// getAdditionalXids returns a list of additional Xids to skip from the specified string.
@@ -264,7 +284,8 @@ func getAdditionalXids(input string) []uint64 {
264284
return additionalXids
265285
}
266286

267-
func (m *deviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool {
287+
// Refer https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html for information on xids.
288+
func (m *nvmlDeviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool {
268289
ignoredXids := []uint64{
269290
13, // Graphics Engine Exception
270291
31, // GPU memory page fault

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -550,12 +550,12 @@ func GetOpaqueDeviceConfigs(
550550
return resultConfigs, nil
551551
}
552552

553-
func (s *DeviceState) UpdateDeviceHealthStatus(device *AllocatableDevice, healthstatus string) {
553+
func (s *DeviceState) UpdateDeviceHealthStatus(device *AllocatableDevice, hs HealthStatus) {
554554
s.Lock()
555555
defer s.Unlock()
556556

557-
device.Health = healthstatus
558-
klog.Infof("Update device sattus:%s healthstatus", device.UUID())
557+
device.Health = hs
558+
klog.Infof("Updated device: %s health status to %s", device.UUID(), hs)
559559
}
560560

561561
// TODO: Dynamic MIG is not yet supported with structured parameters.

cmd/gpu-kubelet-plugin/driver.go

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ import (
4141
const DriverPrepUprepFlockFileName = "pu.lock"
4242

4343
type driver struct {
44-
client coreclientset.Interface
45-
pluginhelper *kubeletplugin.Helper
46-
state *DeviceState
47-
pulock *flock.Flock
48-
healthcheck *healthcheck
49-
deviceHealthMonitor *deviceHealthMonitor
50-
wg sync.WaitGroup
44+
client coreclientset.Interface
45+
pluginhelper *kubeletplugin.Helper
46+
state *DeviceState
47+
pulock *flock.Flock
48+
healthcheck *healthcheck
49+
nvmlDeviceHealthMonitor *nvmlDeviceHealthMonitor
50+
wg sync.WaitGroup
5151
}
5252

5353
func NewDriver(ctx context.Context, config *Config) (*driver, error) {
@@ -98,12 +98,12 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
9898
driver.healthcheck = healthcheck
9999

100100
if featuregates.Enabled(featuregates.DeviceHealthCheck) {
101-
deviceHealthMonitor, err := newDeviceHealthMonitor(ctx, config, state.allocatable, state.nvdevlib)
101+
nvmlDeviceHealthMonitor, err := newNvmlDeviceHealthMonitor(ctx, config, state.allocatable, state.nvdevlib)
102102
if err != nil {
103-
return nil, fmt.Errorf("start deviceHealthMonitor: %w", err)
103+
return nil, fmt.Errorf("start nvmlDeviceHealthMonitor: %w", err)
104104
}
105105

106-
driver.deviceHealthMonitor = deviceHealthMonitor
106+
driver.nvmlDeviceHealthMonitor = nvmlDeviceHealthMonitor
107107

108108
driver.wg.Add(1)
109109
go func() {
@@ -128,8 +128,8 @@ func (d *driver) Shutdown() error {
128128
d.healthcheck.Stop()
129129
}
130130

131-
if d.deviceHealthMonitor != nil {
132-
d.deviceHealthMonitor.Stop()
131+
if d.nvmlDeviceHealthMonitor != nil {
132+
d.nvmlDeviceHealthMonitor.Stop()
133133
}
134134

135135
d.wg.Wait()
@@ -203,13 +203,13 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claimNs kubeletplugi
203203
}
204204

205205
func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
206-
klog.Info("Processing device health notifications")
206+
klog.Info("Starting to watch for device health notifications")
207207
for {
208208
select {
209209
case <-ctx.Done():
210210
klog.V(6).Info("Stop processing device health notifications")
211211
return
212-
case device, ok := <-d.deviceHealthMonitor.Unhealthy():
212+
case device, ok := <-d.nvmlDeviceHealthMonitor.Unhealthy():
213213
if !ok {
214214
klog.V(6).Info("Health monitor channel closed")
215215
return
@@ -219,7 +219,7 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
219219
klog.Warningf("Received unhealthy notification for device: %s", uuid)
220220

221221
if !device.IsHealthy() {
222-
klog.V(6).Infof("Device: %s is aleady marked unhealthy. Skip republishing resourceslice", uuid)
222+
klog.V(6).Infof("Device: %s is aleady marked unhealthy. Skip republishing ResourceSlice", uuid)
223223
continue
224224
}
225225

@@ -234,15 +234,15 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
234234

235235
// Republish resource slice with only healthy devices
236236
// There is no remediation loop right now meaning if the unhealthy device is fixed,
237-
// driver needs to be restarted to publish the resourceslice with all devices
237+
// driver needs to be restarted to publish the ResourceSlice with all devices
238238
var resourceSlice resourceslice.Slice
239239
for _, dev := range d.state.allocatable {
240240
uuid := dev.UUID()
241241
if dev.IsHealthy() {
242-
klog.V(6).Infof("Device: %s is healthy, added to resoureslice", uuid)
242+
klog.V(6).Infof("Device: %s is healthy, added to ResoureSlice", uuid)
243243
resourceSlice.Devices = append(resourceSlice.Devices, dev.GetDevice())
244244
} else {
245-
klog.Warningf("Device: %s is unhealthy, will be removed from resoureslice", uuid)
245+
klog.Warningf("Device: %s is unhealthy, will be removed from ResoureSlice", uuid)
246246
}
247247
}
248248

0 commit comments

Comments
 (0)