11/*
2- * Copyright (c) 2022 , NVIDIA CORPORATION. All rights reserved.
2+ * Copyright (c) 2025 , NVIDIA CORPORATION. All rights reserved.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
@@ -31,23 +31,24 @@ const (
3131 FullGPUInstanceID uint32 = 0xFFFFFFFF
3232)
3333
34- type deviceHealthMonitor struct {
35- nvmllib nvml.Interface
36- eventSet nvml.EventSet
37- unhealthy chan * AllocatableDevice
38- cancelContext context.CancelFunc
39- uuidToDeviceMap map [string ]* AllocatableDevice
40- wg sync.WaitGroup
34+ type nvmlDeviceHealthMonitor struct {
35+ nvmllib nvml.Interface
36+ eventSet nvml.EventSet
37+ unhealthy chan * AllocatableDevice
38+ cancelContext context.CancelFunc
39+ uuidToDeviceMap map [string ]* AllocatableDevice
40+ getDeviceByParentGiCiMap map [string ]map [uint32 ]map [uint32 ]* AllocatableDevice
41+ wg sync.WaitGroup
4142}
4243
43- func newDeviceHealthMonitor (ctx context.Context , config * Config , allocatable AllocatableDevices , nvdevlib * deviceLib ) (* deviceHealthMonitor , error ) {
44+ func newNvmlDeviceHealthMonitor (ctx context.Context , config * Config , allocatable AllocatableDevices , nvdevlib * deviceLib ) (* nvmlDeviceHealthMonitor , error ) {
4445 if nvdevlib .nvmllib == nil {
4546 return nil , fmt .Errorf ("nvml library is nil" )
4647 }
4748
4849 ctx , cancel := context .WithCancel (ctx )
4950
50- m := & deviceHealthMonitor {
51+ m := & nvmlDeviceHealthMonitor {
5152 nvmllib : nvdevlib .nvmllib ,
5253 unhealthy : make (chan * AllocatableDevice , len (allocatable )),
5354 cancelContext : cancel ,
@@ -69,8 +70,10 @@ func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable All
6970
7071 m .uuidToDeviceMap = getUUIDToDeviceMap (allocatable )
7172
73+ m .getDeviceByParentGiCiMap = getDeviceByParentGiCiMap (allocatable )
74+
7275 klog .V (6 ).Info ("registering NVML events for device health monitor" )
73- m .registerDevicesForEvents ()
76+ m .registerEventsForDevices ()
7477
7578 skippedXids := m .xidsToSkip (config .flags .additionalXidsToIgnore )
7679 klog .V (6 ).Info ("started device health monitoring" )
@@ -80,7 +83,7 @@ func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable All
8083 return m , nil
8184}
8285
83- func (m * deviceHealthMonitor ) registerDevicesForEvents () {
86+ func (m * nvmlDeviceHealthMonitor ) registerEventsForDevices () {
8487 eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
8588
8689 processedUUIDs := make (map [string ]bool )
@@ -122,7 +125,7 @@ func (m *deviceHealthMonitor) registerDevicesForEvents() {
122125 }
123126}
124127
125- func (m * deviceHealthMonitor ) Stop () {
128+ func (m * nvmlDeviceHealthMonitor ) Stop () {
126129 if m == nil {
127130 return
128131 }
@@ -153,7 +156,7 @@ func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableD
153156 return uuidToDeviceMap
154157}
155158
156- func (m * deviceHealthMonitor ) run (ctx context.Context , skippedXids map [uint64 ]bool ) {
159+ func (m * nvmlDeviceHealthMonitor ) run (ctx context.Context , skippedXids map [uint64 ]bool ) {
157160 defer m .wg .Done ()
158161 for {
159162 select {
@@ -194,10 +197,12 @@ func (m *deviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bo
194197 }
195198
196199 var affectedDevice * AllocatableDevice
197- if event .GpuInstanceId != FullGPUInstanceID && event .ComputeInstanceId != FullGPUInstanceID {
198- affectedDevice = m .findMigDevice (eventUUID , event .GpuInstanceId , event .ComputeInstanceId )
199- } else {
200- affectedDevice = m .findGpuDevice (eventUUID )
200+ pMap , ok1 := m .getDeviceByParentGiCiMap [eventUUID ]
201+ if ok1 {
202+ giMap , ok2 := pMap [event .GpuInstanceId ]
203+ if ok2 {
204+ affectedDevice , _ = giMap [event .ComputeInstanceId ]
205+ }
201206 }
202207
203208 if affectedDevice == nil {
@@ -211,31 +216,46 @@ func (m *deviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bo
211216 }
212217}
213218
214- func (m * deviceHealthMonitor ) Unhealthy () <- chan * AllocatableDevice {
219+ func (m * nvmlDeviceHealthMonitor ) Unhealthy () <- chan * AllocatableDevice {
215220 return m .unhealthy
216221}
217222
218- func (m * deviceHealthMonitor ) findMigDevice (parentUUID string , giID uint32 , ciID uint32 ) * AllocatableDevice {
219- for _ , device := range m .uuidToDeviceMap {
220- if device .Type () != MigDeviceType {
223+ func getDeviceByParentGiCiMap (allocatable AllocatableDevices ) map [string ]map [uint32 ]map [uint32 ]* AllocatableDevice {
224+ deviceByParentGiCiMap := make (map [string ]map [uint32 ]map [uint32 ]* AllocatableDevice )
225+
226+ for _ , d := range allocatable {
227+ var parentUUID string
228+ var giID , ciID uint32
229+
230+ switch d .Type () {
231+ case GpuDeviceType :
232+ parentUUID = d .UUID ()
233+ if parentUUID == "" {
234+ continue
235+ }
236+ giID = FullGPUInstanceID
237+ ciID = FullGPUInstanceID
238+ case MigDeviceType :
239+ parentUUID = d .Mig .parent .UUID
240+ if parentUUID == "" {
241+ continue
242+ }
243+ giID = d .Mig .giInfo .Id
244+ ciID = d .Mig .ciInfo .Id
245+ default :
246+ klog .Errorf ("Skipping device with unknown type: %s" , d .UUID ())
221247 continue
222248 }
223249
224- if device .Mig .parent .UUID == parentUUID &&
225- device .Mig .giInfo .Id == giID &&
226- device .Mig .ciInfo .Id == ciID {
227- return device
250+ if _ , ok := deviceByParentGiCiMap [parentUUID ]; ! ok {
251+ deviceByParentGiCiMap [parentUUID ] = make (map [uint32 ]map [uint32 ]* AllocatableDevice )
228252 }
253+ if _ , ok := deviceByParentGiCiMap [parentUUID ][giID ]; ! ok {
254+ deviceByParentGiCiMap [parentUUID ][giID ] = make (map [uint32 ]* AllocatableDevice )
255+ }
256+ deviceByParentGiCiMap [parentUUID ][giID ][ciID ] = d
229257 }
230- return nil
231- }
232-
233- func (m * deviceHealthMonitor ) findGpuDevice (uuid string ) * AllocatableDevice {
234- device , exists := m .uuidToDeviceMap [uuid ]
235- if exists && device .Type () == GpuDeviceType {
236- return device
237- }
238- return nil
258+ return deviceByParentGiCiMap
239259}
240260
241261// getAdditionalXids returns a list of additional Xids to skip from the specified string.
@@ -264,7 +284,8 @@ func getAdditionalXids(input string) []uint64 {
264284 return additionalXids
265285}
266286
267- func (m * deviceHealthMonitor ) xidsToSkip (additionalXids string ) map [uint64 ]bool {
287+ // Refer https://docs.nvidia.com/deploy/xid-errors/analyzing-xid-catalog.html for information on xids.
288+ func (m * nvmlDeviceHealthMonitor ) xidsToSkip (additionalXids string ) map [uint64 ]bool {
268289 ignoredXids := []uint64 {
269290 13 , // Graphics Engine Exception
270291 31 , // GPU memory page fault
0 commit comments