@@ -33,10 +33,14 @@ const (
3333 // this is in addition to the Application errors that are already ignored.
3434 envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
3535 allHealthChecks = "xids"
36+
37+ nvmlEventTypeGpuRecoveryAction = 0x0000000000008000 // from https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html?
38+
39+ nvmlEventTypeGpuUnavailableError = 0x0000000000004000
3640)
3741
3842// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
39- func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * Device ) error {
43+ func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * DeviceEvent ) error {
4044 disableHealthChecks := strings .ToLower (os .Getenv (envDisableHealthChecks ))
4145 if disableHealthChecks == "all" {
4246 disableHealthChecks = allHealthChecks
@@ -92,12 +96,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
9296 deviceIDToGiMap := make (map [string ]uint32 )
9397 deviceIDToCiMap := make (map [string ]uint32 )
9498
95- eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
99+ eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError | nvmlEventTypeGpuUnavailableError | nvmlEventTypeGpuRecoveryAction )
96100 for _ , d := range devices {
97101 uuid , gi , ci , err := r .getDevicePlacement (d )
98102 if err != nil {
99103 klog .Warningf ("Could not determine device placement for %v: %v; Marking it unhealthy." , d .ID , err )
100- unhealthy <- d
104+ unhealthy <- & DeviceEvent {
105+ Device : d ,
106+ Event : DeviceUnHalthy ,
107+ }
101108 continue
102109 }
103110 deviceIDToGiMap [d .ID ] = gi
@@ -107,14 +114,20 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
107114 gpu , ret := r .nvml .DeviceGetHandleByUUID (uuid )
108115 if ret != nvml .SUCCESS {
109116 klog .Infof ("unable to get device handle from UUID: %v; marking it as unhealthy" , ret )
110- unhealthy <- d
117+ unhealthy <- & DeviceEvent {
118+ Device : d ,
119+ Event : DeviceUnHalthy ,
120+ }
111121 continue
112122 }
113123
114124 supportedEvents , ret := gpu .GetSupportedEventTypes ()
115125 if ret != nvml .SUCCESS {
116126 klog .Infof ("unable to determine the supported events for %v: %v; marking it as unhealthy" , d .ID , ret )
117- unhealthy <- d
127+ unhealthy <- & DeviceEvent {
128+ Device : d ,
129+ Event : DeviceUnHalthy ,
130+ }
118131 continue
119132 }
120133
@@ -124,7 +137,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
124137 }
125138 if ret != nvml .SUCCESS {
126139 klog .Infof ("Marking device %v as unhealthy: %v" , d .ID , ret )
127- unhealthy <- d
140+ unhealthy <- & DeviceEvent {
141+ Device : d ,
142+ Event : DeviceUnHalthy ,
143+ }
128144 }
129145 }
130146
@@ -142,7 +158,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
142158 if ret != nvml .SUCCESS {
143159 klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , ret )
144160 for _ , d := range devices {
145- unhealthy <- d
161+ unhealthy <- & DeviceEvent {
162+ Device : d ,
163+ Event : DeviceUnHalthy ,
164+ }
146165 }
147166 continue
148167 }
@@ -163,7 +182,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
163182 // If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
164183 klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , e , ret )
165184 for _ , d := range devices {
166- unhealthy <- d
185+ unhealthy <- & DeviceEvent {
186+ Device : d ,
187+ Event : DeviceUnHalthy ,
188+ }
167189 }
168190 continue
169191 }
@@ -173,6 +195,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
173195 klog .Infof ("Ignoring event for unexpected device: %v" , eventUUID )
174196 continue
175197 }
198+ // nvmlEventTypeRecovery is a special case, where we mark the device as healthy.
199+ if e .EventType == nvmlEventTypeGpuRecoveryAction {
200+ klog .Infof ("Gpu recovery event: %+v" , e )
201+ unhealthy <- & DeviceEvent {
202+ Device : d ,
203+ Event : DeviceHealthy ,
204+ }
205+
206+ }
176207
177208 if d .IsMigDevice () && e .GpuInstanceId != 0xFFFFFFFF && e .ComputeInstanceId != 0xFFFFFFFF {
178209 gi := deviceIDToGiMap [d .ID ]
@@ -184,7 +215,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
184215 }
185216
186217 klog .Infof ("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy." , e .EventData , d .ID )
187- unhealthy <- d
218+ unhealthy <- & DeviceEvent {
219+ Device : d ,
220+ Event : DeviceUnHalthy ,
221+ }
188222 }
189223}
190224
0 commit comments