Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cmd/gpu-kubelet-plugin/allocatable.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,22 @@ import (
resourceapi "k8s.io/api/resource/v1"
)

type HealthStatus string

const (
// Healthy means that the device is healthy.
Healthy HealthStatus = "Healthy"
// Unhealthy means that the device is unhealthy.
Unhealthy HealthStatus = "Unhealthy"
)

type AllocatableDevices map[string]*AllocatableDevice

type AllocatableDevice struct {
Gpu *GpuInfo
Mig *MigDeviceInfo
// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy
Health HealthStatus
}

func (d AllocatableDevice) Type() string {
Expand Down Expand Up @@ -96,3 +107,7 @@ func (d AllocatableDevices) UUIDs() []string {
slices.Sort(uuids)
return uuids
}

func (d *AllocatableDevice) IsHealthy() bool {
return d.Health == Healthy
}
316 changes: 316 additions & 0 deletions cmd/gpu-kubelet-plugin/device_health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"fmt"
"strconv"
"strings"
"sync"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/klog/v2"
)

const (
FullGPUInstanceID uint32 = 0xFFFFFFFF
)

type nvmlDeviceHealthMonitor struct {
nvmllib nvml.Interface
eventSet nvml.EventSet
unhealthy chan *AllocatableDevice
cancelContext context.CancelFunc
uuidToDeviceMap map[string]*AllocatableDevice
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need two maps? Are the entries in the more complete map below not a subset of this?

getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's rename this member since it's not a function. I would even go so far as to add a type:

type placementToAllocatableDeviceMap map[string]map[uint32]map[uint32]*AllocatableDevice

that we can attach functions to (get(string,uint32,uint32), update(string,uint32,uint32,*AllocatableDevice)) to simplify implementations bellow.

This would mean that we update the member definition to something like:

Suggested change
getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
deviceByParentGiCiMap placementToAllocatableDeviceMap

wg sync.WaitGroup
}

func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to my other comment(s), what about introducing a top-level factory method where we can add additional constrution logic. Something like:

Suggested change
func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
func NewDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *devicelib) (DeviceHealthMonitor, error) {
return newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error)
}

This may not look too important, but we could even add logic to instantiate a mock monitor based on an envvar (or move the feature flag logic from driver.go here and return a NULL (no-op) handler in the case where the feature is not enabled. This has the advantage of simplifying the callsite.

if nvdevlib.nvmllib == nil {
return nil, fmt.Errorf("nvml library is nil")
}

ctx, cancel := context.WithCancel(ctx)

m := &nvmlDeviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
cancelContext: cancel,
}

if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
cancel()
return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
}
Comment on lines +57 to +60
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have a deferred nvmllib.Shutdown() here instead of handling each return separately?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

defer wont work here, as nvmllib.Shutdown() should not be called untill healthMonitor is closed.


klog.V(6).Info("creating NVML events for device health monitor")
eventSet, ret := m.nvmllib.EventSetCreate()
if ret != nvml.SUCCESS {
_ = m.nvmllib.Shutdown()
cancel()
return nil, fmt.Errorf("failed to create event set: %w", ret)
}
m.eventSet = eventSet

m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Does allocatable change at all, or is it constant for the lifetime of the plugin?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do update the health status of an allocatable when we get a unhealthy notification.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, that wasn't clear. Does the content of allocatable change in any way that would invalidate the map that we construct here meaning that it would need to be reconstructed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the only changing content of an allocatable device is its health status and that wont impact it. This map ([uuid] = device) is constructed in the very beginning when the plugin is started.

and we iterate on it to register event on each device (currently, we dont check any status here. we may do it in future on remediation) and send unhealthy notification.


m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)

klog.V(6).Info("registering NVML events for device health monitor")
m.registerEventsForDevices()

skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the device plugin implementation setting skippedIDs to all disables health checking. Why is this logic different here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

health checking is feature-gated here, so it can be skipped without this flag. I should add a comment to make it explicit

klog.V(6).Info("started device health monitoring")
m.wg.Add(1)
go m.run(ctx, skippedXids)

return m, nil
}
Comment on lines +44 to +84
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned at the call site, I think it simplifies the implmentation if we split the construction of a monitor from actually starting it. What about updating this to:

Suggested change
func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
if nvdevlib.nvmllib == nil {
return nil, fmt.Errorf("nvml library is nil")
}
ctx, cancel := context.WithCancel(ctx)
m := &nvmlDeviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
cancelContext: cancel,
}
if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
cancel()
return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
}
klog.V(6).Info("creating NVML events for device health monitor")
eventSet, ret := m.nvmllib.EventSetCreate()
if ret != nvml.SUCCESS {
_ = m.nvmllib.Shutdown()
cancel()
return nil, fmt.Errorf("failed to create event set: %w", ret)
}
m.eventSet = eventSet
m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)
klog.V(6).Info("registering NVML events for device health monitor")
m.registerEventsForDevices()
skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
klog.V(6).Info("started device health monitoring")
m.wg.Add(1)
go m.run(ctx, skippedXids)
return m, nil
}
func newNvmlDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
if nvdevlib.nvmllib == nil {
return nil, fmt.Errorf("nvml library is nil")
}
if ret := nvdevlib.nvmllib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
}
defer func() {
_ = nvdevlib.nvmllib.Shutdown()
}()
m := &nvmlDeviceHealthMonitor{
nvmllib: nvdevlib.nvmllib,
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
uuidToDeviceMap: getUUIDToDeviceMap(allocatable),
getDeviceByParentGiCiMap: getDeviceByParentGiCiMap(allocatable),
skippedXids: xidsToSkip(config.flags.additionalXidsToIgnore),
}
return m, nil
}
func (m *nvmlDeviceHealthMonitor) Start(ctx context.Context) (rerr error) {
if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
return fmt.Errorf("failed to initialize NVML: %v", ret)
}
// We shutdown nvml if this function returns with an error.
defer func() {
if rerr != nil {
_ = m.nvmllib.Shutdown()
}
}()
klog.V(6).Info("creating NVML events for device health monitor")
eventSet, ret := m.nvmllib.EventSetCreate()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to create event set: %w", ret)
}
ctx, cancel := context.WithCancel(ctx)
m.cancelContext = cancel
m.eventSet = eventSet
klog.V(6).Info("registering NVML events for device health monitor")
m.registerEventsForDevices()
klog.V(6).Info("started device health monitoring")
m.wg.Add(1)
go m.run(ctx, m.skippedXids)
return nil
}

Note that we now cleanly separate nvml errors that occur during setup and those that we have to handle while waiting for events.


func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)

processedUUIDs := make(map[string]bool)

for uuid, dev := range m.uuidToDeviceMap {
var u string
if dev.Type() == MigDeviceType {
u = dev.Mig.parent.UUID
} else {
u = uuid
}

if processedUUIDs[u] {
continue
}
gpu, ret := m.nvmllib.DeviceGetHandleByUUID(u)
if ret != nvml.SUCCESS {
klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
continue
}

supportedEvents, ret := gpu.GetSupportedEventTypes()
if ret != nvml.SUCCESS {
klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
continue
}

ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
if ret == nvml.ERROR_NOT_SUPPORTED {
klog.Warningf("Device %v is too old to support healthchecking.", u)
}
if ret != nvml.SUCCESS {
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret)
m.unhealthy <- dev
}
processedUUIDs[u] = true
}
}

func (m *nvmlDeviceHealthMonitor) Stop() {
if m == nil {
return
}
klog.V(6).Info("stopping health monitor")

if m.cancelContext != nil {
m.cancelContext()
}

m.wg.Wait()

_ = m.eventSet.Free()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is probably like this in the device plugin too, but should we at least log an error here?


if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS {
klog.Warningf("failed to shutdown NVML: %v", ret)
}
close(m.unhealthy)
}

func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice {
uuidToDeviceMap := make(map[string]*AllocatableDevice)

for _, d := range allocatable {
if u := d.UUID(); u != "" {
uuidToDeviceMap[u] = d
}
}
return uuidToDeviceMap
}

func (m *nvmlDeviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bool) {
defer m.wg.Done()
for {
select {
case <-ctx.Done():
klog.V(6).Info("Stopping event-driven GPU health monitor...")
return
default:
event, ret := m.eventSet.Wait(5000)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I needed to look this up to understand what it's doing.

Ref docs are here: https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c

argument is timeout in ms.

errors to be handled:

Image

if ret == nvml.ERROR_TIMEOUT {
continue
}
if ret != nvml.SUCCESS {
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not seem right.

Let us specifically handle the NVML_ERROR_GPU_IS_LOST case, and perform this 'nuke option' only then. Then we can also have a more precise log message (emitted on error level).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why only NVML_ERROR_GPU_IS_LOST If we are not just checking !NVML_SUCCESS, TIMEOUT and UNKNOWN also seems imp.

If you see the returns of all event methods, these all seem common error types. I can do a helper of errortype.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something like https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/return.go#L42

but then we mark the device unhealthy in each case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while this will allow for proper error handling, this will also deviate from how its done in device-plugin. i can take it up as a follow-up to fix in both.

@jgehrcke thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pinging all the reviewers to help resolve this. Most other seems to be non-blocking.

@jgehrcke @elezar @shivamerla @ArangoGutierrez

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will also deviate from how its done in device-plugin.

Let's get this PR into the state we want it for the DRA driver without trying to match the device plugin implementation exactly. That is to say, consider this the next iteration of the device plugin implementation. The learnings that we take from this should be applied to the device plugin and iterated on from there.

One motivation is that this is a new feature and we don't have users currently expecting a certain behaviour. This gives us a lot more flexibility to change behaviour than if we had an existing implementation in use.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is implementation detail of how we want to handle NVML return errors so it wont impact the end user anyway whether we do it similar to device-plugin or improve it here. For user, all these error means unhealthy device and wont be published as part of ResourceSlice. And IMO, ret != nvml.SUCCESS is a valid check and covers a wide range of errors as all subsequent calls are dependent on this check.

For me, these make sense when we have proper remediation in place where based on the given error, the right action can be recommended.

m.markAllDevicesUnhealthy()
continue
}

if event.EventType != nvml.EventTypeXidCriticalError {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also for the device plugin) Can we track the follow-up action of checking why we don't check other supported types? Do whe have any indication of whether we ever see the log message below?

klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", event)
continue
}

if skippedXids[event.EventData] {
klog.Infof("Skipping event %+v", event)
continue
}

klog.Infof("Processing event %+v", event)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example for this log message, how it would look like in practice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I1027 18:03:39.337167       1 device_health.go:179] Processing event {Device:{Handle:0xe151b6b2fef0} EventType:8 EventData:43 GpuInstanceId:4294967295 ComputeInstanceId:4294967295}

eventUUID, ret := event.Device.GetUUID()
if ret != nvml.SUCCESS {
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems bit aggressive to mark all devices as unhealthy on one invalid event. Should we log this as error and continue watch? cc @klueska

Copy link
Contributor Author

@guptaNswati guptaNswati Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

@jgehrcke jgehrcke Oct 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also say we should log an error and otherwise proceed. Even if what you've shown here is currently done in the device plugin.

By the way, this would have been a perfect opportunity for a better code comment in the legacy code:

Image

No blame, no emotions -- but this code comment does not add information in addition to the code. The interesting bit would be if there is a specific, non-obvious reason / relevance for this style of treatment.

For example, I wonder if this code was introduced to fix a bug. I wonder if it is even ever exercised.

The way it's written and with the git blame history, it seems like it was potentially added initially (defensively) and may never have been exercised in production.

m.markAllDevicesUnhealthy()
continue
}

var affectedDevice *AllocatableDevice
pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
if ok1 {
giMap, ok2 := pMap[event.GpuInstanceId]
if ok2 {
affectedDevice = giMap[event.ComputeInstanceId]
}
}
Comment on lines +195 to +202
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming we define a type for this map, we could simplify this as:

Suggested change
var affectedDevice *AllocatableDevice
pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
if ok1 {
giMap, ok2 := pMap[event.GpuInstanceId]
if ok2 {
affectedDevice = giMap[event.ComputeInstanceId]
}
}
affectedDevice := m.getDeviceByParentGiCiMap.get(
eventUUID,
event.GpuInstanceId,
event.ComputeInstanceId,
)

alternatively getting an element from an initialized map is "safe", so the following could also work:

            affectedDevice := m.getDeviceByParentGiCiMap[eventUUID][event.GpuInstanceId][event.ComputeInstanceId]


if affectedDevice == nil {
klog.Infof("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)", eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
continue
}

klog.Infof("Sending unhealthy notification for device %s due to event type: %v and event data: %d", affectedDevice.UUID(), event.EventType, event.EventData)
m.unhealthy <- affectedDevice
}
}
}

func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
return m.unhealthy
}

func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
for _, d := range m.uuidToDeviceMap {
// non-blocking send.
select {
case m.unhealthy <- d:
default:
klog.Errorf("Unhealthy channel buffer full. Dropping unhealthy notification for device %s in markAllDevicesUnhealthy.", d.UUID())
}
}
}

func getDeviceByParentGiCiMap(allocatable AllocatableDevices) map[string]map[uint32]map[uint32]*AllocatableDevice {
deviceByParentGiCiMap := make(map[string]map[uint32]map[uint32]*AllocatableDevice)

for _, d := range allocatable {
var parentUUID string
var giID, ciID uint32

switch d.Type() {
case GpuDeviceType:
parentUUID = d.UUID()
if parentUUID == "" {
continue
}
giID = FullGPUInstanceID
ciID = FullGPUInstanceID
case MigDeviceType:
parentUUID = d.Mig.parent.UUID
if parentUUID == "" {
continue
}
giID = d.Mig.giInfo.Id
ciID = d.Mig.ciInfo.Id
Comment on lines +233 to +251
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also move the put function below to the individual case statements:

Suggested change
for _, d := range allocatable {
var parentUUID string
var giID, ciID uint32
switch d.Type() {
case GpuDeviceType:
parentUUID = d.UUID()
if parentUUID == "" {
continue
}
giID = FullGPUInstanceID
ciID = FullGPUInstanceID
case MigDeviceType:
parentUUID = d.Mig.parent.UUID
if parentUUID == "" {
continue
}
giID = d.Mig.giInfo.Id
ciID = d.Mig.ciInfo.Id
for _, d := range allocatable {
switch d.Type() {
case GpuDeviceType:
uuid := d.UUID()
if uuid == "" {
continue
}
deviceByParentGiCiMap.put(uuid, FullGPUInstanceID, FullGPUInstanceID)
case MigDeviceType:
uuid := d.Mig.parent.UUID
if uuid == "" {
continue
}
deviceByParentGiCiMap.put(uuid, d.Mig.giInfo.Id, d.Mig.ciInfo.Id)

(we could even rename the put to something more meaningful and add the uuid == "" check there)

default:
klog.Errorf("Skipping device with unknown type: %s", d.Type())
continue
}

if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
}
if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
}
deviceByParentGiCiMap[parentUUID][giID][ciID] = d
Comment on lines +257 to +263
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming we define a type for this map we could factor this into:

func (p placementToAllocatableDeviceMap) put(uuid string, gi uint32, ci uint32, d *AllocatableDevice) {
	if _, ok := p[uuid]; !ok {
		p[uuid] = make(map[uint32]map[uint32]*AllocatableDevice)
	}
	if _, ok := p[uuid][gi]; !ok {
		p[uuid][gi] = make(map[uint32]*AllocatableDevice)
	}
	p[uuid][gi][ci] = d
}

and then replace the implementation here with:

Suggested change
if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
}
if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
}
deviceByParentGiCiMap[parentUUID][giID][ciID] = d
deviceByParentGiCiMap.put(parentUUID, giID, ciID, d)

}
return deviceByParentGiCiMap
}

// getAdditionalXids returns a list of additional Xids to skip from the specified string.
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
// Invalid values nare ignored.
func getAdditionalXids(input string) []uint64 {
if input == "" {
return nil
}

var additionalXids []uint64
klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input)
for _, additionalXid := range strings.Split(input, ",") {
trimmed := strings.TrimSpace(additionalXid)
if trimmed == "" {
continue
}
xid, err := strconv.ParseUint(trimmed, 10, 64)
if err != nil {
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
continue
}
additionalXids = append(additionalXids, xid)
}

return additionalXids
}

func (m *nvmlDeviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool {
// Add the list of hardcoded disabled (ignored) XIDs:
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy.
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}
Comment on lines +295 to +305
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that this list is taken from the device plugin, but handling them explicity at such a low-level is quite difficult to customize. Does it make sense to not port this logic over by instead define these as the default value for the envvar where we expose this to the user?

(I'm happy to do this as a follow-up though).

Just as a note for completeness. The GKE device plugin doesn't use this strategy for XIDs. They only element in their list is XID48 (see https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/0509b1f9f4b9a357b44ba65e7b508ded8bd5ecf0/pkg/gpu/nvidia/health_check/health_checker.go#L59).


skippedXids := make(map[uint64]bool)
for _, id := range ignoredXids {
skippedXids[id] = true
}

for _, additionalXid := range getAdditionalXids(additionalXids) {
skippedXids[additionalXid] = true
}
return skippedXids
}
14 changes: 14 additions & 0 deletions cmd/gpu-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,12 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
if !exists {
return nil, fmt.Errorf("requested device is not allocatable: %v", result.Device)
}
// only proceed with config mapping if device is healthy.
if featuregates.Enabled(featuregates.DeviceHealthCheck) {
if device.Health == Unhealthy {
return nil, fmt.Errorf("requested device is not healthy: %v", result.Device)
}
}
for _, c := range slices.Backward(configs) {
if slices.Contains(c.Requests, result.Request) {
if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
Expand Down Expand Up @@ -550,6 +556,14 @@ func GetOpaqueDeviceConfigs(
return resultConfigs, nil
}

func (s *DeviceState) UpdateDeviceHealthStatus(device *AllocatableDevice, hs HealthStatus) {
s.Lock()
defer s.Unlock()

device.Health = hs
klog.Infof("Updated device: %s health status to %s", device.UUID(), hs)
}

// TODO: Dynamic MIG is not yet supported with structured parameters.
// Refactor this to allow for the allocation of statically partitioned MIG
// devices.
Expand Down
Loading