NVIDIA · guptaNswati · Oct 17, 2025 · Oct 20, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/cmd/gpu-kubelet-plugin/allocatable.go b/cmd/gpu-kubelet-plugin/allocatable.go
@@ -22,11 +22,22 @@ import (
 	resourceapi "k8s.io/api/resource/v1"
 )
 
+type HealthStatus string
+
+const (
+	// Healthy means that the device is healthy.
+	Healthy HealthStatus = "Healthy"
+	// Unhealthy means that the device is unhealthy.
+	Unhealthy HealthStatus = "Unhealthy"
+)
+
 type AllocatableDevices map[string]*AllocatableDevice
 
 type AllocatableDevice struct {
 	Gpu *GpuInfo
 	Mig *MigDeviceInfo
+	// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy
+	Health HealthStatus
 }
 
 func (d AllocatableDevice) Type() string {
@@ -96,3 +107,7 @@ func (d AllocatableDevices) UUIDs() []string {
 	slices.Sort(uuids)
 	return uuids
 }
+
+func (d *AllocatableDevice) IsHealthy() bool {
+	return d.Health == Healthy
+}
diff --git a/cmd/gpu-kubelet-plugin/device_health.go b/cmd/gpu-kubelet-plugin/device_health.go
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"k8s.io/klog/v2"
+)
+
+const (
+	FullGPUInstanceID uint32 = 0xFFFFFFFF
+)
+
+type nvmlDeviceHealthMonitor struct {
+	nvmllib                  nvml.Interface
+	eventSet                 nvml.EventSet
+	unhealthy                chan *AllocatableDevice
+	cancelContext            context.CancelFunc
+	uuidToDeviceMap          map[string]*AllocatableDevice
+	getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
-	getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
+	deviceByParentGiCiMap placementToAllocatableDeviceMap
-	getDeviceByParentGiCiMap map[string]map[uint32]map[uint32]*AllocatableDevice
+	deviceByParentGiCiMap placementToAllocatableDeviceMap
+	wg                       sync.WaitGroup
+}
+
+func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
-func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
+func NewDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *devicelib) (DeviceHealthMonitor, error) {
+    return newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error)
+}
-func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
+func NewDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *devicelib) (DeviceHealthMonitor, error) {
+    return newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error)
+}
+	if nvdevlib.nvmllib == nil {
+		return nil, fmt.Errorf("nvml library is nil")
+	}
+
+	ctx, cancel := context.WithCancel(ctx)
+
+	m := &nvmlDeviceHealthMonitor{
+		nvmllib:       nvdevlib.nvmllib,
+		unhealthy:     make(chan *AllocatableDevice, len(allocatable)),
+		cancelContext: cancel,
+	}
+
+	if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
+		cancel()
+		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
+	}
+
+	klog.V(6).Info("creating NVML events for device health monitor")
+	eventSet, ret := m.nvmllib.EventSetCreate()
+	if ret != nvml.SUCCESS {
+		_ = m.nvmllib.Shutdown()
+		cancel()
+		return nil, fmt.Errorf("failed to create event set: %w", ret)
+	}
+	m.eventSet = eventSet
+
+	m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
+
+	m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)
+
+	klog.V(6).Info("registering NVML events for device health monitor")
+	m.registerEventsForDevices()
+
+	skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
+	klog.V(6).Info("started device health monitoring")
+	m.wg.Add(1)
+	go m.run(ctx, skippedXids)
+
+	return m, nil
+}
-func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
-	if nvdevlib.nvmllib == nil {
-		return nil, fmt.Errorf("nvml library is nil")
-	}
-
-	ctx, cancel := context.WithCancel(ctx)
-
-	m := &nvmlDeviceHealthMonitor{
-		nvmllib:       nvdevlib.nvmllib,
-		unhealthy:     make(chan *AllocatableDevice, len(allocatable)),
-		cancelContext: cancel,
-	}
-
-	if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
-		cancel()
-		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
-	}
-
-	klog.V(6).Info("creating NVML events for device health monitor")
-	eventSet, ret := m.nvmllib.EventSetCreate()
-	if ret != nvml.SUCCESS {
-		_ = m.nvmllib.Shutdown()
-		cancel()
-		return nil, fmt.Errorf("failed to create event set: %w", ret)
-	}
-	m.eventSet = eventSet
-
-	m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
-
-	m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)
-
-	klog.V(6).Info("registering NVML events for device health monitor")
-	m.registerEventsForDevices()
-
-	skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
-	klog.V(6).Info("started device health monitoring")
-	m.wg.Add(1)
-	go m.run(ctx, skippedXids)
-
-	return m, nil
-}
+func newNvmlDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
+	if nvdevlib.nvmllib == nil {
+		return nil, fmt.Errorf("nvml library is nil")
+	}
+
+	if ret := nvdevlib.nvmllib.Init(); ret != nvml.SUCCESS {
+		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
+	}
+	defer func() {
+		_ = nvdevlib.nvmllib.Shutdown()
+	}()
+
+	m := &nvmlDeviceHealthMonitor{
+		nvmllib:                  nvdevlib.nvmllib,
+		unhealthy:                make(chan *AllocatableDevice, len(allocatable)),
+		uuidToDeviceMap:          getUUIDToDeviceMap(allocatable),
+		getDeviceByParentGiCiMap: getDeviceByParentGiCiMap(allocatable),
+		skippedXids:              xidsToSkip(config.flags.additionalXidsToIgnore),
+	}
+
+	return m, nil
+}
+
+func (m *nvmlDeviceHealthMonitor) Start(ctx context.Context) (rerr error) {
+	if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
+		return fmt.Errorf("failed to initialize NVML: %v", ret)
+	}
+    // We shutdown nvml if this function returns with an error.
+    defer func() {
+        if rerr != nil {
+            _ = m.nvmllib.Shutdown()
+        }
+    }()
+
+	klog.V(6).Info("creating NVML events for device health monitor")
+	eventSet, ret := m.nvmllib.EventSetCreate()
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("failed to create event set: %w", ret)
+	}
+
+	ctx, cancel := context.WithCancel(ctx)
+
+	m.cancelContext = cancel
+	m.eventSet = eventSet
+
+	klog.V(6).Info("registering NVML events for device health monitor")
+	m.registerEventsForDevices()
+
+	klog.V(6).Info("started device health monitoring")
+	m.wg.Add(1)
+	go m.run(ctx, m.skippedXids)
+
+	return nil
+}
-func newNvmlDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
-	if nvdevlib.nvmllib == nil {
-		return nil, fmt.Errorf("nvml library is nil")
-	}
-
-	ctx, cancel := context.WithCancel(ctx)
-
-	m := &nvmlDeviceHealthMonitor{
-		nvmllib:       nvdevlib.nvmllib,
-		unhealthy:     make(chan *AllocatableDevice, len(allocatable)),
-		cancelContext: cancel,
-	}
-
-	if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
-		cancel()
-		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
-	}
-
-	klog.V(6).Info("creating NVML events for device health monitor")
-	eventSet, ret := m.nvmllib.EventSetCreate()
-	if ret != nvml.SUCCESS {
-		_ = m.nvmllib.Shutdown()
-		cancel()
-		return nil, fmt.Errorf("failed to create event set: %w", ret)
-	}
-	m.eventSet = eventSet
-
-	m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
-
-	m.getDeviceByParentGiCiMap = getDeviceByParentGiCiMap(allocatable)
-
-	klog.V(6).Info("registering NVML events for device health monitor")
-	m.registerEventsForDevices()
-
-	skippedXids := m.xidsToSkip(config.flags.additionalXidsToIgnore)
-	klog.V(6).Info("started device health monitoring")
-	m.wg.Add(1)
-	go m.run(ctx, skippedXids)
-
-	return m, nil
-}
+func newNvmlDeviceHealthMonitor(config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*nvmlDeviceHealthMonitor, error) {
+	if nvdevlib.nvmllib == nil {
+		return nil, fmt.Errorf("nvml library is nil")
+	}
+
+	if ret := nvdevlib.nvmllib.Init(); ret != nvml.SUCCESS {
+		return nil, fmt.Errorf("failed to initialize NVML: %v", ret)
+	}
+	defer func() {
+		_ = nvdevlib.nvmllib.Shutdown()
+	}()
+
+	m := &nvmlDeviceHealthMonitor{
+		nvmllib:                  nvdevlib.nvmllib,
+		unhealthy:                make(chan *AllocatableDevice, len(allocatable)),
+		uuidToDeviceMap:          getUUIDToDeviceMap(allocatable),
+		getDeviceByParentGiCiMap: getDeviceByParentGiCiMap(allocatable),
+		skippedXids:              xidsToSkip(config.flags.additionalXidsToIgnore),
+	}
+
+	return m, nil
+}
+
+func (m *nvmlDeviceHealthMonitor) Start(ctx context.Context) (rerr error) {
+	if ret := m.nvmllib.Init(); ret != nvml.SUCCESS {
+		return fmt.Errorf("failed to initialize NVML: %v", ret)
+	}
+    // We shutdown nvml if this function returns with an error.
+    defer func() {
+        if rerr != nil {
+            _ = m.nvmllib.Shutdown()
+        }
+    }()
+
+	klog.V(6).Info("creating NVML events for device health monitor")
+	eventSet, ret := m.nvmllib.EventSetCreate()
+	if ret != nvml.SUCCESS {
+		return fmt.Errorf("failed to create event set: %w", ret)
+	}
+
+	ctx, cancel := context.WithCancel(ctx)
+
+	m.cancelContext = cancel
+	m.eventSet = eventSet
+
+	klog.V(6).Info("registering NVML events for device health monitor")
+	m.registerEventsForDevices()
+
+	klog.V(6).Info("started device health monitoring")
+	m.wg.Add(1)
+	go m.run(ctx, m.skippedXids)
+
+	return nil
+}
+
+func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
+	eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
+
+	processedUUIDs := make(map[string]bool)
+
+	for uuid, dev := range m.uuidToDeviceMap {
+		var u string
+		if dev.Type() == MigDeviceType {
+			u = dev.Mig.parent.UUID
+		} else {
+			u = uuid
+		}
+
+		if processedUUIDs[u] {
+			continue
+		}
+		gpu, ret := m.nvmllib.DeviceGetHandleByUUID(u)
+		if ret != nvml.SUCCESS {
+			klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", u, ret)
+			m.unhealthy <- dev
+			continue
+		}
+
+		supportedEvents, ret := gpu.GetSupportedEventTypes()
+		if ret != nvml.SUCCESS {
+			klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", u, ret)
+			m.unhealthy <- dev
+			continue
+		}
+
+		ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
+		if ret == nvml.ERROR_NOT_SUPPORTED {
+			klog.Warningf("Device %v is too old to support healthchecking.", u)
+		}
+		if ret != nvml.SUCCESS {
+			klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, ret)
+			m.unhealthy <- dev
+		}
+		processedUUIDs[u] = true
+	}
+}
+
+func (m *nvmlDeviceHealthMonitor) Stop() {
+	if m == nil {
+		return
+	}
+	klog.V(6).Info("stopping health monitor")
+
+	if m.cancelContext != nil {
+		m.cancelContext()
+	}
+
+	m.wg.Wait()
+
+	_ = m.eventSet.Free()
+
+	if ret := m.nvmllib.Shutdown(); ret != nvml.SUCCESS {
+		klog.Warningf("failed to shutdown NVML: %v", ret)
+	}
+	close(m.unhealthy)
+}
+
+func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice {
+	uuidToDeviceMap := make(map[string]*AllocatableDevice)
+
+	for _, d := range allocatable {
+		if u := d.UUID(); u != "" {
+			uuidToDeviceMap[u] = d
+		}
+	}
+	return uuidToDeviceMap
+}
+
+func (m *nvmlDeviceHealthMonitor) run(ctx context.Context, skippedXids map[uint64]bool) {
+	defer m.wg.Done()
+	for {
+		select {
+		case <-ctx.Done():
+			klog.V(6).Info("Stopping event-driven GPU health monitor...")
+			return
+		default:
+			event, ret := m.eventSet.Wait(5000)
+			if ret == nvml.ERROR_TIMEOUT {
+				continue
+			}
+			if ret != nvml.SUCCESS {
+				klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret)
+				m.markAllDevicesUnhealthy()
+				continue
+			}
+
+			if event.EventType != nvml.EventTypeXidCriticalError {
+				klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", event)
+				continue
+			}
+
+			if skippedXids[event.EventData] {
+				klog.Infof("Skipping event %+v", event)
+				continue
+			}
+
+			klog.Infof("Processing event %+v", event)
+			eventUUID, ret := event.Device.GetUUID()
+			if ret != nvml.SUCCESS {
+				klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
+				m.markAllDevicesUnhealthy()
+				continue
+			}
+
+			var affectedDevice *AllocatableDevice
+			pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
+			if ok1 {
+				giMap, ok2 := pMap[event.GpuInstanceId]
+				if ok2 {
+					affectedDevice = giMap[event.ComputeInstanceId]
+				}
+			}
-			var affectedDevice *AllocatableDevice
-			pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
-			if ok1 {
-				giMap, ok2 := pMap[event.GpuInstanceId]
-				if ok2 {
-					affectedDevice = giMap[event.ComputeInstanceId]
-				}
-			}
+            affectedDevice := m.getDeviceByParentGiCiMap.get(
+				eventUUID,
+				event.GpuInstanceId,
+				event.ComputeInstanceId,
+			)
-			var affectedDevice *AllocatableDevice
-			pMap, ok1 := m.getDeviceByParentGiCiMap[eventUUID]
-			if ok1 {
-				giMap, ok2 := pMap[event.GpuInstanceId]
-				if ok2 {
-					affectedDevice = giMap[event.ComputeInstanceId]
-				}
-			}
+            affectedDevice := m.getDeviceByParentGiCiMap.get(
+				eventUUID,
+				event.GpuInstanceId,
+				event.ComputeInstanceId,
+			)
+
+			if affectedDevice == nil {
+				klog.Infof("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)", eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
+				continue
+			}
+
+			klog.Infof("Sending unhealthy notification for device %s due to event type: %v and event data: %d", affectedDevice.UUID(), event.EventType, event.EventData)
+			m.unhealthy <- affectedDevice
+		}
+	}
+}
+
+func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
+	return m.unhealthy
+}
+
+func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
+	for _, d := range m.uuidToDeviceMap {
+		// non-blocking send.
+		select {
+		case m.unhealthy <- d:
+		default:
+			klog.Errorf("Unhealthy channel buffer full. Dropping unhealthy notification for device %s in markAllDevicesUnhealthy.", d.UUID())
+		}
+	}
+}
+
+func getDeviceByParentGiCiMap(allocatable AllocatableDevices) map[string]map[uint32]map[uint32]*AllocatableDevice {
+	deviceByParentGiCiMap := make(map[string]map[uint32]map[uint32]*AllocatableDevice)
+
+	for _, d := range allocatable {
+		var parentUUID string
+		var giID, ciID uint32
+
+		switch d.Type() {
+		case GpuDeviceType:
+			parentUUID = d.UUID()
+			if parentUUID == "" {
+				continue
+			}
+			giID = FullGPUInstanceID
+			ciID = FullGPUInstanceID
+		case MigDeviceType:
+			parentUUID = d.Mig.parent.UUID
+			if parentUUID == "" {
+				continue
+			}
+			giID = d.Mig.giInfo.Id
+			ciID = d.Mig.ciInfo.Id
-	for _, d := range allocatable {
-		var parentUUID string
-		var giID, ciID uint32
-
-		switch d.Type() {
-		case GpuDeviceType:
-			parentUUID = d.UUID()
-			if parentUUID == "" {
-				continue
-			}
-			giID = FullGPUInstanceID
-			ciID = FullGPUInstanceID
-		case MigDeviceType:
-			parentUUID = d.Mig.parent.UUID
-			if parentUUID == "" {
-				continue
-			}
-			giID = d.Mig.giInfo.Id
-			ciID = d.Mig.ciInfo.Id
+	for _, d := range allocatable {
+		switch d.Type() {
+		case GpuDeviceType:
+            uuid := d.UUID()
+            if uuid == "" {
+                continue
+            }
+            deviceByParentGiCiMap.put(uuid, FullGPUInstanceID, FullGPUInstanceID)
+		case MigDeviceType:
+			uuid := d.Mig.parent.UUID
+			if uuid == "" {
+				continue
+			}
+            deviceByParentGiCiMap.put(uuid, d.Mig.giInfo.Id, d.Mig.ciInfo.Id)
-	for _, d := range allocatable {
-		var parentUUID string
-		var giID, ciID uint32
-
-		switch d.Type() {
-		case GpuDeviceType:
-			parentUUID = d.UUID()
-			if parentUUID == "" {
-				continue
-			}
-			giID = FullGPUInstanceID
-			ciID = FullGPUInstanceID
-		case MigDeviceType:
-			parentUUID = d.Mig.parent.UUID
-			if parentUUID == "" {
-				continue
-			}
-			giID = d.Mig.giInfo.Id
-			ciID = d.Mig.ciInfo.Id
+	for _, d := range allocatable {
+		switch d.Type() {
+		case GpuDeviceType:
+            uuid := d.UUID()
+            if uuid == "" {
+                continue
+            }
+            deviceByParentGiCiMap.put(uuid, FullGPUInstanceID, FullGPUInstanceID)
+		case MigDeviceType:
+			uuid := d.Mig.parent.UUID
+			if uuid == "" {
+				continue
+			}
+            deviceByParentGiCiMap.put(uuid, d.Mig.giInfo.Id, d.Mig.ciInfo.Id)
+		default:
+			klog.Errorf("Skipping device with unknown type: %s", d.Type())
+			continue
+		}
+
+		if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
+			deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
+		}
+		if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
+			deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
+		}
+		deviceByParentGiCiMap[parentUUID][giID][ciID] = d
-		if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
-			deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
-		}
-		if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
-			deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
-		}
-		deviceByParentGiCiMap[parentUUID][giID][ciID] = d
+		deviceByParentGiCiMap.put(parentUUID, giID, ciID, d)
-		if _, ok := deviceByParentGiCiMap[parentUUID]; !ok {
-			deviceByParentGiCiMap[parentUUID] = make(map[uint32]map[uint32]*AllocatableDevice)
-		}
-		if _, ok := deviceByParentGiCiMap[parentUUID][giID]; !ok {
-			deviceByParentGiCiMap[parentUUID][giID] = make(map[uint32]*AllocatableDevice)
-		}
-		deviceByParentGiCiMap[parentUUID][giID][ciID] = d
+		deviceByParentGiCiMap.put(parentUUID, giID, ciID, d)
+	}
+	return deviceByParentGiCiMap
+}
+
+// getAdditionalXids returns a list of additional Xids to skip from the specified string.
+// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
+// Invalid values nare ignored.
+func getAdditionalXids(input string) []uint64 {
+	if input == "" {
+		return nil
+	}
+
+	var additionalXids []uint64
+	klog.V(6).Infof("Creating a list of additional xids to ignore: [%s]", input)
+	for _, additionalXid := range strings.Split(input, ",") {
+		trimmed := strings.TrimSpace(additionalXid)
+		if trimmed == "" {
+			continue
+		}
+		xid, err := strconv.ParseUint(trimmed, 10, 64)
+		if err != nil {
+			klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
+			continue
+		}
+		additionalXids = append(additionalXids, xid)
+	}
+
+	return additionalXids
+}
+
+func (m *nvmlDeviceHealthMonitor) xidsToSkip(additionalXids string) map[uint64]bool {
+	// Add the list of hardcoded disabled (ignored) XIDs:
+	// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
+	// Application errors: the GPU should still be healthy.
+	ignoredXids := []uint64{
+		13,  // Graphics Engine Exception
+		31,  // GPU memory page fault
+		43,  // GPU stopped processing
+		45,  // Preemptive cleanup, due to previous errors
+		68,  // Video processor exception
+		109, // Context Switch Timeout Error
+	}
+
+	skippedXids := make(map[uint64]bool)
+	for _, id := range ignoredXids {
+		skippedXids[id] = true
+	}
+
+	for _, additionalXid := range getAdditionalXids(additionalXids) {
+		skippedXids[additionalXid] = true
+	}
+	return skippedXids
+}
diff --git a/cmd/gpu-kubelet-plugin/device_state.go b/cmd/gpu-kubelet-plugin/device_state.go
@@ -296,6 +296,12 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
 		if !exists {
 			return nil, fmt.Errorf("requested device is not allocatable: %v", result.Device)
 		}
+		// only proceed with config mapping if device is healthy.
+		if featuregates.Enabled(featuregates.DeviceHealthCheck) {
+			if device.Health == Unhealthy {
+				return nil, fmt.Errorf("requested device is not healthy: %v", result.Device)
+			}
+		}
 		for _, c := range slices.Backward(configs) {
 			if slices.Contains(c.Requests, result.Request) {
 				if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
@@ -550,6 +556,14 @@ func GetOpaqueDeviceConfigs(
 	return resultConfigs, nil
 }
 
+func (s *DeviceState) UpdateDeviceHealthStatus(device *AllocatableDevice, hs HealthStatus) {
+	s.Lock()
+	defer s.Unlock()
+
+	device.Health = hs
+	klog.Infof("Updated device: %s health status to %s", device.UUID(), hs)
+}
+
 // TODO: Dynamic MIG is not yet supported with structured parameters.
 // Refactor this to allow for the allocation of statically partitioned MIG
 // devices.