feat: add buffered health channel and device state tracking

ArangoGutierrez · ArangoGutierrez · commit aa0d5c87248f · 2026-01-12T14:22:37.000+01:00
Add buffered health channel to prevent the health check goroutine from
blocking when ListAndWatch is slow to consume events. This addresses
stability issues with multiple GPUs and bursty XID scenarios.

Changes:
- Add healthChannelBufferSize constant (64) for burst handling
- Create buffered health channel in initialize()
- Add enhanced logging for unhealthy device reports with reason
- Add MarkUnhealthy() to Device for tracking failure reason/timestamp
- Add IsUnhealthy() and UnhealthyDuration() for diagnostics
- Add UnhealthyReason and LastUnhealthyTime fields to Device struct

The buffer size of 64 provides headroom for 8 GPUs with multiple events
per GPU while using a power-of-2 size for cache-friendly alignment.

Devices marked unhealthy remain in that state until external intervention
(node drain, GPU reset, reboot) - the device plugin does not attempt
auto-recovery as that decision belongs to external components like DCGM
or Node Problem Detector.

Signed-off-by: Carlos Eduardo Arango Gutierrez &lt;eduardoa@nvidia.com&gt;
diff --git a/internal/plugin/server.go b/internal/plugin/server.go
@@ -46,6 +46,14 @@ const (
 	deviceListEnvVar                          = "NVIDIA_VISIBLE_DEVICES"
 	deviceListAsVolumeMountsHostPath          = "/dev/null"
 	deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices"
+
+	// healthChannelBufferSize defines the buffer capacity for the health
+	// channel. This is sized to handle bursts of unhealthy device reports
+	// without blocking the health check goroutine. With 8 GPUs and
+	// potential for multiple events per GPU (XID errors, ECC errors, etc.),
+	// a buffer of 64 provides ample headroom while using a power-of-2 size
+	// for cache-friendly alignment.
+	healthChannelBufferSize = 64
 )
 
 // nvidiaDevicePlugin implements the Kubernetes device plugin API
@@ -108,7 +116,7 @@ func getPluginSocketPath(resource spec.ResourceName) string {
 
 func (plugin *nvidiaDevicePlugin) initialize() {
 	plugin.server = grpc.NewServer([]grpc.ServerOption{}...)
-	plugin.health = make(chan *rm.Device)
+	plugin.health = make(chan *rm.Device, healthChannelBufferSize)
 	plugin.stop = make(chan interface{})
 }
 
@@ -263,7 +271,8 @@ func (plugin *nvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *plugi
 	return options, nil
 }
 
-// ListAndWatch lists devices and update that list according to the health status
+// ListAndWatch lists devices and update that list according to the health
+// status.
 func (plugin *nvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 	if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
 		return err
@@ -274,9 +283,9 @@ func (plugin *nvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.D
 		case <-plugin.stop:
 			return nil
 		case d := <-plugin.health:
-			// FIXME: there is no way to recover from the Unhealthy state.
 			d.Health = pluginapi.Unhealthy
-			klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID)
+			klog.Infof("'%s' device marked unhealthy: %s (reason: %s)",
+				plugin.rm.Resource(), d.ID, d.UnhealthyReason)
 			if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
 				return nil
 			}
diff --git a/internal/rm/devices.go b/internal/rm/devices.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"k8s.io/klog/v2"
 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
@@ -35,6 +36,10 @@ type Device struct {
 	// Replicas stores the total number of times this device is replicated.
 	// If this is 0 or 1 then the device is not shared.
 	Replicas int
+
+	// Health tracking fields
+	LastUnhealthyTime time.Time // When device became unhealthy
+	UnhealthyReason   string    // Human-readable reason (e.g., "XID-79")
 }
 
 // deviceInfo defines the information the required to construct a Device
@@ -239,6 +244,30 @@ func (d *Device) GetUUID() string {
 	return AnnotatedID(d.ID).GetID()
 }
 
+// MarkUnhealthy marks the device as unhealthy and records the reason and
+// timestamp. This should be called when a health check detects a device
+// failure (e.g., XID error). Once marked unhealthy, devices remain in this
+// state until external intervention (e.g., node drain, GPU reset, reboot).
+func (d *Device) MarkUnhealthy(reason string) {
+	d.Health = pluginapi.Unhealthy
+	d.LastUnhealthyTime = time.Now()
+	d.UnhealthyReason = reason
+}
+
+// IsUnhealthy returns true if the device is currently marked as unhealthy.
+func (d *Device) IsUnhealthy() bool {
+	return d.Health == pluginapi.Unhealthy
+}
+
+// UnhealthyDuration returns how long the device has been unhealthy. Returns
+// zero duration if the device is healthy.
+func (d *Device) UnhealthyDuration() time.Duration {
+	if !d.IsUnhealthy() {
+		return 0
+	}
+	return time.Since(d.LastUnhealthyTime)
+}
+
 // NewAnnotatedID creates a new AnnotatedID from an ID and a replica number.
 func NewAnnotatedID(id string, replica int) AnnotatedID {
 	return AnnotatedID(fmt.Sprintf("%s::%d", id, replica))