Skip to content

Commit c8f5009

Browse files
fix: address Copilot review comments
- Fix type assertion in TestCheckHealth to properly check assertion result - Improve healthChannelBufferSize documentation for operators with >8 GPUs Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent e8e4b2b commit c8f5009

2 files changed

Lines changed: 13 additions & 11 deletions

File tree

internal/plugin/server.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,12 @@ const (
4949

5050
// healthChannelBufferSize defines the buffer capacity for the health
5151
// channel. This is sized to handle bursts of unhealthy device reports
52-
// without blocking the health check goroutine. With 8 GPUs and
53-
// potential for multiple events per GPU (XID errors, ECC errors, etc.),
54-
// a buffer of 64 provides ample headroom while using a power-of-2 size
55-
// for cache-friendly alignment.
52+
// without blocking the health check goroutine. The value of 64 is
53+
// chosen assuming a single device plugin instance runs per node with
54+
// up to 8 GPUs per node and multiple in-flight events per GPU (XID
55+
// errors, ECC errors, etc.), while keeping a power-of-2 size for
56+
// cache-friendly alignment. Operators running nodes with significantly
57+
// more GPUs should review this assumption.
5658
healthChannelBufferSize = 64
5759
)
5860

internal/rm/health_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -231,13 +231,13 @@ func TestCheckHealth(t *testing.T) {
231231

232232
server := dgxa100.New()
233233

234-
if deviceMock := server.Devices[0].(*dgxa100.Device); true {
235-
deviceMock.GetSupportedEventTypesFunc = func() (uint64, nvml.Return) {
236-
return nvml.EventTypeXidCriticalError, nvml.SUCCESS
237-
}
238-
deviceMock.RegisterEventsFunc = func(v uint64, eventSet nvml.EventSet) nvml.Return {
239-
return nvml.SUCCESS
240-
}
234+
deviceMock, ok := server.Devices[0].(*dgxa100.Device)
235+
require.True(t, ok, "expected first device to be *dgxa100.Device")
236+
deviceMock.GetSupportedEventTypesFunc = func() (uint64, nvml.Return) {
237+
return nvml.EventTypeXidCriticalError, nvml.SUCCESS
238+
}
239+
deviceMock.RegisterEventsFunc = func(v uint64, eventSet nvml.EventSet) nvml.Return {
240+
return nvml.SUCCESS
241241
}
242242

243243
var count int

0 commit comments

Comments
 (0)