File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -49,10 +49,12 @@ const (
4949
5050 // healthChannelBufferSize defines the buffer capacity for the health
5151 // channel. This is sized to handle bursts of unhealthy device reports
52- // without blocking the health check goroutine. With 8 GPUs and
53- // potential for multiple events per GPU (XID errors, ECC errors, etc.),
54- // a buffer of 64 provides ample headroom while using a power-of-2 size
55- // for cache-friendly alignment.
52+ // without blocking the health check goroutine. The value of 64 is
53+ // chosen assuming a single device plugin instance runs per node with
54+ // up to 8 GPUs per node and multiple in-flight events per GPU (XID
55+ // errors, ECC errors, etc.), while keeping a power-of-2 size for
56+ // cache-friendly alignment. Operators running nodes with significantly
57+ // more GPUs should review this assumption.
5658 healthChannelBufferSize = 64
5759)
5860
Original file line number Diff line number Diff line change @@ -231,13 +231,13 @@ func TestCheckHealth(t *testing.T) {
231231
232232 server := dgxa100 .New ()
233233
234- if deviceMock := server .Devices [0 ].(* dgxa100.Device ); true {
235- deviceMock . GetSupportedEventTypesFunc = func () ( uint64 , nvml. Return ) {
236- return nvml . EventTypeXidCriticalError , nvml .SUCCESS
237- }
238- deviceMock . RegisterEventsFunc = func ( v uint64 , eventSet nvml. EventSet ) nvml. Return {
239- return nvml .SUCCESS
240- }
234+ deviceMock , ok := server .Devices [0 ].(* dgxa100.Device )
235+ require . True ( t , ok , "expected first device to be *dgxa100.Device" )
236+ deviceMock . GetSupportedEventTypesFunc = func () ( uint64 , nvml.Return ) {
237+ return nvml . EventTypeXidCriticalError , nvml . SUCCESS
238+ }
239+ deviceMock . RegisterEventsFunc = func ( v uint64 , eventSet nvml.EventSet ) nvml. Return {
240+ return nvml . SUCCESS
241241 }
242242
243243 var count int
You can’t perform that action at this time.
0 commit comments