diff --git a/pkg/spdk/client/basic.go b/pkg/spdk/client/basic.go index 022617bc..3a4a3320 100644 --- a/pkg/spdk/client/basic.go +++ b/pkg/spdk/client/basic.go @@ -1015,6 +1015,35 @@ func (c *Client) BdevNvmeGetControllers(name string) (controllerInfoList []spdkt return controllerInfoList, json.Unmarshal(cmdOutput, &controllerInfoList) } +// BdevNvmeGetControllerHealthInfo retrieves health information for a specified +// NVMe bdev controller. +// +// "name": Name of the NVMe controller +func (c *Client) BdevNvmeGetControllerHealthInfo(name string) (healthInfo spdktypes.BdevNvmeControllerHealthInfo, err error) { + req := spdktypes.BdevNvmeGetControllerHealthInfoRequest{ + Name: name, + } + + cmdOutput, err := c.jsonCli.SendCommand("bdev_nvme_get_controller_health_info", req) + if err != nil { + return healthInfo, err + } + + if err := json.Unmarshal(cmdOutput, &healthInfo); err != nil { + return healthInfo, err + } + + // Normalize temperature: SPDK writes temperature as unsigned (Kelvin-273). + // When controller reports invalid/0 temperature in Kelvin, subtracting 273 + // on uint64 underflows and produces a huge number (~2^64 - 273), which is + // meaningless in Celsius. Clamp such outliers to -1 to indicate unknown. + if healthInfo.TemperatureCelsius > 255 { // Values >255°C are invalid for a uint8 S.M.A.R.T. temperature. + healthInfo.TemperatureCelsius = spdktypes.UnknownTemperature + } + + return healthInfo, nil +} + // BdevNvmeSetOptions sets global parameters for all bdev NVMe. // This RPC may only be called before SPDK subsystems have been initialized or any bdev NVMe // has been created. diff --git a/pkg/spdk/types/nvme.go b/pkg/spdk/types/nvme.go index cdc80d5e..c452ccc5 100644 --- a/pkg/spdk/types/nvme.go +++ b/pkg/spdk/types/nvme.go @@ -132,29 +132,42 @@ type BdevNvmeGetControllersRequest struct { Name string `json:"name,omitempty"` } -// type BdevNvmeControllerHealthInfo struct { -// ModelNumber string `json:"model_number"` -// SerialNumber string `json:"serial_number"` -// FirmwareRevision string `json:"firmware_revision"` -// Traddr string `json:"traddr"` -// TemperatureCelsius uint64 `json:"temperature_celsius"` -// AvailableSparePercentage uint64 `json:"available_spare_percentage"` -// AvailableSpareThresholdPercentage uint64 `json:"available_spare_threshold_percentage"` -// PercentageUsed uint64 `json:"percentage_used"` -// DataUnitsRead uint128 `json:"data_units_read"` -// DataUnitsWritten uint128 `json:"data_units_written"` -// HostReadCommands uint128 `json:"host_read_commands"` -// HostWriteCommands uint128 `json:"host_write_commands"` -// ControllerBusyTime uint128 `json:"controller_busy_time"` -// PowerCycles uint128 `json:"power_cycles"` -// PowerOnHours uint128 `json:"power_on_hours"` -// UnsafeShutdowns uint128 `json:"unsafe_shutdowns"` -// MediaErrors uint128 `json:"media_errors"` -// NumErrLogEntries uint128 `json:"num_err_log_entries"` -// WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"` -// CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"` -// } -// -// type BdevNvmeGetControllerHealthInfoRequest struct { -// Name string `json:"name"` -// } +// UnknownTemperature represents an unknown/invalid NVMe temperature reading (in Celsius). +// SPDK may emit an underflowed unsigned value when converting Kelvin to Celsius; map such +// outliers to this sentinel at the client layer. +const UnknownTemperature float64 = -1 + +// BdevNvmeControllerHealthInfo represents the response of bdev_nvme_get_controller_health_info. +type BdevNvmeControllerHealthInfo struct { + ModelNumber string `json:"model_number"` + SerialNumber string `json:"serial_number"` + FirmwareRevision string `json:"firmware_revision"` + Traddr string `json:"traddr"` + CriticalWarning uint32 `json:"critical_warning"` + + // TemperatureCelsius can sometimes be reported by SPDK as a wrapped 64-bit sentinel + // value (e.g., 2^64 - 273) when temperature is invalid. Use float64 to avoid + // unmarshal errors on oversized integers and let callers interpret outliers. + TemperatureCelsius float64 `json:"temperature_celsius"` + + AvailableSparePercentage uint32 `json:"available_spare_percentage"` + AvailableSpareThresholdPercentage uint32 `json:"available_spare_threshold_percentage"` + PercentageUsed uint32 `json:"percentage_used"` + DataUnitsRead uint64 `json:"data_units_read"` + DataUnitsWritten uint64 `json:"data_units_written"` + HostReadCommands uint64 `json:"host_read_commands"` + HostWriteCommands uint64 `json:"host_write_commands"` + ControllerBusyTime uint64 `json:"controller_busy_time"` + PowerCycles uint64 `json:"power_cycles"` + PowerOnHours uint64 `json:"power_on_hours"` + UnsafeShutdowns uint64 `json:"unsafe_shutdowns"` + MediaErrors uint64 `json:"media_errors"` + NumErrLogEntries uint64 `json:"num_err_log_entries"` + WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"` + CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"` +} + +// BdevNvmeGetControllerHealthInfoRequest is the request for fetching controller health. +type BdevNvmeGetControllerHealthInfoRequest struct { + Name string `json:"name"` +}