Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions pkg/spdk/client/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,35 @@ func (c *Client) BdevNvmeGetControllers(name string) (controllerInfoList []spdkt
return controllerInfoList, json.Unmarshal(cmdOutput, &controllerInfoList)
}

// BdevNvmeGetControllerHealthInfo retrieves health information for a specified
// NVMe bdev controller.
//
// "name": Name of the NVMe controller
func (c *Client) BdevNvmeGetControllerHealthInfo(name string) (healthInfo spdktypes.BdevNvmeControllerHealthInfo, err error) {
req := spdktypes.BdevNvmeGetControllerHealthInfoRequest{
Name: name,
}

cmdOutput, err := c.jsonCli.SendCommand("bdev_nvme_get_controller_health_info", req)
if err != nil {
return healthInfo, err
}

if err := json.Unmarshal(cmdOutput, &healthInfo); err != nil {
return healthInfo, err
}

// Normalize temperature: SPDK writes temperature as unsigned (Kelvin-273).
// When controller reports invalid/0 temperature in Kelvin, subtracting 273
// on uint64 underflows and produces a huge number (~2^64 - 273), which is
// meaningless in Celsius. Clamp such outliers to -1 to indicate unknown.
if healthInfo.TemperatureCelsius > 255 { // Values >255°C are invalid for a uint8 S.M.A.R.T. temperature.
healthInfo.TemperatureCelsius = spdktypes.UnknownTemperature
}

return healthInfo, nil
}

// BdevNvmeSetOptions sets global parameters for all bdev NVMe.
// This RPC may only be called before SPDK subsystems have been initialized or any bdev NVMe
// has been created.
Expand Down
65 changes: 39 additions & 26 deletions pkg/spdk/types/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,29 +132,42 @@ type BdevNvmeGetControllersRequest struct {
Name string `json:"name,omitempty"`
}

// type BdevNvmeControllerHealthInfo struct {
// ModelNumber string `json:"model_number"`
// SerialNumber string `json:"serial_number"`
// FirmwareRevision string `json:"firmware_revision"`
// Traddr string `json:"traddr"`
// TemperatureCelsius uint64 `json:"temperature_celsius"`
// AvailableSparePercentage uint64 `json:"available_spare_percentage"`
// AvailableSpareThresholdPercentage uint64 `json:"available_spare_threshold_percentage"`
// PercentageUsed uint64 `json:"percentage_used"`
// DataUnitsRead uint128 `json:"data_units_read"`
// DataUnitsWritten uint128 `json:"data_units_written"`
// HostReadCommands uint128 `json:"host_read_commands"`
// HostWriteCommands uint128 `json:"host_write_commands"`
// ControllerBusyTime uint128 `json:"controller_busy_time"`
// PowerCycles uint128 `json:"power_cycles"`
// PowerOnHours uint128 `json:"power_on_hours"`
// UnsafeShutdowns uint128 `json:"unsafe_shutdowns"`
// MediaErrors uint128 `json:"media_errors"`
// NumErrLogEntries uint128 `json:"num_err_log_entries"`
// WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"`
// CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"`
// }
//
// type BdevNvmeGetControllerHealthInfoRequest struct {
// Name string `json:"name"`
// }
// UnknownTemperature represents an unknown/invalid NVMe temperature reading (in Celsius).
// SPDK may emit an underflowed unsigned value when converting Kelvin to Celsius; map such
// outliers to this sentinel at the client layer.
const UnknownTemperature float64 = -1

// BdevNvmeControllerHealthInfo represents the response of bdev_nvme_get_controller_health_info.
type BdevNvmeControllerHealthInfo struct {
ModelNumber string `json:"model_number"`
SerialNumber string `json:"serial_number"`
FirmwareRevision string `json:"firmware_revision"`
Traddr string `json:"traddr"`
CriticalWarning uint32 `json:"critical_warning"`

// TemperatureCelsius can sometimes be reported by SPDK as a wrapped 64-bit sentinel
// value (e.g., 2^64 - 273) when temperature is invalid. Use float64 to avoid
// unmarshal errors on oversized integers and let callers interpret outliers.
TemperatureCelsius float64 `json:"temperature_celsius"`

AvailableSparePercentage uint32 `json:"available_spare_percentage"`
AvailableSpareThresholdPercentage uint32 `json:"available_spare_threshold_percentage"`
PercentageUsed uint32 `json:"percentage_used"`
DataUnitsRead uint64 `json:"data_units_read"`
DataUnitsWritten uint64 `json:"data_units_written"`
HostReadCommands uint64 `json:"host_read_commands"`
HostWriteCommands uint64 `json:"host_write_commands"`
ControllerBusyTime uint64 `json:"controller_busy_time"`
PowerCycles uint64 `json:"power_cycles"`
PowerOnHours uint64 `json:"power_on_hours"`
UnsafeShutdowns uint64 `json:"unsafe_shutdowns"`
MediaErrors uint64 `json:"media_errors"`
NumErrLogEntries uint64 `json:"num_err_log_entries"`
WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"`
CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"`
}

// BdevNvmeGetControllerHealthInfoRequest is the request for fetching controller health.
type BdevNvmeGetControllerHealthInfoRequest struct {
Name string `json:"name"`
}
Loading