Skip to content

Commit 74fdf9a

Browse files
committed
feat(client): get NVMe bdev controller health info
longhorn/longhorn-12016 Signed-off-by: Chin-Ya Huang <chin-ya.huang@suse.com>
1 parent 2b59bbd commit 74fdf9a

2 files changed

Lines changed: 68 additions & 26 deletions

File tree

pkg/spdk/client/basic.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,35 @@ func (c *Client) BdevNvmeGetControllers(name string) (controllerInfoList []spdkt
10151015
return controllerInfoList, json.Unmarshal(cmdOutput, &controllerInfoList)
10161016
}
10171017

1018+
// BdevNvmeGetControllerHealthInfo retrieves health information for a specified
1019+
// NVMe bdev controller.
1020+
//
1021+
// "name": Name of the NVMe controller
1022+
func (c *Client) BdevNvmeGetControllerHealthInfo(name string) (healthInfo spdktypes.BdevNvmeControllerHealthInfo, err error) {
1023+
req := spdktypes.BdevNvmeGetControllerHealthInfoRequest{
1024+
Name: name,
1025+
}
1026+
1027+
cmdOutput, err := c.jsonCli.SendCommand("bdev_nvme_get_controller_health_info", req)
1028+
if err != nil {
1029+
return healthInfo, err
1030+
}
1031+
1032+
if err := json.Unmarshal(cmdOutput, &healthInfo); err != nil {
1033+
return healthInfo, err
1034+
}
1035+
1036+
// Normalize temperature: SPDK writes temperature as unsigned (Kelvin-273).
1037+
// When controller reports invalid/0 temperature in Kelvin, subtracting 273
1038+
// on uint64 underflows and produces a huge number (~2^64 - 273), which is
1039+
// meaningless in Celsius. Clamp such outliers to -1 to indicate unknown.
1040+
if healthInfo.TemperatureCelsius > 255 { // Values >255°C are invalid for a uint8 S.M.A.R.T. temperature.
1041+
healthInfo.TemperatureCelsius = spdktypes.UnknownTemperature
1042+
}
1043+
1044+
return healthInfo, nil
1045+
}
1046+
10181047
// BdevNvmeSetOptions sets global parameters for all bdev NVMe.
10191048
// This RPC may only be called before SPDK subsystems have been initialized or any bdev NVMe
10201049
// has been created.

pkg/spdk/types/nvme.go

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -132,29 +132,42 @@ type BdevNvmeGetControllersRequest struct {
132132
Name string `json:"name,omitempty"`
133133
}
134134

135-
// type BdevNvmeControllerHealthInfo struct {
136-
// ModelNumber string `json:"model_number"`
137-
// SerialNumber string `json:"serial_number"`
138-
// FirmwareRevision string `json:"firmware_revision"`
139-
// Traddr string `json:"traddr"`
140-
// TemperatureCelsius uint64 `json:"temperature_celsius"`
141-
// AvailableSparePercentage uint64 `json:"available_spare_percentage"`
142-
// AvailableSpareThresholdPercentage uint64 `json:"available_spare_threshold_percentage"`
143-
// PercentageUsed uint64 `json:"percentage_used"`
144-
// DataUnitsRead uint128 `json:"data_units_read"`
145-
// DataUnitsWritten uint128 `json:"data_units_written"`
146-
// HostReadCommands uint128 `json:"host_read_commands"`
147-
// HostWriteCommands uint128 `json:"host_write_commands"`
148-
// ControllerBusyTime uint128 `json:"controller_busy_time"`
149-
// PowerCycles uint128 `json:"power_cycles"`
150-
// PowerOnHours uint128 `json:"power_on_hours"`
151-
// UnsafeShutdowns uint128 `json:"unsafe_shutdowns"`
152-
// MediaErrors uint128 `json:"media_errors"`
153-
// NumErrLogEntries uint128 `json:"num_err_log_entries"`
154-
// WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"`
155-
// CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"`
156-
// }
157-
//
158-
// type BdevNvmeGetControllerHealthInfoRequest struct {
159-
// Name string `json:"name"`
160-
// }
135+
// UnknownTemperature represents an unknown/invalid NVMe temperature reading (in Celsius).
136+
// SPDK may emit an underflowed unsigned value when converting Kelvin to Celsius; map such
137+
// outliers to this sentinel at the client layer.
138+
const UnknownTemperature float64 = -1
139+
140+
// BdevNvmeControllerHealthInfo represents the response of bdev_nvme_get_controller_health_info.
141+
type BdevNvmeControllerHealthInfo struct {
142+
ModelNumber string `json:"model_number"`
143+
SerialNumber string `json:"serial_number"`
144+
FirmwareRevision string `json:"firmware_revision"`
145+
Traddr string `json:"traddr"`
146+
CriticalWarning uint32 `json:"critical_warning"`
147+
148+
// TemperatureCelsius can sometimes be reported by SPDK as a wrapped 64-bit sentinel
149+
// value (e.g., 2^64 - 273) when temperature is invalid. Use float64 to avoid
150+
// unmarshal errors on oversized integers and let callers interpret outliers.
151+
TemperatureCelsius float64 `json:"temperature_celsius"`
152+
153+
AvailableSparePercentage uint32 `json:"available_spare_percentage"`
154+
AvailableSpareThresholdPercentage uint32 `json:"available_spare_threshold_percentage"`
155+
PercentageUsed uint32 `json:"percentage_used"`
156+
DataUnitsRead uint64 `json:"data_units_read"`
157+
DataUnitsWritten uint64 `json:"data_units_written"`
158+
HostReadCommands uint64 `json:"host_read_commands"`
159+
HostWriteCommands uint64 `json:"host_write_commands"`
160+
ControllerBusyTime uint64 `json:"controller_busy_time"`
161+
PowerCycles uint64 `json:"power_cycles"`
162+
PowerOnHours uint64 `json:"power_on_hours"`
163+
UnsafeShutdowns uint64 `json:"unsafe_shutdowns"`
164+
MediaErrors uint64 `json:"media_errors"`
165+
NumErrLogEntries uint64 `json:"num_err_log_entries"`
166+
WarningTemperatureTimeMinutes uint64 `json:"warning_temperature_time_minutes"`
167+
CriticalCompositeTemperatureTimeMinutes uint64 `json:"critical_composite_temperature_time_minutes"`
168+
}
169+
170+
// BdevNvmeGetControllerHealthInfoRequest is the request for fetching controller health.
171+
type BdevNvmeGetControllerHealthInfoRequest struct {
172+
Name string `json:"name"`
173+
}

0 commit comments

Comments
 (0)