Skip to content

Commit 99b33e6

Browse files
committed
nvme_metrics: refactor Python collector to support nvme-cli v2.11+
nvme-cli v2.11 introduced a breaking change by forcing JSON output to always be verbose, resulting in a significantly different output structure. Fixes: #226 Signed-off-by: Daniel Swarbrick <[email protected]>
1 parent 39b36d0 commit 99b33e6

File tree

1 file changed

+72
-44
lines changed

1 file changed

+72
-44
lines changed

Diff for: nvme_metrics.py

+72-44
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@
7474
"Device error log entry count",
7575
["device"], namespace=namespace, registry=registry,
7676
),
77+
# FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
78+
# will result in the metric having a "_info" suffixe automatically appended, which is arguably
79+
# a breaking change.
7780
"nvmecli": Gauge(
7881
"nvmecli",
7982
"nvme-cli tool information",
@@ -142,7 +145,11 @@ def exec_nvme_json(*args):
142145
"""
143146
Execute nvme CLI tool with specified arguments and return parsed JSON output.
144147
"""
145-
output = exec_nvme(*args, "--output-format", "json")
148+
# Note: nvme-cli v2.11 effectively introduced a breaking change by forcing JSON output to always
149+
# be verbose. Older versions of nvme-cli optionally produced verbose output if the --verbose
150+
# flag was specified. In order to avoid having to handle two different JSON schemas, always
151+
# add the --verbose flag.
152+
output = exec_nvme(*args, "--output-format", "json", "--verbose")
146153
return json.loads(output)
147154

148155

@@ -157,49 +164,70 @@ def main():
157164
device_list = exec_nvme_json("list")
158165

159166
for device in device_list["Devices"]:
160-
device_path = device["DevicePath"]
161-
device_name = os.path.basename(device_path)
162-
163-
metrics["device_info"].labels(
164-
device_name,
165-
device["ModelNumber"],
166-
device["Firmware"],
167-
device["SerialNumber"].strip(),
168-
)
169-
170-
metrics["sector_size"].labels(device_name).set(device["SectorSize"])
171-
metrics["physical_size"].labels(device_name).set(device["PhysicalSize"])
172-
metrics["used_bytes"].labels(device_name).set(device["UsedBytes"])
173-
174-
smart_log = exec_nvme_json("smart-log", device_path)
175-
176-
# Various counters in the NVMe specification are 128-bit, which would have to discard
177-
# resolution if converted to a JSON number (i.e., float64_t). Instead, nvme-cli marshals
178-
# them as strings. As such, they need to be explicitly cast to int or float when using them
179-
# in Counter metrics.
180-
metrics["data_units_read"].labels(device_name).inc(int(smart_log["data_units_read"]))
181-
metrics["data_units_written"].labels(device_name).inc(int(smart_log["data_units_written"]))
182-
metrics["host_read_commands"].labels(device_name).inc(int(smart_log["host_read_commands"]))
183-
metrics["host_write_commands"].labels(device_name).inc(
184-
int(smart_log["host_write_commands"])
185-
)
186-
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
187-
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
188-
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
189-
metrics["critical_warning"].labels(device_name).set(smart_log["critical_warning"])
190-
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
191-
metrics["num_err_log_entries"].labels(device_name).inc(
192-
int(smart_log["num_err_log_entries"])
193-
)
194-
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
195-
metrics["power_on_hours"].labels(device_name).inc(int(smart_log["power_on_hours"]))
196-
metrics["controller_busy_time"].labels(device_name).inc(
197-
int(smart_log["controller_busy_time"])
198-
)
199-
metrics["unsafe_shutdowns"].labels(device_name).inc(int(smart_log["unsafe_shutdowns"]))
200-
201-
# NVMe reports temperature in kelvins; convert it to degrees Celsius.
202-
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
167+
for subsys in device["Subsystems"]:
168+
for ctrl in subsys["Controllers"]:
169+
for ns in ctrl["Namespaces"]:
170+
device_name = ns["NameSpace"]
171+
172+
# FIXME: This metric ought to be refactored into a "controller_info" metric,
173+
# since it contains information that is not unique to the namespace. However,
174+
# previous versions of this collector erroneously referred to namespaces, e.g.
175+
# "nvme0n1", as devices, so preserve the former behaviour for now.
176+
metrics["device_info"].labels(
177+
device_name,
178+
ctrl["ModelNumber"],
179+
ctrl["Firmware"],
180+
ctrl["SerialNumber"].strip(),
181+
)
182+
183+
metrics["sector_size"].labels(device_name).set(ns["SectorSize"])
184+
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
185+
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])
186+
187+
# FIXME: The smart-log should only need to be fetched once per controller, not
188+
# per namespace. However, in order to preserve legacy metric labels, fetch it
189+
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
190+
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))
191+
192+
# Various counters in the NVMe specification are 128-bit, which would have to
193+
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194+
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
195+
# to int or float when using them in Counter metrics.
196+
metrics["data_units_read"].labels(device_name).inc(
197+
int(smart_log["data_units_read"])
198+
)
199+
metrics["data_units_written"].labels(device_name).inc(
200+
int(smart_log["data_units_written"])
201+
)
202+
metrics["host_read_commands"].labels(device_name).inc(
203+
int(smart_log["host_read_commands"])
204+
)
205+
metrics["host_write_commands"].labels(device_name).inc(
206+
int(smart_log["host_write_commands"])
207+
)
208+
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
209+
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
210+
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
211+
metrics["critical_warning"].labels(device_name).set(
212+
smart_log["critical_warning"]["value"]
213+
)
214+
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
215+
metrics["num_err_log_entries"].labels(device_name).inc(
216+
int(smart_log["num_err_log_entries"])
217+
)
218+
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
219+
metrics["power_on_hours"].labels(device_name).inc(
220+
int(smart_log["power_on_hours"])
221+
)
222+
metrics["controller_busy_time"].labels(device_name).inc(
223+
int(smart_log["controller_busy_time"])
224+
)
225+
metrics["unsafe_shutdowns"].labels(device_name).inc(
226+
int(smart_log["unsafe_shutdowns"])
227+
)
228+
229+
# NVMe reports temperature in kelvins; convert it to degrees Celsius.
230+
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
203231

204232

205233
if __name__ == "__main__":

0 commit comments

Comments
 (0)