Skip to content

Commit da956a5

Browse files
committed
nvme_metrics: fetch device global SMART log once per controller
Change "device" label in controller-specific metrics to "controller". This also means that the label value will be the NVMe character device name, e.g. "nvme0", instead of the previously used namespace block device name, e.g. "nvme0n1". Separate metric declaration dict into controller-specific and namespace-specific groups for easier maintenance. Signed-off-by: Daniel Swarbrick <[email protected]>
1 parent 4e2c7e1 commit da956a5

File tree

1 file changed

+73
-74
lines changed

1 file changed

+73
-74
lines changed

nvme_metrics.py

+73-74
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,22 @@
2424

2525
metrics = {
2626
# fmt: off
27+
"nvmecli": Info(
28+
"nvmecli",
29+
"nvme-cli tool information",
30+
["version"], namespace=namespace, registry=registry,
31+
),
32+
33+
# Controller-specific (e.g. "nvme0") metrics
2734
"avail_spare": Gauge(
2835
"available_spare_ratio",
2936
"Device available spare ratio",
30-
["device"], namespace=namespace, registry=registry,
37+
["controller"], namespace=namespace, registry=registry,
3138
),
3239
"controller_busy_time": Counter(
3340
"controller_busy_time_seconds",
3441
"Device controller busy time in seconds",
35-
["device"], namespace=namespace, registry=registry,
42+
["controller"], namespace=namespace, registry=registry,
3643
),
3744
"controller_info": Info(
3845
"controller",
@@ -43,81 +50,78 @@
4350
"critical_warning": Gauge(
4451
"critical_warning",
4552
"Device critical warning bitmap field",
46-
["device"], namespace=namespace, registry=registry,
53+
["controller"], namespace=namespace, registry=registry,
4754
),
4855
"data_units_read": Counter(
4956
"data_units_read_total",
5057
"Number of 512-byte data units read by host, reported in thousands",
51-
["device"], namespace=namespace, registry=registry,
58+
["controller"], namespace=namespace, registry=registry,
5259
),
5360
"data_units_written": Counter(
5461
"data_units_written_total",
5562
"Number of 512-byte data units written by host, reported in thousands",
56-
["device"], namespace=namespace, registry=registry,
63+
["controller"], namespace=namespace, registry=registry,
5764
),
5865
"host_read_commands": Counter(
5966
"host_read_commands_total",
6067
"Device read commands from host",
61-
["device"], namespace=namespace, registry=registry,
68+
["controller"], namespace=namespace, registry=registry,
6269
),
6370
"host_write_commands": Counter(
6471
"host_write_commands_total",
6572
"Device write commands from host",
66-
["device"], namespace=namespace, registry=registry,
73+
["controller"], namespace=namespace, registry=registry,
6774
),
6875
"media_errors": Counter(
6976
"media_errors_total",
7077
"Device media errors total",
71-
["device"], namespace=namespace, registry=registry,
78+
["controller"], namespace=namespace, registry=registry,
7279
),
7380
"num_err_log_entries": Counter(
7481
"num_err_log_entries_total",
7582
"Device error log entry count",
76-
["device"], namespace=namespace, registry=registry,
77-
),
78-
"nvmecli": Info(
79-
"nvmecli",
80-
"nvme-cli tool information",
81-
["version"], namespace=namespace, registry=registry,
83+
["controller"], namespace=namespace, registry=registry,
8284
),
8385
"percent_used": Gauge(
8486
"percentage_used_ratio",
8587
"Device percentage used ratio",
86-
["device"], namespace=namespace, registry=registry,
87-
),
88-
"physical_size": Gauge(
89-
"physical_size_bytes",
90-
"Device size in bytes",
91-
["device"], namespace=namespace, registry=registry,
88+
["controller"], namespace=namespace, registry=registry,
9289
),
9390
"power_cycles": Counter(
9491
"power_cycles_total",
9592
"Device number of power cycles",
96-
["device"], namespace=namespace, registry=registry,
93+
["controller"], namespace=namespace, registry=registry,
9794
),
9895
"power_on_hours": Counter(
9996
"power_on_hours_total",
10097
"Device power-on hours",
101-
["device"], namespace=namespace, registry=registry,
102-
),
103-
"sector_size": Gauge(
104-
"sector_size_bytes",
105-
"Device sector size in bytes",
106-
["device"], namespace=namespace, registry=registry,
98+
["controller"], namespace=namespace, registry=registry,
10799
),
108100
"spare_thresh": Gauge(
109101
"available_spare_threshold_ratio",
110102
"Device available spare threshold ratio",
111-
["device"], namespace=namespace, registry=registry,
103+
["controller"], namespace=namespace, registry=registry,
112104
),
113105
"temperature": Gauge(
114106
"temperature_celsius",
115107
"Device temperature in degrees Celsius",
116-
["device"], namespace=namespace, registry=registry,
108+
["controller"], namespace=namespace, registry=registry,
117109
),
118110
"unsafe_shutdowns": Counter(
119111
"unsafe_shutdowns_total",
120112
"Device number of unsafe shutdowns",
113+
["controller"], namespace=namespace, registry=registry,
114+
),
115+
116+
# Namespace-specific (e.g. "nvme0n1") metrics
117+
"physical_size": Gauge(
118+
"physical_size_bytes",
119+
"Device size in bytes",
120+
["device"], namespace=namespace, registry=registry,
121+
),
122+
"sector_size": Gauge(
123+
"sector_size_bytes",
124+
"Device sector size in bytes",
121125
["device"], namespace=namespace, registry=registry,
122126
),
123127
"used_bytes": Gauge(
@@ -164,8 +168,10 @@ def main():
164168
for device in device_list["Devices"]:
165169
for subsys in device["Subsystems"]:
166170
for ctrl in subsys["Controllers"]:
171+
ctrl_dev = ctrl["Controller"]
172+
167173
metrics["controller_info"].labels(
168-
ctrl["Controller"],
174+
ctrl_dev,
169175
ctrl["ModelNumber"],
170176
ctrl["Firmware"],
171177
ctrl["SerialNumber"].strip(),
@@ -179,50 +185,43 @@ def main():
179185
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
180186
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])
181187

182-
# FIXME: The smart-log should only need to be fetched once per controller, not
183-
# per namespace. However, in order to preserve legacy metric labels, fetch it
184-
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
185-
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))
186-
187-
# Various counters in the NVMe specification are 128-bit, which would have to
188-
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
189-
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
190-
# to int or float when using them in Counter metrics.
191-
metrics["data_units_read"].labels(device_name).inc(
192-
int(smart_log["data_units_read"])
193-
)
194-
metrics["data_units_written"].labels(device_name).inc(
195-
int(smart_log["data_units_written"])
196-
)
197-
metrics["host_read_commands"].labels(device_name).inc(
198-
int(smart_log["host_read_commands"])
199-
)
200-
metrics["host_write_commands"].labels(device_name).inc(
201-
int(smart_log["host_write_commands"])
202-
)
203-
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
204-
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
205-
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
206-
metrics["critical_warning"].labels(device_name).set(
207-
smart_log["critical_warning"]["value"]
208-
)
209-
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
210-
metrics["num_err_log_entries"].labels(device_name).inc(
211-
int(smart_log["num_err_log_entries"])
212-
)
213-
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
214-
metrics["power_on_hours"].labels(device_name).inc(
215-
int(smart_log["power_on_hours"])
216-
)
217-
metrics["controller_busy_time"].labels(device_name).inc(
218-
int(smart_log["controller_busy_time"])
219-
)
220-
metrics["unsafe_shutdowns"].labels(device_name).inc(
221-
int(smart_log["unsafe_shutdowns"])
222-
)
223-
224-
# NVMe reports temperature in kelvins; convert it to degrees Celsius.
225-
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
188+
# Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace.
189+
# Fetch the device global SMART log by omitting any --namespace-id flag.
190+
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"]))
191+
192+
# Various counters in the NVMe specification are 128-bit, which would have to
193+
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194+
# nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
195+
# or float when using them in Counter metrics.
196+
metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"]))
197+
metrics["data_units_written"].labels(ctrl_dev).inc(
198+
int(smart_log["data_units_written"])
199+
)
200+
metrics["host_read_commands"].labels(ctrl_dev).inc(
201+
int(smart_log["host_read_commands"])
202+
)
203+
metrics["host_write_commands"].labels(ctrl_dev).inc(
204+
int(smart_log["host_write_commands"])
205+
)
206+
metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100)
207+
metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100)
208+
metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100)
209+
metrics["critical_warning"].labels(ctrl_dev).set(
210+
smart_log["critical_warning"]["value"]
211+
)
212+
metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"]))
213+
metrics["num_err_log_entries"].labels(ctrl_dev).inc(
214+
int(smart_log["num_err_log_entries"])
215+
)
216+
metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"]))
217+
metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"]))
218+
metrics["controller_busy_time"].labels(ctrl_dev).inc(
219+
int(smart_log["controller_busy_time"])
220+
)
221+
metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"]))
222+
223+
# NVMe reports temperature in kelvins; convert it to degrees Celsius.
224+
metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273)
226225

227226

228227
if __name__ == "__main__":

0 commit comments

Comments
 (0)