24
24
25
25
metrics = {
26
26
# fmt: off
27
+ "nvmecli" : Info (
28
+ "nvmecli" ,
29
+ "nvme-cli tool information" ,
30
+ ["version" ], namespace = namespace , registry = registry ,
31
+ ),
32
+
33
+ # Controller-specific (e.g. "nvme0") metrics
27
34
"avail_spare" : Gauge (
28
35
"available_spare_ratio" ,
29
36
"Device available spare ratio" ,
30
- ["device " ], namespace = namespace , registry = registry ,
37
+ ["controller " ], namespace = namespace , registry = registry ,
31
38
),
32
39
"controller_busy_time" : Counter (
33
40
"controller_busy_time_seconds" ,
34
41
"Device controller busy time in seconds" ,
35
- ["device " ], namespace = namespace , registry = registry ,
42
+ ["controller " ], namespace = namespace , registry = registry ,
36
43
),
37
44
"controller_info" : Info (
38
45
"controller" ,
43
50
"critical_warning" : Gauge (
44
51
"critical_warning" ,
45
52
"Device critical warning bitmap field" ,
46
- ["device " ], namespace = namespace , registry = registry ,
53
+ ["controller " ], namespace = namespace , registry = registry ,
47
54
),
48
55
"data_units_read" : Counter (
49
56
"data_units_read_total" ,
50
57
"Number of 512-byte data units read by host, reported in thousands" ,
51
- ["device " ], namespace = namespace , registry = registry ,
58
+ ["controller " ], namespace = namespace , registry = registry ,
52
59
),
53
60
"data_units_written" : Counter (
54
61
"data_units_written_total" ,
55
62
"Number of 512-byte data units written by host, reported in thousands" ,
56
- ["device " ], namespace = namespace , registry = registry ,
63
+ ["controller " ], namespace = namespace , registry = registry ,
57
64
),
58
65
"host_read_commands" : Counter (
59
66
"host_read_commands_total" ,
60
67
"Device read commands from host" ,
61
- ["device " ], namespace = namespace , registry = registry ,
68
+ ["controller " ], namespace = namespace , registry = registry ,
62
69
),
63
70
"host_write_commands" : Counter (
64
71
"host_write_commands_total" ,
65
72
"Device write commands from host" ,
66
- ["device " ], namespace = namespace , registry = registry ,
73
+ ["controller " ], namespace = namespace , registry = registry ,
67
74
),
68
75
"media_errors" : Counter (
69
76
"media_errors_total" ,
70
77
"Device media errors total" ,
71
- ["device " ], namespace = namespace , registry = registry ,
78
+ ["controller " ], namespace = namespace , registry = registry ,
72
79
),
73
80
"num_err_log_entries" : Counter (
74
81
"num_err_log_entries_total" ,
75
82
"Device error log entry count" ,
76
- ["device" ], namespace = namespace , registry = registry ,
77
- ),
78
- "nvmecli" : Info (
79
- "nvmecli" ,
80
- "nvme-cli tool information" ,
81
- ["version" ], namespace = namespace , registry = registry ,
83
+ ["controller" ], namespace = namespace , registry = registry ,
82
84
),
83
85
"percent_used" : Gauge (
84
86
"percentage_used_ratio" ,
85
87
"Device percentage used ratio" ,
86
- ["device" ], namespace = namespace , registry = registry ,
87
- ),
88
- "physical_size" : Gauge (
89
- "physical_size_bytes" ,
90
- "Device size in bytes" ,
91
- ["device" ], namespace = namespace , registry = registry ,
88
+ ["controller" ], namespace = namespace , registry = registry ,
92
89
),
93
90
"power_cycles" : Counter (
94
91
"power_cycles_total" ,
95
92
"Device number of power cycles" ,
96
- ["device " ], namespace = namespace , registry = registry ,
93
+ ["controller " ], namespace = namespace , registry = registry ,
97
94
),
98
95
"power_on_hours" : Counter (
99
96
"power_on_hours_total" ,
100
97
"Device power-on hours" ,
101
- ["device" ], namespace = namespace , registry = registry ,
102
- ),
103
- "sector_size" : Gauge (
104
- "sector_size_bytes" ,
105
- "Device sector size in bytes" ,
106
- ["device" ], namespace = namespace , registry = registry ,
98
+ ["controller" ], namespace = namespace , registry = registry ,
107
99
),
108
100
"spare_thresh" : Gauge (
109
101
"available_spare_threshold_ratio" ,
110
102
"Device available spare threshold ratio" ,
111
- ["device " ], namespace = namespace , registry = registry ,
103
+ ["controller " ], namespace = namespace , registry = registry ,
112
104
),
113
105
"temperature" : Gauge (
114
106
"temperature_celsius" ,
115
107
"Device temperature in degrees Celsius" ,
116
- ["device " ], namespace = namespace , registry = registry ,
108
+ ["controller " ], namespace = namespace , registry = registry ,
117
109
),
118
110
"unsafe_shutdowns" : Counter (
119
111
"unsafe_shutdowns_total" ,
120
112
"Device number of unsafe shutdowns" ,
113
+ ["controller" ], namespace = namespace , registry = registry ,
114
+ ),
115
+
116
+ # Namespace-specific (e.g. "nvme0n1") metrics
117
+ "physical_size" : Gauge (
118
+ "physical_size_bytes" ,
119
+ "Device size in bytes" ,
120
+ ["device" ], namespace = namespace , registry = registry ,
121
+ ),
122
+ "sector_size" : Gauge (
123
+ "sector_size_bytes" ,
124
+ "Device sector size in bytes" ,
121
125
["device" ], namespace = namespace , registry = registry ,
122
126
),
123
127
"used_bytes" : Gauge (
@@ -164,8 +168,10 @@ def main():
164
168
for device in device_list ["Devices" ]:
165
169
for subsys in device ["Subsystems" ]:
166
170
for ctrl in subsys ["Controllers" ]:
171
+ ctrl_dev = ctrl ["Controller" ]
172
+
167
173
metrics ["controller_info" ].labels (
168
- ctrl [ "Controller" ] ,
174
+ ctrl_dev ,
169
175
ctrl ["ModelNumber" ],
170
176
ctrl ["Firmware" ],
171
177
ctrl ["SerialNumber" ].strip (),
@@ -179,50 +185,43 @@ def main():
179
185
metrics ["physical_size" ].labels (device_name ).set (ns ["PhysicalSize" ])
180
186
metrics ["used_bytes" ].labels (device_name ).set (ns ["UsedBytes" ])
181
187
182
- # FIXME: The smart-log should only need to be fetched once per controller, not
183
- # per namespace. However, in order to preserve legacy metric labels, fetch it
184
- # per namespace anyway. Most consumer grade SSDs will only have one namespace.
185
- smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , device_name ))
186
-
187
- # Various counters in the NVMe specification are 128-bit, which would have to
188
- # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
189
- # nvme-cli marshals them as strings. As such, they need to be explicitly cast
190
- # to int or float when using them in Counter metrics.
191
- metrics ["data_units_read" ].labels (device_name ).inc (
192
- int (smart_log ["data_units_read" ])
193
- )
194
- metrics ["data_units_written" ].labels (device_name ).inc (
195
- int (smart_log ["data_units_written" ])
196
- )
197
- metrics ["host_read_commands" ].labels (device_name ).inc (
198
- int (smart_log ["host_read_commands" ])
199
- )
200
- metrics ["host_write_commands" ].labels (device_name ).inc (
201
- int (smart_log ["host_write_commands" ])
202
- )
203
- metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
204
- metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
205
- metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
206
- metrics ["critical_warning" ].labels (device_name ).set (
207
- smart_log ["critical_warning" ]["value" ]
208
- )
209
- metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
210
- metrics ["num_err_log_entries" ].labels (device_name ).inc (
211
- int (smart_log ["num_err_log_entries" ])
212
- )
213
- metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
214
- metrics ["power_on_hours" ].labels (device_name ).inc (
215
- int (smart_log ["power_on_hours" ])
216
- )
217
- metrics ["controller_busy_time" ].labels (device_name ).inc (
218
- int (smart_log ["controller_busy_time" ])
219
- )
220
- metrics ["unsafe_shutdowns" ].labels (device_name ).inc (
221
- int (smart_log ["unsafe_shutdowns" ])
222
- )
223
-
224
- # NVMe reports temperature in kelvins; convert it to degrees Celsius.
225
- metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
188
+ # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace.
189
+ # Fetch the device global SMART log by omitting any --namespace-id flag.
190
+ smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , ctrl ["Controller" ]))
191
+
192
+ # Various counters in the NVMe specification are 128-bit, which would have to
193
+ # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194
+ # nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
195
+ # or float when using them in Counter metrics.
196
+ metrics ["data_units_read" ].labels (ctrl_dev ).inc (int (smart_log ["data_units_read" ]))
197
+ metrics ["data_units_written" ].labels (ctrl_dev ).inc (
198
+ int (smart_log ["data_units_written" ])
199
+ )
200
+ metrics ["host_read_commands" ].labels (ctrl_dev ).inc (
201
+ int (smart_log ["host_read_commands" ])
202
+ )
203
+ metrics ["host_write_commands" ].labels (ctrl_dev ).inc (
204
+ int (smart_log ["host_write_commands" ])
205
+ )
206
+ metrics ["avail_spare" ].labels (ctrl_dev ).set (smart_log ["avail_spare" ] / 100 )
207
+ metrics ["spare_thresh" ].labels (ctrl_dev ).set (smart_log ["spare_thresh" ] / 100 )
208
+ metrics ["percent_used" ].labels (ctrl_dev ).set (smart_log ["percent_used" ] / 100 )
209
+ metrics ["critical_warning" ].labels (ctrl_dev ).set (
210
+ smart_log ["critical_warning" ]["value" ]
211
+ )
212
+ metrics ["media_errors" ].labels (ctrl_dev ).inc (int (smart_log ["media_errors" ]))
213
+ metrics ["num_err_log_entries" ].labels (ctrl_dev ).inc (
214
+ int (smart_log ["num_err_log_entries" ])
215
+ )
216
+ metrics ["power_cycles" ].labels (ctrl_dev ).inc (int (smart_log ["power_cycles" ]))
217
+ metrics ["power_on_hours" ].labels (ctrl_dev ).inc (int (smart_log ["power_on_hours" ]))
218
+ metrics ["controller_busy_time" ].labels (ctrl_dev ).inc (
219
+ int (smart_log ["controller_busy_time" ])
220
+ )
221
+ metrics ["unsafe_shutdowns" ].labels (ctrl_dev ).inc (int (smart_log ["unsafe_shutdowns" ]))
222
+
223
+ # NVMe reports temperature in kelvins; convert it to degrees Celsius.
224
+ metrics ["temperature" ].labels (ctrl_dev ).set (smart_log ["temperature" ] - 273 )
226
225
227
226
228
227
if __name__ == "__main__" :
0 commit comments