74
74
"Device error log entry count" ,
75
75
["device" ], namespace = namespace , registry = registry ,
76
76
),
77
+ # FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
78
+ # will result in the metric having a "_info" suffixe automatically appended, which is arguably
79
+ # a breaking change.
77
80
"nvmecli" : Gauge (
78
81
"nvmecli" ,
79
82
"nvme-cli tool information" ,
@@ -142,7 +145,11 @@ def exec_nvme_json(*args):
142
145
"""
143
146
Execute nvme CLI tool with specified arguments and return parsed JSON output.
144
147
"""
145
- output = exec_nvme (* args , "--output-format" , "json" )
148
+ # Note: nvme-cli v2.11 effectively introduced a breaking change by forcing JSON output to always
149
+ # be verbose. Older versions of nvme-cli optionally produced verbose output if the --verbose
150
+ # flag was specified. In order to avoid having to handle two different JSON schemas, always
151
+ # add the --verbose flag.
152
+ output = exec_nvme (* args , "--output-format" , "json" , "--verbose" )
146
153
return json .loads (output )
147
154
148
155
@@ -157,49 +164,70 @@ def main():
157
164
device_list = exec_nvme_json ("list" )
158
165
159
166
for device in device_list ["Devices" ]:
160
- device_path = device ["DevicePath" ]
161
- device_name = os .path .basename (device_path )
162
-
163
- metrics ["device_info" ].labels (
164
- device_name ,
165
- device ["ModelNumber" ],
166
- device ["Firmware" ],
167
- device ["SerialNumber" ].strip (),
168
- )
169
-
170
- metrics ["sector_size" ].labels (device_name ).set (device ["SectorSize" ])
171
- metrics ["physical_size" ].labels (device_name ).set (device ["PhysicalSize" ])
172
- metrics ["used_bytes" ].labels (device_name ).set (device ["UsedBytes" ])
173
-
174
- smart_log = exec_nvme_json ("smart-log" , device_path )
175
-
176
- # Various counters in the NVMe specification are 128-bit, which would have to discard
177
- # resolution if converted to a JSON number (i.e., float64_t). Instead, nvme-cli marshals
178
- # them as strings. As such, they need to be explicitly cast to int or float when using them
179
- # in Counter metrics.
180
- metrics ["data_units_read" ].labels (device_name ).inc (int (smart_log ["data_units_read" ]))
181
- metrics ["data_units_written" ].labels (device_name ).inc (int (smart_log ["data_units_written" ]))
182
- metrics ["host_read_commands" ].labels (device_name ).inc (int (smart_log ["host_read_commands" ]))
183
- metrics ["host_write_commands" ].labels (device_name ).inc (
184
- int (smart_log ["host_write_commands" ])
185
- )
186
- metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
187
- metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
188
- metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
189
- metrics ["critical_warning" ].labels (device_name ).set (smart_log ["critical_warning" ])
190
- metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
191
- metrics ["num_err_log_entries" ].labels (device_name ).inc (
192
- int (smart_log ["num_err_log_entries" ])
193
- )
194
- metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
195
- metrics ["power_on_hours" ].labels (device_name ).inc (int (smart_log ["power_on_hours" ]))
196
- metrics ["controller_busy_time" ].labels (device_name ).inc (
197
- int (smart_log ["controller_busy_time" ])
198
- )
199
- metrics ["unsafe_shutdowns" ].labels (device_name ).inc (int (smart_log ["unsafe_shutdowns" ]))
200
-
201
- # NVMe reports temperature in kelvins; convert it to degrees Celsius.
202
- metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
167
+ for subsys in device ["Subsystems" ]:
168
+ for ctrl in subsys ["Controllers" ]:
169
+ for ns in ctrl ["Namespaces" ]:
170
+ device_name = ns ["NameSpace" ]
171
+
172
+ # FIXME: This metric ought to be refactored into a "controller_info" metric,
173
+ # since it contains information that is not unique to the namespace. However,
174
+ # previous versions of this collector erroneously referred to namespaces, e.g.
175
+ # "nvme0n1", as devices, so preserve the former behaviour for now.
176
+ metrics ["device_info" ].labels (
177
+ device_name ,
178
+ ctrl ["ModelNumber" ],
179
+ ctrl ["Firmware" ],
180
+ ctrl ["SerialNumber" ].strip (),
181
+ )
182
+
183
+ metrics ["sector_size" ].labels (device_name ).set (ns ["SectorSize" ])
184
+ metrics ["physical_size" ].labels (device_name ).set (ns ["PhysicalSize" ])
185
+ metrics ["used_bytes" ].labels (device_name ).set (ns ["UsedBytes" ])
186
+
187
+ # FIXME: The smart-log should only need to be fetched once per controller, not
188
+ # per namespace. However, in order to preserve legacy metric labels, fetch it
189
+ # per namespace anyway. Most consumer grade SSDs will only have one namespace.
190
+ smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , device_name ))
191
+
192
+ # Various counters in the NVMe specification are 128-bit, which would have to
193
+ # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194
+ # nvme-cli marshals them as strings. As such, they need to be explicitly cast
195
+ # to int or float when using them in Counter metrics.
196
+ metrics ["data_units_read" ].labels (device_name ).inc (
197
+ int (smart_log ["data_units_read" ])
198
+ )
199
+ metrics ["data_units_written" ].labels (device_name ).inc (
200
+ int (smart_log ["data_units_written" ])
201
+ )
202
+ metrics ["host_read_commands" ].labels (device_name ).inc (
203
+ int (smart_log ["host_read_commands" ])
204
+ )
205
+ metrics ["host_write_commands" ].labels (device_name ).inc (
206
+ int (smart_log ["host_write_commands" ])
207
+ )
208
+ metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
209
+ metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
210
+ metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
211
+ metrics ["critical_warning" ].labels (device_name ).set (
212
+ smart_log ["critical_warning" ]["value" ]
213
+ )
214
+ metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
215
+ metrics ["num_err_log_entries" ].labels (device_name ).inc (
216
+ int (smart_log ["num_err_log_entries" ])
217
+ )
218
+ metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
219
+ metrics ["power_on_hours" ].labels (device_name ).inc (
220
+ int (smart_log ["power_on_hours" ])
221
+ )
222
+ metrics ["controller_busy_time" ].labels (device_name ).inc (
223
+ int (smart_log ["controller_busy_time" ])
224
+ )
225
+ metrics ["unsafe_shutdowns" ].labels (device_name ).inc (
226
+ int (smart_log ["unsafe_shutdowns" ])
227
+ )
228
+
229
+ # NVMe reports temperature in kelvins; convert it to degrees Celsius.
230
+ metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
203
231
204
232
205
233
if __name__ == "__main__" :
0 commit comments