|
1 | 1 | { |
2 | | - "annotations": { "list": [] }, |
| 2 | + "annotations": { |
| 3 | + "list": [] |
| 4 | + }, |
3 | 5 | "description": "Node health metrics from beast::insight StatsD. Requires [insight] server=statsd in rippled config.", |
4 | 6 | "editable": true, |
5 | 7 | "fiscalYearStartMonth": 0, |
|
11 | 13 | "title": "Validated Ledger Age", |
12 | 14 | "description": "Age of the most recently validated ledger in seconds. Sourced from the LedgerMaster.Validated_Ledger_Age gauge (LedgerMaster.h:373) which is updated every collection interval via the insight hook. Values above 20s indicate the node is falling behind the network.", |
13 | 15 | "type": "stat", |
14 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, |
| 16 | + "gridPos": { |
| 17 | + "h": 8, |
| 18 | + "w": 12, |
| 19 | + "x": 0, |
| 20 | + "y": 0 |
| 21 | + }, |
15 | 22 | "options": { |
16 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 23 | + "tooltip": { |
| 24 | + "mode": "multi", |
| 25 | + "sort": "desc" |
| 26 | + } |
17 | 27 | }, |
18 | 28 | "targets": [ |
19 | 29 | { |
20 | | - "datasource": { "type": "prometheus" }, |
| 30 | + "datasource": { |
| 31 | + "type": "prometheus" |
| 32 | + }, |
21 | 33 | "expr": "rippled_LedgerMaster_Validated_Ledger_Age", |
22 | 34 | "legendFormat": "Validated Age" |
23 | 35 | } |
|
27 | 39 | "unit": "s", |
28 | 40 | "thresholds": { |
29 | 41 | "steps": [ |
30 | | - { "color": "green", "value": null }, |
31 | | - { "color": "yellow", "value": 10 }, |
32 | | - { "color": "red", "value": 20 } |
| 42 | + { |
| 43 | + "color": "green", |
| 44 | + "value": null |
| 45 | + }, |
| 46 | + { |
| 47 | + "color": "yellow", |
| 48 | + "value": 10 |
| 49 | + }, |
| 50 | + { |
| 51 | + "color": "red", |
| 52 | + "value": 20 |
| 53 | + } |
33 | 54 | ] |
34 | 55 | } |
35 | 56 | }, |
|
40 | 61 | "title": "Published Ledger Age", |
41 | 62 | "description": "Age of the most recently published ledger in seconds. Sourced from the LedgerMaster.Published_Ledger_Age gauge (LedgerMaster.h:374). Published ledger age should track close to validated ledger age. A growing gap indicates publish pipeline backlog.", |
42 | 63 | "type": "stat", |
43 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, |
| 64 | + "gridPos": { |
| 65 | + "h": 8, |
| 66 | + "w": 12, |
| 67 | + "x": 12, |
| 68 | + "y": 0 |
| 69 | + }, |
44 | 70 | "options": { |
45 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 71 | + "tooltip": { |
| 72 | + "mode": "multi", |
| 73 | + "sort": "desc" |
| 74 | + } |
46 | 75 | }, |
47 | 76 | "targets": [ |
48 | 77 | { |
49 | | - "datasource": { "type": "prometheus" }, |
| 78 | + "datasource": { |
| 79 | + "type": "prometheus" |
| 80 | + }, |
50 | 81 | "expr": "rippled_LedgerMaster_Published_Ledger_Age", |
51 | 82 | "legendFormat": "Published Age" |
52 | 83 | } |
|
56 | 87 | "unit": "s", |
57 | 88 | "thresholds": { |
58 | 89 | "steps": [ |
59 | | - { "color": "green", "value": null }, |
60 | | - { "color": "yellow", "value": 10 }, |
61 | | - { "color": "red", "value": 20 } |
| 90 | + { |
| 91 | + "color": "green", |
| 92 | + "value": null |
| 93 | + }, |
| 94 | + { |
| 95 | + "color": "yellow", |
| 96 | + "value": 10 |
| 97 | + }, |
| 98 | + { |
| 99 | + "color": "red", |
| 100 | + "value": 20 |
| 101 | + } |
62 | 102 | ] |
63 | 103 | } |
64 | 104 | }, |
|
69 | 109 | "title": "Operating Mode Duration", |
70 | 110 | "description": "Cumulative time spent in each operating mode (Disconnected, Connected, Syncing, Tracking, Full). Sourced from State_Accounting.*_duration gauges (NetworkOPs.cpp:774-778). A healthy node should spend the vast majority of time in Full mode.", |
71 | 111 | "type": "timeseries", |
72 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, |
| 112 | + "gridPos": { |
| 113 | + "h": 8, |
| 114 | + "w": 12, |
| 115 | + "x": 0, |
| 116 | + "y": 8 |
| 117 | + }, |
73 | 118 | "options": { |
74 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 119 | + "tooltip": { |
| 120 | + "mode": "multi", |
| 121 | + "sort": "desc" |
| 122 | + } |
75 | 123 | }, |
76 | 124 | "targets": [ |
77 | 125 | { |
78 | | - "datasource": { "type": "prometheus" }, |
| 126 | + "datasource": { |
| 127 | + "type": "prometheus" |
| 128 | + }, |
79 | 129 | "expr": "rippled_State_Accounting_Full_duration", |
80 | 130 | "legendFormat": "Full" |
81 | 131 | }, |
82 | 132 | { |
83 | | - "datasource": { "type": "prometheus" }, |
| 133 | + "datasource": { |
| 134 | + "type": "prometheus" |
| 135 | + }, |
84 | 136 | "expr": "rippled_State_Accounting_Tracking_duration", |
85 | 137 | "legendFormat": "Tracking" |
86 | 138 | }, |
87 | 139 | { |
88 | | - "datasource": { "type": "prometheus" }, |
| 140 | + "datasource": { |
| 141 | + "type": "prometheus" |
| 142 | + }, |
89 | 143 | "expr": "rippled_State_Accounting_Syncing_duration", |
90 | 144 | "legendFormat": "Syncing" |
91 | 145 | }, |
92 | 146 | { |
93 | | - "datasource": { "type": "prometheus" }, |
| 147 | + "datasource": { |
| 148 | + "type": "prometheus" |
| 149 | + }, |
94 | 150 | "expr": "rippled_State_Accounting_Connected_duration", |
95 | 151 | "legendFormat": "Connected" |
96 | 152 | }, |
97 | 153 | { |
98 | | - "datasource": { "type": "prometheus" }, |
| 154 | + "datasource": { |
| 155 | + "type": "prometheus" |
| 156 | + }, |
99 | 157 | "expr": "rippled_State_Accounting_Disconnected_duration", |
100 | 158 | "legendFormat": "Disconnected" |
101 | 159 | } |
|
118 | 176 | "title": "Operating Mode Transitions", |
119 | 177 | "description": "Count of transitions into each operating mode. Sourced from State_Accounting.*_transitions gauges (NetworkOPs.cpp:780-786). Frequent transitions out of Full mode indicate instability. Transitions to Disconnected or Syncing warrant investigation.", |
120 | 178 | "type": "timeseries", |
121 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, |
| 179 | + "gridPos": { |
| 180 | + "h": 8, |
| 181 | + "w": 12, |
| 182 | + "x": 12, |
| 183 | + "y": 8 |
| 184 | + }, |
122 | 185 | "options": { |
123 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 186 | + "tooltip": { |
| 187 | + "mode": "multi", |
| 188 | + "sort": "desc" |
| 189 | + } |
124 | 190 | }, |
125 | 191 | "targets": [ |
126 | 192 | { |
127 | | - "datasource": { "type": "prometheus" }, |
| 193 | + "datasource": { |
| 194 | + "type": "prometheus" |
| 195 | + }, |
128 | 196 | "expr": "rippled_State_Accounting_Full_transitions", |
129 | 197 | "legendFormat": "Full" |
130 | 198 | }, |
131 | 199 | { |
132 | | - "datasource": { "type": "prometheus" }, |
| 200 | + "datasource": { |
| 201 | + "type": "prometheus" |
| 202 | + }, |
133 | 203 | "expr": "rippled_State_Accounting_Tracking_transitions", |
134 | 204 | "legendFormat": "Tracking" |
135 | 205 | }, |
136 | 206 | { |
137 | | - "datasource": { "type": "prometheus" }, |
| 207 | + "datasource": { |
| 208 | + "type": "prometheus" |
| 209 | + }, |
138 | 210 | "expr": "rippled_State_Accounting_Syncing_transitions", |
139 | 211 | "legendFormat": "Syncing" |
140 | 212 | }, |
141 | 213 | { |
142 | | - "datasource": { "type": "prometheus" }, |
| 214 | + "datasource": { |
| 215 | + "type": "prometheus" |
| 216 | + }, |
143 | 217 | "expr": "rippled_State_Accounting_Connected_transitions", |
144 | 218 | "legendFormat": "Connected" |
145 | 219 | }, |
146 | 220 | { |
147 | | - "datasource": { "type": "prometheus" }, |
| 221 | + "datasource": { |
| 222 | + "type": "prometheus" |
| 223 | + }, |
148 | 224 | "expr": "rippled_State_Accounting_Disconnected_transitions", |
149 | 225 | "legendFormat": "Disconnected" |
150 | 226 | } |
|
167 | 243 | "title": "I/O Latency", |
168 | 244 | "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp:438) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", |
169 | 245 | "type": "timeseries", |
170 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, |
| 246 | + "gridPos": { |
| 247 | + "h": 8, |
| 248 | + "w": 12, |
| 249 | + "x": 0, |
| 250 | + "y": 16 |
| 251 | + }, |
171 | 252 | "options": { |
172 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 253 | + "tooltip": { |
| 254 | + "mode": "multi", |
| 255 | + "sort": "desc" |
| 256 | + } |
173 | 257 | }, |
174 | 258 | "targets": [ |
175 | 259 | { |
176 | | - "datasource": { "type": "prometheus" }, |
177 | | - "expr": "histogram_quantile(0.95, sum by (le) (rippled_ios_latency_bucket))", |
| 260 | + "datasource": { |
| 261 | + "type": "prometheus" |
| 262 | + }, |
| 263 | + "expr": "rippled_ios_latency{quantile=\"0.95\"}", |
178 | 264 | "legendFormat": "P95 I/O Latency" |
179 | 265 | }, |
180 | 266 | { |
181 | | - "datasource": { "type": "prometheus" }, |
182 | | - "expr": "histogram_quantile(0.50, sum by (le) (rippled_ios_latency_bucket))", |
| 267 | + "datasource": { |
| 268 | + "type": "prometheus" |
| 269 | + }, |
| 270 | + "expr": "rippled_ios_latency{quantile=\"0.5\"}", |
183 | 271 | "legendFormat": "P50 I/O Latency" |
184 | 272 | } |
185 | 273 | ], |
|
199 | 287 | }, |
200 | 288 | { |
201 | 289 | "title": "Job Queue Depth", |
202 | | - "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough — common during ledger replay or heavy RPC load.", |
| 290 | + "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough \u2014 common during ledger replay or heavy RPC load.", |
203 | 291 | "type": "timeseries", |
204 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, |
| 292 | + "gridPos": { |
| 293 | + "h": 8, |
| 294 | + "w": 12, |
| 295 | + "x": 12, |
| 296 | + "y": 16 |
| 297 | + }, |
205 | 298 | "options": { |
206 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 299 | + "tooltip": { |
| 300 | + "mode": "multi", |
| 301 | + "sort": "desc" |
| 302 | + } |
207 | 303 | }, |
208 | 304 | "targets": [ |
209 | 305 | { |
210 | | - "datasource": { "type": "prometheus" }, |
| 306 | + "datasource": { |
| 307 | + "type": "prometheus" |
| 308 | + }, |
211 | 309 | "expr": "rippled_job_count", |
212 | 310 | "legendFormat": "Job Queue Depth" |
213 | 311 | } |
|
230 | 328 | "title": "Ledger Fetch Rate", |
231 | 329 | "description": "Rate of ledger fetch requests initiated by the node. Sourced from the ledger_fetches counter (InboundLedgers.cpp:44) which increments each time the node requests a ledger from a peer. High rates indicate the node is catching up or missing ledgers.", |
232 | 330 | "type": "stat", |
233 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, |
| 331 | + "gridPos": { |
| 332 | + "h": 8, |
| 333 | + "w": 12, |
| 334 | + "x": 0, |
| 335 | + "y": 24 |
| 336 | + }, |
234 | 337 | "options": { |
235 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 338 | + "tooltip": { |
| 339 | + "mode": "multi", |
| 340 | + "sort": "desc" |
| 341 | + } |
236 | 342 | }, |
237 | 343 | "targets": [ |
238 | 344 | { |
239 | | - "datasource": { "type": "prometheus" }, |
240 | | - "expr": "rate(rippled_ledger_fetches[5m])", |
| 345 | + "datasource": { |
| 346 | + "type": "prometheus" |
| 347 | + }, |
| 348 | + "expr": "rate(rippled_ledger_fetches_total[5m])", |
241 | 349 | "legendFormat": "Fetches / Sec" |
242 | 350 | } |
243 | 351 | ], |
|
252 | 360 | "title": "Ledger History Mismatches", |
253 | 361 | "description": "Rate of ledger history hash mismatches. Sourced from the ledger.history.mismatch counter (LedgerHistory.cpp:16) which increments when a built ledger hash does not match the expected validated hash. Non-zero values indicate consensus divergence or database corruption.", |
254 | 362 | "type": "stat", |
255 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, |
| 363 | + "gridPos": { |
| 364 | + "h": 8, |
| 365 | + "w": 12, |
| 366 | + "x": 12, |
| 367 | + "y": 24 |
| 368 | + }, |
256 | 369 | "options": { |
257 | | - "tooltip": { "mode": "multi", "sort": "desc" } |
| 370 | + "tooltip": { |
| 371 | + "mode": "multi", |
| 372 | + "sort": "desc" |
| 373 | + } |
258 | 374 | }, |
259 | 375 | "targets": [ |
260 | 376 | { |
261 | | - "datasource": { "type": "prometheus" }, |
262 | | - "expr": "rate(rippled_ledger_history_mismatch[5m])", |
| 377 | + "datasource": { |
| 378 | + "type": "prometheus" |
| 379 | + }, |
| 380 | + "expr": "rate(rippled_ledger_history_mismatch_total[5m])", |
263 | 381 | "legendFormat": "Mismatches / Sec" |
264 | 382 | } |
265 | 383 | ], |
|
268 | 386 | "unit": "ops", |
269 | 387 | "thresholds": { |
270 | 388 | "steps": [ |
271 | | - { "color": "green", "value": null }, |
272 | | - { "color": "red", "value": 0.01 } |
| 389 | + { |
| 390 | + "color": "green", |
| 391 | + "value": null |
| 392 | + }, |
| 393 | + { |
| 394 | + "color": "red", |
| 395 | + "value": 0.01 |
| 396 | + } |
273 | 397 | ] |
274 | 398 | } |
275 | 399 | }, |
|
279 | 403 | ], |
280 | 404 | "schemaVersion": 39, |
281 | 405 | "tags": ["rippled", "statsd", "node-health", "telemetry"], |
282 | | - "templating": { "list": [] }, |
283 | | - "time": { "from": "now-1h", "to": "now" }, |
| 406 | + "templating": { |
| 407 | + "list": [] |
| 408 | + }, |
| 409 | + "time": { |
| 410 | + "from": "now-1h", |
| 411 | + "to": "now" |
| 412 | + }, |
284 | 413 | "title": "rippled Node Health (StatsD)", |
285 | 414 | "uid": "rippled-statsd-node-health" |
286 | 415 | } |
0 commit comments