|
8 | 8 | "panels": [ |
9 | 9 | { |
10 | 10 | "title": "Consensus Round Duration", |
| 11 | + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries xrpl.consensus.proposers and xrpl.consensus.round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", |
11 | 12 | "type": "timeseries", |
12 | 13 | "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, |
| 14 | + "options": { |
| 15 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 16 | + }, |
13 | 17 | "targets": [ |
14 | 18 | { |
15 | 19 | "datasource": { "type": "prometheus" }, |
16 | 20 | "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", |
17 | | - "legendFormat": "p95 round duration" |
| 21 | + "legendFormat": "P95 Round Duration" |
18 | 22 | }, |
19 | 23 | { |
20 | 24 | "datasource": { "type": "prometheus" }, |
21 | 25 | "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", |
22 | | - "legendFormat": "p50 round duration" |
| 26 | + "legendFormat": "P50 Round Duration" |
23 | 27 | } |
24 | 28 | ], |
25 | 29 | "fieldConfig": { |
26 | 30 | "defaults": { |
27 | | - "unit": "ms" |
| 31 | + "unit": "ms", |
| 32 | + "custom": { |
| 33 | + "axisLabel": "Duration (ms)", |
| 34 | + "spanNulls": true, |
| 35 | + "insertNulls": false, |
| 36 | + "showPoints": "auto", |
| 37 | + "pointSize": 3 |
| 38 | + } |
28 | 39 | }, |
29 | 40 | "overrides": [] |
30 | 41 | } |
31 | 42 | }, |
32 | 43 | { |
33 | 44 | "title": "Consensus Proposals Sent Rate", |
| 45 | + "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp:177) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", |
34 | 46 | "type": "timeseries", |
35 | 47 | "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, |
| 48 | + "options": { |
| 49 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 50 | + }, |
36 | 51 | "targets": [ |
37 | 52 | { |
38 | 53 | "datasource": { "type": "prometheus" }, |
39 | 54 | "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))", |
40 | | - "legendFormat": "proposals/sec" |
| 55 | + "legendFormat": "Proposals / Sec" |
41 | 56 | } |
42 | 57 | ], |
43 | 58 | "fieldConfig": { |
44 | 59 | "defaults": { |
45 | | - "unit": "ops" |
| 60 | + "unit": "ops", |
| 61 | + "custom": { |
| 62 | + "axisLabel": "Proposals / Sec", |
| 63 | + "spanNulls": true, |
| 64 | + "insertNulls": false, |
| 65 | + "showPoints": "auto", |
| 66 | + "pointSize": 3 |
| 67 | + } |
46 | 68 | }, |
47 | 69 | "overrides": [] |
48 | 70 | } |
49 | 71 | }, |
50 | 72 | { |
51 | 73 | "title": "Ledger Close Duration", |
| 74 | + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.consensus.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", |
52 | 75 | "type": "timeseries", |
53 | 76 | "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, |
| 77 | + "options": { |
| 78 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 79 | + }, |
54 | 80 | "targets": [ |
55 | 81 | { |
56 | 82 | "datasource": { "type": "prometheus" }, |
57 | 83 | "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))", |
58 | | - "legendFormat": "p95 close duration" |
| 84 | + "legendFormat": "P95 Close Duration" |
59 | 85 | } |
60 | 86 | ], |
61 | 87 | "fieldConfig": { |
62 | 88 | "defaults": { |
63 | | - "unit": "ms" |
| 89 | + "unit": "ms", |
| 90 | + "custom": { |
| 91 | + "axisLabel": "Duration (ms)", |
| 92 | + "spanNulls": true, |
| 93 | + "insertNulls": false, |
| 94 | + "showPoints": "auto", |
| 95 | + "pointSize": 3 |
| 96 | + } |
64 | 97 | }, |
65 | 98 | "overrides": [] |
66 | 99 | } |
67 | 100 | }, |
68 | 101 | { |
69 | 102 | "title": "Validation Send Rate", |
| 103 | + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.consensus.ledger.seq and xrpl.consensus.proposing. Should closely track the ledger close rate when the node is healthy.", |
70 | 104 | "type": "stat", |
71 | 105 | "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, |
| 106 | + "options": { |
| 107 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 108 | + }, |
72 | 109 | "targets": [ |
73 | 110 | { |
74 | 111 | "datasource": { "type": "prometheus" }, |
75 | 112 | "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))", |
76 | | - "legendFormat": "validations/sec" |
| 113 | + "legendFormat": "Validations / Sec" |
77 | 114 | } |
78 | 115 | ], |
79 | 116 | "fieldConfig": { |
|
82 | 119 | }, |
83 | 120 | "overrides": [] |
84 | 121 | } |
| 122 | + }, |
| 123 | + { |
| 124 | + "title": "Consensus Mode Over Time", |
| 125 | + "description": "Breakdown of consensus ledger close events by the node's consensus mode (proposing, observing, wrongLedger, switchedLedger). Grouped by the xrpl.consensus.mode span attribute from consensus.ledger_close. A healthy validator should be predominantly in 'proposing' mode. Frequent 'wrongLedger' or 'switchedLedger' indicates sync issues.", |
| 126 | + "type": "timeseries", |
| 127 | + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, |
| 128 | + "options": { |
| 129 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 130 | + }, |
| 131 | + "targets": [ |
| 132 | + { |
| 133 | + "datasource": { "type": "prometheus" }, |
| 134 | + "expr": "sum by (xrpl_consensus_mode) (rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", |
| 135 | + "legendFormat": "{{xrpl_consensus_mode}}" |
| 136 | + } |
| 137 | + ], |
| 138 | + "fieldConfig": { |
| 139 | + "defaults": { |
| 140 | + "unit": "ops", |
| 141 | + "custom": { |
| 142 | + "axisLabel": "Events / Sec", |
| 143 | + "spanNulls": true, |
| 144 | + "insertNulls": false, |
| 145 | + "showPoints": "auto", |
| 146 | + "pointSize": 3 |
| 147 | + } |
| 148 | + }, |
| 149 | + "overrides": [] |
| 150 | + } |
| 151 | + }, |
| 152 | + { |
| 153 | + "title": "Accept vs Close Rate", |
| 154 | + "description": "Compares the rate of consensus.accept (ledger accepted after consensus) vs consensus.ledger_close (ledger close initiated). These should track closely in a healthy network. A divergence means some close events are not completing the accept phase, potentially indicating consensus failures or timeouts.", |
| 155 | + "type": "timeseries", |
| 156 | + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, |
| 157 | + "options": { |
| 158 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 159 | + }, |
| 160 | + "targets": [ |
| 161 | + { |
| 162 | + "datasource": { "type": "prometheus" }, |
| 163 | + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept\"}[5m]))", |
| 164 | + "legendFormat": "Accepts / Sec" |
| 165 | + }, |
| 166 | + { |
| 167 | + "datasource": { "type": "prometheus" }, |
| 168 | + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", |
| 169 | + "legendFormat": "Closes / Sec" |
| 170 | + } |
| 171 | + ], |
| 172 | + "fieldConfig": { |
| 173 | + "defaults": { |
| 174 | + "unit": "ops", |
| 175 | + "custom": { |
| 176 | + "axisLabel": "Events / Sec", |
| 177 | + "spanNulls": true, |
| 178 | + "insertNulls": false, |
| 179 | + "showPoints": "auto", |
| 180 | + "pointSize": 3 |
| 181 | + } |
| 182 | + }, |
| 183 | + "overrides": [] |
| 184 | + } |
| 185 | + }, |
| 186 | + { |
| 187 | + "title": "Validation vs Close Rate", |
| 188 | + "description": "Compares the rate of consensus.validation.send vs consensus.ledger_close. Each validated ledger should produce one validation message. If validations lag behind closes, the node may be falling behind on validation or experiencing issues with the validation pipeline.", |
| 189 | + "type": "timeseries", |
| 190 | + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, |
| 191 | + "options": { |
| 192 | + "tooltip": { "mode": "multi", "sort": "desc" } |
| 193 | + }, |
| 194 | + "targets": [ |
| 195 | + { |
| 196 | + "datasource": { "type": "prometheus" }, |
| 197 | + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))", |
| 198 | + "legendFormat": "Validations / Sec" |
| 199 | + }, |
| 200 | + { |
| 201 | + "datasource": { "type": "prometheus" }, |
| 202 | + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", |
| 203 | + "legendFormat": "Closes / Sec" |
| 204 | + } |
| 205 | + ], |
| 206 | + "fieldConfig": { |
| 207 | + "defaults": { |
| 208 | + "unit": "ops", |
| 209 | + "custom": { |
| 210 | + "axisLabel": "Events / Sec", |
| 211 | + "spanNulls": true, |
| 212 | + "insertNulls": false, |
| 213 | + "showPoints": "auto", |
| 214 | + "pointSize": 3 |
| 215 | + } |
| 216 | + }, |
| 217 | + "overrides": [] |
| 218 | + } |
| 219 | + }, |
| 220 | + { |
| 221 | + "title": "Consensus Accept Duration Heatmap", |
| 222 | + "description": "Heatmap showing the distribution of consensus.accept span durations across histogram buckets over time. Each cell represents how many accept events fell into that duration bucket in a 5m window. Useful for detecting outlier consensus rounds that take abnormally long.", |
| 223 | + "type": "heatmap", |
| 224 | + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, |
| 225 | + "options": { |
| 226 | + "tooltip": { "mode": "multi", "sort": "desc" }, |
| 227 | + "yAxis": { "axisLabel": "Duration (ms)" } |
| 228 | + }, |
| 229 | + "targets": [ |
| 230 | + { |
| 231 | + "datasource": { "type": "prometheus" }, |
| 232 | + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])) by (le)", |
| 233 | + "legendFormat": "{{le}}", |
| 234 | + "format": "heatmap" |
| 235 | + } |
| 236 | + ] |
85 | 237 | } |
86 | 238 | ], |
87 | 239 | "schemaVersion": 39, |
|
0 commit comments