Skip to content

Commit a4e86b4

Browse files
Fix StatsD metrics pipeline: summary observer, counter suffix, monotonic
Three issues found during live testing with the telemetry Docker stack: 1. Timer histograms only produced +Inf bucket — switched from observer_type: histogram to observer_type: summary with explicit percentiles [0, 50, 90, 95, 99, 100]. Updated dashboard PromQL from histogram_quantile() to quantile label selectors. 2. Counter metrics (rpc_requests, ledger_fetches, ledger_history_mismatch) were invisible — Prometheus exports monotonic counters with _total suffix. Updated dashboard queries to use _total suffix. 3. is_monotonic_counter was false, causing counters to be dropped by the Prometheus exporter. Changed to true. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent aa1bbd3 commit a4e86b4

File tree

3 files changed

+386
-112
lines changed

3 files changed

+386
-112
lines changed

docker/telemetry/grafana/dashboards/statsd-node-health.json

Lines changed: 178 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
{
2-
"annotations": { "list": [] },
2+
"annotations": {
3+
"list": []
4+
},
35
"description": "Node health metrics from beast::insight StatsD. Requires [insight] server=statsd in rippled config.",
46
"editable": true,
57
"fiscalYearStartMonth": 0,
@@ -11,13 +13,23 @@
1113
"title": "Validated Ledger Age",
1214
"description": "Age of the most recently validated ledger in seconds. Sourced from the LedgerMaster.Validated_Ledger_Age gauge (LedgerMaster.h:373) which is updated every collection interval via the insight hook. Values above 20s indicate the node is falling behind the network.",
1315
"type": "stat",
14-
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
16+
"gridPos": {
17+
"h": 8,
18+
"w": 12,
19+
"x": 0,
20+
"y": 0
21+
},
1522
"options": {
16-
"tooltip": { "mode": "multi", "sort": "desc" }
23+
"tooltip": {
24+
"mode": "multi",
25+
"sort": "desc"
26+
}
1727
},
1828
"targets": [
1929
{
20-
"datasource": { "type": "prometheus" },
30+
"datasource": {
31+
"type": "prometheus"
32+
},
2133
"expr": "rippled_LedgerMaster_Validated_Ledger_Age",
2234
"legendFormat": "Validated Age"
2335
}
@@ -27,9 +39,18 @@
2739
"unit": "s",
2840
"thresholds": {
2941
"steps": [
30-
{ "color": "green", "value": null },
31-
{ "color": "yellow", "value": 10 },
32-
{ "color": "red", "value": 20 }
42+
{
43+
"color": "green",
44+
"value": null
45+
},
46+
{
47+
"color": "yellow",
48+
"value": 10
49+
},
50+
{
51+
"color": "red",
52+
"value": 20
53+
}
3354
]
3455
}
3556
},
@@ -40,13 +61,23 @@
4061
"title": "Published Ledger Age",
4162
"description": "Age of the most recently published ledger in seconds. Sourced from the LedgerMaster.Published_Ledger_Age gauge (LedgerMaster.h:374). Published ledger age should track close to validated ledger age. A growing gap indicates publish pipeline backlog.",
4263
"type": "stat",
43-
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
64+
"gridPos": {
65+
"h": 8,
66+
"w": 12,
67+
"x": 12,
68+
"y": 0
69+
},
4470
"options": {
45-
"tooltip": { "mode": "multi", "sort": "desc" }
71+
"tooltip": {
72+
"mode": "multi",
73+
"sort": "desc"
74+
}
4675
},
4776
"targets": [
4877
{
49-
"datasource": { "type": "prometheus" },
78+
"datasource": {
79+
"type": "prometheus"
80+
},
5081
"expr": "rippled_LedgerMaster_Published_Ledger_Age",
5182
"legendFormat": "Published Age"
5283
}
@@ -56,9 +87,18 @@
5687
"unit": "s",
5788
"thresholds": {
5889
"steps": [
59-
{ "color": "green", "value": null },
60-
{ "color": "yellow", "value": 10 },
61-
{ "color": "red", "value": 20 }
90+
{
91+
"color": "green",
92+
"value": null
93+
},
94+
{
95+
"color": "yellow",
96+
"value": 10
97+
},
98+
{
99+
"color": "red",
100+
"value": 20
101+
}
62102
]
63103
}
64104
},
@@ -69,33 +109,51 @@
69109
"title": "Operating Mode Duration",
70110
"description": "Cumulative time spent in each operating mode (Disconnected, Connected, Syncing, Tracking, Full). Sourced from State_Accounting.*_duration gauges (NetworkOPs.cpp:774-778). A healthy node should spend the vast majority of time in Full mode.",
71111
"type": "timeseries",
72-
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
112+
"gridPos": {
113+
"h": 8,
114+
"w": 12,
115+
"x": 0,
116+
"y": 8
117+
},
73118
"options": {
74-
"tooltip": { "mode": "multi", "sort": "desc" }
119+
"tooltip": {
120+
"mode": "multi",
121+
"sort": "desc"
122+
}
75123
},
76124
"targets": [
77125
{
78-
"datasource": { "type": "prometheus" },
126+
"datasource": {
127+
"type": "prometheus"
128+
},
79129
"expr": "rippled_State_Accounting_Full_duration",
80130
"legendFormat": "Full"
81131
},
82132
{
83-
"datasource": { "type": "prometheus" },
133+
"datasource": {
134+
"type": "prometheus"
135+
},
84136
"expr": "rippled_State_Accounting_Tracking_duration",
85137
"legendFormat": "Tracking"
86138
},
87139
{
88-
"datasource": { "type": "prometheus" },
140+
"datasource": {
141+
"type": "prometheus"
142+
},
89143
"expr": "rippled_State_Accounting_Syncing_duration",
90144
"legendFormat": "Syncing"
91145
},
92146
{
93-
"datasource": { "type": "prometheus" },
147+
"datasource": {
148+
"type": "prometheus"
149+
},
94150
"expr": "rippled_State_Accounting_Connected_duration",
95151
"legendFormat": "Connected"
96152
},
97153
{
98-
"datasource": { "type": "prometheus" },
154+
"datasource": {
155+
"type": "prometheus"
156+
},
99157
"expr": "rippled_State_Accounting_Disconnected_duration",
100158
"legendFormat": "Disconnected"
101159
}
@@ -118,33 +176,51 @@
118176
"title": "Operating Mode Transitions",
119177
"description": "Count of transitions into each operating mode. Sourced from State_Accounting.*_transitions gauges (NetworkOPs.cpp:780-786). Frequent transitions out of Full mode indicate instability. Transitions to Disconnected or Syncing warrant investigation.",
120178
"type": "timeseries",
121-
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
179+
"gridPos": {
180+
"h": 8,
181+
"w": 12,
182+
"x": 12,
183+
"y": 8
184+
},
122185
"options": {
123-
"tooltip": { "mode": "multi", "sort": "desc" }
186+
"tooltip": {
187+
"mode": "multi",
188+
"sort": "desc"
189+
}
124190
},
125191
"targets": [
126192
{
127-
"datasource": { "type": "prometheus" },
193+
"datasource": {
194+
"type": "prometheus"
195+
},
128196
"expr": "rippled_State_Accounting_Full_transitions",
129197
"legendFormat": "Full"
130198
},
131199
{
132-
"datasource": { "type": "prometheus" },
200+
"datasource": {
201+
"type": "prometheus"
202+
},
133203
"expr": "rippled_State_Accounting_Tracking_transitions",
134204
"legendFormat": "Tracking"
135205
},
136206
{
137-
"datasource": { "type": "prometheus" },
207+
"datasource": {
208+
"type": "prometheus"
209+
},
138210
"expr": "rippled_State_Accounting_Syncing_transitions",
139211
"legendFormat": "Syncing"
140212
},
141213
{
142-
"datasource": { "type": "prometheus" },
214+
"datasource": {
215+
"type": "prometheus"
216+
},
143217
"expr": "rippled_State_Accounting_Connected_transitions",
144218
"legendFormat": "Connected"
145219
},
146220
{
147-
"datasource": { "type": "prometheus" },
221+
"datasource": {
222+
"type": "prometheus"
223+
},
148224
"expr": "rippled_State_Accounting_Disconnected_transitions",
149225
"legendFormat": "Disconnected"
150226
}
@@ -167,19 +243,31 @@
167243
"title": "I/O Latency",
168244
"description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp:438) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.",
169245
"type": "timeseries",
170-
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
246+
"gridPos": {
247+
"h": 8,
248+
"w": 12,
249+
"x": 0,
250+
"y": 16
251+
},
171252
"options": {
172-
"tooltip": { "mode": "multi", "sort": "desc" }
253+
"tooltip": {
254+
"mode": "multi",
255+
"sort": "desc"
256+
}
173257
},
174258
"targets": [
175259
{
176-
"datasource": { "type": "prometheus" },
177-
"expr": "histogram_quantile(0.95, sum by (le) (rippled_ios_latency_bucket))",
260+
"datasource": {
261+
"type": "prometheus"
262+
},
263+
"expr": "rippled_ios_latency{quantile=\"0.95\"}",
178264
"legendFormat": "P95 I/O Latency"
179265
},
180266
{
181-
"datasource": { "type": "prometheus" },
182-
"expr": "histogram_quantile(0.50, sum by (le) (rippled_ios_latency_bucket))",
267+
"datasource": {
268+
"type": "prometheus"
269+
},
270+
"expr": "rippled_ios_latency{quantile=\"0.5\"}",
183271
"legendFormat": "P50 I/O Latency"
184272
}
185273
],
@@ -199,15 +287,25 @@
199287
},
200288
{
201289
"title": "Job Queue Depth",
202-
"description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough common during ledger replay or heavy RPC load.",
290+
"description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough \u2014 common during ledger replay or heavy RPC load.",
203291
"type": "timeseries",
204-
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
292+
"gridPos": {
293+
"h": 8,
294+
"w": 12,
295+
"x": 12,
296+
"y": 16
297+
},
205298
"options": {
206-
"tooltip": { "mode": "multi", "sort": "desc" }
299+
"tooltip": {
300+
"mode": "multi",
301+
"sort": "desc"
302+
}
207303
},
208304
"targets": [
209305
{
210-
"datasource": { "type": "prometheus" },
306+
"datasource": {
307+
"type": "prometheus"
308+
},
211309
"expr": "rippled_job_count",
212310
"legendFormat": "Job Queue Depth"
213311
}
@@ -230,14 +328,24 @@
230328
"title": "Ledger Fetch Rate",
231329
"description": "Rate of ledger fetch requests initiated by the node. Sourced from the ledger_fetches counter (InboundLedgers.cpp:44) which increments each time the node requests a ledger from a peer. High rates indicate the node is catching up or missing ledgers.",
232330
"type": "stat",
233-
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
331+
"gridPos": {
332+
"h": 8,
333+
"w": 12,
334+
"x": 0,
335+
"y": 24
336+
},
234337
"options": {
235-
"tooltip": { "mode": "multi", "sort": "desc" }
338+
"tooltip": {
339+
"mode": "multi",
340+
"sort": "desc"
341+
}
236342
},
237343
"targets": [
238344
{
239-
"datasource": { "type": "prometheus" },
240-
"expr": "rate(rippled_ledger_fetches[5m])",
345+
"datasource": {
346+
"type": "prometheus"
347+
},
348+
"expr": "rate(rippled_ledger_fetches_total[5m])",
241349
"legendFormat": "Fetches / Sec"
242350
}
243351
],
@@ -252,14 +360,24 @@
252360
"title": "Ledger History Mismatches",
253361
"description": "Rate of ledger history hash mismatches. Sourced from the ledger.history.mismatch counter (LedgerHistory.cpp:16) which increments when a built ledger hash does not match the expected validated hash. Non-zero values indicate consensus divergence or database corruption.",
254362
"type": "stat",
255-
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
363+
"gridPos": {
364+
"h": 8,
365+
"w": 12,
366+
"x": 12,
367+
"y": 24
368+
},
256369
"options": {
257-
"tooltip": { "mode": "multi", "sort": "desc" }
370+
"tooltip": {
371+
"mode": "multi",
372+
"sort": "desc"
373+
}
258374
},
259375
"targets": [
260376
{
261-
"datasource": { "type": "prometheus" },
262-
"expr": "rate(rippled_ledger_history_mismatch[5m])",
377+
"datasource": {
378+
"type": "prometheus"
379+
},
380+
"expr": "rate(rippled_ledger_history_mismatch_total[5m])",
263381
"legendFormat": "Mismatches / Sec"
264382
}
265383
],
@@ -268,8 +386,14 @@
268386
"unit": "ops",
269387
"thresholds": {
270388
"steps": [
271-
{ "color": "green", "value": null },
272-
{ "color": "red", "value": 0.01 }
389+
{
390+
"color": "green",
391+
"value": null
392+
},
393+
{
394+
"color": "red",
395+
"value": 0.01
396+
}
273397
]
274398
}
275399
},
@@ -279,8 +403,13 @@
279403
],
280404
"schemaVersion": 39,
281405
"tags": ["rippled", "statsd", "node-health", "telemetry"],
282-
"templating": { "list": [] },
283-
"time": { "from": "now-1h", "to": "now" },
406+
"templating": {
407+
"list": []
408+
},
409+
"time": {
410+
"from": "now-1h",
411+
"to": "now"
412+
},
284413
"title": "rippled Node Health (StatsD)",
285414
"uid": "rippled-statsd-node-health"
286415
}

0 commit comments

Comments
 (0)