Skip to content

Commit fe6d54c

Browse files
Phase 5: Observability stack — spanmetrics, dashboards, runbook
- Add spanmetrics connector to OTel Collector config for deriving RED metrics (Rate, Errors, Duration) from trace spans - Add Prometheus service to Docker Compose for scraping spanmetrics - Add Prometheus datasource provisioning for Grafana - Create three Grafana dashboards: - RPC Performance: request rate, p95 latency, error rate, heatmap - Transaction Overview: processing rate, latency, path distribution - Consensus Health: round duration, proposal rate, validation rate - Add operator runbook with setup, configuration, troubleshooting, and performance tuning guidance Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0ad5ca4 commit fe6d54c

File tree

9 files changed

+446
-3
lines changed

9 files changed

+446
-3
lines changed

docker/telemetry/docker-compose.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ services:
2323
ports:
2424
- "4317:4317" # OTLP gRPC
2525
- "4318:4318" # OTLP HTTP
26+
- "8889:8889" # Prometheus metrics (spanmetrics)
2627
- "13133:13133" # Health check
2728
volumes:
2829
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
@@ -41,6 +42,17 @@ services:
4142
networks:
4243
- rippled-telemetry
4344

45+
prometheus:
46+
image: prom/prometheus:latest
47+
ports:
48+
- "9090:9090"
49+
volumes:
50+
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
51+
depends_on:
52+
- otel-collector
53+
networks:
54+
- rippled-telemetry
55+
4456
grafana:
4557
image: grafana/grafana:latest
4658
environment:
@@ -50,8 +62,10 @@ services:
5062
- "3000:3000"
5163
volumes:
5264
- ./grafana/provisioning:/etc/grafana/provisioning:ro
65+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
5366
depends_on:
5467
- jaeger
68+
- prometheus
5569
networks:
5670
- rippled-telemetry
5771

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{
2+
"annotations": { "list": [] },
3+
"editable": true,
4+
"fiscalYearStartMonth": 0,
5+
"graphTooltip": 1,
6+
"id": null,
7+
"links": [],
8+
"panels": [
9+
{
10+
"title": "Consensus Round Duration",
11+
"type": "timeseries",
12+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
13+
"targets": [
14+
{
15+
"datasource": { "type": "prometheus" },
16+
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
17+
"legendFormat": "p95 round duration"
18+
},
19+
{
20+
"datasource": { "type": "prometheus" },
21+
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
22+
"legendFormat": "p50 round duration"
23+
}
24+
],
25+
"fieldConfig": {
26+
"defaults": {
27+
"unit": "ms"
28+
},
29+
"overrides": []
30+
}
31+
},
32+
{
33+
"title": "Consensus Proposals Sent Rate",
34+
"type": "timeseries",
35+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
36+
"targets": [
37+
{
38+
"datasource": { "type": "prometheus" },
39+
"expr": "sum(rate(traces_spanmetrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))",
40+
"legendFormat": "proposals/sec"
41+
}
42+
],
43+
"fieldConfig": {
44+
"defaults": {
45+
"unit": "ops"
46+
},
47+
"overrides": []
48+
}
49+
},
50+
{
51+
"title": "Ledger Close Duration",
52+
"type": "timeseries",
53+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
54+
"targets": [
55+
{
56+
"datasource": { "type": "prometheus" },
57+
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))",
58+
"legendFormat": "p95 close duration"
59+
}
60+
],
61+
"fieldConfig": {
62+
"defaults": {
63+
"unit": "ms"
64+
},
65+
"overrides": []
66+
}
67+
},
68+
{
69+
"title": "Validation Send Rate",
70+
"type": "stat",
71+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
72+
"targets": [
73+
{
74+
"datasource": { "type": "prometheus" },
75+
"expr": "sum(rate(traces_spanmetrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))",
76+
"legendFormat": "validations/sec"
77+
}
78+
],
79+
"fieldConfig": {
80+
"defaults": {
81+
"unit": "ops"
82+
},
83+
"overrides": []
84+
}
85+
}
86+
],
87+
"schemaVersion": 39,
88+
"tags": ["rippled", "consensus", "telemetry"],
89+
"templating": { "list": [] },
90+
"time": { "from": "now-1h", "to": "now" },
91+
"title": "rippled Consensus Health",
92+
"uid": "rippled-consensus"
93+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
{
2+
"annotations": { "list": [] },
3+
"editable": true,
4+
"fiscalYearStartMonth": 0,
5+
"graphTooltip": 1,
6+
"id": null,
7+
"links": [],
8+
"panels": [
9+
{
10+
"title": "RPC Request Rate by Command",
11+
"type": "timeseries",
12+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
13+
"targets": [
14+
{
15+
"datasource": { "type": "prometheus" },
16+
"expr": "sum by (xrpl_rpc_command) (rate(traces_spanmetrics_calls_total{span_name=~\"rpc.command.*\"}[5m]))",
17+
"legendFormat": "{{xrpl_rpc_command}}"
18+
}
19+
],
20+
"fieldConfig": {
21+
"defaults": {
22+
"unit": "reqps"
23+
},
24+
"overrides": []
25+
}
26+
},
27+
{
28+
"title": "RPC Latency p95 by Command",
29+
"type": "timeseries",
30+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
31+
"targets": [
32+
{
33+
"datasource": { "type": "prometheus" },
34+
"expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))",
35+
"legendFormat": "p95 {{xrpl_rpc_command}}"
36+
}
37+
],
38+
"fieldConfig": {
39+
"defaults": {
40+
"unit": "ms"
41+
},
42+
"overrides": []
43+
}
44+
},
45+
{
46+
"title": "RPC Error Rate",
47+
"type": "bargauge",
48+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
49+
"targets": [
50+
{
51+
"datasource": { "type": "prometheus" },
52+
"expr": "sum by (xrpl_rpc_command) (rate(traces_spanmetrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_spanmetrics_calls_total{span_name=~\"rpc.command.*\"}[5m])) * 100",
53+
"legendFormat": "{{xrpl_rpc_command}}"
54+
}
55+
],
56+
"fieldConfig": {
57+
"defaults": {
58+
"unit": "percent",
59+
"thresholds": {
60+
"steps": [
61+
{ "color": "green", "value": null },
62+
{ "color": "yellow", "value": 1 },
63+
{ "color": "red", "value": 5 }
64+
]
65+
}
66+
},
67+
"overrides": []
68+
}
69+
},
70+
{
71+
"title": "RPC Latency Heatmap",
72+
"type": "heatmap",
73+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
74+
"targets": [
75+
{
76+
"datasource": { "type": "prometheus" },
77+
"expr": "sum(increase(traces_spanmetrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])) by (le)",
78+
"legendFormat": "{{le}}",
79+
"format": "heatmap"
80+
}
81+
]
82+
}
83+
],
84+
"schemaVersion": 39,
85+
"tags": ["rippled", "rpc", "telemetry"],
86+
"templating": { "list": [] },
87+
"time": { "from": "now-1h", "to": "now" },
88+
"title": "rippled RPC Performance",
89+
"uid": "rippled-rpc-perf"
90+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
{
2+
"annotations": { "list": [] },
3+
"editable": true,
4+
"fiscalYearStartMonth": 0,
5+
"graphTooltip": 1,
6+
"id": null,
7+
"links": [],
8+
"panels": [
9+
{
10+
"title": "Transaction Processing Rate",
11+
"type": "timeseries",
12+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
13+
"targets": [
14+
{
15+
"datasource": { "type": "prometheus" },
16+
"expr": "sum(rate(traces_spanmetrics_calls_total{span_name=\"tx.process\"}[5m]))",
17+
"legendFormat": "tx.process/sec"
18+
},
19+
{
20+
"datasource": { "type": "prometheus" },
21+
"expr": "sum(rate(traces_spanmetrics_calls_total{span_name=\"tx.receive\"}[5m]))",
22+
"legendFormat": "tx.receive/sec"
23+
}
24+
],
25+
"fieldConfig": {
26+
"defaults": {
27+
"unit": "ops"
28+
},
29+
"overrides": []
30+
}
31+
},
32+
{
33+
"title": "Transaction Processing Latency",
34+
"type": "timeseries",
35+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
36+
"targets": [
37+
{
38+
"datasource": { "type": "prometheus" },
39+
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
40+
"legendFormat": "p95"
41+
},
42+
{
43+
"datasource": { "type": "prometheus" },
44+
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_spanmetrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
45+
"legendFormat": "p50"
46+
}
47+
],
48+
"fieldConfig": {
49+
"defaults": {
50+
"unit": "ms"
51+
},
52+
"overrides": []
53+
}
54+
},
55+
{
56+
"title": "Transaction Path Distribution",
57+
"type": "piechart",
58+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
59+
"targets": [
60+
{
61+
"datasource": { "type": "prometheus" },
62+
"expr": "sum by (xrpl_tx_local) (rate(traces_spanmetrics_calls_total{span_name=\"tx.process\"}[5m]))",
63+
"legendFormat": "local={{xrpl_tx_local}}"
64+
}
65+
]
66+
},
67+
{
68+
"title": "Transaction Receive vs Suppressed",
69+
"type": "timeseries",
70+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
71+
"targets": [
72+
{
73+
"datasource": { "type": "prometheus" },
74+
"expr": "sum(rate(traces_spanmetrics_calls_total{span_name=\"tx.receive\"}[5m]))",
75+
"legendFormat": "total received"
76+
}
77+
],
78+
"fieldConfig": {
79+
"defaults": {
80+
"unit": "ops"
81+
},
82+
"overrides": []
83+
}
84+
}
85+
],
86+
"schemaVersion": 39,
87+
"tags": ["rippled", "transactions", "telemetry"],
88+
"templating": { "list": [] },
89+
"time": { "from": "now-1h", "to": "now" },
90+
"title": "rippled Transaction Overview",
91+
"uid": "rippled-transactions"
92+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: 1
2+
3+
providers:
4+
- name: rippled-telemetry
5+
orgId: 1
6+
folder: rippled
7+
type: file
8+
disableDeletion: false
9+
editable: true
10+
options:
11+
path: /var/lib/grafana/dashboards
12+
foldersFromFilesStructure: false
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: 1
2+
3+
datasources:
4+
- name: Prometheus
5+
type: prometheus
6+
access: proxy
7+
url: http://prometheus:9090
8+
isDefault: false
9+
editable: true

docker/telemetry/otel-collector-config.yaml

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
# OpenTelemetry Collector configuration for rippled development.
22
#
3-
# Pipeline: OTLP receiver -> batch processor -> debug exporter + Jaeger.
3+
# Pipelines:
4+
# traces: OTLP receiver -> batch processor -> debug + Jaeger + spanmetrics
5+
# metrics: spanmetrics connector -> Prometheus exporter
6+
#
47
# rippled sends traces via OTLP/HTTP to port 4318. The collector batches
5-
# them and forwards to Jaeger via OTLP/gRPC on the Docker network.
8+
# them, forwards to Jaeger, and derives RED metrics via the spanmetrics
9+
# connector, which Prometheus scrapes on port 8889.
610

711
receivers:
812
otlp:
@@ -17,17 +21,33 @@ processors:
1721
timeout: 1s
1822
send_batch_size: 100
1923

24+
connectors:
25+
spanmetrics:
26+
histogram:
27+
explicit:
28+
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
29+
dimensions:
30+
- name: xrpl.rpc.command
31+
- name: xrpl.rpc.status
32+
- name: xrpl.consensus.mode
33+
- name: xrpl.tx.local
34+
2035
exporters:
2136
debug:
2237
verbosity: detailed
2338
otlp/jaeger:
2439
endpoint: jaeger:4317
2540
tls:
2641
insecure: true
42+
prometheus:
43+
endpoint: 0.0.0.0:8889
2744

2845
service:
2946
pipelines:
3047
traces:
3148
receivers: [otlp]
3249
processors: [batch]
33-
exporters: [debug, otlp/jaeger]
50+
exporters: [debug, otlp/jaeger, spanmetrics]
51+
metrics:
52+
receivers: [spanmetrics]
53+
exporters: [prometheus]

docker/telemetry/prometheus.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Prometheus configuration for scraping spanmetrics from OTel Collector.
2+
global:
3+
scrape_interval: 15s
4+
evaluation_interval: 15s
5+
6+
scrape_configs:
7+
- job_name: otel-collector
8+
static_configs:
9+
- targets: ["otel-collector:8889"]

0 commit comments

Comments
 (0)