diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index d93f2256da8..edc0ea53927 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -370,7 +370,7 @@ See the "Verification Queries" section below. ## Expected Span Catalog -All 12 production span names instrumented across Phases 2-4: +All 16 production span names instrumented across Phases 2-5: | Span Name | Source File | Phase | Key Attributes | How to Trigger | | --------------------------- | --------------------- | ----- | ---------------------------------------------------------- | ------------------------- | @@ -380,10 +380,16 @@ All 12 production span names instrumented across Phases 2-4: | `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | | `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | | `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | +| `tx.apply` | BuildLedger.cpp:88 | 5 | `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger close (tx set) | | `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | | `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | | `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | | `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| `ledger.build` | BuildLedger.cpp:31 | 5 | `xrpl.ledger.seq` | Ledger build | +| `ledger.validate` | LedgerMaster.cpp:915 | 5 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger validated | +| `ledger.store` | LedgerMaster.cpp:409 | 5 | `xrpl.ledger.seq` | Ledger stored | +| `peer.proposal.receive` | PeerImp.cpp:1667 | 5 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Peer sends proposal | +| `peer.validation.receive` | PeerImp.cpp:2264 | 5 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Peer sends validation | --- @@ -405,9 +411,11 @@ curl -s "$JAEGER/api/services/rippled/operations" | jq '.data' # Query traces by operation for op in "rpc.request" "rpc.process" \ "rpc.command.server_info" "rpc.command.server_state" "rpc.command.ledger" \ - "tx.process" "tx.receive" \ + "tx.process" "tx.receive" "tx.apply" \ "consensus.proposal.send" "consensus.ledger_close" \ - "consensus.accept" "consensus.validation.send"; do + "consensus.accept" "consensus.validation.send" \ + "ledger.build" "ledger.validate" "ledger.store" \ + "peer.proposal.receive" "peer.validation.receive"; do count=$(curl -s "$JAEGER/api/traces?service=rippled&operation=$op&limit=5&lookback=1h" \ | jq '.data | length') printf "%-35s %s traces\n" "$op" "$count" @@ -440,9 +448,11 @@ Open http://localhost:3000 (anonymous admin access enabled). Pre-configured dashboards: -- **RPC Performance**: Request rates, latency percentiles by command -- **Transaction Overview**: Transaction processing rates and paths -- **Consensus Health**: Consensus round duration and proposer counts +- **RPC Performance**: Request rates, latency percentiles by command, top commands, WebSocket rate +- **Transaction Overview**: Transaction processing rates, apply duration, peer relay, failed tx rate +- **Consensus Health**: Consensus round duration, proposer counts, mode tracking, accept heatmap +- **Ledger Operations**: Build/validate/store rates and durations, TX apply metrics +- **Peer Network**: Proposal/validation receive rates, trusted vs untrusted breakdown (requires `trace_peer=1`) Pre-configured datasources: diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 5a03238e15e..f44ff66dd21 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -8,72 +8,109 @@ "panels": [ { "title": "Consensus Round Duration", + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries xrpl.consensus.proposers and xrpl.consensus.round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "p95 round duration" + "legendFormat": "P95 Round Duration" }, { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "p50 round duration" + "legendFormat": "P50 Round Duration" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Consensus Proposals Sent Rate", + "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp:177) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))", - "legendFormat": "proposals/sec" + "legendFormat": "Proposals / Sec" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Proposals / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Ledger Close Duration", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.consensus.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))", - "legendFormat": "p95 close duration" + "legendFormat": "P95 Close Duration" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Validation Send Rate", + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.consensus.ledger.seq and xrpl.consensus.proposing. Should closely track the ledger close rate when the node is healthy.", "type": "stat", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))", - "legendFormat": "validations/sec" + "legendFormat": "Validations / Sec" } ], "fieldConfig": { @@ -82,6 +119,121 @@ }, "overrides": [] } + }, + { + "title": "Consensus Mode Over Time", + "description": "Breakdown of consensus ledger close events by the node's consensus mode (proposing, observing, wrongLedger, switchedLedger). Grouped by the xrpl.consensus.mode span attribute from consensus.ledger_close. A healthy validator should be predominantly in 'proposing' mode. Frequent 'wrongLedger' or 'switchedLedger' indicates sync issues.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum by (xrpl_consensus_mode) (rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "{{xrpl_consensus_mode}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Accept vs Close Rate", + "description": "Compares the rate of consensus.accept (ledger accepted after consensus) vs consensus.ledger_close (ledger close initiated). These should track closely in a healthy network. A divergence means some close events are not completing the accept phase, potentially indicating consensus failures or timeouts.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept\"}[5m]))", + "legendFormat": "Accepts / Sec" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "Closes / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Validation vs Close Rate", + "description": "Compares the rate of consensus.validation.send vs consensus.ledger_close. Each validated ledger should produce one validation message. If validations lag behind closes, the node may be falling behind on validation or experiencing issues with the validation pipeline.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))", + "legendFormat": "Validations / Sec" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "Closes / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Accept Duration Heatmap", + "description": "Heatmap showing the distribution of consensus.accept span durations across histogram buckets over time. Each cell represents how many accept events fell into that duration bucket in a 5m window. Useful for detecting outlier consensus rounds that take abnormally long.", + "type": "heatmap", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "yAxis": { "axisLabel": "Duration (ms)" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json new file mode 100644 index 00000000000..ec83d97d68a --- /dev/null +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -0,0 +1,231 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Ledger Build Rate", + "description": "Rate at which new ledgers are being built. The ledger.build span (BuildLedger.cpp:31) wraps the entire buildLedgerImpl() function which creates a new ledger from a parent, applies transactions, flushes SHAMap nodes, and sets the accepted state. Should match the consensus close rate (~0.25/sec on mainnet with ~4s rounds).", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"ledger.build\"}[5m]))", + "legendFormat": "Builds / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Build Duration", + "description": "p95 and p50 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P95 Build Duration" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P50 Build Duration" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Validation Rate", + "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and xrpl.ledger.validations (the number of validations received).", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"ledger.validate\"}[5m]))", + "legendFormat": "Validations / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Build Duration Heatmap", + "description": "Heatmap showing the distribution of ledger.build durations across histogram buckets over time. Each cell represents the count of ledger builds that fell into that duration bucket in a 5m window. Useful for spotting occasional slow ledger builds that may not appear in percentile charts.", + "type": "heatmap", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "yAxis": { "axisLabel": "Duration (ms)" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=\"ledger.build\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + }, + { + "title": "Transaction Apply Duration", + "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records xrpl.ledger.tx_count (successful) and xrpl.ledger.tx_failed (failed) as attributes.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P95 tx.apply" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P50 tx.apply" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Apply Rate", + "description": "Rate of tx.apply span invocations, reflecting how frequently the transaction application phase runs during ledger building. Each ledger build triggers one tx.apply call. Should closely match the ledger build rate.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.apply\"}[5m]))", + "legendFormat": "tx.apply / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Store Rate", + "description": "Rate at which ledgers are stored into the ledger history. The ledger.store span (LedgerMaster.cpp:409) wraps storeLedger() which inserts the ledger into the LedgerHistory cache. Records xrpl.ledger.seq. Should match the ledger build rate under normal operation.", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"ledger.store\"}[5m]))", + "legendFormat": "Stores / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Build vs Close Duration", + "description": "Compares p95 durations of ledger.build (the actual ledger construction in BuildLedger.cpp) vs consensus.ledger_close (the consensus close event in RCLConsensus.cpp). Build time is a subset of close time. A large gap between them indicates overhead in the consensus pipeline outside of ledger construction itself.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P95 ledger.build" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))", + "legendFormat": "P95 consensus.ledger_close" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "ledger", "telemetry"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "rippled Ledger Operations", + "uid": "rippled-ledger-ops" +} diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json new file mode 100644 index 00000000000..a6676915854 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -0,0 +1,107 @@ +{ + "annotations": { "list": [] }, + "description": "Requires trace_peer=1 in the [telemetry] config section.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Peer Proposal Receive Rate", + "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and xrpl.peer.proposal.trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Proposals Received / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Proposals / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Peer Validation Receive Rate", + "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and xrpl.peer.validation.trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Validations Received / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Validations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Proposals Trusted vs Untrusted", + "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.proposal.trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Trusted = {{xrpl_peer_proposal_trusted}}" + } + ] + }, + { + "title": "Validations Trusted vs Untrusted", + "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.validation.trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum by (xrpl_peer_validation_trusted) (rate(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Trusted = {{xrpl_peer_validation_trusted}}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["rippled", "peer", "telemetry"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "rippled Peer Network", + "uid": "rippled-peer-net" +} diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index 535a03c8706..54620bf1af6 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -8,8 +8,12 @@ "panels": [ { "title": "RPC Request Rate by Command", + "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the xrpl.rpc.command span attribute.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, @@ -19,33 +23,55 @@ ], "fieldConfig": { "defaults": { - "unit": "reqps" + "unit": "reqps", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "RPC Latency p95 by Command", + "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by xrpl.rpc.command. High values indicate slow commands that may need optimization.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "p95 {{xrpl_rpc_command}}" + "legendFormat": "P95 {{xrpl_rpc_command}}" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "RPC Error Rate", + "description": "Percentage of RPC commands that completed with an error status, per command. Calculated as (error calls / total calls) * 100, where errors have status_code=STATUS_CODE_ERROR. Thresholds: green < 1%, yellow 1-5%, red > 5%.", "type": "bargauge", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, @@ -69,8 +95,13 @@ }, { "title": "RPC Latency Heatmap", + "description": "Distribution of RPC command response times across histogram buckets. Shows the density of requests at each latency level over time. Each cell represents the count of requests that fell into that duration bucket in a 5m window. Useful for spotting bimodal latency patterns.", "type": "heatmap", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "yAxis": { "axisLabel": "Duration (ms)" } + }, "targets": [ { "datasource": { "type": "prometheus" }, @@ -79,6 +110,118 @@ "format": "heatmap" } ] + }, + { + "title": "Overall RPC Throughput", + "description": "Aggregate RPC throughput showing two layers of the request pipeline. rpc.request is the outer HTTP handler (ServerHandler.cpp:271) that accepts incoming connections. rpc.process is the inner processing layer (ServerHandler.cpp:573) that parses and dispatches. A gap between the two indicates requests being queued or rejected before processing.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"rpc.request\"}[5m]))", + "legendFormat": "rpc.request / Sec" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"rpc.process\"}[5m]))", + "legendFormat": "rpc.process / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Success vs Error", + "description": "Aggregate rate of successful vs failed RPC commands across all command types. Success = status_code UNSET (OpenTelemetry default for OK spans). Error = status_code STATUS_CODE_ERROR. A sustained error rate warrants investigation via per-command breakdown above.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", + "legendFormat": "Success" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "legendFormat": "Error" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Commands / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Top Commands by Volume", + "description": "Top 10 most frequently called RPC commands by total invocation count over the last 5 minutes. Uses topk(10, increase(calls_total)) to rank commands. Helps identify the hottest API endpoints driving load on the node.", + "type": "bargauge", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "topk(10, sum by (xrpl_rpc_command) (increase(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "{{xrpl_rpc_command}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "WebSocket Message Rate", + "description": "Rate of incoming WebSocket RPC messages processed by the server. Sourced from the rpc.ws_message span (ServerHandler.cpp:384). Only active when clients connect via WebSocket instead of HTTP. Zero is normal if only HTTP RPC is in use.", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"rpc.ws_message\"}[5m]))", + "legendFormat": "WS Messages / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index d5a60a6fa15..99c8ca75aa4 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -8,76 +8,223 @@ "panels": [ { "title": "Transaction Processing Rate", + "description": "Rate of transactions entering the processing pipeline. tx.process (NetworkOPs.cpp:1227) fires when a transaction is submitted locally or received from a peer and enters processTransaction(). tx.receive (PeerImp.cpp:1273) fires when a raw transaction message arrives from a peer before deduplication.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))", - "legendFormat": "tx.process/sec" + "legendFormat": "tx.process / Sec" }, { "datasource": { "type": "prometheus" }, "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))", - "legendFormat": "tx.receive/sec" + "legendFormat": "tx.receive / Sec" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Transaction Processing Latency", + "description": "p95 and p50 latency of transaction processing (tx.process span). Measures the time from when a transaction enters processTransaction() to completion. Computed via histogram_quantile() over the spanmetrics duration histogram with a 5m rate window.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))", - "legendFormat": "p95" + "legendFormat": "P95" }, { "datasource": { "type": "prometheus" }, "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))", - "legendFormat": "p50" + "legendFormat": "P50" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Transaction Path Distribution", + "description": "Breakdown of transactions by origin path. The xrpl.tx.local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", "type": "piechart", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))", - "legendFormat": "local={{xrpl_tx_local}}" + "legendFormat": "Local = {{xrpl_tx_local}}" } ] }, { "title": "Transaction Receive vs Suppressed", + "description": "Total rate of raw transaction messages received from peers (tx.receive span from PeerImp.cpp:1273). This fires before deduplication via the HashRouter, so the difference between tx.receive and tx.process reflects suppressed duplicate transactions.", "type": "timeseries", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, "targets": [ { "datasource": { "type": "prometheus" }, "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))", - "legendFormat": "total received" + "legendFormat": "Total Received" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Processing Duration Heatmap", + "description": "Heatmap showing the distribution of tx.process span durations across histogram buckets over time. Each cell represents the count of transactions that completed within that latency bucket in a 5m window. Reveals whether processing times are consistent or exhibit multi-modal patterns.", + "type": "heatmap", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "yAxis": { "axisLabel": "Duration (ms)" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + }, + { + "title": "Transaction Apply Duration per Ledger", + "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp:88) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P95 tx.apply" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P50 tx.apply" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Peer Transaction Receive Rate", + "description": "Rate of transaction messages received from network peers. Sourced from the tx.receive span (PeerImp.cpp:1273) which fires in the onMessage(TMTransaction) handler. High rates may indicate network-wide transaction volume spikes or peer flooding.", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))", + "legendFormat": "tx.receive / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Apply Failed Rate", + "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records xrpl.ledger.tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.apply\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "legendFormat": "Failed / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.1 }, + { "color": "red", "value": 1 } + ] + } }, "overrides": [] } diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index da2c17fb2b3..77e80ee2be9 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -307,7 +307,7 @@ max_queue_size=2048 trace_rpc=1 trace_transactions=1 trace_consensus=1 -trace_peer=0 +trace_peer=1 trace_ledger=1 [rpc_startup] @@ -481,6 +481,7 @@ log "" log "--- Phase 3: Transaction Spans ---" check_span "tx.process" check_span "tx.receive" +check_span "tx.apply" log "" log "--- Phase 4: Consensus Spans ---" @@ -489,6 +490,17 @@ check_span "consensus.ledger_close" check_span "consensus.accept" check_span "consensus.validation.send" +log "" +log "--- Phase 5: Ledger Spans ---" +check_span "ledger.build" +check_span "ledger.validate" +check_span "ledger.store" + +log "" +log "--- Phase 5: Peer Spans (trace_peer=1) ---" +check_span "peer.proposal.receive" +check_span "peer.validation.receive" + # --------------------------------------------------------------------------- # Step 10: Verify Prometheus spanmetrics # --------------------------------------------------------------------------- diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index f811dfa9857..18feb2d8c7a 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -31,6 +31,8 @@ connectors: - name: xrpl.rpc.status - name: xrpl.consensus.mode - name: xrpl.tx.local + - name: xrpl.peer.proposal.trusted + - name: xrpl.peer.validation.trusted exporters: debug: diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index ebb403e1843..5b852a6fbd3 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -75,6 +75,7 @@ All spans instrumented in rippled, grouped by subsystem: | ------------ | ------------------- | ----------------------------------------------- | ------------------------------------- | | `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Transaction submission and processing | | `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp:88 | `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Transaction set applied per ledger | ### Consensus Spans (Phase 4) @@ -85,6 +86,21 @@ All spans instrumented in rippled, grouped by subsystem: | `consensus.accept` | RCLConsensus.cpp:395 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted by consensus | | `consensus.validation.send` | RCLConsensus.cpp:753 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent after accept | +### Ledger Spans (Phase 5) + +| Span Name | Source File | Attributes | Description | +| ----------------- | -------------------- | -------------------------------------------- | ----------------------------- | +| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq` | Ledger build during consensus | +| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger promoted to validated | +| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | + +### Peer Spans (Phase 5) + +| Span Name | Source File | Attributes | Description | +| ------------------------- | ---------------- | ---------------------------------------------- | ----------------------------- | +| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Proposal received from peer | +| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Validation received from peer | + ## Prometheus Metrics (Spanmetrics) The OTel Collector's spanmetrics connector automatically derives RED (Rate, Errors, Duration) metrics from every span. No custom metrics code is needed in rippled. @@ -111,12 +127,14 @@ Every metric carries these standard labels: Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): -| Span Attribute | Metric Label | Applies To | -| --------------------- | --------------------- | ------------------------------ | -| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | -| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | -| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | +| Span Attribute | Metric Label | Applies To | +| ------------------------------ | ------------------------------ | ------------------------------- | +| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | +| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | +| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | +| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | +| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | ### Histogram Buckets @@ -128,7 +146,7 @@ Configured in `otel-collector-config.yaml`: ## Grafana Dashboards -Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: +Five dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ### RPC Performance (`rippled-rpc-perf`) @@ -138,6 +156,10 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` | | RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` | | RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | +| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | +| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | +| Top Commands by Volume | bargauge | `topk(10, ...)` by `xrpl_rpc_command` | `xrpl_rpc_command` | +| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | ### Transaction Overview (`rippled-transactions`) @@ -147,29 +169,68 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | | Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` | | Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | +| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | +| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | +| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | +| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | ### Consensus Health (`rippled-consensus`) -| Panel | Type | PromQL | Labels Used | -| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ----------- | -| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | -| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | -| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | -| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Panel | Type | PromQL | Labels Used | +| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | --------------------- | +| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | +| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | +| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | +| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `xrpl_consensus_mode` | `xrpl_consensus_mode` | +| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | +| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | +| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | + +### Ledger Operations (`rippled-ledger-ops`) + +| Panel | Type | PromQL | Labels Used | +| ----------------------- | ---------- | ---------------------------------------------- | ----------- | +| Ledger Build Rate | stat | `ledger.build` call rate | — | +| Ledger Build Duration | timeseries | p95/p50 of `ledger.build` | — | +| Ledger Validation Rate | stat | `ledger.validate` call rate | — | +| Build Duration Heatmap | heatmap | `ledger.build` histogram buckets | `le` | +| TX Apply Duration | timeseries | p95/p50 of `tx.apply` | — | +| TX Apply Rate | timeseries | `tx.apply` call rate | — | +| Ledger Store Rate | stat | `ledger.store` call rate | — | +| Build vs Close Duration | timeseries | p95 `ledger.build` vs `consensus.ledger_close` | — | + +### Peer Network (`rippled-peer-net`) + +Requires `trace_peer=1` in the `[telemetry]` config section. + +| Panel | Type | PromQL | Labels Used | +| -------------------------------- | ---------- | --------------------------------- | ------------------------------ | +| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | +| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | +| Proposals Trusted vs Untrusted | piechart | by `xrpl_peer_proposal_trusted` | `xrpl_peer_proposal_trusted` | +| Validations Trusted vs Untrusted | piechart | by `xrpl_peer_validation_trusted` | `xrpl_peer_validation_trusted` | ### Span → Metric → Dashboard Summary -| Span Name | Prometheus Metric Filter | Grafana Dashboard | -| --------------------------- | ----------------------------------------- | ---------------------------------- | -| `rpc.request` | `{span_name="rpc.request"}` | — (available but not paneled) | -| `rpc.process` | `{span_name="rpc.process"}` | — (available but not paneled) | -| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | -| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | -| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (2 panels) | -| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Round Duration) | -| `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | -| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close Duration) | -| `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | +| Span Name | Prometheus Metric Filter | Grafana Dashboard | +| --------------------------- | ----------------------------------------- | --------------------------------------------- | +| `rpc.request` | `{span_name="rpc.request"}` | RPC Performance (Overall Throughput) | +| `rpc.process` | `{span_name="rpc.process"}` | RPC Performance (Overall Throughput) | +| `rpc.ws_message` | `{span_name="rpc.ws_message"}` | RPC Performance (WebSocket Rate) | +| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (Rate, Latency, Error, Top) | +| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (Rate, Latency, Heatmap) | +| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (Rate, Receive) | +| `tx.apply` | `{span_name="tx.apply"}` | Transaction Overview + Ledger Ops (Apply) | +| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Duration, Rate, Heatmap) | +| `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | +| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close, Mode) | +| `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | +| `ledger.build` | `{span_name="ledger.build"}` | Ledger Ops (Build Rate, Duration, Heatmap) | +| `ledger.validate` | `{span_name="ledger.validate"}` | Ledger Ops (Validation Rate) | +| `ledger.store` | `{span_name="ledger.store"}` | Ledger Ops (Store Rate) | +| `peer.proposal.receive` | `{span_name="peer.proposal.receive"}` | Peer Network (Rate, Trusted/Untrusted) | +| `peer.validation.receive` | `{span_name="peer.validation.receive"}` | Peer Network (Rate, Trusted/Untrusted) | ## Troubleshooting diff --git a/src/xrpld/app/ledger/detail/BuildLedger.cpp b/src/xrpld/app/ledger/detail/BuildLedger.cpp index c4f8f9e60a0..a566ca16290 100644 --- a/src/xrpld/app/ledger/detail/BuildLedger.cpp +++ b/src/xrpld/app/ledger/detail/BuildLedger.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,8 @@ buildLedgerImpl( beast::Journal j, ApplyTxs&& applyTxs) { + XRPL_TRACE_LEDGER(app.getTelemetry(), "ledger.build"); + auto built = std::make_shared(*parent, closeTime); if (built->isFlagLedger()) @@ -60,6 +63,7 @@ buildLedgerImpl( built->header().seq < XRP_LEDGER_EARLIEST_FEES || built->read(keylet::fees()), "xrpl::buildLedgerImpl : valid ledger fees"); built->setAccepted(closeTime, closeResolution, closeTimeCorrect); + XRPL_TRACE_SET_ATTR("xrpl.ledger.seq", static_cast(built->header().seq)); return built; } @@ -83,6 +87,8 @@ applyTransactions( OpenView& view, beast::Journal j) { + XRPL_TRACE_TX(app.getTelemetry(), "tx.apply"); + bool certainRetry = true; std::size_t count = 0; @@ -149,6 +155,8 @@ applyTransactions( // If there are any transactions left, we must have // tried them in at least one final pass XRPL_ASSERT(txns.empty() || !certainRetry, "xrpl::applyTransactions : retry transactions"); + XRPL_TRACE_SET_ATTR("xrpl.ledger.tx_count", static_cast(count)); + XRPL_TRACE_SET_ATTR("xrpl.ledger.tx_failed", static_cast(failed.size())); return count; } diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index 8072b619e17..12b2cac19eb 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -404,6 +405,9 @@ LedgerMaster::fixIndex(LedgerIndex ledgerIndex, LedgerHash const& ledgerHash) bool LedgerMaster::storeLedger(std::shared_ptr ledger) { + XRPL_TRACE_LEDGER(app_.getTelemetry(), "ledger.store"); + XRPL_TRACE_SET_ATTR("xrpl.ledger.seq", static_cast(ledger->header().seq)); + bool validated = ledger->header().validated; // Returns true if we already had the ledger return mLedgerHistory.insert(std::move(ledger), validated); @@ -907,6 +911,10 @@ LedgerMaster::checkAccept(std::shared_ptr const& ledger) return; } + XRPL_TRACE_LEDGER(app_.getTelemetry(), "ledger.validate"); + XRPL_TRACE_SET_ATTR("xrpl.ledger.seq", static_cast(ledger->header().seq)); + XRPL_TRACE_SET_ATTR("xrpl.ledger.validations", static_cast(tvc)); + JLOG(m_journal.info()) << "Advancing accepted ledger to " << ledger->header().seq << " with >= " << minVal << " validations"; diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index a3166ac6288..d4f9b372ad0 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -1664,6 +1664,9 @@ PeerImp::onMessage(std::shared_ptr const& m) void PeerImp::onMessage(std::shared_ptr const& m) { + XRPL_TRACE_PEER(app_.getTelemetry(), "peer.proposal.receive"); + XRPL_TRACE_SET_ATTR("xrpl.peer.id", static_cast(id_)); + protocol::TMProposeSet& set = *m; auto const sig = makeSlice(set.signature()); @@ -1690,6 +1693,7 @@ PeerImp::onMessage(std::shared_ptr const& m) // every time a spam packet is received PublicKey const publicKey{makeSlice(set.nodepubkey())}; auto const isTrusted = app_.validators().trusted(publicKey); + XRPL_TRACE_SET_ATTR("xrpl.peer.proposal.trusted", isTrusted); // If the operator has specified that untrusted proposals be dropped then // this happens here I.e. before further wasting CPU verifying the signature @@ -2257,6 +2261,9 @@ PeerImp::onMessage(std::shared_ptr const& m void PeerImp::onMessage(std::shared_ptr const& m) { + XRPL_TRACE_PEER(app_.getTelemetry(), "peer.validation.receive"); + XRPL_TRACE_SET_ATTR("xrpl.peer.id", static_cast(id_)); + if (m->validation().size() < 50) { JLOG(p_journal_.warn()) << "Validation: Too small"; @@ -2295,6 +2302,7 @@ PeerImp::onMessage(std::shared_ptr const& m) // suppression for 30 seconds to avoid doing a relatively expensive // lookup every time a spam packet is received auto const isTrusted = app_.validators().trusted(val->getSignerPublic()); + XRPL_TRACE_SET_ATTR("xrpl.peer.validation.trusted", isTrusted); // If the operator has specified that untrusted validations be // dropped then this happens here I.e. before further wasting CPU