telemetry_;
+
+public:
+ ApplicationImp(...)
+ {
+ // Initialize telemetry early (before other components)
+ auto telemetrySection = config_->section("telemetry");
+ auto telemetrySetup = telemetry::setup_Telemetry(
+ telemetrySection,
+ toBase58(TokenType::NodePublic, nodeIdentity_.publicKey()),
+ BuildInfo::getVersionString());
+
+ // Set network attributes
+ telemetrySetup.networkId = config_->NETWORK_ID;
+ telemetrySetup.networkType = [&]() {
+ if (config_->NETWORK_ID == 0) return "mainnet";
+ if (config_->NETWORK_ID == 1) return "testnet";
+ if (config_->NETWORK_ID == 2) return "devnet";
+ return "custom";
+ }();
+
+ telemetry_ = telemetry::make_Telemetry(
+ telemetrySetup,
+ logs_->journal("Telemetry"));
+
+ // ... rest of initialization ...
+ }
+
+ void start() override
+ {
+ // Start telemetry first
+ if (telemetry_)
+ telemetry_->start();
+
+ // ... existing start code ...
+ }
+
+ void stop() override
+ {
+ // ... existing stop code ...
+
+ // Stop telemetry last (to capture shutdown spans)
+ if (telemetry_)
+ telemetry_->stop();
+ }
+
+ telemetry::Telemetry& getTelemetry() override
+ {
+ assert(telemetry_);
+ return *telemetry_;
+ }
+};
+```
+
+### 5.3.2 Application Interface Addition
+
+```cpp
+// include/xrpl/app/main/Application.h (modified)
+
+namespace telemetry { class Telemetry; }
+
+class Application
+{
+public:
+ // ... existing virtual methods ...
+
+ /** Get the telemetry system for distributed tracing */
+ virtual telemetry::Telemetry& getTelemetry() = 0;
+};
+```
+
+---
+
+## 5.4 CMake Integration
+
+### 5.4.1 Find OpenTelemetry Module
+
+```cmake
+# cmake/FindOpenTelemetry.cmake
+
+# Find OpenTelemetry C++ SDK
+#
+# This module defines:
+# OpenTelemetry_FOUND - System has OpenTelemetry
+# OpenTelemetry::api - API library target
+# OpenTelemetry::sdk - SDK library target
+# OpenTelemetry::otlp_grpc_exporter - OTLP gRPC exporter target
+# OpenTelemetry::otlp_http_exporter - OTLP HTTP exporter target
+
+find_package(opentelemetry-cpp CONFIG QUIET)
+
+if(opentelemetry-cpp_FOUND)
+ set(OpenTelemetry_FOUND TRUE)
+
+ # Create imported targets if not already created by config
+ if(NOT TARGET OpenTelemetry::api)
+ add_library(OpenTelemetry::api ALIAS opentelemetry-cpp::api)
+ endif()
+ if(NOT TARGET OpenTelemetry::sdk)
+ add_library(OpenTelemetry::sdk ALIAS opentelemetry-cpp::sdk)
+ endif()
+ if(NOT TARGET OpenTelemetry::otlp_grpc_exporter)
+ add_library(OpenTelemetry::otlp_grpc_exporter ALIAS
+ opentelemetry-cpp::otlp_grpc_exporter)
+ endif()
+else()
+ # Try pkg-config fallback
+ find_package(PkgConfig QUIET)
+ if(PKG_CONFIG_FOUND)
+ pkg_check_modules(OTEL opentelemetry-cpp QUIET)
+ if(OTEL_FOUND)
+ set(OpenTelemetry_FOUND TRUE)
+ # Create imported targets from pkg-config
+ add_library(OpenTelemetry::api INTERFACE IMPORTED)
+ target_include_directories(OpenTelemetry::api INTERFACE
+ ${OTEL_INCLUDE_DIRS})
+ endif()
+ endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OpenTelemetry
+ REQUIRED_VARS OpenTelemetry_FOUND)
+```
+
+### 5.4.2 CMakeLists.txt Changes
+
+```cmake
+# CMakeLists.txt (additions)
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TELEMETRY OPTIONS
+# ═══════════════════════════════════════════════════════════════════════════════
+
+option(XRPL_ENABLE_TELEMETRY
+ "Enable OpenTelemetry distributed tracing support" OFF)
+
+if(XRPL_ENABLE_TELEMETRY)
+ find_package(OpenTelemetry REQUIRED)
+
+ # Define compile-time flag
+ add_compile_definitions(XRPL_ENABLE_TELEMETRY)
+
+ message(STATUS "OpenTelemetry tracing: ENABLED")
+else()
+ message(STATUS "OpenTelemetry tracing: DISABLED")
+endif()
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TELEMETRY LIBRARY
+# ═══════════════════════════════════════════════════════════════════════════════
+
+if(XRPL_ENABLE_TELEMETRY)
+ add_library(xrpl_telemetry
+ src/libxrpl/telemetry/Telemetry.cpp
+ src/libxrpl/telemetry/TelemetryConfig.cpp
+ src/libxrpl/telemetry/TraceContext.cpp
+ )
+
+ target_include_directories(xrpl_telemetry
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ )
+
+ target_link_libraries(xrpl_telemetry
+ PUBLIC
+ OpenTelemetry::api
+ OpenTelemetry::sdk
+ OpenTelemetry::otlp_grpc_exporter
+ PRIVATE
+ xrpl_basics
+ )
+
+ # Add to main library dependencies
+ target_link_libraries(xrpld PRIVATE xrpl_telemetry)
+else()
+ # Create null implementation library
+ add_library(xrpl_telemetry
+ src/libxrpl/telemetry/NullTelemetry.cpp
+ )
+ target_include_directories(xrpl_telemetry
+ PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
+ )
+endif()
+```
+
+---
+
+## 5.5 OpenTelemetry Collector Configuration
+
+### 5.5.1 Development Configuration
+
+```yaml
+# otel-collector-dev.yaml
+# Minimal configuration for local development
+
+receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+ http:
+ endpoint: 0.0.0.0:4318
+
+processors:
+ batch:
+ timeout: 1s
+ send_batch_size: 100
+
+exporters:
+ # Console output for debugging
+ logging:
+ verbosity: detailed
+ sampling_initial: 5
+ sampling_thereafter: 200
+
+ # Jaeger for trace visualization
+ jaeger:
+ endpoint: jaeger:14250
+ tls:
+ insecure: true
+
+service:
+ pipelines:
+ traces:
+ receivers: [otlp]
+ processors: [batch]
+ exporters: [logging, jaeger]
+```
+
+### 5.5.2 Production Configuration
+
+```yaml
+# otel-collector-prod.yaml
+# Production configuration with filtering, sampling, and multiple backends
+
+receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+ tls:
+ cert_file: /etc/otel/server.crt
+ key_file: /etc/otel/server.key
+ ca_file: /etc/otel/ca.crt
+
+processors:
+ # Memory limiter to prevent OOM
+ memory_limiter:
+ check_interval: 1s
+ limit_mib: 1000
+ spike_limit_mib: 200
+
+ # Batch processing for efficiency
+ batch:
+ timeout: 5s
+ send_batch_size: 512
+ send_batch_max_size: 1024
+
+ # Tail-based sampling (keep errors and slow traces)
+ tail_sampling:
+ decision_wait: 10s
+ num_traces: 100000
+ expected_new_traces_per_sec: 1000
+ policies:
+ # Always keep error traces
+ - name: errors
+ type: status_code
+ status_code:
+ status_codes: [ERROR]
+ # Keep slow consensus rounds (>5s)
+ - name: slow-consensus
+ type: latency
+ latency:
+ threshold_ms: 5000
+ # Keep slow RPC requests (>1s)
+ - name: slow-rpc
+ type: and
+ and:
+ and_sub_policy:
+ - name: rpc-spans
+ type: string_attribute
+ string_attribute:
+ key: xrpl.rpc.command
+ values: [".*"]
+ enabled_regex_matching: true
+ - name: latency
+ type: latency
+ latency:
+ threshold_ms: 1000
+ # Probabilistic sampling for the rest
+ - name: probabilistic
+ type: probabilistic
+ probabilistic:
+ sampling_percentage: 10
+
+ # Attribute processing
+ attributes:
+ actions:
+ # Hash sensitive data
+ - key: xrpl.tx.account
+ action: hash
+ # Add deployment info
+ - key: deployment.environment
+ value: production
+ action: upsert
+
+exporters:
+ # Grafana Tempo for long-term storage
+ otlp/tempo:
+ endpoint: tempo.monitoring:4317
+ tls:
+ insecure: false
+ ca_file: /etc/otel/tempo-ca.crt
+
+ # Elastic APM for correlation with logs
+ otlp/elastic:
+ endpoint: apm.elastic:8200
+ headers:
+ Authorization: "Bearer ${ELASTIC_APM_TOKEN}"
+
+extensions:
+ health_check:
+ endpoint: 0.0.0.0:13133
+ zpages:
+ endpoint: 0.0.0.0:55679
+
+service:
+ extensions: [health_check, zpages]
+ pipelines:
+ traces:
+ receivers: [otlp]
+ processors: [memory_limiter, tail_sampling, attributes, batch]
+ exporters: [otlp/tempo, otlp/elastic]
+```
+
+---
+
+## 5.6 Docker Compose Development Environment
+
+```yaml
+# docker-compose-telemetry.yaml
+version: "3.8"
+
+services:
+ # OpenTelemetry Collector
+ otel-collector:
+ image: otel/opentelemetry-collector-contrib:0.92.0
+ container_name: otel-collector
+ command: ["--config=/etc/otel-collector-config.yaml"]
+ volumes:
+ - ./otel-collector-dev.yaml:/etc/otel-collector-config.yaml:ro
+ ports:
+ - "4317:4317" # OTLP gRPC
+ - "4318:4318" # OTLP HTTP
+ - "13133:13133" # Health check
+ depends_on:
+ - jaeger
+
+ # Jaeger for trace visualization
+ jaeger:
+ image: jaegertracing/all-in-one:1.53
+ container_name: jaeger
+ environment:
+ - COLLECTOR_OTLP_ENABLED=true
+ ports:
+ - "16686:16686" # UI
+ - "14250:14250" # gRPC
+
+ # Grafana for dashboards
+ grafana:
+ image: grafana/grafana:10.2.3
+ container_name: grafana
+ environment:
+ - GF_AUTH_ANONYMOUS_ENABLED=true
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+ volumes:
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
+ - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+ ports:
+ - "3000:3000"
+ depends_on:
+ - jaeger
+
+ # Prometheus for metrics (optional, for correlation)
+ prometheus:
+ image: prom/prometheus:v2.48.1
+ container_name: prometheus
+ volumes:
+ - ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro
+ ports:
+ - "9090:9090"
+
+networks:
+ default:
+ name: rippled-telemetry
+```
+
+---
+
+## 5.7 Configuration Architecture
+
+```mermaid
+flowchart TB
+ subgraph config["Configuration Sources"]
+ cfgFile["xrpld.cfg
[telemetry] section"]
+ cmake["CMake
XRPL_ENABLE_TELEMETRY"]
+ end
+
+ subgraph init["Initialization"]
+ parse["setup_Telemetry()"]
+ factory["make_Telemetry()"]
+ end
+
+ subgraph runtime["Runtime Components"]
+ tracer["TracerProvider"]
+ exporter["OTLP Exporter"]
+ processor["BatchProcessor"]
+ end
+
+ subgraph collector["Collector Pipeline"]
+ recv["Receivers"]
+ proc["Processors"]
+ exp["Exporters"]
+ end
+
+ cfgFile --> parse
+ cmake -->|"compile flag"| parse
+ parse --> factory
+ factory --> tracer
+ tracer --> processor
+ processor --> exporter
+ exporter -->|"OTLP"| recv
+ recv --> proc
+ proc --> exp
+
+ style config fill:#e3f2fd,stroke:#1976d2
+ style runtime fill:#e8f5e9,stroke:#388e3c
+ style collector fill:#fff3e0,stroke:#ff9800
+```
+
+---
+
+## 5.8 Grafana Integration
+
+Step-by-step instructions for integrating rippled traces with Grafana.
+
+### 5.8.1 Data Source Configuration
+
+#### Tempo (Recommended)
+
+```yaml
+# grafana/provisioning/datasources/tempo.yaml
+apiVersion: 1
+
+datasources:
+ - name: Tempo
+ type: tempo
+ access: proxy
+ url: http://tempo:3200
+ jsonData:
+ httpMethod: GET
+ tracesToLogs:
+ datasourceUid: loki
+ tags: ["service.name", "xrpl.tx.hash"]
+ mappedTags: [{ key: "trace_id", value: "traceID" }]
+ mapTagNamesEnabled: true
+ filterByTraceID: true
+ serviceMap:
+ datasourceUid: prometheus
+ nodeGraph:
+ enabled: true
+ search:
+ hide: false
+ lokiSearch:
+ datasourceUid: loki
+```
+
+#### Jaeger
+
+```yaml
+# grafana/provisioning/datasources/jaeger.yaml
+apiVersion: 1
+
+datasources:
+ - name: Jaeger
+ type: jaeger
+ access: proxy
+ url: http://jaeger:16686
+ jsonData:
+ tracesToLogs:
+ datasourceUid: loki
+ tags: ["service.name"]
+```
+
+#### Elastic APM
+
+```yaml
+# grafana/provisioning/datasources/elastic-apm.yaml
+apiVersion: 1
+
+datasources:
+ - name: Elasticsearch-APM
+ type: elasticsearch
+ access: proxy
+ url: http://elasticsearch:9200
+ database: "apm-*"
+ jsonData:
+ esVersion: "8.0.0"
+ timeField: "@timestamp"
+ logMessageField: message
+ logLevelField: log.level
+```
+
+### 5.8.2 Dashboard Provisioning
+
+```yaml
+# grafana/provisioning/dashboards/dashboards.yaml
+apiVersion: 1
+
+providers:
+ - name: "rippled-dashboards"
+ orgId: 1
+ folder: "rippled"
+ folderUid: "rippled"
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 30
+ options:
+ path: /var/lib/grafana/dashboards/rippled
+```
+
+### 5.8.3 Example Dashboard: RPC Performance
+
+```json
+{
+ "title": "rippled RPC Performance",
+ "uid": "rippled-rpc-performance",
+ "panels": [
+ {
+ "title": "RPC Latency by Command",
+ "type": "heatmap",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
+ },
+ {
+ "title": "RPC Error Rate",
+ "type": "timeseries",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && status.code=error} | rate() by (span.xrpl.rpc.command)"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
+ },
+ {
+ "title": "Top 10 Slowest RPC Commands",
+ "type": "table",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }
+ },
+ {
+ "title": "Recent Traces",
+ "type": "table",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\"}"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }
+ }
+ ]
+}
+```
+
+### 5.8.4 Example Dashboard: Transaction Tracing
+
+```json
+{
+ "title": "rippled Transaction Tracing",
+ "uid": "rippled-tx-tracing",
+ "panels": [
+ {
+ "title": "Transaction Throughput",
+ "type": "stat",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"tx.receive\"} | rate()"
+ }
+ ],
+ "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }
+ },
+ {
+ "title": "Cross-Node Relay Count",
+ "type": "timeseries",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"tx.relay\"} | avg(span.xrpl.tx.relay_count)"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }
+ },
+ {
+ "title": "Transaction Validation Errors",
+ "type": "table",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"tx.validate\" && status.code=error}"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }
+ }
+ ]
+}
+```
+
+### 5.8.5 TraceQL Query Examples
+
+Common queries for rippled traces:
+
+```
+# Find all traces for a specific transaction hash
+{resource.service.name="rippled" && span.xrpl.tx.hash="ABC123..."}
+
+# Find slow RPC commands (>100ms)
+{resource.service.name="rippled" && name=~"rpc.command.*"} | duration > 100ms
+
+# Find consensus rounds taking >5 seconds
+{resource.service.name="rippled" && name="consensus.round"} | duration > 5s
+
+# Find failed transactions with error details
+{resource.service.name="rippled" && name="tx.validate" && status.code=error}
+
+# Find transactions relayed to many peers
+{resource.service.name="rippled" && name="tx.relay"} | span.xrpl.tx.relay_count > 10
+
+# Compare latency across nodes
+{resource.service.name="rippled" && name="rpc.command.account_info"} | avg(duration) by (resource.service.instance.id)
+```
+
+### 5.8.6 Correlation with PerfLog
+
+To correlate OpenTelemetry traces with existing PerfLog data:
+
+**Step 1: Configure Loki to ingest PerfLog**
+
+```yaml
+# promtail-config.yaml
+scrape_configs:
+ - job_name: rippled-perflog
+ static_configs:
+ - targets:
+ - localhost
+ labels:
+ job: rippled
+ __path__: /var/log/rippled/perf*.log
+ pipeline_stages:
+ - json:
+ expressions:
+ trace_id: trace_id
+ ledger_seq: ledger_seq
+ tx_hash: tx_hash
+ - labels:
+ trace_id:
+ ledger_seq:
+ tx_hash:
+```
+
+**Step 2: Add trace_id to PerfLog entries**
+
+Modify PerfLog to include trace_id when available:
+
+```cpp
+// In PerfLog output, add trace_id from current span context
+void logPerf(Json::Value& entry) {
+ auto span = opentelemetry::trace::GetSpan(
+ opentelemetry::context::RuntimeContext::GetCurrent());
+ if (span && span->GetContext().IsValid()) {
+ char traceIdHex[33];
+ span->GetContext().trace_id().ToLowerBase16(traceIdHex);
+ entry["trace_id"] = std::string(traceIdHex, 32);
+ }
+ // ... existing logging
+}
+```
+
+**Step 3: Configure Grafana trace-to-logs link**
+
+In Tempo data source configuration, set up the derived field:
+
+```yaml
+jsonData:
+ tracesToLogs:
+ datasourceUid: loki
+ tags: ["trace_id", "xrpl.tx.hash"]
+ filterByTraceID: true
+ filterBySpanID: false
+```
+
+### 5.8.7 Correlation with Insight/StatsD Metrics
+
+To correlate traces with existing Beast Insight metrics:
+
+**Step 1: Export Insight metrics to Prometheus**
+
+```yaml
+# prometheus.yaml
+scrape_configs:
+ - job_name: "rippled-statsd"
+ static_configs:
+ - targets: ["statsd-exporter:9102"]
+```
+
+**Step 2: Add exemplars to metrics**
+
+OpenTelemetry SDK automatically adds exemplars (trace IDs) to metrics when using the Prometheus exporter. This links metrics spikes to specific traces.
+
+**Step 3: Configure Grafana metric-to-trace link**
+
+```yaml
+# In Prometheus data source
+jsonData:
+ exemplarTraceIdDestinations:
+ - name: trace_id
+ datasourceUid: tempo
+```
+
+**Step 4: Dashboard panel with exemplars**
+
+```json
+{
+ "title": "RPC Latency with Trace Links",
+ "type": "timeseries",
+ "datasource": "Prometheus",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, rate(rippled_rpc_duration_seconds_bucket[5m]))",
+ "exemplar": true
+ }
+ ]
+}
+```
+
+This allows clicking on metric data points to jump directly to the related trace.
+
+---
+
+_Previous: [Code Samples](./04-code-samples.md)_ | _Next: [Implementation Phases](./06-implementation-phases.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_
diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md
new file mode 100644
index 00000000000..10b97333ee1
--- /dev/null
+++ b/OpenTelemetryPlan/06-implementation-phases.md
@@ -0,0 +1,543 @@
+# Implementation Phases
+
+> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
+> **Related**: [Configuration Reference](./05-configuration-reference.md) | [Observability Backends](./07-observability-backends.md)
+
+---
+
+## 6.1 Phase Overview
+
+```mermaid
+gantt
+ title OpenTelemetry Implementation Timeline
+ dateFormat YYYY-MM-DD
+ axisFormat Week %W
+
+ section Phase 1
+ Core Infrastructure :p1, 2024-01-01, 2w
+ SDK Integration :p1a, 2024-01-01, 4d
+ Telemetry Interface :p1b, after p1a, 3d
+ Configuration & CMake :p1c, after p1b, 3d
+ Unit Tests :p1d, after p1c, 2d
+
+ section Phase 2
+ RPC Tracing :p2, after p1, 2w
+ HTTP Context Extraction :p2a, after p1, 2d
+ RPC Handler Instrumentation :p2b, after p2a, 4d
+ WebSocket Support :p2c, after p2b, 2d
+ Integration Tests :p2d, after p2c, 2d
+
+ section Phase 3
+ Transaction Tracing :p3, after p2, 2w
+ Protocol Buffer Extension :p3a, after p2, 2d
+ PeerImp Instrumentation :p3b, after p3a, 3d
+ Relay Context Propagation :p3c, after p3b, 3d
+ Multi-node Tests :p3d, after p3c, 2d
+
+ section Phase 4
+ Consensus Tracing :p4, after p3, 2w
+ Consensus Round Spans :p4a, after p3, 3d
+ Proposal Handling :p4b, after p4a, 3d
+ Validation Tests :p4c, after p4b, 4d
+
+ section Phase 5
+ Documentation & Deploy :p5, after p4, 1w
+```
+
+---
+
+## 6.2 Phase 1: Core Infrastructure (Weeks 1-2)
+
+**Objective**: Establish foundational telemetry infrastructure
+
+### Tasks
+
+| Task | Description | Effort | Risk |
+| ---- | ----------------------------------------------------- | ------ | ------ |
+| 1.1 | Add OpenTelemetry C++ SDK to Conan/CMake | 2d | Low |
+| 1.2 | Implement `Telemetry` interface and factory | 2d | Low |
+| 1.3 | Implement `SpanGuard` RAII wrapper | 1d | Low |
+| 1.4 | Implement configuration parser | 1d | Low |
+| 1.5 | Integrate into `ApplicationImp` | 1d | Medium |
+| 1.6 | Add conditional compilation (`XRPL_ENABLE_TELEMETRY`) | 1d | Low |
+| 1.7 | Create `NullTelemetry` no-op implementation | 0.5d | Low |
+| 1.8 | Unit tests for core infrastructure | 1.5d | Low |
+
+**Total Effort**: 10 days (2 developers)
+
+### Exit Criteria
+
+- [ ] OpenTelemetry SDK compiles and links
+- [ ] Telemetry can be enabled/disabled via config
+- [ ] Basic span creation works
+- [ ] No performance regression when disabled
+- [ ] Unit tests passing
+
+---
+
+## 6.3 Phase 2: RPC Tracing (Weeks 3-4)
+
+**Objective**: Complete tracing for all RPC operations
+
+### Tasks
+
+| Task | Description | Effort | Risk |
+| ---- | -------------------------------------------------- | ------ | ------ |
+| 2.1 | Implement W3C Trace Context HTTP header extraction | 1d | Low |
+| 2.2 | Instrument `ServerHandler::onRequest()` | 1d | Low |
+| 2.3 | Instrument `RPCHandler::doCommand()` | 2d | Medium |
+| 2.4 | Add RPC-specific attributes | 1d | Low |
+| 2.5 | Instrument WebSocket handler | 1d | Medium |
+| 2.6 | Integration tests for RPC tracing | 2d | Low |
+| 2.7 | Performance benchmarks | 1d | Low |
+| 2.8 | Documentation | 1d | Low |
+
+**Total Effort**: 10 days
+
+### Exit Criteria
+
+- [ ] All RPC commands traced
+- [ ] Trace context propagates from HTTP headers
+- [ ] WebSocket and HTTP both instrumented
+- [ ] <1ms overhead per RPC call
+- [ ] Integration tests passing
+
+---
+
+## 6.4 Phase 3: Transaction Tracing (Weeks 5-6)
+
+**Objective**: Trace transaction lifecycle across network
+
+### Tasks
+
+| Task | Description | Effort | Risk |
+| ---- | --------------------------------------------- | ------ | ------ |
+| 3.1 | Define `TraceContext` Protocol Buffer message | 1d | Low |
+| 3.2 | Implement protobuf context serialization | 1d | Low |
+| 3.3 | Instrument `PeerImp::handleTransaction()` | 2d | Medium |
+| 3.4 | Instrument `NetworkOPs::submitTransaction()` | 1d | Medium |
+| 3.5 | Instrument HashRouter integration | 1d | Medium |
+| 3.6 | Implement relay context propagation | 2d | High |
+| 3.7 | Integration tests (multi-node) | 2d | Medium |
+| 3.8 | Performance benchmarks | 1d | Low |
+
+**Total Effort**: 11 days
+
+### Exit Criteria
+
+- [ ] Transaction traces span across nodes
+- [ ] Trace context in Protocol Buffer messages
+- [ ] HashRouter deduplication visible in traces
+- [ ] Multi-node integration tests passing
+- [ ] <5% overhead on transaction throughput
+
+---
+
+## 6.5 Phase 4: Consensus Tracing (Weeks 7-8)
+
+**Objective**: Full observability into consensus rounds
+
+### Tasks
+
+| Task | Description | Effort | Risk |
+| ---- | ---------------------------------------------- | ------ | ------ |
+| 4.1 | Instrument `RCLConsensusAdaptor::startRound()` | 1d | Medium |
+| 4.2 | Instrument phase transitions | 2d | Medium |
+| 4.3 | Instrument proposal handling | 2d | High |
+| 4.4 | Instrument validation handling | 1d | Medium |
+| 4.5 | Add consensus-specific attributes | 1d | Low |
+| 4.6 | Correlate with transaction traces | 1d | Medium |
+| 4.7 | Multi-validator integration tests | 2d | High |
+| 4.8 | Performance validation | 1d | Medium |
+
+**Total Effort**: 11 days
+
+### Exit Criteria
+
+- [ ] Complete consensus round traces
+- [ ] Phase transitions visible
+- [ ] Proposals and validations traced
+- [ ] No impact on consensus timing
+- [ ] Multi-validator test network validated
+
+---
+
+## 6.6 Phase 5: Documentation & Deployment (Week 9)
+
+**Objective**: Production readiness
+
+### Tasks
+
+| Task | Description | Effort | Risk |
+| ---- | ----------------------------- | ------ | ---- |
+| 5.1 | Operator runbook | 1d | Low |
+| 5.2 | Grafana dashboards | 1d | Low |
+| 5.3 | Alert definitions | 0.5d | Low |
+| 5.4 | Collector deployment examples | 0.5d | Low |
+| 5.5 | Developer documentation | 1d | Low |
+| 5.6 | Training materials | 0.5d | Low |
+| 5.7 | Final integration testing | 0.5d | Low |
+
+**Total Effort**: 5 days
+
+---
+
+## 6.7 Risk Assessment
+
+```mermaid
+quadrantChart
+ title Risk Assessment Matrix
+ x-axis Low Impact --> High Impact
+ y-axis Low Likelihood --> High Likelihood
+ quadrant-1 Monitor Closely
+ quadrant-2 Mitigate Immediately
+ quadrant-3 Accept Risk
+ quadrant-4 Plan Mitigation
+
+ SDK Compatibility: [0.25, 0.2]
+ Protocol Changes: [0.75, 0.65]
+ Performance Overhead: [0.65, 0.45]
+ Context Propagation: [0.5, 0.5]
+ Memory Leaks: [0.8, 0.2]
+```
+
+### Risk Details
+
+| Risk | Likelihood | Impact | Mitigation |
+| ------------------------------------ | ---------- | ------ | --------------------------------------- |
+| Protocol changes break compatibility | Medium | High | Use high field numbers, optional fields |
+| Performance overhead unacceptable | Medium | Medium | Sampling, conditional compilation |
+| Context propagation complexity | Medium | Medium | Phased rollout, extensive testing |
+| SDK compatibility issues | Low | Medium | Pin SDK version, fallback to no-op |
+| Memory leaks in long-running nodes | Low | High | Memory profiling, bounded queues |
+
+---
+
+## 6.8 Success Metrics
+
+| Metric | Target | Measurement |
+| ------------------------ | ------------------------------ | --------------------- |
+| Trace coverage | >95% of transactions | Sampling verification |
+| CPU overhead | <3% | Benchmark tests |
+| Memory overhead | <5 MB | Memory profiling |
+| Latency impact (p99) | <2% | Performance tests |
+| Trace completeness | >99% spans with required attrs | Validation script |
+| Cross-node trace linkage | >90% of multi-hop transactions | Integration tests |
+
+---
+
+## 6.9 Effort Summary
+
+
+
+```mermaid
+%%{init: {'pie': {'textPosition': 0.75}}}%%
+pie showData
+ "Phase 1: Core Infrastructure" : 10
+ "Phase 2: RPC Tracing" : 10
+ "Phase 3: Transaction Tracing" : 11
+ "Phase 4: Consensus Tracing" : 11
+ "Phase 5: Documentation" : 5
+```
+
+**Total Effort Distribution (47 developer-days)**
+
+
+
+### Resource Requirements
+
+| Phase | Developers | Duration | Total Effort |
+| --------- | ---------- | ----------- | ------------ |
+| 1 | 2 | 2 weeks | 10 days |
+| 2 | 1-2 | 2 weeks | 10 days |
+| 3 | 2 | 2 weeks | 11 days |
+| 4 | 2 | 2 weeks | 11 days |
+| 5 | 1 | 1 week | 5 days |
+| **Total** | **2** | **9 weeks** | **47 days** |
+
+---
+
+## 6.10 Quick Wins and Crawl-Walk-Run Strategy
+
+This section outlines a prioritized approach to maximize ROI with minimal initial investment.
+
+### 6.10.1 Crawl-Walk-Run Overview
+
+
+
+```mermaid
+flowchart TB
+ subgraph crawl["🐢 CRAWL (Week 1-2)"]
+ direction LR
+ c1[Core SDK Setup] ~~~ c2[RPC Tracing Only] ~~~ c3[Single Node]
+ end
+
+ subgraph walk["🚶 WALK (Week 3-5)"]
+ direction LR
+ w1[Transaction Tracing] ~~~ w2[Cross-Node Context] ~~~ w3[Basic Dashboards]
+ end
+
+ subgraph run["🏃 RUN (Week 6-9)"]
+ direction LR
+ r1[Consensus Tracing] ~~~ r2[Full Correlation] ~~~ r3[Production Deploy]
+ end
+
+ crawl --> walk --> run
+
+ style crawl fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style walk fill:#bf360c,stroke:#8c2809,color:#fff
+ style run fill:#0d47a1,stroke:#082f6a,color:#fff
+ style c1 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style c2 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style c3 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style w1 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style w2 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style w3 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style r1 fill:#0d47a1,stroke:#082f6a,color:#fff
+ style r2 fill:#0d47a1,stroke:#082f6a,color:#fff
+ style r3 fill:#0d47a1,stroke:#082f6a,color:#fff
+```
+
+
+
+### 6.10.2 Quick Wins (Immediate Value)
+
+| Quick Win | Effort | Value | When to Deploy |
+| ------------------------------ | -------- | ------ | -------------- |
+| **RPC Command Tracing** | 2 days | High | Week 2 |
+| **RPC Latency Histograms** | 0.5 days | High | Week 2 |
+| **Error Rate Dashboard** | 0.5 days | Medium | Week 2 |
+| **Transaction Submit Tracing** | 1 day | High | Week 3 |
+| **Consensus Round Duration** | 1 day | Medium | Week 6 |
+
+### 6.10.3 CRAWL Phase (Weeks 1-2)
+
+**Goal**: Get basic tracing working with minimal code changes.
+
+**What You Get**:
+
+- RPC request/response traces for all commands
+- Latency breakdown per RPC command
+- Error visibility with stack traces
+- Basic Grafana dashboard
+
+**Code Changes**: ~15 lines in `ServerHandler.cpp`, ~40 lines in new telemetry module
+
+**Why Start Here**:
+
+- RPC is the lowest-risk, highest-visibility component
+- Immediate value for debugging client issues
+- No cross-node complexity
+- Single file modification to existing code
+
+### 6.10.4 WALK Phase (Weeks 3-5)
+
+**Goal**: Add transaction lifecycle tracing across nodes.
+
+**What You Get**:
+
+- End-to-end transaction traces from submit to relay
+- Cross-node correlation (see transaction path)
+- HashRouter deduplication visibility
+- Relay latency metrics
+
+**Code Changes**: ~120 lines across 4 files, plus protobuf extension
+
+**Why Do This Second**:
+
+- Builds on RPC tracing (transactions submitted via RPC)
+- Moderate complexity (requires context propagation)
+- High value for debugging transaction issues
+
+### 6.10.5 RUN Phase (Weeks 6-9)
+
+**Goal**: Full observability including consensus.
+
+**What You Get**:
+
+- Complete consensus round visibility
+- Phase transition timing
+- Validator proposal tracking
+- Full end-to-end traces (client → RPC → TX → consensus → ledger)
+
+**Code Changes**: ~100 lines across 3 consensus files
+
+**Why Do This Last**:
+
+- Highest complexity (consensus is critical path)
+- Requires thorough testing
+- Lower relative value (consensus issues are rarer)
+
+### 6.10.6 ROI Prioritization Matrix
+
+```mermaid
+quadrantChart
+ title Implementation ROI Matrix
+ x-axis Low Effort --> High Effort
+ y-axis Low Value --> High Value
+ quadrant-1 Quick Wins - Do First
+ quadrant-2 Major Projects - Plan Carefully
+ quadrant-3 Nice to Have - Optional
+ quadrant-4 Time Sinks - Avoid
+
+ RPC Tracing: [0.15, 0.9]
+ TX Submit Trace: [0.25, 0.85]
+ TX Relay Trace: [0.5, 0.8]
+ Consensus Trace: [0.7, 0.75]
+ Peer Message Trace: [0.85, 0.3]
+ Ledger Acquire: [0.55, 0.5]
+```
+
+---
+
+## 6.11 Definition of Done
+
+Clear, measurable criteria for each phase.
+
+### 6.11.1 Phase 1: Core Infrastructure
+
+| Criterion | Measurement | Target |
+| --------------- | ---------------------------------------------------------- | ---------------------------- |
+| SDK Integration | `cmake --build` succeeds with `-DXRPL_ENABLE_TELEMETRY=ON` | ✅ Compiles |
+| Runtime Toggle | `enabled=0` produces zero overhead | <0.1% CPU difference |
+| Span Creation | Unit test creates and exports span | Span appears in Jaeger |
+| Configuration | All config options parsed correctly | Config validation tests pass |
+| Documentation | Developer guide exists | PR approved |
+
+**Definition of Done**: All criteria met, PR merged, no regressions in CI.
+
+### 6.11.2 Phase 2: RPC Tracing
+
+| Criterion | Measurement | Target |
+| ------------------ | ---------------------------------- | -------------------------- |
+| Coverage | All RPC commands instrumented | 100% of commands |
+| Context Extraction | traceparent header propagates | Integration test passes |
+| Attributes | Command, status, duration recorded | Validation script confirms |
+| Performance | RPC latency overhead | <1ms p99 |
+| Dashboard | Grafana dashboard deployed | Screenshot in docs |
+
+**Definition of Done**: RPC traces visible in Jaeger/Tempo for all commands, dashboard shows latency distribution.
+
+### 6.11.3 Phase 3: Transaction Tracing
+
+| Criterion | Measurement | Target |
+| ---------------- | ------------------------------- | ---------------------------------- |
+| Local Trace | Submit → validate → TxQ traced | Single-node test passes |
+| Cross-Node | Context propagates via protobuf | Multi-node test passes |
+| Relay Visibility | relay_count attribute correct | Spot check 100 txs |
+| HashRouter | Deduplication visible in trace | Duplicate txs show suppressed=true |
+| Performance | TX throughput overhead | <5% degradation |
+
+**Definition of Done**: Transaction traces span 3+ nodes in test network, performance within bounds.
+
+### 6.11.4 Phase 4: Consensus Tracing
+
+| Criterion | Measurement | Target |
+| -------------------- | ----------------------------- | ------------------------- |
+| Round Tracing | startRound creates root span | Unit test passes |
+| Phase Visibility | All phases have child spans | Integration test confirms |
+| Proposer Attribution | Proposer ID in attributes | Spot check 50 rounds |
+| Timing Accuracy | Phase durations match PerfLog | <5% variance |
+| No Consensus Impact | Round timing unchanged | Performance test passes |
+
+**Definition of Done**: Consensus rounds fully traceable, no impact on consensus timing.
+
+### 6.11.5 Phase 5: Production Deployment
+
+| Criterion | Measurement | Target |
+| ------------ | ---------------------------- | -------------------------- |
+| Collector HA | Multiple collectors deployed | No single point of failure |
+| Sampling | Tail sampling configured | 10% base + errors + slow |
+| Retention | Data retained per policy | 7 days hot, 30 days warm |
+| Alerting | Alerts configured | Error spike, high latency |
+| Runbook | Operator documentation | Approved by ops team |
+| Training | Team trained | Session completed |
+
+**Definition of Done**: Telemetry running in production, operators trained, alerts active.
+
+### 6.11.6 Success Metrics Summary
+
+| Phase | Primary Metric | Secondary Metric | Deadline |
+| ------- | ---------------------- | --------------------------- | ------------- |
+| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 |
+| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 |
+| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 |
+| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 |
+| Phase 5 | Production deployment | Operators trained | End of Week 9 |
+
+---
+
+## 6.12 Recommended Implementation Order
+
+Based on ROI analysis, implement in this exact order:
+
+```mermaid
+flowchart TB
+ subgraph week1["Week 1"]
+ t1[1. OpenTelemetry SDK
Conan/CMake integration]
+ t2[2. Telemetry interface
SpanGuard, config]
+ end
+
+ subgraph week2["Week 2"]
+ t3[3. RPC ServerHandler
instrumentation]
+ t4[4. Basic Jaeger setup
for testing]
+ end
+
+ subgraph week3["Week 3"]
+ t5[5. Transaction submit
tracing]
+ t6[6. Grafana dashboard
v1]
+ end
+
+ subgraph week4["Week 4"]
+ t7[7. Protobuf context
extension]
+ t8[8. PeerImp tx.relay
instrumentation]
+ end
+
+ subgraph week5["Week 5"]
+ t9[9. Multi-node
integration tests]
+ t10[10. Performance
benchmarks]
+ end
+
+ subgraph week6_8["Weeks 6-8"]
+ t11[11. Consensus
instrumentation]
+ t12[12. Full integration
testing]
+ end
+
+ subgraph week9["Week 9"]
+ t13[13. Production
deployment]
+ t14[14. Documentation
& training]
+ end
+
+ t1 --> t2 --> t3 --> t4
+ t4 --> t5 --> t6
+ t6 --> t7 --> t8
+ t8 --> t9 --> t10
+ t10 --> t11 --> t12
+ t12 --> t13 --> t14
+
+ style week1 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style week2 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style week3 fill:#bf360c,stroke:#8c2809,color:#fff
+ style week4 fill:#bf360c,stroke:#8c2809,color:#fff
+ style week5 fill:#bf360c,stroke:#8c2809,color:#fff
+ style week6_8 fill:#0d47a1,stroke:#082f6a,color:#fff
+ style week9 fill:#4a148c,stroke:#2e0d57,color:#fff
+ style t1 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style t2 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style t3 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style t4 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style t5 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t6 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t7 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t8 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t9 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t10 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
+ style t11 fill:#0d47a1,stroke:#082f6a,color:#fff
+ style t12 fill:#0d47a1,stroke:#082f6a,color:#fff
+ style t13 fill:#4a148c,stroke:#2e0d57,color:#fff
+ style t14 fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+---
+
+_Previous: [Configuration Reference](./05-configuration-reference.md)_ | _Next: [Observability Backends](./07-observability-backends.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_
diff --git a/OpenTelemetryPlan/07-observability-backends.md b/OpenTelemetryPlan/07-observability-backends.md
new file mode 100644
index 00000000000..a90f41ae43f
--- /dev/null
+++ b/OpenTelemetryPlan/07-observability-backends.md
@@ -0,0 +1,595 @@
+# Observability Backend Recommendations
+
+> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
+> **Related**: [Implementation Phases](./06-implementation-phases.md) | [Appendix](./08-appendix.md)
+
+---
+
+## 7.1 Development/Testing Backends
+
+| Backend | Pros | Cons | Use Case |
+| ---------- | ------------------- | ----------------- | ----------------- |
+| **Jaeger** | Easy setup, good UI | Limited retention | Local dev, CI |
+| **Zipkin** | Simple, lightweight | Basic features | Quick prototyping |
+
+### Quick Start with Jaeger
+
+```bash
+# Start Jaeger with OTLP support
+docker run -d --name jaeger \
+ -e COLLECTOR_OTLP_ENABLED=true \
+ -p 16686:16686 \
+ -p 4317:4317 \
+ -p 4318:4318 \
+ jaegertracing/all-in-one:latest
+```
+
+---
+
+## 7.2 Production Backends
+
+| Backend | Pros | Cons | Use Case |
+| ----------------- | ----------------------------------------- | ------------------ | --------------------------- |
+| **Grafana Tempo** | Cost-effective, Grafana integration | Newer project | Most production deployments |
+| **Elastic APM** | Full observability stack, log correlation | Resource intensive | Existing Elastic users |
+| **Honeycomb** | Excellent query, high cardinality | SaaS cost | Deep debugging needs |
+| **Datadog APM** | Full platform, easy setup | SaaS cost | Enterprise with budget |
+
+### Backend Selection Flowchart
+
+```mermaid
+flowchart TD
+ start[Select Backend] --> budget{Budget
Constraints?}
+
+ budget -->|Yes| oss[Open Source]
+ budget -->|No| saas{Prefer
SaaS?}
+
+ oss --> existing{Existing
Stack?}
+ existing -->|Grafana| tempo[Grafana Tempo]
+ existing -->|Elastic| elastic[Elastic APM]
+ existing -->|None| tempo
+
+ saas -->|Yes| enterprise{Enterprise
Support?}
+ saas -->|No| oss
+
+ enterprise -->|Yes| datadog[Datadog APM]
+ enterprise -->|No| honeycomb[Honeycomb]
+
+ tempo --> final[Configure Collector]
+ elastic --> final
+ honeycomb --> final
+ datadog --> final
+
+ style start fill:#0f172a,stroke:#020617,color:#fff
+ style budget fill:#334155,stroke:#1e293b,color:#fff
+ style oss fill:#1e293b,stroke:#0f172a,color:#fff
+ style existing fill:#334155,stroke:#1e293b,color:#fff
+ style saas fill:#334155,stroke:#1e293b,color:#fff
+ style enterprise fill:#334155,stroke:#1e293b,color:#fff
+ style final fill:#0f172a,stroke:#020617,color:#fff
+ style tempo fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style elastic fill:#bf360c,stroke:#8c2809,color:#fff
+ style honeycomb fill:#0d47a1,stroke:#082f6a,color:#fff
+ style datadog fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+---
+
+## 7.3 Recommended Production Architecture
+
+```mermaid
+flowchart TB
+ subgraph validators["Validator Nodes"]
+ v1[rippled
Validator 1]
+ v2[rippled
Validator 2]
+ end
+
+ subgraph stock["Stock Nodes"]
+ s1[rippled
Stock 1]
+ s2[rippled
Stock 2]
+ end
+
+ subgraph collector["OTel Collector Cluster"]
+ c1[Collector
DC1]
+ c2[Collector
DC2]
+ end
+
+ subgraph backends["Storage Backends"]
+ tempo[(Grafana
Tempo)]
+ elastic[(Elastic
APM)]
+ archive[(S3/GCS
Archive)]
+ end
+
+ subgraph ui["Visualization"]
+ grafana[Grafana
Dashboards]
+ end
+
+ v1 -->|OTLP| c1
+ v2 -->|OTLP| c1
+ s1 -->|OTLP| c2
+ s2 -->|OTLP| c2
+
+ c1 --> tempo
+ c1 --> elastic
+ c2 --> tempo
+ c2 --> archive
+
+ tempo --> grafana
+ elastic --> grafana
+
+ style validators fill:#b71c1c,stroke:#7f1d1d,color:#ffffff
+ style stock fill:#0d47a1,stroke:#082f6a,color:#ffffff
+ style collector fill:#bf360c,stroke:#8c2809,color:#ffffff
+ style backends fill:#1b5e20,stroke:#0d3d14,color:#ffffff
+ style ui fill:#4a148c,stroke:#2e0d57,color:#ffffff
+```
+
+---
+
+## 7.4 Architecture Considerations
+
+### 7.4.1 Collector Placement
+
+| Strategy | Description | Pros | Cons |
+| ------------- | -------------------- | ------------------------ | ----------------------- |
+| **Sidecar** | Collector per node | Isolation, simple config | Resource overhead |
+| **DaemonSet** | Collector per host | Shared resources | Complexity |
+| **Gateway** | Central collector(s) | Centralized processing | Single point of failure |
+
+**Recommendation**: Use **Gateway** pattern with regional collectors for rippled networks:
+
+- One collector cluster per datacenter/region
+- Tail-based sampling at collector level
+- Multiple export destinations for redundancy
+
+### 7.4.2 Sampling Strategy
+
+```mermaid
+flowchart LR
+ subgraph head["Head Sampling (Node)"]
+ hs[10% probabilistic]
+ end
+
+ subgraph tail["Tail Sampling (Collector)"]
+ ts1[Keep all errors]
+ ts2[Keep slow >5s]
+ ts3[Keep 10% rest]
+ end
+
+ head --> tail
+
+ ts1 --> final[Final Traces]
+ ts2 --> final
+ ts3 --> final
+
+ style head fill:#0d47a1,stroke:#082f6a,color:#fff
+ style tail fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style hs fill:#0d47a1,stroke:#082f6a,color:#fff
+ style ts1 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style ts2 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style ts3 fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style final fill:#bf360c,stroke:#8c2809,color:#fff
+```
+
+### 7.4.3 Data Retention
+
+| Environment | Hot Storage | Warm Storage | Cold Archive |
+| ----------- | ----------- | ------------ | ------------ |
+| Development | 24 hours | N/A | N/A |
+| Staging | 7 days | N/A | N/A |
+| Production | 7 days | 30 days | many years |
+
+---
+
+## 7.5 Integration Checklist
+
+- [ ] Choose primary backend (Tempo recommended for cost/features)
+- [ ] Deploy collector cluster with high availability
+- [ ] Configure tail-based sampling for error/latency traces
+- [ ] Set up Grafana dashboards for trace visualization
+- [ ] Configure alerts for trace anomalies
+- [ ] Establish data retention policies
+- [ ] Test trace correlation with logs and metrics
+
+---
+
+## 7.6 Grafana Dashboard Examples
+
+Pre-built dashboards for rippled observability.
+
+### 7.6.1 Consensus Health Dashboard
+
+```json
+{
+ "title": "rippled Consensus Health",
+ "uid": "rippled-consensus-health",
+ "tags": ["rippled", "consensus", "tracing"],
+ "panels": [
+ {
+ "title": "Consensus Round Duration",
+ "type": "timeseries",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"consensus.round\"} | avg(duration) by (resource.service.instance.id)"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ms",
+ "thresholds": {
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 4000 },
+ { "color": "red", "value": 5000 }
+ ]
+ }
+ }
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
+ },
+ {
+ "title": "Phase Duration Breakdown",
+ "type": "barchart",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=~\"consensus.phase.*\"} | avg(duration) by (name)"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
+ },
+ {
+ "title": "Proposers per Round",
+ "type": "stat",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"consensus.round\"} | avg(span.xrpl.consensus.proposers)"
+ }
+ ],
+ "gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 }
+ },
+ {
+ "title": "Recent Slow Rounds (>5s)",
+ "type": "table",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"consensus.round\"} | duration > 5s"
+ }
+ ],
+ "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }
+ }
+ ]
+}
+```
+
+### 7.6.2 Node Overview Dashboard
+
+```json
+{
+ "title": "rippled Node Overview",
+ "uid": "rippled-node-overview",
+ "panels": [
+ {
+ "title": "Active Nodes",
+ "type": "stat",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\"} | count_over_time() by (resource.service.instance.id) | count()"
+ }
+ ],
+ "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }
+ },
+ {
+ "title": "Total Transactions (1h)",
+ "type": "stat",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"tx.receive\"} | count()"
+ }
+ ],
+ "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }
+ },
+ {
+ "title": "Error Rate",
+ "type": "gauge",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && status.code=error} | rate() / {resource.service.name=\"rippled\"} | rate() * 100"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "max": 10,
+ "thresholds": {
+ "steps": [
+ { "color": "green", "value": null },
+ { "color": "yellow", "value": 1 },
+ { "color": "red", "value": 5 }
+ ]
+ }
+ }
+ },
+ "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }
+ },
+ {
+ "title": "Service Map",
+ "type": "nodeGraph",
+ "datasource": "Tempo",
+ "gridPos": { "h": 12, "w": 12, "x": 12, "y": 0 }
+ }
+ ]
+}
+```
+
+### 7.6.3 Alert Rules
+
+```yaml
+# grafana/provisioning/alerting/rippled-alerts.yaml
+apiVersion: 1
+
+groups:
+ - name: rippled-tracing-alerts
+ folder: rippled
+ interval: 1m
+ rules:
+ - uid: consensus-slow
+ title: Consensus Round Slow
+ condition: A
+ data:
+ - refId: A
+ datasourceUid: tempo
+ model:
+ queryType: traceql
+ query: '{resource.service.name="rippled" && name="consensus.round"} | avg(duration) > 5s'
+ for: 5m
+ annotations:
+ summary: Consensus rounds taking >5 seconds
+ description: "Consensus duration: {{ $value }}ms"
+ labels:
+ severity: warning
+
+ - uid: rpc-error-spike
+ title: RPC Error Rate Spike
+ condition: B
+ data:
+ - refId: B
+ datasourceUid: tempo
+ model:
+ queryType: traceql
+ query: '{resource.service.name="rippled" && name=~"rpc.command.*" && status.code=error} | rate() > 0.05'
+ for: 2m
+ annotations:
+ summary: RPC error rate >5%
+ labels:
+ severity: critical
+
+ - uid: tx-throughput-drop
+ title: Transaction Throughput Drop
+ condition: C
+ data:
+ - refId: C
+ datasourceUid: tempo
+ model:
+ queryType: traceql
+ query: '{resource.service.name="rippled" && name="tx.receive"} | rate() < 10'
+ for: 10m
+ annotations:
+ summary: Transaction throughput below threshold
+ labels:
+ severity: warning
+```
+
+---
+
+## 7.7 PerfLog and Insight Correlation
+
+How to correlate OpenTelemetry traces with existing rippled observability.
+
+### 7.7.1 Correlation Architecture
+
+```mermaid
+flowchart TB
+ subgraph rippled["rippled Node"]
+ otel[OpenTelemetry
Spans]
+ perflog[PerfLog
JSON Logs]
+ insight[Beast Insight
StatsD Metrics]
+ end
+
+ subgraph collectors["Data Collection"]
+ otelc[OTel Collector]
+ promtail[Promtail/Fluentd]
+ statsd[StatsD Exporter]
+ end
+
+ subgraph storage["Storage"]
+ tempo[(Tempo)]
+ loki[(Loki)]
+ prom[(Prometheus)]
+ end
+
+ subgraph grafana["Grafana"]
+ traces[Trace View]
+ logs[Log View]
+ metrics[Metrics View]
+ corr[Correlation
Panel]
+ end
+
+ otel -->|OTLP| otelc --> tempo
+ perflog -->|JSON| promtail --> loki
+ insight -->|StatsD| statsd --> prom
+
+ tempo --> traces
+ loki --> logs
+ prom --> metrics
+
+ traces --> corr
+ logs --> corr
+ metrics --> corr
+
+ style rippled fill:#0d47a1,stroke:#082f6a,color:#fff
+ style collectors fill:#bf360c,stroke:#8c2809,color:#fff
+ style storage fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style grafana fill:#4a148c,stroke:#2e0d57,color:#fff
+ style otel fill:#0d47a1,stroke:#082f6a,color:#fff
+ style perflog fill:#0d47a1,stroke:#082f6a,color:#fff
+ style insight fill:#0d47a1,stroke:#082f6a,color:#fff
+ style otelc fill:#bf360c,stroke:#8c2809,color:#fff
+ style promtail fill:#bf360c,stroke:#8c2809,color:#fff
+ style statsd fill:#bf360c,stroke:#8c2809,color:#fff
+ style tempo fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style loki fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style prom fill:#1b5e20,stroke:#0d3d14,color:#fff
+ style traces fill:#4a148c,stroke:#2e0d57,color:#fff
+ style logs fill:#4a148c,stroke:#2e0d57,color:#fff
+ style metrics fill:#4a148c,stroke:#2e0d57,color:#fff
+ style corr fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+### 7.7.2 Correlation Fields
+
+| Source | Field | Link To | Purpose |
+| ----------- | --------------------------- | ------------- | -------------------------- |
+| **Trace** | `trace_id` | Logs | Find log entries for trace |
+| **Trace** | `xrpl.tx.hash` | Logs, Metrics | Find TX-related data |
+| **Trace** | `xrpl.consensus.ledger.seq` | Logs | Find ledger-related logs |
+| **PerfLog** | `trace_id` (new) | Traces | Jump to trace from log |
+| **PerfLog** | `ledger_seq` | Traces | Find consensus trace |
+| **Insight** | `exemplar.trace_id` | Traces | Jump from metric spike |
+
+### 7.7.3 Example: Debugging a Slow Transaction
+
+**Step 1: Find the trace**
+
+```
+# In Grafana Explore with Tempo
+{resource.service.name="rippled" && span.xrpl.tx.hash="ABC123..."}
+```
+
+**Step 2: Get the trace_id from the trace view**
+
+```
+Trace ID: 4bf92f3577b34da6a3ce929d0e0e4736
+```
+
+**Step 3: Find related PerfLog entries**
+
+```
+# In Grafana Explore with Loki
+{job="rippled"} |= "4bf92f3577b34da6a3ce929d0e0e4736"
+```
+
+**Step 4: Check Insight metrics for the time window**
+
+```
+# In Grafana with Prometheus
+rate(rippled_tx_applied_total[1m])
+ @ timestamp_from_trace
+```
+
+### 7.7.4 Unified Dashboard Example
+
+```json
+{
+ "title": "rippled Unified Observability",
+ "uid": "rippled-unified",
+ "panels": [
+ {
+ "title": "Transaction Latency (Traces)",
+ "type": "timeseries",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\" && name=\"tx.receive\"} | histogram_over_time(duration)"
+ }
+ ],
+ "gridPos": { "h": 6, "w": 8, "x": 0, "y": 0 }
+ },
+ {
+ "title": "Transaction Rate (Metrics)",
+ "type": "timeseries",
+ "datasource": "Prometheus",
+ "targets": [
+ {
+ "expr": "rate(rippled_tx_received_total[5m])",
+ "legendFormat": "{{ instance }}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "links": [
+ {
+ "title": "View traces",
+ "url": "/explore?left={\"datasource\":\"Tempo\",\"query\":\"{resource.service.name=\\\"rippled\\\" && name=\\\"tx.receive\\\"}\"}"
+ }
+ ]
+ }
+ },
+ "gridPos": { "h": 6, "w": 8, "x": 8, "y": 0 }
+ },
+ {
+ "title": "Recent Logs",
+ "type": "logs",
+ "datasource": "Loki",
+ "targets": [
+ {
+ "expr": "{job=\"rippled\"} | json"
+ }
+ ],
+ "gridPos": { "h": 6, "w": 8, "x": 16, "y": 0 }
+ },
+ {
+ "title": "Trace Search",
+ "type": "table",
+ "datasource": "Tempo",
+ "targets": [
+ {
+ "queryType": "traceql",
+ "query": "{resource.service.name=\"rippled\"}"
+ }
+ ],
+ "fieldConfig": {
+ "overrides": [
+ {
+ "matcher": { "id": "byName", "options": "traceID" },
+ "properties": [
+ {
+ "id": "links",
+ "value": [
+ {
+ "title": "View trace",
+ "url": "/explore?left={\"datasource\":\"Tempo\",\"query\":\"${__value.raw}\"}"
+ },
+ {
+ "title": "View logs",
+ "url": "/explore?left={\"datasource\":\"Loki\",\"query\":\"{job=\\\"rippled\\\"} |= \\\"${__value.raw}\\\"\"}"
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 12, "w": 24, "x": 0, "y": 6 }
+ }
+ ]
+}
+```
+
+---
+
+_Previous: [Implementation Phases](./06-implementation-phases.md)_ | _Next: [Appendix](./08-appendix.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_
diff --git a/OpenTelemetryPlan/08-appendix.md b/OpenTelemetryPlan/08-appendix.md
new file mode 100644
index 00000000000..98470dd13cb
--- /dev/null
+++ b/OpenTelemetryPlan/08-appendix.md
@@ -0,0 +1,133 @@
+# Appendix
+
+> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
+> **Related**: [Observability Backends](./07-observability-backends.md)
+
+---
+
+## 8.1 Glossary
+
+| Term | Definition |
+| --------------------- | ---------------------------------------------------------- |
+| **Span** | A unit of work with start/end time, name, and attributes |
+| **Trace** | A collection of spans representing a complete request flow |
+| **Trace ID** | 128-bit unique identifier for a trace |
+| **Span ID** | 64-bit unique identifier for a span within a trace |
+| **Context** | Carrier for trace/span IDs across boundaries |
+| **Propagator** | Component that injects/extracts context |
+| **Sampler** | Decides which traces to record |
+| **Exporter** | Sends spans to backend |
+| **Collector** | Receives, processes, and forwards telemetry |
+| **OTLP** | OpenTelemetry Protocol (wire format) |
+| **W3C Trace Context** | Standard HTTP headers for trace propagation |
+| **Baggage** | Key-value pairs propagated across service boundaries |
+| **Resource** | Entity producing telemetry (service, host, etc.) |
+| **Instrumentation** | Code that creates telemetry data |
+
+### rippled-Specific Terms
+
+| Term | Definition |
+| ----------------- | -------------------------------------------------- |
+| **Overlay** | P2P network layer managing peer connections |
+| **Consensus** | XRP Ledger consensus algorithm (RCL) |
+| **Proposal** | Validator's suggested transaction set for a ledger |
+| **Validation** | Validator's signature on a closed ledger |
+| **HashRouter** | Component for transaction deduplication |
+| **JobQueue** | Thread pool for asynchronous task execution |
+| **PerfLog** | Existing performance logging system in rippled |
+| **Beast Insight** | Existing metrics framework in rippled |
+
+---
+
+## 8.2 Span Hierarchy Visualization
+
+```mermaid
+flowchart TB
+ subgraph trace["Trace: Transaction Lifecycle"]
+ rpc["rpc.submit
(entry point)"]
+ validate["tx.validate"]
+ relay["tx.relay
(parent span)"]
+
+ subgraph peers["Peer Spans"]
+ p1["peer.send
Peer A"]
+ p2["peer.send
Peer B"]
+ p3["peer.send
Peer C"]
+ end
+
+ consensus["consensus.round"]
+ apply["tx.apply"]
+ end
+
+ rpc --> validate
+ validate --> relay
+ relay --> p1
+ relay --> p2
+ relay --> p3
+ p1 -.->|"context propagation"| consensus
+ consensus --> apply
+
+ style trace fill:#0f172a,stroke:#020617,color:#fff
+ style peers fill:#1e3a8a,stroke:#172554,color:#fff
+ style rpc fill:#1d4ed8,stroke:#1e40af,color:#fff
+ style validate fill:#047857,stroke:#064e3b,color:#fff
+ style relay fill:#047857,stroke:#064e3b,color:#fff
+ style p1 fill:#0e7490,stroke:#155e75,color:#fff
+ style p2 fill:#0e7490,stroke:#155e75,color:#fff
+ style p3 fill:#0e7490,stroke:#155e75,color:#fff
+ style consensus fill:#fef3c7,stroke:#fde68a,color:#1e293b
+ style apply fill:#047857,stroke:#064e3b,color:#fff
+```
+
+---
+
+## 8.3 References
+
+### OpenTelemetry Resources
+
+1. [OpenTelemetry C++ SDK](https://github.com/open-telemetry/opentelemetry-cpp)
+2. [OpenTelemetry Specification](https://opentelemetry.io/docs/specs/otel/)
+3. [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/)
+4. [OTLP Protocol Specification](https://opentelemetry.io/docs/specs/otlp/)
+
+### Standards
+
+5. [W3C Trace Context](https://www.w3.org/TR/trace-context/)
+6. [W3C Baggage](https://www.w3.org/TR/baggage/)
+7. [Protocol Buffers](https://protobuf.dev/)
+
+### rippled Resources
+
+8. [rippled Source Code](https://github.com/XRPLF/rippled)
+9. [XRP Ledger Documentation](https://xrpl.org/docs/)
+10. [rippled Overlay README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/overlay/README.md)
+11. [rippled RPC README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/rpc/README.md)
+12. [rippled Consensus README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/app/consensus/README.md)
+
+---
+
+## 8.4 Version History
+
+| Version | Date | Author | Changes |
+| ------- | ---------- | ------ | --------------------------------- |
+| 1.0 | 2026-02-12 | - | Initial implementation plan |
+| 1.1 | 2026-02-13 | - | Refactored into modular documents |
+
+---
+
+## 8.5 Document Index
+
+| Document | Description |
+| ---------------------------------------------------------------- | ------------------------------------------ |
+| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary |
+| [01-architecture-analysis.md](./01-architecture-analysis.md) | rippled architecture and trace points |
+| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions |
+| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis |
+| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components |
+| [05-configuration-reference.md](./05-configuration-reference.md) | rippled config, CMake, Collector configs |
+| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics |
+| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture |
+| [08-appendix.md](./08-appendix.md) | Glossary, references, version history |
+
+---
+
+_Previous: [Observability Backends](./07-observability-backends.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_
diff --git a/OpenTelemetryPlan/OpenTelemetryPlan.md b/OpenTelemetryPlan/OpenTelemetryPlan.md
new file mode 100644
index 00000000000..96a1b697dea
--- /dev/null
+++ b/OpenTelemetryPlan/OpenTelemetryPlan.md
@@ -0,0 +1,190 @@
+# [OpenTelemetry](00-tracing-fundamentals.md) Distributed Tracing Implementation Plan for rippled (xrpld)
+
+## Executive Summary
+
+This document provides a comprehensive implementation plan for integrating OpenTelemetry distributed tracing into the rippled XRP Ledger node software. The plan addresses the unique challenges of a decentralized peer-to-peer system where trace context must propagate across network boundaries between independent nodes.
+
+### Key Benefits
+
+- **End-to-end transaction visibility**: Track transactions from submission through consensus to ledger inclusion
+- **Consensus round analysis**: Understand timing and behavior of consensus phases across validators
+- **RPC performance insights**: Identify slow handlers and optimize response times
+- **Network topology understanding**: Visualize message propagation patterns between peers
+- **Incident debugging**: Correlate events across distributed nodes during issues
+
+### Estimated Performance Overhead
+
+| Metric | Overhead | Notes |
+| ------------- | ---------- | ----------------------------------- |
+| CPU | 1-3% | Span creation and attribute setting |
+| Memory | 2-5 MB | Batch buffer for pending spans |
+| Network | 10-50 KB/s | Compressed OTLP export to collector |
+| Latency (p99) | <2% | With proper sampling configuration |
+
+---
+
+## Document Structure
+
+This implementation plan is organized into modular documents for easier navigation:
+
+
+
+```mermaid
+flowchart TB
+ overview["📋 OpenTelemetryPlan.md
(This Document)"]
+
+ subgraph analysis["Analysis & Design"]
+ arch["01-architecture-analysis.md"]
+ design["02-design-decisions.md"]
+ end
+
+ subgraph impl["Implementation"]
+ strategy["03-implementation-strategy.md"]
+ code["04-code-samples.md"]
+ config["05-configuration-reference.md"]
+ end
+
+ subgraph deploy["Deployment & Planning"]
+ phases["06-implementation-phases.md"]
+ backends["07-observability-backends.md"]
+ appendix["08-appendix.md"]
+ end
+
+ overview --> analysis
+ overview --> impl
+ overview --> deploy
+
+ arch --> design
+ design --> strategy
+ strategy --> code
+ code --> config
+ config --> phases
+ phases --> backends
+ backends --> appendix
+
+ style overview fill:#1b5e20,stroke:#0d3d14,color:#fff,stroke-width:2px
+ style analysis fill:#0d47a1,stroke:#082f6a,color:#fff
+ style impl fill:#bf360c,stroke:#8c2809,color:#fff
+ style deploy fill:#4a148c,stroke:#2e0d57,color:#fff
+ style arch fill:#0d47a1,stroke:#082f6a,color:#fff
+ style design fill:#0d47a1,stroke:#082f6a,color:#fff
+ style strategy fill:#bf360c,stroke:#8c2809,color:#fff
+ style code fill:#bf360c,stroke:#8c2809,color:#fff
+ style config fill:#bf360c,stroke:#8c2809,color:#fff
+ style phases fill:#4a148c,stroke:#2e0d57,color:#fff
+ style backends fill:#4a148c,stroke:#2e0d57,color:#fff
+ style appendix fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+
+
+---
+
+## Table of Contents
+
+| Section | Document | Description |
+| ------- | ---------------------------------------------------------- | ---------------------------------------------------------------------- |
+| **1** | [Architecture Analysis](./01-architecture-analysis.md) | rippled component analysis, trace points, instrumentation priorities |
+| **2** | [Design Decisions](./02-design-decisions.md) | SDK selection, exporters, span naming, attributes, context propagation |
+| **3** | [Implementation Strategy](./03-implementation-strategy.md) | Directory structure, key principles, performance optimization |
+| **4** | [Code Samples](./04-code-samples.md) | Complete C++ implementation examples for all components |
+| **5** | [Configuration Reference](./05-configuration-reference.md) | rippled config, CMake integration, Collector configurations |
+| **6** | [Implementation Phases](./06-implementation-phases.md) | 5-phase timeline, tasks, risks, success metrics |
+| **7** | [Observability Backends](./07-observability-backends.md) | Backend selection guide and production architecture |
+| **8** | [Appendix](./08-appendix.md) | Glossary, references, version history |
+
+---
+
+## 1. Architecture Analysis
+
+The rippled node consists of several key components that require instrumentation for comprehensive distributed tracing. The main areas include the RPC server (HTTP/WebSocket), Overlay P2P network, Consensus mechanism (RCLConsensus), JobQueue for async task execution, and existing observability infrastructure (PerfLog, Insight/StatsD, Journal logging).
+
+Key trace points span across transaction submission via RPC, peer-to-peer message propagation, consensus round execution, and ledger building. The implementation prioritizes high-value, low-risk components first: RPC handlers provide immediate value with minimal risk, while consensus tracing requires careful implementation to avoid timing impacts.
+
+➡️ **[Read full Architecture Analysis](./01-architecture-analysis.md)**
+
+---
+
+## 2. Design Decisions
+
+The OpenTelemetry C++ SDK is selected for its CNCF backing, active development, and native performance characteristics. Traces are exported via OTLP/gRPC (primary) or OTLP/HTTP (fallback) to an OpenTelemetry Collector, which provides flexible routing and sampling.
+
+Span naming follows a hierarchical `.` convention (e.g., `rpc.submit`, `tx.relay`, `consensus.round`). Context propagation uses W3C Trace Context headers for HTTP and embedded Protocol Buffer fields for P2P messages. The implementation coexists with existing PerfLog and Insight observability systems through correlation IDs.
+
+**Data Collection & Privacy**: Telemetry collects only operational metadata (timing, counts, hashes) — never sensitive content (private keys, balances, amounts, raw payloads). Privacy protection includes account hashing, configurable redaction, sampling, and collector-level filtering. Node operators retain full control(not penned down in this document yet) over what data is exported.
+
+➡️ **[Read full Design Decisions](./02-design-decisions.md)**
+
+---
+
+## 3. Implementation Strategy
+
+The telemetry code is organized under `include/xrpl/telemetry/` for headers and `src/libxrpl/telemetry/` for implementation. Key principles include RAII-based span management via `SpanGuard`, conditional compilation with `XRPL_ENABLE_TELEMETRY`, and minimal runtime overhead through batch processing and efficient sampling.
+
+Performance optimization strategies include probabilistic head sampling (10% default), tail-based sampling at the collector for errors and slow traces, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.
+
+➡️ **[Read full Implementation Strategy](./03-implementation-strategy.md)**
+
+---
+
+## 4. Code Samples
+
+Complete C++ implementation examples are provided for all telemetry components:
+
+- `Telemetry.h` - Core interface for tracer access and span creation
+- `SpanGuard.h` - RAII wrapper for automatic span lifecycle management
+- `TracingInstrumentation.h` - Macros for conditional instrumentation
+- Protocol Buffer extensions for trace context propagation
+- Module-specific instrumentation (RPC, Consensus, P2P, JobQueue)
+
+➡️ **[View all Code Samples](./04-code-samples.md)**
+
+---
+
+## 5. Configuration Reference
+
+Configuration is handled through the `[telemetry]` section in `xrpld.cfg` with options for enabling/disabling, exporter selection, endpoint configuration, sampling ratios, and component-level filtering. CMake integration includes a `XRPL_ENABLE_TELEMETRY` option for compile-time control.
+
+OpenTelemetry Collector configurations are provided for development (with Jaeger) and production (with tail-based sampling, Tempo, and Elastic APM). Docker Compose examples enable quick local development environment setup.
+
+➡️ **[View full Configuration Reference](./05-configuration-reference.md)**
+
+---
+
+## 6. Implementation Phases
+
+The implementation spans 9 weeks across 5 phases:
+
+| Phase | Duration | Focus | Key Deliverables |
+| ----- | --------- | ------------------- | --------------------------------------------------- |
+| 1 | Weeks 1-2 | Core Infrastructure | SDK integration, Telemetry interface, Configuration |
+| 2 | Weeks 3-4 | RPC Tracing | HTTP context extraction, Handler instrumentation |
+| 3 | Weeks 5-6 | Transaction Tracing | Protocol Buffer context, Relay propagation |
+| 4 | Weeks 7-8 | Consensus Tracing | Round spans, Proposal/validation tracing |
+| 5 | Week 9 | Documentation | Runbook, Dashboards, Training |
+
+**Total Effort**: 47 developer-days with 2 developers
+
+➡️ **[View full Implementation Phases](./06-implementation-phases.md)**
+
+---
+
+## 7. Observability Backends
+
+For development and testing, Jaeger provides easy setup with a good UI. For production deployments, Grafana Tempo is recommended for its cost-effectiveness and Grafana integration, while Elastic APM is ideal for organizations with existing Elastic infrastructure.
+
+The recommended production architecture uses a gateway collector pattern with regional collectors performing tail-based sampling, routing traces to multiple backends (Tempo for primary storage, Elastic for log correlation, S3/GCS for long-term archive).
+
+➡️ **[View Observability Backend Recommendations](./07-observability-backends.md)**
+
+---
+
+## 8. Appendix
+
+The appendix contains a glossary of OpenTelemetry and rippled-specific terms, references to external documentation and specifications, version history for this implementation plan, and a complete document index.
+
+➡️ **[View Appendix](./08-appendix.md)**
+
+---
+
+_This document provides a comprehensive implementation plan for integrating OpenTelemetry distributed tracing into the rippled XRP Ledger node software. For detailed information on any section, follow the links to the corresponding sub-documents._
diff --git a/OpenTelemetryPlan/POC_taskList.md b/OpenTelemetryPlan/POC_taskList.md
new file mode 100644
index 00000000000..8d3a24279ee
--- /dev/null
+++ b/OpenTelemetryPlan/POC_taskList.md
@@ -0,0 +1,610 @@
+# OpenTelemetry POC Task List
+
+> **Goal**: Build a minimal end-to-end proof of concept that demonstrates distributed tracing in rippled. A successful POC will show RPC request traces flowing from rippled through an OTel Collector into Jaeger, viewable in a browser UI.
+>
+> **Scope**: RPC tracing only (highest value, lowest risk per the [CRAWL phase](./06-implementation-phases.md#6102-quick-wins-immediate-value) in the implementation phases). No cross-node P2P context propagation or consensus tracing in the POC.
+
+### Related Plan Documents
+
+| Document | Relevance to POC |
+| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Core concepts: traces, spans, context propagation, sampling |
+| [01-architecture-analysis.md](./01-architecture-analysis.md) | RPC request flow (§1.5), key trace points (§1.6), instrumentation priority (§1.7) |
+| [02-design-decisions.md](./02-design-decisions.md) | SDK selection (§2.1), exporter config (§2.2), span naming (§2.3), attribute schema (§2.4), coexistence with PerfLog/Insight (§2.6) |
+| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure (§3.1), key principles (§3.2), performance overhead (§3.3-3.6), conditional compilation (§3.7.3), code intrusiveness (§3.9) |
+| [04-code-samples.md](./04-code-samples.md) | Telemetry interface (§4.1), SpanGuard (§4.2), macros (§4.3), RPC instrumentation (§4.5.3) |
+| [05-configuration-reference.md](./05-configuration-reference.md) | rippled config (§5.1), config parser (§5.2), Application integration (§5.3), CMake (§5.4), Collector config (§5.5), Docker Compose (§5.6), Grafana (§5.8) |
+| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 1 core tasks (§6.2), Phase 2 RPC tasks (§6.3), quick wins (§6.10), definition of done (§6.11) |
+| [07-observability-backends.md](./07-observability-backends.md) | Jaeger dev setup (§7.1), Grafana dashboards (§7.6), alert rules (§7.6.3) |
+
+---
+
+## Task 0: Docker Observability Stack Setup
+
+**Objective**: Stand up the backend infrastructure to receive, store, and display traces.
+
+**What to do**:
+
+- Create `docker/telemetry/docker-compose.yml` in the repo with three services:
+ 1. **OpenTelemetry Collector** (`otel/opentelemetry-collector-contrib:latest`)
+ - Expose ports `4317` (OTLP gRPC) and `4318` (OTLP HTTP)
+ - Expose port `13133` (health check)
+ - Mount a config file `docker/telemetry/otel-collector-config.yaml`
+ 2. **Jaeger** (`jaegertracing/all-in-one:latest`)
+ - Expose port `16686` (UI) and `14250` (gRPC collector)
+ - Set env `COLLECTOR_OTLP_ENABLED=true`
+ 3. **Grafana** (`grafana/grafana:latest`) — optional but useful
+ - Expose port `3000`
+ - Enable anonymous admin access for local dev (`GF_AUTH_ANONYMOUS_ENABLED=true`, `GF_AUTH_ANONYMOUS_ORG_ROLE=Admin`)
+ - Provision Jaeger as a data source via `docker/telemetry/grafana/provisioning/datasources/jaeger.yaml`
+
+- Create `docker/telemetry/otel-collector-config.yaml`:
+
+ ```yaml
+ receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+ http:
+ endpoint: 0.0.0.0:4318
+
+ processors:
+ batch:
+ timeout: 1s
+ send_batch_size: 100
+
+ exporters:
+ logging:
+ verbosity: detailed
+ otlp/jaeger:
+ endpoint: jaeger:4317
+ tls:
+ insecure: true
+
+ service:
+ pipelines:
+ traces:
+ receivers: [otlp]
+ processors: [batch]
+ exporters: [logging, otlp/jaeger]
+ ```
+
+- Create Grafana Jaeger datasource provisioning file at `docker/telemetry/grafana/provisioning/datasources/jaeger.yaml`:
+ ```yaml
+ apiVersion: 1
+ datasources:
+ - name: Jaeger
+ type: jaeger
+ access: proxy
+ url: http://jaeger:16686
+ ```
+
+**Verification**: Run `docker compose -f docker/telemetry/docker-compose.yml up -d`, then:
+
+- `curl http://localhost:13133` returns healthy (Collector)
+- `http://localhost:16686` opens Jaeger UI (no traces yet)
+- `http://localhost:3000` opens Grafana (optional)
+
+**Reference**:
+
+- [05-configuration-reference.md §5.5](./05-configuration-reference.md) — Collector config (dev YAML with Jaeger exporter)
+- [05-configuration-reference.md §5.6](./05-configuration-reference.md) — Docker Compose development environment
+- [07-observability-backends.md §7.1](./07-observability-backends.md) — Jaeger quick start and backend selection
+- [05-configuration-reference.md §5.8](./05-configuration-reference.md) — Grafana datasource provisioning and dashboards
+
+---
+
+## Task 1: Add OpenTelemetry C++ SDK Dependency
+
+**Objective**: Make `opentelemetry-cpp` available to the build system.
+
+**What to do**:
+
+- Edit `conanfile.py` to add `opentelemetry-cpp` as an **optional** dependency. The gRPC otel plugin flag (`"grpc/*:otel_plugin": False`) in the existing conanfile may need to remain false — we pull the OTel SDK separately.
+ - Add a Conan option: `with_telemetry = [True, False]` defaulting to `False`
+ - When `with_telemetry` is `True`, add `opentelemetry-cpp` to `self.requires()`
+ - Required OTel Conan components: `opentelemetry-cpp` (which bundles api, sdk, and exporters). If the package isn't in Conan Center, consider using `FetchContent` in CMake or building from source as a fallback.
+- Edit `CMakeLists.txt`:
+ - Add option: `option(XRPL_ENABLE_TELEMETRY "Enable OpenTelemetry tracing" OFF)`
+ - When ON, `find_package(opentelemetry-cpp CONFIG REQUIRED)` and add compile definition `XRPL_ENABLE_TELEMETRY`
+ - When OFF, do nothing (zero build impact)
+- Verify the build succeeds with `-DXRPL_ENABLE_TELEMETRY=OFF` (no regressions) and with `-DXRPL_ENABLE_TELEMETRY=ON` (SDK links successfully).
+
+**Key files**:
+
+- `conanfile.py`
+- `CMakeLists.txt`
+
+**Reference**:
+
+- [05-configuration-reference.md §5.4](./05-configuration-reference.md) — CMake integration, `FindOpenTelemetry.cmake`, `XRPL_ENABLE_TELEMETRY` option
+- [03-implementation-strategy.md §3.2](./03-implementation-strategy.md) — Key principle: zero-cost when disabled via compile-time flags
+- [02-design-decisions.md §2.1](./02-design-decisions.md) — SDK selection rationale and required OTel components
+
+---
+
+## Task 2: Create Core Telemetry Interface and NullTelemetry
+
+**Objective**: Define the `Telemetry` abstract interface and a no-op implementation so the rest of the codebase can reference telemetry without hard-depending on the OTel SDK.
+
+**What to do**:
+
+- Create `include/xrpl/telemetry/Telemetry.h`:
+ - Define `namespace xrpl::telemetry`
+ - Define `struct Telemetry::Setup` holding: `enabled`, `exporterEndpoint`, `samplingRatio`, `serviceName`, `serviceVersion`, `serviceInstanceId`, `traceRpc`, `traceTransactions`, `traceConsensus`, `tracePeer`
+ - Define abstract `class Telemetry` with:
+ - `virtual void start() = 0;`
+ - `virtual void stop() = 0;`
+ - `virtual bool isEnabled() const = 0;`
+ - `virtual nostd::shared_ptr getTracer(string_view name = "rippled") = 0;`
+ - `virtual nostd::shared_ptr startSpan(string_view name, SpanKind kind = kInternal) = 0;`
+ - `virtual nostd::shared_ptr startSpan(string_view name, Context const& parentContext, SpanKind kind = kInternal) = 0;`
+ - `virtual bool shouldTraceRpc() const = 0;`
+ - `virtual bool shouldTraceTransactions() const = 0;`
+ - `virtual bool shouldTraceConsensus() const = 0;`
+ - Factory: `std::unique_ptr make_Telemetry(Setup const&, beast::Journal);`
+ - Config parser: `Telemetry::Setup setup_Telemetry(Section const&, std::string const& nodePublicKey, std::string const& version);`
+
+- Create `include/xrpl/telemetry/SpanGuard.h`:
+ - RAII guard that takes an `nostd::shared_ptr`, creates a `Scope`, and calls `span->End()` in destructor.
+ - Convenience: `setAttribute()`, `setOk()`, `setStatus()`, `addEvent()`, `recordException()`, `context()`
+ - See [04-code-samples.md](./04-code-samples.md) §4.2 for the full implementation.
+
+- Create `src/libxrpl/telemetry/NullTelemetry.cpp`:
+ - Implements `Telemetry` with all no-ops.
+ - `isEnabled()` returns `false`, `startSpan()` returns a noop span.
+ - This is used when `XRPL_ENABLE_TELEMETRY` is OFF or `enabled=0` in config.
+
+- Guard all OTel SDK headers behind `#ifdef XRPL_ENABLE_TELEMETRY`. The `NullTelemetry` implementation should compile without the OTel SDK present.
+
+**Key new files**:
+
+- `include/xrpl/telemetry/Telemetry.h`
+- `include/xrpl/telemetry/SpanGuard.h`
+- `src/libxrpl/telemetry/NullTelemetry.cpp`
+
+**Reference**:
+
+- [04-code-samples.md §4.1](./04-code-samples.md) — Full `Telemetry` interface with `Setup` struct, lifecycle, tracer access, span creation, and component filtering methods
+- [04-code-samples.md §4.2](./04-code-samples.md) — Full `SpanGuard` RAII implementation and `NullSpanGuard` no-op class
+- [03-implementation-strategy.md §3.1](./03-implementation-strategy.md) — Directory structure: `include/xrpl/telemetry/` for headers, `src/libxrpl/telemetry/` for implementation
+- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation and zero-cost compile-time disabled pattern
+
+---
+
+## Task 3: Implement OTel-Backed Telemetry
+
+**Objective**: Implement the real `Telemetry` class that initializes the OTel SDK, configures the OTLP exporter and batch processor, and creates tracers/spans.
+
+**What to do**:
+
+- Create `src/libxrpl/telemetry/Telemetry.cpp` (compiled only when `XRPL_ENABLE_TELEMETRY=ON`):
+ - `class TelemetryImpl : public Telemetry` that:
+ - In `start()`: creates a `TracerProvider` with:
+ - Resource attributes: `service.name`, `service.version`, `service.instance.id`
+ - An `OtlpGrpcExporter` pointed at `setup.exporterEndpoint` (default `localhost:4317`)
+ - A `BatchSpanProcessor` with configurable batch size and delay
+ - A `TraceIdRatioBasedSampler` using `setup.samplingRatio`
+ - Sets the global `TracerProvider`
+ - In `stop()`: calls `ForceFlush()` then shuts down the provider
+ - In `startSpan()`: delegates to `getTracer()->StartSpan(name, ...)`
+ - `shouldTraceRpc()` etc. read from `Setup` fields
+
+- Create `src/libxrpl/telemetry/TelemetryConfig.cpp`:
+ - `setup_Telemetry()` parses the `[telemetry]` config section from `xrpld.cfg`
+ - Maps config keys: `enabled`, `exporter`, `endpoint`, `sampling_ratio`, `trace_rpc`, `trace_transactions`, `trace_consensus`, `trace_peer`
+
+- Wire `make_Telemetry()` factory:
+ - If `setup.enabled` is true AND `XRPL_ENABLE_TELEMETRY` is defined: return `TelemetryImpl`
+ - Otherwise: return `NullTelemetry`
+
+- Add telemetry source files to CMake. When `XRPL_ENABLE_TELEMETRY=ON`, compile `Telemetry.cpp` and `TelemetryConfig.cpp` and link against `opentelemetry-cpp::api`, `opentelemetry-cpp::sdk`, `opentelemetry-cpp::otlp_grpc_exporter`. When OFF, compile only `NullTelemetry.cpp`.
+
+**Key new files**:
+
+- `src/libxrpl/telemetry/Telemetry.cpp`
+- `src/libxrpl/telemetry/TelemetryConfig.cpp`
+
+**Key modified files**:
+
+- `CMakeLists.txt` (add telemetry library target)
+
+**Reference**:
+
+- [04-code-samples.md §4.1](./04-code-samples.md) — `Telemetry` interface that `TelemetryImpl` must implement
+- [05-configuration-reference.md §5.2](./05-configuration-reference.md) — `setup_Telemetry()` config parser implementation
+- [02-design-decisions.md §2.2](./02-design-decisions.md) — OTLP/gRPC exporter config (endpoint, TLS options)
+- [02-design-decisions.md §2.4.1](./02-design-decisions.md) — Resource attributes: `service.name`, `service.version`, `service.instance.id`, `xrpl.network.id`
+- [03-implementation-strategy.md §3.4](./03-implementation-strategy.md) — Per-operation CPU costs and overhead budget for span creation
+- [03-implementation-strategy.md §3.5](./03-implementation-strategy.md) — Memory overhead: static (~456 KB) and dynamic (~1.2 MB) budgets
+
+---
+
+## Task 4: Integrate Telemetry into Application Lifecycle
+
+**Objective**: Wire the `Telemetry` object into `Application` so all components can access it.
+
+**What to do**:
+
+- Edit `src/xrpld/app/main/Application.h`:
+ - Forward-declare `namespace xrpl::telemetry { class Telemetry; }`
+ - Add pure virtual method: `virtual telemetry::Telemetry& getTelemetry() = 0;`
+
+- Edit `src/xrpld/app/main/Application.cpp` (the `ApplicationImp` class):
+ - Add member: `std::unique_ptr telemetry_;`
+ - In the constructor, after config is loaded and node identity is known:
+ ```cpp
+ auto const telemetrySection = config_->section("telemetry");
+ auto telemetrySetup = telemetry::setup_Telemetry(
+ telemetrySection,
+ toBase58(TokenType::NodePublic, nodeIdentity_.publicKey()),
+ BuildInfo::getVersionString());
+ telemetry_ = telemetry::make_Telemetry(telemetrySetup, logs_->journal("Telemetry"));
+ ```
+ - In `start()`: call `telemetry_->start()` early
+ - In `stop()` or destructor: call `telemetry_->stop()` late (to flush pending spans)
+ - Implement `getTelemetry()` override: return `*telemetry_`
+
+- Add `[telemetry]` section to the example config `cfg/rippled-example.cfg`:
+ ```ini
+ # [telemetry]
+ # enabled=1
+ # endpoint=localhost:4317
+ # sampling_ratio=1.0
+ # trace_rpc=1
+ ```
+
+**Key modified files**:
+
+- `src/xrpld/app/main/Application.h`
+- `src/xrpld/app/main/Application.cpp`
+- `cfg/rippled-example.cfg` (or equivalent example config)
+
+**Reference**:
+
+- [05-configuration-reference.md §5.3](./05-configuration-reference.md) — `ApplicationImp` changes: member declaration, constructor init, `start()`/`stop()` wiring, `getTelemetry()` override
+- [05-configuration-reference.md §5.1](./05-configuration-reference.md) — `[telemetry]` config section format and all option defaults
+- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact assessment: `Application.cpp` ~15 lines added, ~3 changed (Low risk)
+
+---
+
+## Task 5: Create Instrumentation Macros
+
+**Objective**: Define convenience macros that make instrumenting code one-liners, and that compile to zero-cost no-ops when telemetry is disabled.
+
+**What to do**:
+
+- Create `src/xrpld/telemetry/TracingInstrumentation.h`:
+ - When `XRPL_ENABLE_TELEMETRY` is defined:
+
+ ```cpp
+ #define XRPL_TRACE_SPAN(telemetry, name) \
+ auto _xrpl_span_ = (telemetry).startSpan(name); \
+ ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_)
+
+ #define XRPL_TRACE_RPC(telemetry, name) \
+ std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \
+ if ((telemetry).shouldTraceRpc()) { \
+ _xrpl_guard_.emplace((telemetry).startSpan(name)); \
+ }
+
+ #define XRPL_TRACE_SET_ATTR(key, value) \
+ if (_xrpl_guard_.has_value()) { \
+ _xrpl_guard_->setAttribute(key, value); \
+ }
+
+ #define XRPL_TRACE_EXCEPTION(e) \
+ if (_xrpl_guard_.has_value()) { \
+ _xrpl_guard_->recordException(e); \
+ }
+ ```
+
+ - When `XRPL_ENABLE_TELEMETRY` is NOT defined, all macros expand to `((void)0)`
+
+**Key new file**:
+
+- `src/xrpld/telemetry/TracingInstrumentation.h`
+
+**Reference**:
+
+- [04-code-samples.md §4.3](./04-code-samples.md) — Full macro definitions for `XRPL_TRACE_SPAN`, `XRPL_TRACE_RPC`, `XRPL_TRACE_CONSENSUS`, `XRPL_TRACE_SET_ATTR`, `XRPL_TRACE_EXCEPTION` with both enabled and disabled branches
+- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation pattern: compile-time `#ifndef` and runtime `shouldTrace*()` checks
+- [03-implementation-strategy.md §3.9.7](./03-implementation-strategy.md) — Before/after code examples showing minimal intrusiveness (~1-3 lines per instrumentation point)
+
+---
+
+## Task 6: Instrument RPC ServerHandler
+
+**Objective**: Add tracing to the HTTP RPC entry point so every incoming RPC request creates a span.
+
+**What to do**:
+
+- Edit `src/xrpld/rpc/detail/ServerHandler.cpp`:
+ - `#include` the `TracingInstrumentation.h` header
+ - In `ServerHandler::onRequest(Session& session)`:
+ - At the top of the method, add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request");`
+ - After the RPC command name is extracted, set attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command);`
+ - After the response status is known, set: `XRPL_TRACE_SET_ATTR("http.status_code", static_cast(statusCode));`
+ - Wrap error paths with: `XRPL_TRACE_EXCEPTION(e);`
+ - In `ServerHandler::processRequest(...)`:
+ - Add a child span: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process");`
+ - Set method attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.method", request_method);`
+ - In `ServerHandler::onWSMessage(...)` (WebSocket path):
+ - Add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws.message");`
+
+- The goal is to see spans like:
+ ```
+ rpc.request
+ └── rpc.process
+ ```
+ in Jaeger for every HTTP RPC call.
+
+**Key modified file**:
+
+- `src/xrpld/rpc/detail/ServerHandler.cpp` (~15-25 lines added)
+
+**Reference**:
+
+- [04-code-samples.md §4.5.3](./04-code-samples.md) — Complete `ServerHandler::onRequest()` instrumented code sample with W3C header extraction, span creation, attribute setting, and error handling
+- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response
+- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High)
+- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*`
+- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params`
+- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk)
+
+---
+
+## Task 7: Instrument RPC Command Execution
+
+**Objective**: Add per-command tracing inside the RPC handler so each command (e.g., `submit`, `account_info`, `server_info`) gets its own child span.
+
+**What to do**:
+
+- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`:
+ - `#include` the `TracingInstrumentation.h` header
+ - In `doCommand(RPC::JsonContext& context, Json::Value& result)`:
+ - At the top: `XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + context.method);`
+ - Set attributes:
+ - `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", context.method);`
+ - `XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast(context.apiVersion));`
+ - `XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");`
+ - On success: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success");`
+ - On error: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error");` and set the error message
+
+- After this, traces in Jaeger should look like:
+ ```
+ rpc.request (xrpl.rpc.command=account_info)
+ └── rpc.process
+ └── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success)
+ ```
+
+**Key modified file**:
+
+- `src/xrpld/rpc/detail/RPCHandler.cpp` (~15-20 lines added)
+
+**Reference**:
+
+- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`)
+- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`)
+- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`
+- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High)
+- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries
+- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request
+
+---
+
+## Task 8: Build, Run, and Verify End-to-End
+
+**Objective**: Prove the full pipeline works: rippled emits traces -> OTel Collector receives them -> Jaeger displays them.
+
+**What to do**:
+
+1. **Start the Docker stack**:
+
+ ```bash
+ docker compose -f docker/telemetry/docker-compose.yml up -d
+ ```
+
+ Verify Collector health: `curl http://localhost:13133`
+
+2. **Build rippled with telemetry**:
+
+ ```bash
+ # Adjust for your actual build workflow
+ conan install . --build=missing -o with_telemetry=True
+ cmake --preset default -DXRPL_ENABLE_TELEMETRY=ON
+ cmake --build --preset default
+ ```
+
+3. **Configure rippled**:
+ Add to `rippled.cfg` (or your local test config):
+
+ ```ini
+ [telemetry]
+ enabled=1
+ endpoint=localhost:4317
+ sampling_ratio=1.0
+ trace_rpc=1
+ ```
+
+4. **Start rippled** in standalone mode:
+
+ ```bash
+ ./rippled --conf rippled.cfg -a --start
+ ```
+
+5. **Generate RPC traffic**:
+
+ ```bash
+ # server_info
+ curl -s -X POST http://localhost:5005 \
+ -H "Content-Type: application/json" \
+ -d '{"method":"server_info","params":[{}]}'
+
+ # ledger
+ curl -s -X POST http://localhost:5005 \
+ -H "Content-Type: application/json" \
+ -d '{"method":"ledger","params":[{"ledger_index":"current"}]}'
+
+ # account_info (will error in standalone, that's fine — we trace errors too)
+ curl -s -X POST http://localhost:5005 \
+ -H "Content-Type: application/json" \
+ -d '{"method":"account_info","params":[{"account":"rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"}]}'
+ ```
+
+6. **Verify in Jaeger**:
+ - Open `http://localhost:16686`
+ - Select service `rippled` from the dropdown
+ - Click "Find Traces"
+ - Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info`
+ - Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version`
+
+7. **Verify zero-overhead when disabled**:
+ - Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config
+ - Run the same RPC calls
+ - Confirm no new traces appear and no errors in rippled logs
+
+**Verification Checklist**:
+
+- [ ] Docker stack starts without errors
+- [ ] rippled builds with `-DXRPL_ENABLE_TELEMETRY=ON`
+- [ ] rippled starts and connects to OTel Collector (check rippled logs for telemetry messages)
+- [ ] Traces appear in Jaeger UI under service "rippled"
+- [ ] Span hierarchy is correct (parent-child relationships)
+- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.)
+- [ ] Error spans show error status and message
+- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions
+- [ ] Setting `enabled=0` at runtime produces no traces and no errors
+
+**Reference**:
+
+- [06-implementation-phases.md §6.11.1](./06-implementation-phases.md) — Phase 1 definition of done: SDK compiles, runtime toggle works, span creation verified in Jaeger, config validation passes
+- [06-implementation-phases.md §6.11.2](./06-implementation-phases.md) — Phase 2 definition of done: 100% RPC coverage, traceparent propagation, <1ms p99 overhead, dashboard deployed
+- [06-implementation-phases.md §6.8](./06-implementation-phases.md) — Success metrics: trace coverage >95%, CPU overhead <3%, memory <5 MB, latency impact <2%
+- [03-implementation-strategy.md §3.9.5](./03-implementation-strategy.md) — Backward compatibility: config optional, protocol unchanged, `XRPL_ENABLE_TELEMETRY=OFF` produces identical binary
+- [01-architecture-analysis.md §1.8](./01-architecture-analysis.md) — Observable outcomes: what traces, metrics, and dashboards to expect
+
+---
+
+## Task 9: Document POC Results and Next Steps
+
+**Objective**: Capture findings, screenshots, and remaining work for the team.
+
+**What to do**:
+
+- Take screenshots of Jaeger showing:
+ - The service list with "rippled"
+ - A trace with the full span tree
+ - Span detail view showing attributes
+- Document any issues encountered (build issues, SDK quirks, missing attributes)
+- Note performance observations (build time impact, any noticeable runtime overhead)
+- Write a short summary of what the POC proves and what it doesn't cover yet:
+ - **Proves**: OTel SDK integrates with rippled, OTLP export works, RPC traces visible
+ - **Doesn't cover**: Cross-node P2P context propagation, consensus tracing, protobuf trace context, W3C traceparent header extraction, tail-based sampling, production deployment
+- Outline next steps (mapping to the full plan phases):
+ - [Phase 2](./06-implementation-phases.md) completion: [W3C header extraction](./02-design-decisions.md) (§2.5), WebSocket tracing, all [RPC handlers](./01-architecture-analysis.md) (§1.6)
+ - [Phase 3](./06-implementation-phases.md): [Protobuf `TraceContext` message](./04-code-samples.md) (§4.4), [transaction relay tracing](./04-code-samples.md) (§4.5.1) across nodes
+ - [Phase 4](./06-implementation-phases.md): [Consensus round and phase tracing](./04-code-samples.md) (§4.5.2)
+ - [Phase 5](./06-implementation-phases.md): [Production collector config](./05-configuration-reference.md) (§5.5.2), [Grafana dashboards](./07-observability-backends.md) (§7.6), [alerting](./07-observability-backends.md) (§7.6.3)
+
+**Reference**:
+
+- [06-implementation-phases.md §6.1](./06-implementation-phases.md) — Full 5-phase timeline overview and Gantt chart
+- [06-implementation-phases.md §6.10](./06-implementation-phases.md) — Crawl-Walk-Run strategy: POC is the CRAWL phase, next steps are WALK and RUN
+- [06-implementation-phases.md §6.12](./06-implementation-phases.md) — Recommended implementation order (14 steps across 9 weeks)
+- [03-implementation-strategy.md §3.9](./03-implementation-strategy.md) — Code intrusiveness assessment and risk matrix for each remaining component
+- [07-observability-backends.md §7.2](./07-observability-backends.md) — Production backend selection (Tempo, Elastic APM, Honeycomb, Datadog)
+- [02-design-decisions.md §2.5](./02-design-decisions.md) — Context propagation design: W3C HTTP headers, protobuf P2P, JobQueue internal
+- [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) — Reference for team onboarding on distributed tracing concepts
+
+---
+
+## Summary
+
+| Task | Description | New Files | Modified Files | Depends On |
+| ---- | ------------------------------------ | --------- | -------------- | ---------- |
+| 0 | Docker observability stack | 4 | 0 | — |
+| 1 | OTel C++ SDK dependency | 0 | 2 | — |
+| 2 | Core Telemetry interface + NullImpl | 3 | 0 | 1 |
+| 3 | OTel-backed Telemetry implementation | 2 | 1 | 1, 2 |
+| 4 | Application lifecycle integration | 0 | 3 | 2, 3 |
+| 5 | Instrumentation macros | 1 | 0 | 2 |
+| 6 | Instrument RPC ServerHandler | 0 | 1 | 4, 5 |
+| 7 | Instrument RPC command execution | 0 | 1 | 4, 5 |
+| 8 | End-to-end verification | 0 | 0 | 0-7 |
+| 9 | Document results and next steps | 1 | 0 | 8 |
+
+**Parallel work**: Tasks 0 and 1 can run in parallel. Tasks 2 and 5 have no dependency on each other. Tasks 6 and 7 can be done in parallel once Tasks 4 and 5 are complete.
+
+---
+
+## Next Steps (Post-POC)
+
+### Metrics Pipeline for Grafana Dashboards
+
+The current POC exports **traces only**. Grafana's Explore view can query Jaeger for individual traces, but time-series charts (latency histograms, request throughput, error rates) require a **metrics pipeline**. To enable this:
+
+1. **Add a `spanmetrics` connector** to the OTel Collector config that derives RED metrics (Rate, Errors, Duration) from trace spans automatically:
+
+ ```yaml
+ connectors:
+ spanmetrics:
+ histogram:
+ explicit:
+ buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
+ dimensions:
+ - name: xrpl.rpc.command
+ - name: xrpl.rpc.status
+
+ exporters:
+ prometheus:
+ endpoint: 0.0.0.0:8889
+
+ service:
+ pipelines:
+ traces:
+ receivers: [otlp]
+ processors: [batch]
+ exporters: [debug, otlp/jaeger, spanmetrics]
+ metrics:
+ receivers: [spanmetrics]
+ exporters: [prometheus]
+ ```
+
+2. **Add Prometheus** to the Docker Compose stack to scrape the collector's metrics endpoint.
+
+3. **Add Prometheus as a Grafana datasource** and build dashboards for:
+ - RPC request latency (p50/p95/p99) by command
+ - RPC throughput (requests/sec) by command
+ - Error rate by command
+ - Span duration distribution
+
+### Additional Instrumentation
+
+- **W3C `traceparent` header extraction** in `ServerHandler` to support cross-service context propagation from external callers
+- **WebSocket RPC tracing** in `ServerHandler::onWSMessage()`
+- **Transaction relay tracing** across nodes using protobuf `TraceContext` messages
+- **Consensus round and phase tracing** for validator coordination visibility
+- **Ledger close tracing** to measure close-to-validated latency
+
+### Production Hardening
+
+- **Tail-based sampling** in the OTel Collector to reduce volume while retaining error/slow traces
+- **TLS configuration** for the OTLP exporter in production deployments
+- **Resource limits** on the batch processor queue to prevent unbounded memory growth
+- **Health monitoring** for the telemetry pipeline itself (collector lag, export failures)
+
+### POC Lessons Learned
+
+Issues encountered during POC implementation that inform future work:
+
+| Issue | Resolution | Impact on Future Work |
+| -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ---------------------------------------------------------------- |
+| Conan lockfile rejected `opentelemetry-cpp/1.18.0` | Used `--lockfile=""` to bypass | Lockfile must be regenerated when adding new dependencies |
+| Conan package only builds OTLP HTTP exporter, not gRPC | Switched from gRPC to HTTP exporter (`localhost:4318/v1/traces`) | HTTP exporter is the default; gRPC requires custom Conan profile |
+| CMake target `opentelemetry-cpp::api` etc. don't exist in Conan package | Use umbrella target `opentelemetry-cpp::opentelemetry-cpp` | Conan targets differ from upstream CMake targets |
+| OTel Collector `logging` exporter deprecated | Renamed to `debug` exporter | Use `debug` in all collector configs going forward |
+| Macro parameter `telemetry` collided with `::xrpl::telemetry::` namespace | Renamed macro params to `_tel_obj_`, `_span_name_` | Avoid common words as macro parameter names |
+| `opentelemetry::trace::Scope` creates new context on move | Store scope as member, create once in constructor | SpanGuard move semantics need care with Scope lifecycle |
+| `TracerProviderFactory::Create` returns `unique_ptr`, not `nostd::shared_ptr` | Use `std::shared_ptr` member, wrap in `nostd::shared_ptr` for global provider | OTel SDK factory return types don't match API provider types |
diff --git a/cspell.config.yaml b/cspell.config.yaml
index e2b20ac0985..c6f69bfc290 100644
--- a/cspell.config.yaml
+++ b/cspell.config.yaml
@@ -178,6 +178,7 @@ words:
- nixpkgs
- nonxrp
- noripple
+ - nostd
- nudb
- nullptr
- nunl
@@ -307,3 +308,9 @@ words:
- xrplf
- xxhash
- xxhasher
+ - xychart
+ - otelc
+ - zpages
+ - traceql
+ - Gantt
+ - gantt
diff --git a/presentation.md b/presentation.md
new file mode 100644
index 00000000000..7a443a635c5
--- /dev/null
+++ b/presentation.md
@@ -0,0 +1,280 @@
+# OpenTelemetry Distributed Tracing for rippled
+
+---
+
+## Slide 1: Introduction
+
+### What is OpenTelemetry?
+
+OpenTelemetry is an open-source, CNCF-backed observability framework for distributed tracing, metrics, and logs.
+
+### Why OpenTelemetry for rippled?
+
+- **End-to-End Transaction Visibility**: Track transactions from submission → consensus → ledger inclusion
+- **Cross-Node Correlation**: Follow requests across multiple independent nodes using a unique `trace_id`
+- **Consensus Round Analysis**: Understand timing and behavior across validators
+- **Incident Debugging**: Correlate events across distributed nodes during issues
+
+```mermaid
+flowchart LR
+ A["Node A
tx.receive
trace_id: abc123"] --> B["Node B
tx.relay
trace_id: abc123"] --> C["Node C
tx.validate
trace_id: abc123"] --> D["Node D
ledger.apply
trace_id: abc123"]
+
+ style A fill:#1565c0,stroke:#0d47a1,color:#fff
+ style B fill:#2e7d32,stroke:#1b5e20,color:#fff
+ style C fill:#2e7d32,stroke:#1b5e20,color:#fff
+ style D fill:#e65100,stroke:#bf360c,color:#fff
+```
+
+> **Trace ID: abc123** — All nodes share the same trace, enabling cross-node correlation.
+
+---
+
+## Slide 2: OpenTelemetry vs Open Source Alternatives
+
+| Feature | OpenTelemetry | Jaeger | Zipkin | SkyWalking | Pinpoint | Prometheus |
+| ------------------- | ---------------- | ---------------- | ------------------ | ---------- | ---------- | ---------- |
+| **Tracing** | YES | YES | YES | YES | YES | NO |
+| **Metrics** | YES | NO | NO | YES | YES | YES |
+| **Logs** | YES | NO | NO | YES | NO | NO |
+| **C++ SDK** | YES Official | YES (Deprecated) | YES (Unmaintained) | NO | NO | YES |
+| **Vendor Neutral** | YES Primary goal | NO | NO | NO | NO | NO |
+| **Instrumentation** | Manual + Auto | Manual | Manual | Auto-first | Auto-first | Manual |
+| **Backend** | Any (exporters) | Self | Self | Self | Self | Self |
+| **CNCF Status** | Incubating | Graduated | NO | Incubating | NO | Graduated |
+
+> **Why OpenTelemetry?** It's the only actively maintained, full-featured C++ option with vendor neutrality — allowing export to Jaeger, Prometheus, Grafana, or any commercial backend without changing instrumentation.
+
+---
+
+## Slide 3: Comparison with rippled's Existing Solutions
+
+### Current Observability Stack
+
+| Aspect | PerfLog (JSON) | StatsD (Metrics) | OpenTelemetry (NEW) |
+| --------------------- | --------------------- | --------------------- | --------------------------- |
+| **Type** | Logging | Metrics | Distributed Tracing |
+| **Scope** | Single node | Single node | **Cross-node** |
+| **Data** | JSON log entries | Counters, gauges | Spans with context |
+| **Correlation** | By timestamp | By metric name | By `trace_id` |
+| **Overhead** | Low (file I/O) | Low (UDP) | Low-Medium (configurable) |
+| **Question Answered** | "What happened here?" | "How many? How fast?" | **"What was the journey?"** |
+
+### Use Case Matrix
+
+| Scenario | PerfLog | StatsD | OpenTelemetry |
+| -------------------------------- | ------- | ------ | ------------- |
+| "How many TXs per second?" | ❌ | ✅ | ❌ |
+| "Why was this specific TX slow?" | ⚠️ | ❌ | ✅ |
+| "Which node delayed consensus?" | ❌ | ❌ | ✅ |
+| "Show TX journey across 5 nodes" | ❌ | ❌ | ✅ |
+
+> **Key Insight**: OpenTelemetry **complements** (not replaces) existing systems.
+
+---
+
+## Slide 4: Architecture
+
+### High-Level Integration Architecture
+
+```mermaid
+flowchart TB
+ subgraph rippled["rippled Node"]
+ subgraph services["Core Services"]
+ direction LR
+ RPC["RPC Server
(HTTP/WS)"] ~~~ Overlay["Overlay
(P2P Network)"] ~~~ Consensus["Consensus
(RCLConsensus)"]
+ end
+
+ Telemetry["Telemetry Module
(OpenTelemetry SDK)"]
+
+ services --> Telemetry
+ end
+
+ Telemetry -->|OTLP/gRPC| Collector["OTel Collector"]
+
+ Collector --> Tempo["Grafana Tempo"]
+ Collector --> Jaeger["Jaeger"]
+ Collector --> Elastic["Elastic APM"]
+
+ style rippled fill:#424242,stroke:#212121,color:#fff
+ style services fill:#1565c0,stroke:#0d47a1,color:#fff
+ style Telemetry fill:#2e7d32,stroke:#1b5e20,color:#fff
+ style Collector fill:#e65100,stroke:#bf360c,color:#fff
+```
+
+### Context Propagation
+
+```mermaid
+sequenceDiagram
+ participant Client
+ participant NodeA as Node A
+ participant NodeB as Node B
+
+ Client->>NodeA: Submit TX (no context)
+ Note over NodeA: Creates trace_id: abc123
span: tx.receive
+ NodeA->>NodeB: Relay TX
(traceparent: abc123)
+ Note over NodeB: Links to trace_id: abc123
span: tx.relay
+```
+
+- **HTTP/RPC**: W3C Trace Context headers (`traceparent`)
+- **P2P Messages**: Protocol Buffer extension fields
+
+---
+
+## Slide 5: Implementation Plan
+
+### 5-Phase Rollout (9 Weeks)
+
+```mermaid
+gantt
+ title Implementation Timeline
+ dateFormat YYYY-MM-DD
+ axisFormat Week %W
+
+ section Phase 1
+ Core Infrastructure :p1, 2024-01-01, 2w
+
+ section Phase 2
+ RPC Tracing :p2, after p1, 2w
+
+ section Phase 3
+ Transaction Tracing :p3, after p2, 2w
+
+ section Phase 4
+ Consensus Tracing :p4, after p3, 2w
+
+ section Phase 5
+ Documentation :p5, after p4, 1w
+```
+
+### Phase Details
+
+| Phase | Focus | Key Deliverables | Effort |
+| ----- | ------------------- | -------------------------------------------- | ------- |
+| 1 | Core Infrastructure | SDK integration, Telemetry interface, Config | 10 days |
+| 2 | RPC Tracing | HTTP context extraction, Handler spans | 10 days |
+| 3 | Transaction Tracing | Protobuf context, P2P relay propagation | 10 days |
+| 4 | Consensus Tracing | Round spans, Proposal/validation tracing | 10 days |
+| 5 | Documentation | Runbook, Dashboards, Training | 7 days |
+
+**Total Effort**: ~47 developer-days (2 developers)
+
+---
+
+## Slide 6: Performance Overhead
+
+### Estimated System Impact
+
+| Metric | Overhead | Notes |
+| ----------------- | ---------- | ----------------------------------- |
+| **CPU** | 1-3% | Span creation and attribute setting |
+| **Memory** | 2-5 MB | Batch buffer for pending spans |
+| **Network** | 10-50 KB/s | Compressed OTLP export to collector |
+| **Latency (p99)** | <2% | With proper sampling configuration |
+
+### Per-Message Overhead (Context Propagation)
+
+Each P2P message carries trace context with the following overhead:
+
+| Field | Size | Description |
+| ------------- | ------------- | ----------------------------------------- |
+| `trace_id` | 16 bytes | Unique identifier for the entire trace |
+| `span_id` | 8 bytes | Current span (becomes parent on receiver) |
+| `trace_flags` | 4 bytes | Sampling decision flags |
+| `trace_state` | 0-4 bytes | Optional vendor-specific data |
+| **Total** | **~32 bytes** | **Added per traced P2P message** |
+
+```mermaid
+flowchart LR
+ subgraph msg["P2P Message with Trace Context"]
+ A["Original Message
(variable size)"] --> B["+ TraceContext
(~32 bytes)"]
+ end
+
+ subgraph breakdown["Context Breakdown"]
+ C["trace_id
16 bytes"]
+ D["span_id
8 bytes"]
+ E["flags
4 bytes"]
+ F["state
0-4 bytes"]
+ end
+
+ B --> breakdown
+
+ style A fill:#424242,stroke:#212121,color:#fff
+ style B fill:#2e7d32,stroke:#1b5e20,color:#fff
+ style C fill:#1565c0,stroke:#0d47a1,color:#fff
+ style D fill:#1565c0,stroke:#0d47a1,color:#fff
+ style E fill:#e65100,stroke:#bf360c,color:#fff
+ style F fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+> **Note**: 32 bytes is negligible compared to typical transaction messages (hundreds to thousands of bytes)
+
+### Mitigation Strategies
+
+```mermaid
+flowchart LR
+ A["Head Sampling
10% default"] --> B["Tail Sampling
Keep errors/slow"] --> C["Batch Export
Reduce I/O"] --> D["Conditional Compile
XRPL_ENABLE_TELEMETRY"]
+
+ style A fill:#1565c0,stroke:#0d47a1,color:#fff
+ style B fill:#2e7d32,stroke:#1b5e20,color:#fff
+ style C fill:#e65100,stroke:#bf360c,color:#fff
+ style D fill:#4a148c,stroke:#2e0d57,color:#fff
+```
+
+### Kill Switches (Rollback Options)
+
+1. **Config Disable**: Set `enabled=0` in config → instant disable, no restart needed for sampling
+2. **Rebuild**: Compile with `XRPL_ENABLE_TELEMETRY=OFF` → zero overhead (no-op)
+3. **Full Revert**: Clean separation allows easy commit reversion
+
+---
+
+## Slide 7: Data Collection & Privacy
+
+### What Data is Collected
+
+| Category | Attributes Collected | Purpose |
+| --------------- | ---------------------------------------------------------------------------------- | --------------------------- |
+| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle |
+| **Consensus** | `round`, `phase`, `mode`, `proposers`(public key or public node id), `duration_ms` | Analyze consensus timing |
+| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance |
+| **Peer** | `peer.id`(public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis |
+| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking |
+| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance |
+
+### What is NOT Collected (Privacy Guarantees)
+
+```mermaid
+flowchart LR
+ subgraph notCollected["❌ NOT Collected"]
+ direction LR
+ A["Private Keys"] ~~~ B["Account Balances"] ~~~ C["Transaction Amounts"]
+ end
+
+ subgraph alsoNot["❌ Also Excluded"]
+ direction LR
+ D["IP Addresses
(configurable)"] ~~~ E["Personal Data"] ~~~ F["Raw TX Payloads"]
+ end
+
+ style A fill:#c62828,stroke:#8c2809,color:#fff
+ style B fill:#c62828,stroke:#8c2809,color:#fff
+ style C fill:#c62828,stroke:#8c2809,color:#fff
+ style D fill:#c62828,stroke:#8c2809,color:#fff
+ style E fill:#c62828,stroke:#8c2809,color:#fff
+ style F fill:#c62828,stroke:#8c2809,color:#fff
+```
+
+### Privacy Protection Mechanisms
+
+| Mechanism | Description |
+| -------------------------- | ------------------------------------------------------------- |
+| **Account Hashing** | `xrpl.tx.account` is hashed at collector level before storage |
+| **Configurable Redaction** | Sensitive fields can be excluded via config |
+| **Sampling** | Only 10% of traces recorded by default (reduces exposure) |
+| **Local Control** | Node operators control what gets exported |
+| **No Raw Payloads** | Transaction content is never recorded, only metadata |
+
+> **Key Principle**: Telemetry collects **operational metadata** (timing, counts, hashes) — never **sensitive content** (keys, balances, amounts).
+
+---
+
+_End of Presentation_