Skip to content

Commit 762d58e

Browse files
committed
feat(infra): add OTEL Collector with spanmetrics connector
Configure OTEL Collector to derive RED metrics from trace spans via the spanmetrics connector, exported to Prometheus. Adds collector config, updates docker-compose, and wires Prometheus scrape target.
1 parent 8c859b3 commit 762d58e

File tree

4 files changed

+184
-4
lines changed

4 files changed

+184
-4
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ logs/
3737
# If you have .env files specific to subprojects, e.g. subproject/.env,
3838
# the .env rule above will catch them.
3939

40+
# Local validation scripts (not part of CI)
41+
test-suite/fhevm/scripts/validate-spanmetrics.sh
42+
4043
# Build output & temporary directories (generic)
4144
.cache/
4245
.buildx-cache/
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# =============================================================================
2+
# OpenTelemetry Collector Configuration for FHEVM Coprocessor
3+
# =============================================================================
4+
# Architecture:
5+
# Services --OTLP gRPC--> [otlp receiver] --traces--> [spanmetrics connector] --> [prometheus exporter]
6+
# --traces--> [otlp/jaeger exporter] --> Jaeger
7+
#
8+
# Spanmetrics dimensions (explicit allowlist):
9+
# - service.name (default) — 6 known services
10+
# - span.name (default) — ~45 distinct span names
11+
# - status.code (default) — OK / ERROR
12+
# - operation — FHE operation enum name (~30 values)
13+
# - ct_type — ciphertext type (~15 values)
14+
# - operation_pattern_id — per-cone DFG hash (bounded by distinct contract patterns)
15+
# - transaction_pattern_id — whole-tx DFG hash (bounded by distinct contract patterns)
16+
#
17+
# Excluded from defaults:
18+
# - span.kind — all spans are INTERNAL (single value = useless)
19+
#
20+
# High-cardinality attributes NOT listed as dimensions are automatically
21+
# excluded from spanmetrics output (txn_id, count, compressed_size, etc.)
22+
# =============================================================================
23+
24+
receivers:
25+
otlp:
26+
protocols:
27+
grpc:
28+
endpoint: "0.0.0.0:4317"
29+
30+
connectors:
31+
spanmetrics:
32+
namespace: coprocessor.span
33+
34+
histogram:
35+
unit: ms
36+
explicit:
37+
buckets:
38+
- 1ms
39+
- 5ms
40+
- 10ms
41+
- 25ms
42+
- 50ms
43+
- 100ms
44+
- 250ms
45+
- 500ms
46+
- 1s
47+
- 2.5s
48+
- 5s
49+
- 10s
50+
- 30s
51+
- 60s
52+
53+
# Additional dimensions beyond defaults (service.name, span.name, status.code)
54+
dimensions:
55+
- name: operation # FHE op name on fhe_operation / compress_ciphertext spans
56+
- name: ct_type # Ciphertext type on compress/squash/upload spans
57+
- name: operation_pattern_id # per-cone DFG fingerprint on execute_transaction / fhe_operation spans
58+
- name: transaction_pattern_id # whole-tx DFG fingerprint on execute_transaction spans
59+
60+
# Remove unhelpful defaults (all coprocessor spans are INTERNAL — single value)
61+
exclude_dimensions:
62+
- span.kind
63+
64+
# Exemplars for metric→trace pivot in Grafana
65+
exemplars:
66+
enabled: true
67+
max_per_data_point: 5
68+
69+
# Flush and expiration
70+
metrics_flush_interval: 15s
71+
metrics_expiration: 5m
72+
73+
exporters:
74+
otlp/jaeger:
75+
endpoint: "jaeger:4317"
76+
tls:
77+
insecure: true
78+
79+
prometheus:
80+
endpoint: "0.0.0.0:8889"
81+
enable_open_metrics: true # Required for exemplars
82+
83+
processors:
84+
batch:
85+
send_batch_size: 1024
86+
timeout: 5s
87+
88+
service:
89+
telemetry:
90+
logs:
91+
level: info
92+
metrics:
93+
address: "0.0.0.0:8888" # Collector's own health metrics
94+
95+
pipelines:
96+
traces:
97+
receivers: [otlp]
98+
processors: [batch]
99+
exporters: [spanmetrics, otlp/jaeger]
100+
101+
metrics/spanmetrics:
102+
receivers: [spanmetrics]
103+
processors: [batch]
104+
exporters: [prometheus]
105+
106+
# =============================================================================
107+
# Sample PromQL Queries for RED Dashboards
108+
# =============================================================================
109+
# Metric names generated (namespace "coprocessor.span" → underscores in Prometheus):
110+
# coprocessor_span_calls_total — span call counter
111+
# coprocessor_span_duration_milliseconds_bucket — span duration histogram
112+
#
113+
# --- Rate (Request Rate) ---
114+
#
115+
# Total span call rate by service and span name:
116+
# sum(rate(coprocessor_span_calls_total[5m])) by (service_name, span_name)
117+
#
118+
# FHE operation rate by operation type:
119+
# sum(rate(coprocessor_span_calls_total{span_name="fhe_operation"}[5m])) by (operation)
120+
#
121+
# --- Error Rate ---
122+
#
123+
# Error rate by service:
124+
# sum(rate(coprocessor_span_calls_total{status_code="STATUS_CODE_ERROR"}[5m])) by (service_name)
125+
# / sum(rate(coprocessor_span_calls_total[5m])) by (service_name)
126+
#
127+
# Error rate by span name:
128+
# sum(rate(coprocessor_span_calls_total{status_code="STATUS_CODE_ERROR"}[5m])) by (span_name)
129+
# / sum(rate(coprocessor_span_calls_total[5m])) by (span_name)
130+
#
131+
# --- Duration (Latency Percentiles) ---
132+
#
133+
# p50 latency by service:
134+
# histogram_quantile(0.5, sum(rate(coprocessor_span_duration_milliseconds_bucket[5m])) by (le, service_name))
135+
#
136+
# p95 latency by span name:
137+
# histogram_quantile(0.95, sum(rate(coprocessor_span_duration_milliseconds_bucket[5m])) by (le, span_name))
138+
#
139+
# p99 latency for FHE operations by operation type:
140+
# histogram_quantile(0.99, sum(rate(coprocessor_span_duration_milliseconds_bucket{span_name="fhe_operation"}[5m])) by (le, operation))
141+
#
142+
# p95 latency by DFG operation pattern:
143+
# histogram_quantile(0.95, sum(rate(coprocessor_span_duration_milliseconds_bucket{span_name="execute_transaction"}[5m])) by (le, operation_pattern_id))
144+
#
145+
# p95 latency by DFG transaction pattern:
146+
# histogram_quantile(0.95, sum(rate(coprocessor_span_duration_milliseconds_bucket{span_name="execute_transaction"}[5m])) by (le, transaction_pattern_id))
147+
# =============================================================================

test-suite/fhevm/config/prometheus/prometheus.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,20 @@ scrape_configs:
77

88
static_configs:
99
- targets: ['kms-connector-gw-listener:9100', 'kms-connector-kms-worker:9100', 'kms-connector-tx-sender:9100']
10-
10+
1111
# Coprocessor job configuration
1212
- job_name: 'coprocessor'
1313
static_configs:
1414
- targets: ['coprocessor-transaction-sender:9100', 'coprocessor-gw-listener:9100', 'coprocessor-tfhe-worker:9100', 'coprocessor-sns-worker:9100', 'coprocessor-zkproof-worker:9100']
15+
16+
# Spanmetrics from OTEL Collector
17+
# Scrapes spanmetrics-derived metrics from the OTEL Collector's Prometheus exporter.
18+
# OpenMetricsText1.0.0 must be first to enable exemplar ingestion (metric→trace links).
19+
- job_name: 'otel-spanmetrics'
20+
scrape_interval: 15s
21+
scrape_protocols:
22+
- OpenMetricsText1.0.0 # Required first: enables exemplar ingestion
23+
- PrometheusProto
24+
- PrometheusText0.0.4
25+
static_configs:
26+
- targets: ['otel-collector:8889']

test-suite/fhevm/docker-compose/tracing-docker-compose.yml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,23 @@ services:
1212
- prom_data:/prometheus
1313
command:
1414
- '--config.file=/etc/prometheus/prometheus.yml'
15+
- '--enable-feature=exemplar-storage' # Enable exemplar support for spanmetrics trace links
16+
depends_on:
17+
- otel-collector
18+
19+
otel-collector:
20+
container_name: otel-collector
21+
image: otel/opentelemetry-collector-contrib:0.120.0 # spanmetrics connector available since 0.86.0
22+
ports:
23+
- "4317:4317" # OTLP gRPC (services send traces here instead of directly to Jaeger)
24+
- "4318:4318" # OTLP HTTP (optional)
25+
- "8889:8889" # Prometheus exporter (spanmetrics output)
26+
- "8888:8888" # Collector health/self-monitoring metrics
27+
volumes:
28+
- ../config/otel-collector/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
29+
depends_on:
30+
- jaeger
31+
restart: unless-stopped
1532

1633
jaeger:
1734
container_name: jaeger
@@ -24,9 +41,10 @@ services:
2441
- "6831:6831/udp"
2542
- "6832:6832/udp"
2643
- "5778:5778"
27-
- "16686:16686"
28-
- "4317:4317"
29-
- "4318:4318"
44+
- "16686:16686" # Jaeger UI (keep exposed to host)
45+
# NOTE: OTLP ports 4317 and 4318 are intentionally NOT exposed to the host.
46+
# The OTEL Collector is the new ingress point for OTLP traces (owns host port 4317).
47+
# Jaeger receives traces from the collector via Docker DNS (jaeger:4317).
3048
- "14250:14250"
3149
- "14268:14268"
3250
- "14269:14269"

0 commit comments

Comments
 (0)