Skip to content

Commit d4912ad

Browse files
authored
feat: kafka otel + internal collector metrics (#49)
fix: doc updates fix: working kafka piece fix: update azure prod to include the new image
1 parent d3bd78f commit d4912ad

File tree

11 files changed

+1038
-7
lines changed

11 files changed

+1038
-7
lines changed

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,15 @@ Relibank simulates a banking system with separate services for accounts, transac
2020
- **chatbot-service** - Relibank's AI chatbot (FastAPI)
2121
- **notifications-service** - Sends notifications via Kafka
2222
- **scheduler-service** - Schedules events via Kafka
23-
- **Infrastructure** - Kafka, Zookeeper, databases
23+
- **Infrastructure:**
24+
- Kafka & Zookeeper - Message streaming
25+
- PostgreSQL - Accounts database
26+
- MSSQL - Transactions database
27+
- **otel-collector-kafka** - OpenTelemetry collector for Kafka monitoring
28+
- JMX metrics (Kafka broker + JVM telemetry)
29+
- Kafka protocol metrics
30+
- Internal collector telemetry
31+
- Exports to New Relic via OTLP
2432

2533
## Getting Started
2634

@@ -164,6 +172,15 @@ This isn't meant to be a real banking application. It's a learning tool for:
164172
- **Date Conversion**: Automatic MSSQL date-to-string conversion for proper JSON serialization
165173
- **Active Schedule Filtering**: Frontend filters cancelled vs active recurring payments
166174

175+
### Infrastructure Monitoring
176+
- **Kafka OpenTelemetry Collector**: Comprehensive monitoring of Kafka infrastructure
177+
- **JMX Metrics**: Kafka broker metrics via JMX (topics, partitions, replication, leader elections)
178+
- **JVM Telemetry**: Full JVM observability (GC, memory, threads, CPU, file descriptors)
179+
- **Kafka Protocol Metrics**: Native Kafka metrics (broker count, consumer lag, partition health)
180+
- **Internal Telemetry**: Collector self-monitoring with detailed metrics
181+
- **Export to New Relic**: All metrics sent to New Relic via OTLP
182+
- See [`otel_collector_kafka/README.md`](otel_collector_kafka/README.md) for details
183+
167184
Try breaking things with Chaos Mesh and see how the system responds!
168185

169186
---
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: internal-telemetry-config
5+
namespace: relibank
6+
data:
7+
internal-telemetry-config.yaml: |
8+
##### Example configuration for internal telemetry
9+
# This configuration is intended to be used in conjunction with a configuration of components and pipelines. The
10+
# collector supports config merging on startup.
11+
##### Requirements
12+
# - nrdot-collector (any distro) >= 1.3.0 or collector core version >= v1.35.0 / v0.129.0
13+
##### Configuration via environment variables
14+
# For official documentation, see: https://opentelemetry.io/docs/collector/internal-telemetry/
15+
## Required
16+
# - NEW_RELIC_LICENSE_KEY (baked into image via Dockerfile)
17+
## Optional
18+
# - INTERNAL_TELEMETRY_SERVICE_NAME: defaults to 'otel-collector'; determines entity name in New Relic *** this important for testing ***
19+
# - INTERNAL_TELEMETRY_OTLP_ENDPOINT: defaults to 'https://otlp.nr-data.net'; see https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp/ and https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp-troubleshooting/
20+
# - INTERNAL_TELEMETRY_METRICS_LEVEL: defaults to 'detailed'; other values are 'normal', 'basic', 'none'
21+
# - INTERNAL_TELEMETRY_LOG_LEVEL: defaults to INFO; other values are DEBUG, WARN, ERROR
22+
# - INTERNAL_TELEMETRY_TRACE_LEVEL: defaults to 'none' (traces disabled); other value is 'basic'
23+
# - INTERNAL_TELEMETRY_TRACE_SAMPLE_RATIO: defaults to 0.01, i.e. 1% sampling; has no effect if TRACE_LEVEL is 'none'
24+
service:
25+
telemetry:
26+
metrics:
27+
level: "${env:INTERNAL_TELEMETRY_METRICS_LEVEL:-detailed}"
28+
readers:
29+
- periodic:
30+
exporter:
31+
otlp:
32+
protocol: http/protobuf
33+
endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
34+
headers:
35+
- name: api-key
36+
value: "${env:NEW_RELIC_LICENSE_KEY}"
37+
logs:
38+
level: "${env:INTERNAL_TELEMETRY_LOG_LEVEL:-INFO}"
39+
# default sampling config for reference to simplify overwrites even if not exposed via env var, e.g. --config=yaml:service::telemetry::logs::sampling::enabled::false
40+
sampling:
41+
enabled: true
42+
# The interval in seconds that the logger applies to each sampling.
43+
tick: 10s
44+
# The number of messages logged at the start of each sampling::tick
45+
initial: 10
46+
# Sets the sampling policy for subsequent messages after sampling::initial messages are logged. When sampling::thereafter is set to N, every Nth message is logged and all others are dropped. If N is zero, the logger drops all messages after sampling::initial messages are logged.
47+
thereafter: 100
48+
processors:
49+
- batch:
50+
exporter:
51+
otlp:
52+
protocol: http/protobuf
53+
endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
54+
headers:
55+
- name: api-key
56+
value: "${env:NEW_RELIC_LICENSE_KEY}"
57+
traces:
58+
# traces are disabled by default due to experimental status and lack of default sampling rate that works across use cases
59+
level: "${env:INTERNAL_TELEMETRY_TRACE_LEVEL:-none}"
60+
sampler:
61+
parent_based:
62+
root:
63+
trace_id_ratio_based:
64+
ratio: ${env:INTERNAL_TELEMETRY_TRACE_SAMPLE_RATIO:-0.01}
65+
processors:
66+
- batch:
67+
exporter:
68+
otlp:
69+
protocol: http/protobuf
70+
endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
71+
headers:
72+
- name: api-key
73+
value: "${env:NEW_RELIC_LICENSE_KEY}"
74+
resource:
75+
newrelic.collector_telemetry.version: 0.4.0
76+
newrelic.service.type: otel_collector
77+
service.name: "${env:INTERNAL_TELEMETRY_SERVICE_NAME:-otel-collector}"
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: kafka-jmx-config
5+
namespace: relibank
6+
data:
7+
kafka-jmx-config.yaml: |
8+
---
9+
rules:
10+
# Per-topic custom metrics using custom MBean commands
11+
- bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=*
12+
metricAttribute:
13+
topic: param(topic)
14+
mapping:
15+
Count:
16+
metric: kafka.prod.msg.count
17+
type: counter
18+
desc: The number of messages in per topic
19+
unit: "{message}"
20+
21+
- bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=*
22+
metricAttribute:
23+
topic: param(topic)
24+
direction: const(in)
25+
mapping:
26+
Count:
27+
metric: kafka.topic.io
28+
type: counter
29+
desc: The bytes received or sent per topic
30+
unit: By
31+
32+
- bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=*
33+
metricAttribute:
34+
topic: param(topic)
35+
direction: const(out)
36+
mapping:
37+
Count:
38+
metric: kafka.topic.io
39+
type: counter
40+
desc: The bytes received or sent per topic
41+
unit: By
42+
43+
# Cluster-level metrics using controller-based MBeans
44+
- bean: kafka.controller:type=KafkaController,name=GlobalTopicCount
45+
mapping:
46+
Value:
47+
metric: kafka.cluster.topic.count
48+
type: gauge
49+
desc: The total number of global topics in the cluster
50+
unit: "{topic}"
51+
52+
- bean: kafka.controller:type=KafkaController,name=GlobalPartitionCount
53+
mapping:
54+
Value:
55+
metric: kafka.cluster.partition.count
56+
type: gauge
57+
desc: The total number of global partitions in the cluster
58+
unit: "{partition}"
59+
60+
- bean: kafka.controller:type=KafkaController,name=FencedBrokerCount
61+
mapping:
62+
Value:
63+
metric: kafka.broker.fenced.count
64+
type: gauge
65+
desc: The number of fenced brokers in the cluster
66+
unit: "{broker}"
67+
68+
- bean: kafka.controller:type=KafkaController,name=PreferredReplicaImbalanceCount
69+
mapping:
70+
Value:
71+
metric: kafka.partition.non_preferred_leader
72+
type: gauge
73+
desc: The count of topic partitions for which the leader is not the preferred leader
74+
unit: "{partition}"
75+
76+
# Broker-level metrics using ReplicaManager MBeans
77+
- bean: kafka.server:type=ReplicaManager,name=UnderMinIsrPartitionCount
78+
mapping:
79+
Value:
80+
metric: kafka.partition.under_min_isr
81+
type: gauge
82+
desc: The number of partitions where the number of in-sync replicas is less than the minimum
83+
unit: "{partition}"
84+
85+
# Broker uptime metric using JVM Runtime
86+
- bean: java.lang:type=Runtime
87+
mapping:
88+
Uptime:
89+
metric: kafka.broker.uptime
90+
type: gauge
91+
desc: Broker uptime in milliseconds
92+
unit: ms
93+
94+
# Leader count per broker
95+
- bean: kafka.server:type=ReplicaManager,name=LeaderCount
96+
mapping:
97+
Value:
98+
metric: kafka.broker.leader.count
99+
type: gauge
100+
desc: Number of partitions for which this broker is the leader
101+
unit: "{partition}"
102+
103+
# JVM metrics
104+
- bean: java.lang:type=GarbageCollector,name=*
105+
mapping:
106+
CollectionCount:
107+
metric: jvm.gc.collections.count
108+
type: counter
109+
unit: "{collection}"
110+
desc: total number of collections that have occurred
111+
metricAttribute:
112+
name: param(name)
113+
CollectionTime:
114+
metric: jvm.gc.collections.elapsed
115+
type: counter
116+
unit: ms
117+
desc: the approximate accumulated collection elapsed time in milliseconds
118+
metricAttribute:
119+
name: param(name)
120+
121+
- bean: java.lang:type=Memory
122+
unit: By
123+
prefix: jvm.memory.
124+
dropNegativeValues: true
125+
mapping:
126+
HeapMemoryUsage.committed:
127+
metric: heap.committed
128+
desc: current heap usage
129+
type: gauge
130+
HeapMemoryUsage.max:
131+
metric: heap.max
132+
desc: current heap usage
133+
type: gauge
134+
HeapMemoryUsage.used:
135+
metric: heap.used
136+
desc: current heap usage
137+
type: gauge
138+
139+
- bean: java.lang:type=Threading
140+
mapping:
141+
ThreadCount:
142+
metric: jvm.thread.count
143+
type: gauge
144+
unit: "{thread}"
145+
desc: Total thread count (Kafka typical range 100-300 threads)
146+
147+
- bean: java.lang:type=OperatingSystem
148+
prefix: jvm.
149+
dropNegativeValues: true
150+
mapping:
151+
SystemLoadAverage:
152+
metric: system.cpu.load_1m
153+
type: gauge
154+
unit: "{run_queue_item}"
155+
desc: System load average (1 minute) - alert if > CPU count
156+
AvailableProcessors:
157+
metric: cpu.count
158+
type: gauge
159+
unit: "{cpu}"
160+
desc: Number of processors available
161+
ProcessCpuLoad:
162+
metric: cpu.recent_utilization
163+
type: gauge
164+
unit: '1'
165+
desc: Recent CPU utilization for JVM process (0.0 to 1.0)
166+
SystemCpuLoad:
167+
metric: system.cpu.utilization
168+
type: gauge
169+
unit: '1'
170+
desc: Recent CPU utilization for whole system (0.0 to 1.0)
171+
OpenFileDescriptorCount:
172+
metric: file_descriptor.count
173+
type: gauge
174+
unit: "{file_descriptor}"
175+
desc: Number of open file descriptors - alert if > 80% of ulimit
176+
177+
- bean: java.lang:type=ClassLoading
178+
mapping:
179+
LoadedClassCount:
180+
metric: jvm.class.count
181+
type: gauge
182+
unit: "{class}"
183+
desc: Currently loaded class count
184+
185+
- bean: java.lang:type=MemoryPool,name=*
186+
type: gauge
187+
unit: By
188+
metricAttribute:
189+
name: param(name)
190+
mapping:
191+
Usage.used:
192+
metric: jvm.memory.pool.used
193+
desc: Memory pool usage by generation (G1 Old Gen, Eden, Survivor)
194+
Usage.max:
195+
metric: jvm.memory.pool.max
196+
desc: Maximum memory pool size
197+
CollectionUsage.used:
198+
metric: jvm.memory.pool.used_after_last_gc
199+
desc: Memory used after last GC (shows retained memory baseline)

0 commit comments

Comments
 (0)