newrelic
diff --git a/‎README.md‎
Lines changed: 18 additions & 1 deletion b/‎README.md‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎k8s/base/configs/internal-telemetry-config.yaml‎
Lines changed: 77 additions & 0 deletions b/‎k8s/base/configs/internal-telemetry-config.yaml‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎k8s/base/configs/kafka-jmx-config.yaml‎
Lines changed: 199 additions & 0 deletions b/‎k8s/base/configs/kafka-jmx-config.yaml‎
Lines changed: 199 additions & 0 deletions
@@ -20,7 +20,15 @@ Relibank simulates a banking system with separate services for accounts, transac
 - **chatbot-service** - Relibank's AI chatbot (FastAPI)
 - **notifications-service** - Sends notifications via Kafka
 - **scheduler-service** - Schedules events via Kafka
-- **Infrastructure** - Kafka, Zookeeper, databases
+- **Infrastructure:**
+  - Kafka & Zookeeper - Message streaming
+  - PostgreSQL - Accounts database
+  - MSSQL - Transactions database
+  - **otel-collector-kafka** - OpenTelemetry collector for Kafka monitoring
+    - JMX metrics (Kafka broker + JVM telemetry)
+    - Kafka protocol metrics
+    - Internal collector telemetry
+    - Exports to New Relic via OTLP
 
 ## Getting Started
 
@@ -164,6 +172,15 @@ This isn't meant to be a real banking application. It's a learning tool for:
 - **Date Conversion**: Automatic MSSQL date-to-string conversion for proper JSON serialization
 - **Active Schedule Filtering**: Frontend filters cancelled vs active recurring payments
 
+### Infrastructure Monitoring
+- **Kafka OpenTelemetry Collector**: Comprehensive monitoring of Kafka infrastructure
+  - **JMX Metrics**: Kafka broker metrics via JMX (topics, partitions, replication, leader elections)
+  - **JVM Telemetry**: Full JVM observability (GC, memory, threads, CPU, file descriptors)
+  - **Kafka Protocol Metrics**: Native Kafka metrics (broker count, consumer lag, partition health)
+  - **Internal Telemetry**: Collector self-monitoring with detailed metrics
+  - **Export to New Relic**: All metrics sent to New Relic via OTLP
+  - See [`otel_collector_kafka/README.md`](otel_collector_kafka/README.md) for details
+
 Try breaking things with Chaos Mesh and see how the system responds!
 
 ---
 
@@ -0,0 +1,77 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: internal-telemetry-config
+  namespace: relibank
+data:
+  internal-telemetry-config.yaml: |
+    ##### Example configuration for internal telemetry
+    # This configuration is intended to be used in conjunction with a configuration of components and pipelines. The
+    # collector supports config merging on startup.
+    ##### Requirements
+    # - nrdot-collector (any distro) >= 1.3.0 or collector core version >= v1.35.0 / v0.129.0
+    ##### Configuration via environment variables
+    # For official documentation, see: https://opentelemetry.io/docs/collector/internal-telemetry/
+    ## Required
+    # - NEW_RELIC_LICENSE_KEY (baked into image via Dockerfile)
+    ## Optional
+    # - INTERNAL_TELEMETRY_SERVICE_NAME: defaults to 'otel-collector'; determines entity name in New Relic *** this important for testing ***
+    # - INTERNAL_TELEMETRY_OTLP_ENDPOINT: defaults to 'https://otlp.nr-data.net'; see https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp/ and https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp-troubleshooting/
+    # - INTERNAL_TELEMETRY_METRICS_LEVEL: defaults to 'detailed'; other values are 'normal', 'basic', 'none'
+    # - INTERNAL_TELEMETRY_LOG_LEVEL: defaults to INFO; other values are DEBUG, WARN, ERROR
+    # - INTERNAL_TELEMETRY_TRACE_LEVEL: defaults to 'none' (traces disabled); other value is 'basic'
+    # - INTERNAL_TELEMETRY_TRACE_SAMPLE_RATIO: defaults to 0.01, i.e. 1% sampling; has no effect if TRACE_LEVEL is 'none'
+    service:
+      telemetry:
+        metrics:
+          level: "${env:INTERNAL_TELEMETRY_METRICS_LEVEL:-detailed}"
+          readers:
+            - periodic:
+                exporter:
+                  otlp:
+                    protocol: http/protobuf
+                    endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
+                    headers:
+                      - name: api-key
+                        value: "${env:NEW_RELIC_LICENSE_KEY}"
+        logs:
+          level: "${env:INTERNAL_TELEMETRY_LOG_LEVEL:-INFO}"
+          # default sampling config for reference to simplify overwrites even if not exposed via env var, e.g. --config=yaml:service::telemetry::logs::sampling::enabled::false
+          sampling:
+            enabled: true
+            # The interval in seconds that the logger applies to each sampling.
+            tick: 10s
+            # The number of messages logged at the start of each sampling::tick
+            initial: 10
+            # Sets the sampling policy for subsequent messages after sampling::initial messages are logged. When sampling::thereafter is set to N, every Nth message is logged and all others are dropped. If N is zero, the logger drops all messages after sampling::initial messages are logged.
+            thereafter: 100
+          processors:
+            - batch:
+                exporter:
+                  otlp:
+                    protocol: http/protobuf
+                    endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
+                    headers:
+                      - name: api-key
+                        value: "${env:NEW_RELIC_LICENSE_KEY}"
+        traces:
+          # traces are disabled by default due to experimental status and lack of default sampling rate that works across use cases
+          level: "${env:INTERNAL_TELEMETRY_TRACE_LEVEL:-none}"
+          sampler:
+            parent_based:
+              root:
+                trace_id_ratio_based:
+                  ratio: ${env:INTERNAL_TELEMETRY_TRACE_SAMPLE_RATIO:-0.01}
+          processors:
+            - batch:
+                exporter:
+                  otlp:
+                    protocol: http/protobuf
+                    endpoint: "${env:INTERNAL_TELEMETRY_OTLP_ENDPOINT:-https://otlp.nr-data.net}"
+                    headers:
+                      - name: api-key
+                        value: "${env:NEW_RELIC_LICENSE_KEY}"
+        resource:
+          newrelic.collector_telemetry.version: 0.4.0
+          newrelic.service.type: otel_collector
+          service.name: "${env:INTERNAL_TELEMETRY_SERVICE_NAME:-otel-collector}"
@@ -0,0 +1,199 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kafka-jmx-config
+  namespace: relibank
+data:
+  kafka-jmx-config.yaml: |
+    ---
+    rules:
+      # Per-topic custom metrics using custom MBean commands
+      - bean: kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=*
+        metricAttribute:
+          topic: param(topic)
+        mapping:
+          Count:
+            metric: kafka.prod.msg.count
+            type: counter
+            desc: The number of messages in per topic
+            unit: "{message}"
+
+      - bean: kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=*
+        metricAttribute:
+          topic: param(topic)
+          direction: const(in)
+        mapping:
+          Count:
+            metric: kafka.topic.io
+            type: counter
+            desc: The bytes received or sent per topic
+            unit: By
+
+      - bean: kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=*
+        metricAttribute:
+          topic: param(topic)
+          direction: const(out)
+        mapping:
+          Count:
+            metric: kafka.topic.io
+            type: counter
+            desc: The bytes received or sent per topic
+            unit: By
+
+      # Cluster-level metrics using controller-based MBeans
+      - bean: kafka.controller:type=KafkaController,name=GlobalTopicCount
+        mapping:
+          Value:
+            metric: kafka.cluster.topic.count
+            type: gauge
+            desc: The total number of global topics in the cluster
+            unit: "{topic}"
+
+      - bean: kafka.controller:type=KafkaController,name=GlobalPartitionCount
+        mapping:
+          Value:
+            metric: kafka.cluster.partition.count
+            type: gauge
+            desc: The total number of global partitions in the cluster
+            unit: "{partition}"
+
+      - bean: kafka.controller:type=KafkaController,name=FencedBrokerCount
+        mapping:
+          Value:
+            metric: kafka.broker.fenced.count
+            type: gauge
+            desc: The number of fenced brokers in the cluster
+            unit: "{broker}"
+
+      - bean: kafka.controller:type=KafkaController,name=PreferredReplicaImbalanceCount
+        mapping:
+          Value:
+            metric: kafka.partition.non_preferred_leader
+            type: gauge
+            desc: The count of topic partitions for which the leader is not the preferred leader
+            unit: "{partition}"
+
+      # Broker-level metrics using ReplicaManager MBeans
+      - bean: kafka.server:type=ReplicaManager,name=UnderMinIsrPartitionCount
+        mapping:
+          Value:
+            metric: kafka.partition.under_min_isr
+            type: gauge
+            desc: The number of partitions where the number of in-sync replicas is less than the minimum
+            unit: "{partition}"
+
+      # Broker uptime metric using JVM Runtime
+      - bean: java.lang:type=Runtime
+        mapping:
+          Uptime:
+            metric: kafka.broker.uptime
+            type: gauge
+            desc: Broker uptime in milliseconds
+            unit: ms
+
+      # Leader count per broker
+      - bean: kafka.server:type=ReplicaManager,name=LeaderCount
+        mapping:
+          Value:
+            metric: kafka.broker.leader.count
+            type: gauge
+            desc: Number of partitions for which this broker is the leader
+            unit: "{partition}"
+
+      # JVM metrics
+      - bean: java.lang:type=GarbageCollector,name=*
+        mapping:
+          CollectionCount:
+            metric: jvm.gc.collections.count
+            type: counter
+            unit: "{collection}"
+            desc: total number of collections that have occurred
+            metricAttribute:
+              name: param(name)
+          CollectionTime:
+            metric: jvm.gc.collections.elapsed
+            type: counter
+            unit: ms
+            desc: the approximate accumulated collection elapsed time in milliseconds
+            metricAttribute:
+              name: param(name)
+
+      - bean: java.lang:type=Memory
+        unit: By
+        prefix: jvm.memory.
+        dropNegativeValues: true
+        mapping:
+          HeapMemoryUsage.committed:
+            metric: heap.committed
+            desc: current heap usage
+            type: gauge
+          HeapMemoryUsage.max:
+            metric: heap.max
+            desc: current heap usage
+            type: gauge
+          HeapMemoryUsage.used:
+            metric: heap.used
+            desc: current heap usage
+            type: gauge
+
+      - bean: java.lang:type=Threading
+        mapping:
+          ThreadCount:
+            metric: jvm.thread.count
+            type: gauge
+            unit: "{thread}"
+            desc: Total thread count (Kafka typical range 100-300 threads)
+
+      - bean: java.lang:type=OperatingSystem
+        prefix: jvm.
+        dropNegativeValues: true
+        mapping:
+          SystemLoadAverage:
+            metric: system.cpu.load_1m
+            type: gauge
+            unit: "{run_queue_item}"
+            desc: System load average (1 minute) - alert if > CPU count
+          AvailableProcessors:
+            metric: cpu.count
+            type: gauge
+            unit: "{cpu}"
+            desc: Number of processors available
+          ProcessCpuLoad:
+            metric: cpu.recent_utilization
+            type: gauge
+            unit: '1'
+            desc: Recent CPU utilization for JVM process (0.0 to 1.0)
+          SystemCpuLoad:
+            metric: system.cpu.utilization
+            type: gauge
+            unit: '1'
+            desc: Recent CPU utilization for whole system (0.0 to 1.0)
+          OpenFileDescriptorCount:
+            metric: file_descriptor.count
+            type: gauge
+            unit: "{file_descriptor}"
+            desc: Number of open file descriptors - alert if > 80% of ulimit
+
+      - bean: java.lang:type=ClassLoading
+        mapping:
+          LoadedClassCount:
+            metric: jvm.class.count
+            type: gauge
+            unit: "{class}"
+            desc: Currently loaded class count
+
+      - bean: java.lang:type=MemoryPool,name=*
+        type: gauge
+        unit: By
+        metricAttribute:
+          name: param(name)
+        mapping:
+          Usage.used:
+            metric: jvm.memory.pool.used
+            desc: Memory pool usage by generation (G1 Old Gen, Eden, Survivor)
+          Usage.max:
+            metric: jvm.memory.pool.max
+            desc: Maximum memory pool size
+          CollectionUsage.used:
+            metric: jvm.memory.pool.used_after_last_gc
+            desc: Memory used after last GC (shows retained memory baseline)