klag-exporter/config.example.toml at main · softwaremill/klag-exporter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Kafka Lag Exporter Configuration
# Copy this file to config.toml and adjust as needed

[exporter]
# How often to poll Kafka for offsets (default: 30s)
poll_interval = "30s"

# HTTP server settings for Prometheus endpoint
http_port = 8000
http_host = "0.0.0.0"

# Metric granularity: "topic" (default) or "partition"
# "topic" aggregates per-topic, "partition" includes partition-level metrics
granularity = "topic"

# How long a cluster's metrics remain in /metrics output after its last
# successful collection. Past this age the cluster is filtered out so
# Prometheus sees a gap rather than a frozen snapshot. Defaults to
# poll_interval * 3 when unset — raise this on large clusters where a
# single collection cycle can take several minutes (timestamp sampling
# in 'message' mode, many consumer groups, etc.).
# staleness_threshold = "3m"

[exporter.timestamp_sampling]
# Enable time lag calculation
enabled = true

# Time-lag estimation mode:
#   "rate"    - (default) Estimate from observed high-watermark production
#               rate. No consumer pool, no FFI, scales to any cluster size.
#   "message" - Read the actual Kafka message at the committed offset and
#               use its produce timestamp. Exact, but requires a pooled
#               BaseConsumer per concurrent fetch (~5-15 MB each).
# mode = "rate"

# Message-mode settings (ignored when mode = "rate"):
cache_ttl = "60s"
max_concurrent_fetches = 5

# Rate-mode settings (ignored when mode = "message"):
# rate_history_samples = 5       # watermark observations per partition
# rate_history_max_age = "10m"   # evict samples older than this
# rate_min_msgs_per_sec = 0.01   # below this rate → time lag missing

[exporter.performance]
# Timeout for Kafka API operations (metadata, watermarks, etc.)
# kafka_timeout = "30s"

# Timeout for fetching committed offsets per consumer group
# offset_fetch_timeout = "10s"

# Maximum consumer groups to fetch offsets for in parallel
# max_concurrent_groups = 10

# Maximum partitions to fetch watermarks for in parallel
# max_concurrent_watermarks = 50

# Client recycling interval (number of collection cycles).
#
# librdkafka caches internal topic handles that are never freed until the
# client is destroyed. On large clusters with many topics (especially with
# topic churn — topics being created and deleted), this cache grows
# unboundedly and can consume gigabytes of memory.
#
# Recycling periodically destroys and recreates the internal Kafka clients,
# releasing all accumulated metadata. The trade-off is a brief allocation
# spike during the swap (~2-10 MB depending on cluster size).
#
# Guidelines:
#   0   = disabled (recommended for small/stable clusters)
#   50  = default (~25 min at 30s poll; good for large clusters)
#   100 = less frequent (lower overhead, more metadata accumulation)
#
# client_recycle_interval = 50

[exporter.otel]
# Enable OpenTelemetry export (default: false)
enabled = false

# OTLP endpoint (gRPC)
endpoint = "http://localhost:4317"

# How often to push metrics to OTLP
export_interval = "60s"

# Kafka cluster configuration
# You can define multiple [[clusters]] sections for multi-cluster monitoring

[[clusters]]
# Unique name for this cluster (used in metric labels)
name = "production"

# Kafka bootstrap servers (comma-separated)
bootstrap_servers = "kafka1:9092,kafka2:9092,kafka3:9092"

# Consumer group filters (regex patterns)
# Groups matching whitelist AND not matching blacklist will be monitored
group_whitelist = [".*"]
group_blacklist = []

# Topic filters (regex patterns)
topic_whitelist = [".*"]
topic_blacklist = ["^__.*"]  # Exclude internal topics like __consumer_offsets

# Additional Kafka consumer properties
# Use ${ENV_VAR} syntax for environment variable substitution
[clusters.consumer_properties]
# Uncomment and configure for SASL authentication:
# "security.protocol" = "SASL_SSL"
# "sasl.mechanism" = "PLAIN"
# "sasl.username" = "${KAFKA_USER}"
# "sasl.password" = "${KAFKA_PASSWORD}"

# Uncomment for SSL/TLS:
# "ssl.ca.location" = "/path/to/ca.pem"
# "ssl.certificate.location" = "/path/to/client.pem"
# "ssl.key.location" = "/path/to/client.key"

# Custom labels added to all metrics for this cluster
[clusters.labels]
environment = "production"
# datacenter = "us-east-1"

# Example: Second cluster configuration
# [[clusters]]
# name = "staging"
# bootstrap_servers = "kafka-staging:9092"
# group_whitelist = ["^staging-.*"]
# group_blacklist = []
# topic_whitelist = [".*"]
# topic_blacklist = ["^__.*"]
#
# [clusters.consumer_properties]
#
# [clusters.labels]
# environment = "staging"