-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
141 lines (130 loc) · 6.19 KB
/
Copy pathconfig.yaml
File metadata and controls
141 lines (130 loc) · 6.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# metrics-governor configuration
# https://github.com/szibis/metrics-governor
#
# All settings are optional - defaults are applied when not specified.
# CLI flags can override any setting in this file.
# Receiver configuration - how metrics are received
receiver:
grpc:
address: ":4317"
http:
address: ":4318"
server:
max_request_body_size: 0 # bytes, 0 = no limit
read_timeout: 0s # 0 = no timeout
read_header_timeout: 1m
write_timeout: 30s
idle_timeout: 1m
keep_alives_enabled: true
# TLS for receiver (server-side)
tls:
enabled: false
cert_file: "/etc/tls/server.crt"
key_file: "/etc/tls/server.key"
ca_file: "/etc/tls/ca.crt" # for mTLS client verification
client_auth: false # require client certificates
# Authentication for incoming requests
auth:
enabled: false
bearer_token: "" # expected token from clients
basic_username: ""
basic_password: ""
# Exporter configuration - how metrics are forwarded
exporter:
endpoint: "localhost:4317"
protocol: "grpc" # grpc or http
insecure: true # use TLS?
timeout: 30s
# TLS for exporter (client-side)
tls:
enabled: false
cert_file: "" # client cert for mTLS
key_file: "" # client key for mTLS
ca_file: "" # custom CA for server verification
skip_verify: false # skip server certificate verification
server_name: "" # override SNI
# Authentication when forwarding
auth:
bearer_token: ""
basic_username: ""
basic_password: ""
headers: {} # custom headers, e.g. X-Custom-Header: value
# Compression for HTTP protocol
compression:
type: "none" # none, gzip, zstd, snappy, zlib, deflate
level: 0 # 0 = default, gzip: 1-9, zstd: 1,3,6,11
# HTTP client connection pool settings
http_client:
max_idle_conns: 100
max_idle_conns_per_host: 100
max_conns_per_host: 0 # 0 = no limit
idle_conn_timeout: 90s
disable_keep_alives: false
force_http2: false
http2_read_idle_timeout: 0s
http2_ping_timeout: 0s
#############################################################################
# Queue Configuration - Persistent Retry Queue with Resilience Features
#############################################################################
# The queue provides durability for export failures, ensuring metrics are
# not lost during backend outages. It includes circuit breaker and exponential
# backoff to prevent resource exhaustion during prolonged failures.
queue:
# Core queue settings
enabled: false # enable persistent queue for retries
path: "./queue" # queue storage directory
max_size: 10000 # max batches in queue
max_bytes: 1073741824 # max total size (1GB)
retry_interval: 5s # initial retry interval
max_retry_delay: 5m # max backoff delay
full_behavior: drop_oldest # drop_oldest, drop_newest, or block
target_utilization: 0.85 # target disk utilization (0.0-1.0)
adaptive_enabled: true # enable adaptive sizing based on disk space
# Storage settings (advanced tuning)
inmemory_blocks: 2048 # in-memory channel size before spilling to disk
chunk_size: 536870912 # chunk file size (512MB)
meta_sync_interval: 1s # metadata sync interval (max data loss window)
stale_flush_interval: 30s # flush stale in-memory data to disk
compression: "snappy" # queue block compression: none, snappy
write_buffer_size: 262144 # buffered writer size (256KB)
# Exponential backoff - increases delay between retries on repeated failures
# Prevents overwhelming a struggling backend with rapid retry attempts
backoff:
enabled: true # enable exponential backoff for retries
multiplier: 2.0 # multiply delay by this on each failure
# e.g., 5s -> 10s -> 20s -> 40s (capped at max_retry_delay)
# Circuit breaker - stops retries after consecutive failures
# Prevents wasting resources on a completely unavailable backend
# States: closed (normal) -> open (blocking) -> half-open (testing recovery)
circuit_breaker:
enabled: true # enable circuit breaker pattern
threshold: 10 # consecutive failures before opening circuit
reset_timeout: 30s # time before testing recovery (half-open state)
# Buffer configuration - internal metrics buffering
buffer:
size: 10000 # max metrics to buffer
batch_size: 5000 # max metrics per export batch
flush_interval: 5s # how often to flush
# Stats endpoint configuration
stats:
address: ":9090"
labels: # labels for per-label-value stats
- service
- env
# Limits configuration (dry_run only - rules go in separate limits.yaml)
limits:
dry_run: true # log violations but don't drop
# Performance tuning
# Techniques inspired by VictoriaMetrics blog articles (https://valyala.medium.com/)
# Original implementations using standard Go patterns
performance:
export_concurrency: 0 # max concurrent export goroutines (0 = NumCPU * 4)
string_interning: true # deduplicate label strings to reduce allocations
intern_max_value_length: 64 # max length for interned label values
# Memory limit configuration
# Auto-detects container memory limits (Docker/K8s via cgroups) and sets GOMEMLIMIT
# This helps prevent OOM kills by making Go GC more aggressive as memory approaches limit
memory:
limit_ratio: 0.9 # ratio of container memory for GOMEMLIMIT (0.0-1.0)
# 0.9 = 90% of container limit, leaves 10% headroom
# Set to 0 to disable auto-detection