metrics-governor/examples/config.yaml at main · szibis/metrics-governor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# metrics-governor configuration
# https://github.com/szibis/metrics-governor
#
# All settings are optional - defaults are applied when not specified.
# CLI flags can override any setting in this file.

# Receiver configuration - how metrics are received
receiver:
  grpc:
    address: ":4317"
  http:
    address: ":4318"
    server:
      max_request_body_size: 0         # bytes, 0 = no limit
      read_timeout: 0s                 # 0 = no timeout
      read_header_timeout: 1m
      write_timeout: 30s
      idle_timeout: 1m
      keep_alives_enabled: true
  # TLS for receiver (server-side)
  tls:
    enabled: false
    cert_file: "/etc/tls/server.crt"
    key_file: "/etc/tls/server.key"
    ca_file: "/etc/tls/ca.crt"         # for mTLS client verification
    client_auth: false                 # require client certificates
  # Authentication for incoming requests
  auth:
    enabled: false
    bearer_token: ""                   # expected token from clients
    basic_username: ""
    basic_password: ""

# Exporter configuration - how metrics are forwarded
exporter:
  endpoint: "localhost:4317"
  protocol: "grpc"                     # grpc or http
  insecure: true                       # use TLS?
  timeout: 30s
  # TLS for exporter (client-side)
  tls:
    enabled: false
    cert_file: ""                      # client cert for mTLS
    key_file: ""                       # client key for mTLS
    ca_file: ""                        # custom CA for server verification
    skip_verify: false                 # skip server certificate verification
    server_name: ""                    # override SNI
  # Authentication when forwarding
  auth:
    bearer_token: ""
    basic_username: ""
    basic_password: ""
    headers: {}                        # custom headers, e.g. X-Custom-Header: value
  # Compression for HTTP protocol
  compression:
    type: "none"                       # none, gzip, zstd, snappy, zlib, deflate
    level: 0                           # 0 = default, gzip: 1-9, zstd: 1,3,6,11
  # HTTP client connection pool settings
  http_client:
    max_idle_conns: 100
    max_idle_conns_per_host: 100
    max_conns_per_host: 0              # 0 = no limit
    idle_conn_timeout: 90s
    disable_keep_alives: false
    force_http2: false
    http2_read_idle_timeout: 0s
    http2_ping_timeout: 0s

  #############################################################################
  # Queue Configuration - Persistent Retry Queue with Resilience Features
  #############################################################################
  # The queue provides durability for export failures, ensuring metrics are
  # not lost during backend outages. It includes circuit breaker and exponential
  # backoff to prevent resource exhaustion during prolonged failures.
  queue:
    # Core queue settings
    enabled: false                     # enable persistent queue for retries
    path: "./queue"                    # queue storage directory
    max_size: 10000                    # max batches in queue
    max_bytes: 1073741824              # max total size (1GB)
    retry_interval: 5s                 # initial retry interval
    max_retry_delay: 5m                # max backoff delay
    full_behavior: drop_oldest         # drop_oldest, drop_newest, or block
    target_utilization: 0.85           # target disk utilization (0.0-1.0)
    adaptive_enabled: true             # enable adaptive sizing based on disk space

    # Storage settings (advanced tuning)
    inmemory_blocks: 2048              # in-memory channel size before spilling to disk
    chunk_size: 536870912              # chunk file size (512MB)
    meta_sync_interval: 1s             # metadata sync interval (max data loss window)
    stale_flush_interval: 30s          # flush stale in-memory data to disk
    compression: "snappy"              # queue block compression: none, snappy
    write_buffer_size: 262144          # buffered writer size (256KB)

    # Exponential backoff - increases delay between retries on repeated failures
    # Prevents overwhelming a struggling backend with rapid retry attempts
    backoff:
      enabled: true                    # enable exponential backoff for retries
      multiplier: 2.0                  # multiply delay by this on each failure
                                       # e.g., 5s -> 10s -> 20s -> 40s (capped at max_retry_delay)

    # Circuit breaker - stops retries after consecutive failures
    # Prevents wasting resources on a completely unavailable backend
    # States: closed (normal) -> open (blocking) -> half-open (testing recovery)
    circuit_breaker:
      enabled: true                    # enable circuit breaker pattern
      threshold: 10                    # consecutive failures before opening circuit
      reset_timeout: 30s               # time before testing recovery (half-open state)

# Buffer configuration - internal metrics buffering
buffer:
  size: 10000                          # max metrics to buffer
  batch_size: 5000                     # max metrics per export batch
  flush_interval: 5s                   # how often to flush

# Stats endpoint configuration
stats:
  address: ":9090"
  labels:                              # labels for per-label-value stats
    - service
    - env

# Limits configuration (dry_run only - rules go in separate limits.yaml)
limits:
  dry_run: true                        # log violations but don't drop

# Performance tuning
# Techniques inspired by VictoriaMetrics blog articles (https://valyala.medium.com/)
# Original implementations using standard Go patterns
performance:
  export_concurrency: 0                # max concurrent export goroutines (0 = NumCPU * 4)
  string_interning: true               # deduplicate label strings to reduce allocations
  intern_max_value_length: 64          # max length for interned label values

# Memory limit configuration
# Auto-detects container memory limits (Docker/K8s via cgroups) and sets GOMEMLIMIT
# This helps prevent OOM kills by making Go GC more aggressive as memory approaches limit
memory:
  limit_ratio: 0.9                     # ratio of container memory for GOMEMLIMIT (0.0-1.0)
                                       # 0.9 = 90% of container limit, leaves 10% headroom
                                       # Set to 0 to disable auto-detection