-
Notifications
You must be signed in to change notification settings - Fork 684
Open
Description
We have deployed grafana mimir using the helm chart. When trying to scale down the mimir ingester from 15 to 12 replicas, the read path fails during the last scale down (from 13 to 12). The scale down is handled by the rollout operator, we deploy the helm chart using argocd. We are currently on helm version 6.0.5. The errors we see are of the type
partition x: too many unhealthy instances in the ring
Is there some procedure we need to follow to scale down the ingesters?
config:
activity_tracker:
filepath: /active-query-tracker/activity.log
alertmanager:
data_dir: /data
enable_api: true
external_url: /alertmanager
fallback_config_file: /configs/alertmanager_fallback_config.yaml
alertmanager_storage:
s3:
bucket_name: <bucket-name>
blocks_storage:
backend: s3
bucket_store:
chunks_cache:
backend: memcached
memcached:
addresses: dnssrvnoa+mimir-chunks-cache.mimir-ingest.svc.cluster.local.:11211
max_idle_connections: 150
max_item_size: 1048576
timeout: 750ms
index_cache:
backend: memcached
memcached:
addresses: dnssrvnoa+mimir-index-cache.mimir-ingest.svc.cluster.local.:11211
max_idle_connections: 150
max_item_size: 5242880
timeout: 750ms
metadata_cache:
backend: memcached
memcached:
addresses: dnssrvnoa+mimir-metadata-cache.mimir-ingest.svc.cluster.local.:11211
max_idle_connections: 150
max_item_size: 1048576
sync_dir: /data/tsdb-sync
s3:
bucket_name: <bucket-name>
tsdb:
dir: /data/tsdb
flush_blocks_on_shutdown: true
head_compaction_interval: 15m
wal_replay_concurrency: 3
common:
storage:
backend: s3
s3:
access_key_id: ${S3_USER}
endpoint: <s3-endpoint>
http:
insecure_skip_verify: true
secret_access_key: ${S3_PASSWORD}
compactor:
compaction_interval: 30m
data_dir: /data
deletion_delay: 2h
first_level_compaction_wait_period: 25m
max_closing_blocks_concurrency: 2
max_opening_blocks_concurrency: 4
sharding_ring:
heartbeat_period: 1m
heartbeat_timeout: 4m
wait_stability_min_duration: 1m
symbols_flushers_concurrency: 4
distributor:
remote_timeout: 5s
ring:
heartbeat_period: 1m
heartbeat_timeout: 4m
frontend:
cache_results: true
parallelize_shardable_queries: true
query_sharding_target_series_per_shard: 2500
results_cache:
backend: memcached
memcached:
addresses: dnssrvnoa+mimir-results-cache.mimir-ingest.svc.cluster.local.:11211
max_item_size: 5242880
timeout: 500ms
scheduler_address: mimir-query-scheduler-headless.mimir-ingest.svc:9095
frontend_worker:
grpc_client_config:
max_send_msg_size: 419430400
scheduler_address: mimir-query-scheduler-headless.mimir-ingest.svc:9095
ingest_storage:
enabled: true
kafka:
address: _tcp-clients._tcp.lgtm-kafka-cluster-kafka-bootstrap.kafka.svc.cluster.local
auto_create_topic_default_partitions: 48
auto_create_topic_enabled: true
client_id: mimir
consumer_group: observability-stack-mimir-ingest
topic: observability-stack-mimir-ingest
ingester:
push_grpc_method_enabled: false
ring:
final_sleep: 0s
heartbeat_period: 2m
heartbeat_timeout: 10m
num_tokens: 512
tokens_file_path: /data/tokens
unregister_on_shutdown: false
zone_awareness_enabled: true
ingester_client:
grpc_client_config:
max_recv_msg_size: 104857600
max_send_msg_size: 104857600
limits:
compactor_blocks_retention_period: 1y
ingestion_burst_size: 10000000
ingestion_rate: 500000
max_cache_freshness: 10m
max_global_exemplars_per_user: 10000000
max_global_series_per_user: 10000000
max_query_parallelism: 240
max_total_query_length: 12000h
out_of_order_time_window: 5m
ruler_max_rules_per_rule_group: 50
memberlist:
abort_if_cluster_join_fails: false
bind_addr:
- ${MY_POD_IP}
compression_enabled: false
join_members:
- dns+mimir-gossip-ring.mimir-ingest.svc.cluster.local.:7946
querier:
max_concurrent: 16
query_scheduler:
max_outstanding_requests_per_tenant: 800
ruler:
alertmanager_url: dnssrvnoa+http://_http-metrics._tcp.mimir-alertmanager-headless.mimir-ingest.svc.cluster.local./alertmanager
enable_api: true
query_frontend:
address: mimir-query-frontend.mimir-ingest:9095
rule_path: /data
ruler_storage:
cache:
backend: memcached
memcached:
addresses: dnssrvnoa+mimir-metadata-cache.mimir-ingest.svc.cluster.local.:11211
max_item_size: 1048576
timeout: 500ms
s3:
bucket_name: <bucket-name>
runtime_config:
file: /var/mimir/runtime.yaml
server:
log_format: json
log_level: warn
store_gateway:
sharding_ring:
heartbeat_period: 1m
heartbeat_timeout: 10m
kvstore:
prefix: multi-zone/
tokens_file_path: /data/tokens
unregister_on_shutdown: false
wait_stability_min_duration: 1m
zone_awareness_enabled: true
tenant_federation:
enabled: true
usage_stats:
enabled: false
installation_mode: helmMetadata
Metadata
Assignees
Labels
No labels