Skip to content

Read path fails during ingester scale down using rollout operator #13802

@jensloe-nhn

Description

@jensloe-nhn

We have deployed grafana mimir using the helm chart. When trying to scale down the mimir ingester from 15 to 12 replicas, the read path fails during the last scale down (from 13 to 12). The scale down is handled by the rollout operator, we deploy the helm chart using argocd. We are currently on helm version 6.0.5. The errors we see are of the type

partition x: too many unhealthy instances in the ring

Is there some procedure we need to follow to scale down the ingesters?

config:

activity_tracker:
  filepath: /active-query-tracker/activity.log
alertmanager:
  data_dir: /data
  enable_api: true
  external_url: /alertmanager
  fallback_config_file: /configs/alertmanager_fallback_config.yaml
alertmanager_storage:
  s3:
    bucket_name: <bucket-name>
blocks_storage:
  backend: s3
  bucket_store:
    chunks_cache:
      backend: memcached
      memcached:
        addresses: dnssrvnoa+mimir-chunks-cache.mimir-ingest.svc.cluster.local.:11211
        max_idle_connections: 150
        max_item_size: 1048576
        timeout: 750ms
    index_cache:
      backend: memcached
      memcached:
        addresses: dnssrvnoa+mimir-index-cache.mimir-ingest.svc.cluster.local.:11211
        max_idle_connections: 150
        max_item_size: 5242880
        timeout: 750ms
    metadata_cache:
      backend: memcached
      memcached:
        addresses: dnssrvnoa+mimir-metadata-cache.mimir-ingest.svc.cluster.local.:11211
        max_idle_connections: 150
        max_item_size: 1048576
    sync_dir: /data/tsdb-sync
  s3:
    bucket_name: <bucket-name>
  tsdb:
    dir: /data/tsdb
    flush_blocks_on_shutdown: true
    head_compaction_interval: 15m
    wal_replay_concurrency: 3
common:
  storage:
    backend: s3
    s3:
      access_key_id: ${S3_USER}
      endpoint: <s3-endpoint>
      http:
        insecure_skip_verify: true
      secret_access_key: ${S3_PASSWORD}
compactor:
  compaction_interval: 30m
  data_dir: /data
  deletion_delay: 2h
  first_level_compaction_wait_period: 25m
  max_closing_blocks_concurrency: 2
  max_opening_blocks_concurrency: 4
  sharding_ring:
    heartbeat_period: 1m
    heartbeat_timeout: 4m
    wait_stability_min_duration: 1m
  symbols_flushers_concurrency: 4
distributor:
  remote_timeout: 5s
  ring:
    heartbeat_period: 1m
    heartbeat_timeout: 4m
frontend:
  cache_results: true
  parallelize_shardable_queries: true
  query_sharding_target_series_per_shard: 2500
  results_cache:
    backend: memcached
    memcached:
      addresses: dnssrvnoa+mimir-results-cache.mimir-ingest.svc.cluster.local.:11211
      max_item_size: 5242880
      timeout: 500ms
  scheduler_address: mimir-query-scheduler-headless.mimir-ingest.svc:9095
frontend_worker:
  grpc_client_config:
    max_send_msg_size: 419430400
  scheduler_address: mimir-query-scheduler-headless.mimir-ingest.svc:9095
ingest_storage:
  enabled: true
  kafka:
    address: _tcp-clients._tcp.lgtm-kafka-cluster-kafka-bootstrap.kafka.svc.cluster.local
    auto_create_topic_default_partitions: 48
    auto_create_topic_enabled: true
    client_id: mimir
    consumer_group: observability-stack-mimir-ingest
    topic: observability-stack-mimir-ingest
ingester:
  push_grpc_method_enabled: false
  ring:
    final_sleep: 0s
    heartbeat_period: 2m
    heartbeat_timeout: 10m
    num_tokens: 512
    tokens_file_path: /data/tokens
    unregister_on_shutdown: false
    zone_awareness_enabled: true
ingester_client:
  grpc_client_config:
    max_recv_msg_size: 104857600
    max_send_msg_size: 104857600
limits:
  compactor_blocks_retention_period: 1y
  ingestion_burst_size: 10000000
  ingestion_rate: 500000
  max_cache_freshness: 10m
  max_global_exemplars_per_user: 10000000
  max_global_series_per_user: 10000000
  max_query_parallelism: 240
  max_total_query_length: 12000h
  out_of_order_time_window: 5m
  ruler_max_rules_per_rule_group: 50
memberlist:
  abort_if_cluster_join_fails: false
  bind_addr:
    - ${MY_POD_IP}
  compression_enabled: false
  join_members:
    - dns+mimir-gossip-ring.mimir-ingest.svc.cluster.local.:7946
querier:
  max_concurrent: 16
query_scheduler:
  max_outstanding_requests_per_tenant: 800
ruler:
  alertmanager_url: dnssrvnoa+http://_http-metrics._tcp.mimir-alertmanager-headless.mimir-ingest.svc.cluster.local./alertmanager
  enable_api: true
  query_frontend:
    address: mimir-query-frontend.mimir-ingest:9095
  rule_path: /data
ruler_storage:
  cache:
    backend: memcached
    memcached:
      addresses: dnssrvnoa+mimir-metadata-cache.mimir-ingest.svc.cluster.local.:11211
      max_item_size: 1048576
      timeout: 500ms
  s3:
    bucket_name: <bucket-name>
runtime_config:
  file: /var/mimir/runtime.yaml
server:
  log_format: json
  log_level: warn
store_gateway:
  sharding_ring:
    heartbeat_period: 1m
    heartbeat_timeout: 10m
    kvstore:
      prefix: multi-zone/
    tokens_file_path: /data/tokens
    unregister_on_shutdown: false
    wait_stability_min_duration: 1m
    zone_awareness_enabled: true
tenant_federation:
  enabled: true
usage_stats:
  enabled: false
  installation_mode: helm

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions