workload-variant-autoscaler/config/samples/model-scale-to-zero-config.yaml at main · ev-shindin/workload-variant-autoscaler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# ConfigMap for per-model scale-to-zero configuration
#
# This ConfigMap follows the same format as saturation-scaling-config:
# - 'default' entry: Global default thresholds applied to all models
# - Override entries: Per-model custom configuration (must include model_id)
#
# Configuration fields:
#   - model_id (string): Model identifier (required for override entries)
#   - namespace (string): Namespace for this override (optional)
#   - enable_scale_to_zero (boolean): Enables scale-to-zero for this model
#   - retention_period (string): Duration after last request before scaling to zero
#                                 (e.g., "5m", "1h", "30s"). Optional, defaults to 10 minutes.
#
# Configuration priority (highest to lowest):
#   1. Per-model configuration (specific model_id in override entry)
#   2. Global defaults in this ConfigMap (key: "default")
#   3. WVA_SCALE_TO_ZERO environment variable
#   4. System default (disabled, 10-minute retention)

apiVersion: v1
kind: ConfigMap
metadata:
  name: wva-model-scale-to-zero-config
  namespace: workload-variant-autoscaler-system
data:
  # Global defaults applied to all models unless overridden
  default: |
    enable_scale_to_zero: true
    retention_period: "15m"

  # Example per-model override for llama model
  # Only override retention_period, inherit enable_scale_to_zero from defaults
  # Result: scale-to-zero ENABLED (from defaults) with 5-minute retention
  # llama-8b-override: |
  #   model_id: meta/llama-3.1-8b
  #   retention_period: "5m"

  # Example per-model override to DISABLE scale-to-zero
  # Result: minimum 1 replica maintained at all times
  # llama-70b-override: |
  #   model_id: meta/llama-3.1-70b
  #   enable_scale_to_zero: false

  # Example per-model override with namespace
  # llama-production: |
  #   model_id: meta/llama-3.1-8b
  #   namespace: production
  #   enable_scale_to_zero: true
  #   retention_period: "30m"

# Note: Models not explicitly listed will inherit settings from "default".
# If "default" is not specified, models fall back to the WVA_SCALE_TO_ZERO
# environment variable, then system default (disabled).