forked from llm-d/llm-d-workload-variant-autoscaler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel-scale-to-zero-config.yaml
More file actions
53 lines (48 loc) · 2.08 KB
/
model-scale-to-zero-config.yaml
File metadata and controls
53 lines (48 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# ConfigMap for per-model scale-to-zero configuration
#
# This ConfigMap follows the same format as saturation-scaling-config:
# - 'default' entry: Global default thresholds applied to all models
# - Override entries: Per-model custom configuration (must include model_id)
#
# Configuration fields:
# - model_id (string): Model identifier (required for override entries)
# - namespace (string): Namespace for this override (optional)
# - enable_scale_to_zero (boolean): Enables scale-to-zero for this model
# - retention_period (string): Duration after last request before scaling to zero
# (e.g., "5m", "1h", "30s"). Optional, defaults to 10 minutes.
#
# Configuration priority (highest to lowest):
# 1. Per-model configuration (specific model_id in override entry)
# 2. Global defaults in this ConfigMap (key: "default")
# 3. WVA_SCALE_TO_ZERO environment variable
# 4. System default (disabled, 10-minute retention)
apiVersion: v1
kind: ConfigMap
metadata:
name: wva-model-scale-to-zero-config
namespace: workload-variant-autoscaler-system
data:
# Global defaults applied to all models unless overridden
default: |
enable_scale_to_zero: true
retention_period: "15m"
# Example per-model override for llama model
# Only override retention_period, inherit enable_scale_to_zero from defaults
# Result: scale-to-zero ENABLED (from defaults) with 5-minute retention
# llama-8b-override: |
# model_id: meta/llama-3.1-8b
# retention_period: "5m"
# Example per-model override to DISABLE scale-to-zero
# Result: minimum 1 replica maintained at all times
# llama-70b-override: |
# model_id: meta/llama-3.1-70b
# enable_scale_to_zero: false
# Example per-model override with namespace
# llama-production: |
# model_id: meta/llama-3.1-8b
# namespace: production
# enable_scale_to_zero: true
# retention_period: "30m"
# Note: Models not explicitly listed will inherit settings from "default".
# If "default" is not specified, models fall back to the WVA_SCALE_TO_ZERO
# environment variable, then system default (disabled).