-
Notifications
You must be signed in to change notification settings - Fork 39
Open
Labels
needs-triageIndicates an issue or PR lacks a triage label and requires one.Indicates an issue or PR lacks a triage label and requires one.
Description
Here are the values. We can compare and recommend to align as needed:
llm-d/guides/workload-autoscaling/helmfile.yaml.gotmpl
- name: workload-variant-autoscaler
namespace: {{ if $isWvaOnly }}{{ env "WVA_NAMESPACE" | default "llm-d-autoscaler" }}{{ else }}{{ $ns }}{{ end }}
chart: oci://ghcr.io/llm-d-incubation/workload-variant-autoscaler/workload-variant-autoscaler
version: "~0.5.0"
installed: true
{{- if not $isWvaOnly }}
needs:
- {{ printf "ms-%s" $rn | quote }}
{{- end }}
labels:
kind: autoscaling
values:
- workload-autoscaling/values.yaml
- wva:
prometheus:
tls:
insecureSkipVerify: true
caCertPath: ""
caCert: |
{{ readFile (printf "%s/prometheus-ca.crt" $certdir) | indent 14 }}
set:
- name: va.accelerator
value: "L40S"
{{- if not $isWvaOnly }}
- name: llmd.modelID
value: "Qwen/Qwen3-0.6B"
{{- end }}
- name: vllmService.enabled
value: true
- name: vllmService.nodePort
value: 30000
- name: llmd.namespace
value: {{ if $isWvaOnly }}{{ $llmdNamespaceEnv | quote }}{{ else }}{{ $ns | quote }}{{ end }}
- name: llmd.modelName
value: {{ if $isWvaOnly }}{{ printf "ms-%s-llm-d-modelservice" $llmdReleaseName | quote }}{{ else }}{{ printf "ms-%s-llm-d-modelservice" $rn | quote }}{{ end }}
- name: modelProfile
value: "default"
- name: sloClassRef.name
value: "service-classes-config"
llm-d/guides/workload-autoscaling/values.yaml
cat workload-autoscaling/values.yaml
wva:
enabled: true
imagePullPolicy: Always
metrics:
enabled: true
port: 8443
secure: true
reconcileInterval: 60s
prometheus:
monitoringNamespace: llm-d-monitoring # Namespace for Prometheus monitoring
serviceAccountName: "kube-prometheus-stack-prometheus"
baseURL: "https://llmd-kube-prometheus-stack-prometheus.llm-d-monitoring.svc.cluster.local:9090"
# Development security configuration (relaxed for easier development)
tls:
insecureSkipVerify: true # Development: true, Production: false
caCertPath: "" # Empty string to disable CA cert when using insecureSkipVerify
# caCert: | # Uncomment and provide your CA certificate
# -----BEGIN CERTIFICATE-----
# YOUR_CA_CERTIFICATE_HERE
# -----END CERTIFICATE-----
# Environment variable to enable experimental hybrid-based optimization
# When "on", runs both capacity analyzer and model-based optimizer with arbitration
# When "model-only" runs model-based optimizer only
# When "off" or unset, runs capacity analyzer only (default, reactive mode)
experimentalHybridOptimization: false # Enable experimental hybrid optimization (default: false)
scaleToZero: false # Enable scaling variants to zero replicas (default: false)
llmd:
# Namespace where the llm-d inference-scheduling stack is deployed
# For wva-only mode: Set this to your existing inference-scheduling namespace (default auto-detected from LLMD_NAMESPACE env var)
# For full installation: This will be set automatically to match the deployment namespace
namespace: llm-d-autoscaler
# Model service name (Service name of the vLLM inference pods)
# For wva-only mode: Auto-detected as ms-{LLMD_RELEASE_NAME_POSTFIX}-llm-d-modelservice, but can be explicitly set here
# For full installation: Auto-generated from release name postfix, but can be overridden here
modelName: ms-workload-autoscaler-llm-d-modelservice
# Model ID must match the model configured in your inference-scheduling deployment
modelID: "Qwen/Qwen3-0.6B"
va:
enabled: true
accelerator: H100
sloTpot: 10
sloTtft: 1000
hpa:
enabled: true
maxReplicas: 10
targetAverageValue: "1"
vllmService:
enabled: false
nodePort: 30000
interval: 15s
scheme: http # vLLM emulator runs on HTTP
workload-variant-autoscaler/charts/workload-variant-autoscaler/values.yaml
# Controller deployment settings
# Set controller.enabled=false to deploy only VA/HPA/ServiceMonitor resources
# without deploying another controller instance (use existing cluster-wide controller)
controller:
enabled: true
wva:
enabled: true
image:
repository: ghcr.io/llm-d/llm-d-workload-variant-autoscaler
tag: v0.5.0
imagePullPolicy: Always
metrics:
enabled: true
port: 8443
secure: true
# If true, the controller will only watch the namespace it is deployed in.
# If false, the controller will watch all namespaces (cluster-scoped).
namespaceScoped: true
reconcileInterval: 60s
# ConfigMap settings
configMap:
# If true, makes the ConfigMap immutable (cannot be updated after creation).
# This provides security benefits by preventing accidental or malicious changes
# to configuration, but disables runtime config updates (dynamic config changes
# will require recreating the ConfigMap and restarting the controller).
# Default: false (allows runtime updates for dynamic configuration)
immutable: false
prometheus:
monitoringNamespace: openshift-user-workload-monitoring
serviceAccountName: "kube-prometheus-stack-prometheus"
baseURL: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091"
# Development security configuration (relaxed for easier development)
tls:
insecureSkipVerify: true # Development: true, Production: false
caCertPath: "/etc/ssl/certs/prometheus-ca.crt"
# caCert: | # Uncomment and provide your CA certificate
# -----BEGIN CERTIFICATE-----
# YOUR_CA_CERTIFICATE_HERE
# -----END CERTIFICATE-----
limitedMode: false # Enable limited mode (default: false)
# Node selector for sharding WVA instances
# Example: "wva.llmd.ai/shard=instance-a"
nodeSelector: ""
scaleToZero: false # Enable scaling variants to zero replicas (default: false)
# Controller instance identifier for multi-controller isolation
# When set, adds controller_instance label to all emitted metrics
# Used with HPA selector to filter metrics from specific controller instances
# Useful for parallel e2e tests where multiple WVA controllers run simultaneously
controllerInstance: ""
# Saturation-based scaling configuration
# These thresholds determine when replicas are saturated and when to scale up
capacityScaling:
# Global defaults applied to all variants unless overridden
default:
kvCacheThreshold: 0.80 # Replica saturated if KV cache utilization >= threshold (0.0-1.0)
queueLengthThreshold: 5 # Replica saturated if queue length >= threshold
kvSpareTrigger: 0.1 # Scale-up if avg spare KV capacity < trigger (0.0-1.0)
queueSpareTrigger: 3 # Scale-up if avg spare queue capacity < trigger
# Per-model/namespace overrides (optional)
# Example:
# overrides:
# llm-d:
# modelID: "Qwen/Qwen3-0.6B"
# namespace: "llm-d-autoscaler"
# kvCacheThreshold: 0.70
# kvSpareTrigger: 0.35
overrides: {}
llmd:
namespace: llm-d-autoscaler
modelName: ms-workload-autoscaler-llm-d-modelservice
modelID: "Qwen/Qwen3-0.6B"
va:
enabled: true
accelerator: H100
# Cost per replica in arbitrary units (higher = more expensive to scale)
# Used by saturation analysis to weight scaling decisions across variants
# Example: H100=10.0, A100=8.0, L40S=5.0 (relative GPU costs)
variantCost: "10.0"
sloTpot: 10
sloTtft: 1000
hpa:
enabled: true
# minReplicas: 0 enables scale-to-zero (requires HPAScaleToZero feature gate in k8s)
# minReplicas: 1 is the safe default that prevents scale-to-zero
# Set to 0 when wva.scaleToZero is enabled
minReplicas: 1
maxReplicas: 10
targetAverageValue: "1"
# HPA scaling behavior configuration
behavior:
scaleUp:
stabilizationWindowSeconds: 240
selectPolicy: Max
policies:
- type: Pods
value: 10
periodSeconds: 150
scaleDown:
stabilizationWindowSeconds: 240
selectPolicy: Max
policies:
- type: Pods
value: 10
periodSeconds: 150
vllmService:
enabled: true
port: 8200
targetPort: 8200
nodePort: 30000
interval: 15s
scheme: http # vLLM emulator runs on HTTP
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
needs-triageIndicates an issue or PR lacks a triage label and requires one.Indicates an issue or PR lacks a triage label and requires one.