Skip to content

Align wva values.yaml in llm-d and wva #757

@shuynh2017

Description

@shuynh2017

Here are the values. We can compare and recommend to align as needed:

llm-d/guides/workload-autoscaling/helmfile.yaml.gotmpl

- name: workload-variant-autoscaler
    namespace: {{ if $isWvaOnly }}{{ env "WVA_NAMESPACE" | default "llm-d-autoscaler" }}{{ else }}{{ $ns }}{{ end }}
    chart: oci://ghcr.io/llm-d-incubation/workload-variant-autoscaler/workload-variant-autoscaler
    version: "~0.5.0"
    installed: true
    {{- if not $isWvaOnly }}
    needs:
      - {{ printf "ms-%s" $rn | quote }}
    {{- end }}
    labels:
      kind: autoscaling
    values:
      - workload-autoscaling/values.yaml
      - wva:
          prometheus:
            tls:
              insecureSkipVerify: true
              caCertPath: ""
            caCert: |
{{ readFile (printf "%s/prometheus-ca.crt" $certdir) | indent 14 }}
    set:
      - name: va.accelerator
        value: "L40S"
      {{- if not $isWvaOnly }}
      - name: llmd.modelID
        value: "Qwen/Qwen3-0.6B"
      {{- end }}
      - name: vllmService.enabled
        value: true
      - name: vllmService.nodePort
        value: 30000
      - name: llmd.namespace
        value: {{ if $isWvaOnly }}{{ $llmdNamespaceEnv | quote }}{{ else }}{{ $ns | quote }}{{ end }}
      - name: llmd.modelName
        value: {{ if $isWvaOnly }}{{ printf "ms-%s-llm-d-modelservice" $llmdReleaseName | quote }}{{ else }}{{ printf "ms-%s-llm-d-modelservice" $rn | quote }}{{ end }}
      - name: modelProfile
        value: "default"
      - name: sloClassRef.name
        value: "service-classes-config"

llm-d/guides/workload-autoscaling/values.yaml

cat workload-autoscaling/values.yaml
wva:
  enabled: true
  imagePullPolicy: Always
  metrics:
    enabled: true
    port: 8443
    secure: true

  reconcileInterval: 60s
  prometheus:
    monitoringNamespace: llm-d-monitoring  # Namespace for Prometheus monitoring
    serviceAccountName: "kube-prometheus-stack-prometheus"
    baseURL: "https://llmd-kube-prometheus-stack-prometheus.llm-d-monitoring.svc.cluster.local:9090"
    # Development security configuration (relaxed for easier development)
    tls:
      insecureSkipVerify: true   # Development: true, Production: false
      caCertPath: ""  # Empty string to disable CA cert when using insecureSkipVerify
      # caCert: |  # Uncomment and provide your CA certificate
      #   -----BEGIN CERTIFICATE-----
      #   YOUR_CA_CERTIFICATE_HERE
      #   -----END CERTIFICATE-----

  # Environment variable to enable experimental hybrid-based optimization
  #  When "on", runs both capacity analyzer and model-based optimizer with arbitration
  #  When "model-only" runs model-based optimizer only
  #  When "off" or unset, runs capacity analyzer only (default, reactive mode)
  experimentalHybridOptimization: false  # Enable experimental hybrid optimization (default: false)
  scaleToZero: false  # Enable scaling variants to zero replicas (default: false)

llmd:
  # Namespace where the llm-d inference-scheduling stack is deployed
  # For wva-only mode: Set this to your existing inference-scheduling namespace (default auto-detected from LLMD_NAMESPACE env var)
  # For full installation: This will be set automatically to match the deployment namespace
  namespace: llm-d-autoscaler
  # Model service name (Service name of the vLLM inference pods)
  # For wva-only mode: Auto-detected as ms-{LLMD_RELEASE_NAME_POSTFIX}-llm-d-modelservice, but can be explicitly set here
  # For full installation: Auto-generated from release name postfix, but can be overridden here
  modelName: ms-workload-autoscaler-llm-d-modelservice
  # Model ID must match the model configured in your inference-scheduling deployment
  modelID: "Qwen/Qwen3-0.6B"

va:
  enabled: true
  accelerator: H100
  sloTpot: 10
  sloTtft: 1000

hpa:
  enabled: true
  maxReplicas: 10
  targetAverageValue: "1"

vllmService:
  enabled: false
  nodePort: 30000
  interval: 15s
  scheme: http  # vLLM emulator runs on HTTP

workload-variant-autoscaler/charts/workload-variant-autoscaler/values.yaml

# Controller deployment settings
# Set controller.enabled=false to deploy only VA/HPA/ServiceMonitor resources
# without deploying another controller instance (use existing cluster-wide controller)
controller:
  enabled: true

wva:
  enabled: true

  image:
    repository: ghcr.io/llm-d/llm-d-workload-variant-autoscaler
    tag: v0.5.0
  imagePullPolicy: Always

  metrics:
    enabled: true
    port: 8443
    secure: true

  # If true, the controller will only watch the namespace it is deployed in.
  # If false, the controller will watch all namespaces (cluster-scoped).
  namespaceScoped: true

  reconcileInterval: 60s

  # ConfigMap settings
  configMap:
    # If true, makes the ConfigMap immutable (cannot be updated after creation).
    # This provides security benefits by preventing accidental or malicious changes
    # to configuration, but disables runtime config updates (dynamic config changes
    # will require recreating the ConfigMap and restarting the controller).
    # Default: false (allows runtime updates for dynamic configuration)
    immutable: false

  prometheus:
    monitoringNamespace: openshift-user-workload-monitoring
    serviceAccountName: "kube-prometheus-stack-prometheus"
    baseURL: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091"
    # Development security configuration (relaxed for easier development)
    tls:
      insecureSkipVerify: true   # Development: true, Production: false
      caCertPath: "/etc/ssl/certs/prometheus-ca.crt"
    # caCert: |  # Uncomment and provide your CA certificate
    #   -----BEGIN CERTIFICATE-----
    #   YOUR_CA_CERTIFICATE_HERE
    #   -----END CERTIFICATE-----

  limitedMode: false  # Enable limited mode (default: false)
  # Node selector for sharding WVA instances
  # Example: "wva.llmd.ai/shard=instance-a"
  nodeSelector: ""
  scaleToZero: false  # Enable scaling variants to zero replicas (default: false)
  # Controller instance identifier for multi-controller isolation
  # When set, adds controller_instance label to all emitted metrics
  # Used with HPA selector to filter metrics from specific controller instances
  # Useful for parallel e2e tests where multiple WVA controllers run simultaneously
  controllerInstance: ""

  # Saturation-based scaling configuration
  # These thresholds determine when replicas are saturated and when to scale up
  capacityScaling:
    # Global defaults applied to all variants unless overridden
    default:
      kvCacheThreshold: 0.80      # Replica saturated if KV cache utilization >= threshold (0.0-1.0)
      queueLengthThreshold: 5     # Replica saturated if queue length >= threshold
      kvSpareTrigger: 0.1         # Scale-up if avg spare KV capacity < trigger (0.0-1.0)
      queueSpareTrigger: 3        # Scale-up if avg spare queue capacity < trigger

    # Per-model/namespace overrides (optional)
    # Example:
    # overrides:
    #   llm-d:
    #     modelID: "Qwen/Qwen3-0.6B"
    #     namespace: "llm-d-autoscaler"
    #     kvCacheThreshold: 0.70
    #     kvSpareTrigger: 0.35
    overrides: {}

llmd:
  namespace: llm-d-autoscaler
  modelName: ms-workload-autoscaler-llm-d-modelservice
  modelID: "Qwen/Qwen3-0.6B"

va:
  enabled: true
  accelerator: H100
  # Cost per replica in arbitrary units (higher = more expensive to scale)
  # Used by saturation analysis to weight scaling decisions across variants
  # Example: H100=10.0, A100=8.0, L40S=5.0 (relative GPU costs)
  variantCost: "10.0"
  sloTpot: 10
  sloTtft: 1000

hpa:
  enabled: true
  # minReplicas: 0 enables scale-to-zero (requires HPAScaleToZero feature gate in k8s)
  # minReplicas: 1 is the safe default that prevents scale-to-zero
  # Set to 0 when wva.scaleToZero is enabled
  minReplicas: 1
  maxReplicas: 10
  targetAverageValue: "1"
  # HPA scaling behavior configuration
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 240
      selectPolicy: Max
      policies:
        - type: Pods
          value: 10
          periodSeconds: 150
    scaleDown:
      stabilizationWindowSeconds: 240
      selectPolicy: Max
      policies:
        - type: Pods
          value: 10
          periodSeconds: 150

vllmService:
  enabled: true
  port: 8200
  targetPort: 8200
  nodePort: 30000
  interval: 15s
  scheme: http  # vLLM emulator runs on HTTP

Metadata

Metadata

Assignees

No one assigned

    Labels

    needs-triageIndicates an issue or PR lacks a triage label and requires one.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions