forked from llm-d/llm-d-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path27_wva-variantautoscaling.yaml.j2
More file actions
63 lines (56 loc) · 3.13 KB
/
27_wva-variantautoscaling.yaml.j2
File metadata and controls
63 lines (56 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{# ============================================================================
27_wva-variantautoscaling.yaml.j2
Per-stack VariantAutoscaling resource. Targets the decode Deployment
created by the llm-d-modelservice chart (name = {model_id_label}-decode
per chart helper llm-d-modelservice.decodeName + fullnameOverride).
Labels follow the upstream well-lit-path example in
llm-d/guides/workload-autoscaling/inference-scheduling-autoscaling/va.yaml
with values parameterized so each stack emits its own VA regardless of
how many models share a namespace.
IMPORTANT: `wva.llmd.ai/controller-instance` label is required when the
chart is installed with a non-empty wva.controllerInstance. Per the
v0.6.0 controller's predicate (internal/controller/predicates.go):
- When CONTROLLER_INSTANCE env is set on the controller, it ONLY
reconciles VAs whose `wva.llmd.ai/controller-instance` label
matches that value. Without this label, the predicate returns
false and the controller silently skips the VA — it'll log
"No active VariantAutoscalings found" forever.
The value here MUST equal `wva.controllerInstance` from chart values
(which we set to wva.namespace | default(namespace.name)) and the
`controller_instance` label in the HPA selector — all three derive
from the same expression so they stay aligned.
The controller (installed once per namespace by step_02) watches this
namespace and reconciles every VariantAutoscaling resource in it
that bears the matching controller-instance label.
Only rendered when wva.enabled is true.
============================================================================ #}
{% if wva is defined and wva.enabled | default(false) %}
{% set decode_cfg = decode if decode is defined else {} %}
{% set accel_type = decode_cfg.acceleratorType | default({}) %}
{% set accel_label = (accel_type.labelValues[0] if accel_type.labelValues is defined and accel_type.labelValues else accel_type.labelValue | default('')) %}
{% set accel_name = 'H100' if 'H100' in accel_label
else ('A100' if 'A100' in accel_label
else ('L40S' if 'L40S' in accel_label
else ('MI300X' if 'MI300X' in accel_label
else ('G2' if 'G2' in accel_label
else '')))) %}
apiVersion: llmd.ai/v1alpha1
kind: VariantAutoscaling
metadata:
name: {{ model_id_label }}-decode
namespace: {{ wva.namespace | default(namespace.name, true) }}
labels:
inference.optimization/acceleratorName: "{{ accel_name }}"
llm-d.ai/model: "{{ model.shortName }}"
llm-d.ai/inference-serving: "true"
llm-d.ai/guide: "{{ wva.wellLitPath | default('inference-scheduling') }}"
wva.llmd.ai/controller-instance: "{{ wva.namespace | default(namespace.name, true) }}"
spec:
scaleTargetRef:
kind: Deployment
name: {{ model_id_label }}-decode
modelID: "{{ model.name }}"
variantCost: "{{ wva.variantAutoscaling.variantCost | default('10.0') }}"
minReplicas: {{ wva.variantAutoscaling.minReplicas | default(1) }}
maxReplicas: {{ wva.variantAutoscaling.maxReplicas | default(10) }}
{% endif %}