-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathvalues-heterogeneous-pd.yaml
More file actions
134 lines (121 loc) · 3.7 KB
/
Copy pathvalues-heterogeneous-pd.yaml
File metadata and controls
134 lines (121 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# This values.yaml file demonstrates heterogeneous accelerator support for P/D disaggregation
# Uses NVIDIA GPUs for decode (high performance for token generation)
# Uses Intel Gaudi for prefill (cost-effective for prompt processing)
# Based on values-pd.yaml with added accelerator type overrides
# When true, LeaderWorkerSet is used instead of Deployment
multinode: false
# Global accelerator configuration (fallback for components without specific type)
accelerator:
type: nvidia
modelArtifacts:
# This is the model name used to start vLLM.
name: facebook/opt-125m
labels:
llm-d.ai/inference-serving: "true"
llm-d.ai/model: facebook-opt-125m
uri: hf://"{{ .Values.modelArtifacts.name }}"
size: 20Gi
# Describe routing requirements. In addition to service level routing (OpenAI model name, service port)
# also describes elements for Gateway API Inference Extension configuration
routing:
servicePort: 8000
# other fields are inherited from chart values.yaml
proxy:
secure: false
# Decode pod configuration - Uses NVIDIA GPUs with DRA for high-performance token generation
decode:
create: true
replicas: 1
# Override accelerator configuration for decode pods
# Using DRA (Dynamic Resource Allocation) for decode
accelerator:
# type: nvidia # Not specified - uses global accelerator.type
dra: true # Enable DRA for decode pods
parallelism:
tensor: 2
data: 1
dataLocal: 1
workers: 1
containers:
- name: "vllm"
image: "ghcr.io/llm-d/llm-d:v0.2.0"
modelCommand: vllmServe
args:
- "--enforce-eager"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_LOGGING_LEVEL
value: DEBUG
ports:
- containerPort: 8200 # from routing.proxy.targetPort
protocol: TCP
- containerPort: 5600 # NIXL side channel
protocol: TCP
resources:
limits:
memory: 16Gi
cpu: "16"
requests:
cpu: "16"
memory: 16Gi
mountModelVolume: true
# Prefill pod configuration - Uses Intel Gaudi for cost-effective prompt processing
prefill:
create: true
replicas: 1
# # Override accelerator configuration for prefill pods
# # Using device plugin mode (not DRA) for prefill
# accelerator:
# type: intel-gaudi
# # dra: false # Not specified - uses device plugin mode (global default)
parallelism:
tensor: 1
data: 1
dataLocal: 1
workers: 1
containers:
- name: "vllm"
# Using Intel-optimized vLLM image for Gaudi
image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
modelCommand: vllmServe
args:
- "--enforce-eager"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
env:
- name: HABANA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "tcp"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_LOGGING_LEVEL
value: DEBUG
ports:
- containerPort: 8000 # from routing.servicePort
protocol: TCP
- containerPort: 5600 # NIXL side channel
protocol: TCP
resources:
limits:
memory: 16Gi
cpu: "16"
requests:
cpu: "16"
memory: 16Gi
mountModelVolume: true