-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathvalues-xpu-pd.yaml
More file actions
158 lines (150 loc) · 4.21 KB
/
Copy pathvalues-xpu-pd.yaml
File metadata and controls
158 lines (150 loc) · 4.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Example values for Intel XPU with Prefill/Decode disaggregation
# WARNING: Intel XPU does not currently support P/D disaggregation
# This configuration is for reference only - testing shows:
# - SimpleConnector: Not supported
# - NixlConnector: Requires NIXL library which is not available
# Use values-xpu.yaml for working Intel XPU deployment instead
modelArtifacts:
name: microsoft/DialoGPT-large
labels:
llm-d.ai/inference-serving: "true"
llm-d.ai/model: microsoft-dialogpt-large
uri: "hf://microsoft/DialoGPT-large"
size: 10Gi
accelerator:
type: "intel-i915"
routing:
servicePort: 8000
proxy:
image: ghcr.io/llm-d/llm-d-routing-sidecar:latest
targetPort: 8200
# Use compatible connector for XPU
connector: nixlv2
# Decode pod configuration for Intel XPU
decode:
create: true
replicas: 1 # Just 1 decode pod
containers:
- name: "vllm"
image: "ghcr.io/llm-d/llm-d-xpu:latest"
imagePullPolicy: Never
modelCommand: custom
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "microsoft/DialoGPT-large"
- "--enforce-eager"
- "--tensor-parallel-size"
- "1" # TP=1 for each decode pod (using 1 XPU each)
- "--port"
- "8200"
- "--host"
- "0.0.0.0"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_consumer"}'
env:
- name: ZE_AFFINITY_MASK
value: "0" # Decode pod uses XPU 0
- name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_LOGGING_LEVEL
value: DEBUG
# Intel XPU specific environment variables
- name: TORCH_LLM_ALLREDUCE
value: "1"
- name: VLLM_USE_V1
value: "1"
- name: CCL_ZE_IPC_EXCHANGE
value: "pidfd"
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
ports:
- containerPort: 8200
protocol: TCP
- containerPort: 5600 # NIXL side channel
protocol: TCP
resources:
limits:
memory: 24Gi # Reduced memory since each pod uses 1 XPU
cpu: "8" # Reduced CPU
gpu.intel.com/i915: "1" # Each decode pod uses 1 XPU
requests:
cpu: "4"
memory: 12Gi
gpu.intel.com/i915: "1"
mountModelVolume: true
acceleratorTypes:
labelKey: "accelerator"
labelValues:
- "intel-xpu"
# Prefill pod configuration for Intel XPU
prefill:
create: true
replicas: 1 # Just 1 prefill pod
containers:
- name: "vllm"
image: "ghcr.io/llm-d/llm-d-xpu:latest"
modelCommand: custom
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "microsoft/DialoGPT-large"
- "--enforce-eager"
- "--tensor-parallel-size"
- "1" # TP=1 for prefill
- "--port"
- "8000"
- "--host"
- "0.0.0.0"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_producer"}'
env:
- name: ZE_AFFINITY_MASK
value: "1" # Prefill pod uses XPU 1
- name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_LOGGING_LEVEL
value: DEBUG
# Intel XPU specific environment variables
- name: TORCH_LLM_ALLREDUCE
value: "1"
- name: VLLM_USE_V1
value: "1"
- name: CCL_ZE_IPC_EXCHANGE
value: "pidfd"
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
ports:
- containerPort: 8000
protocol: TCP
- containerPort: 5600
protocol: TCP
resources:
limits:
memory: 32Gi
cpu: "16"
requests:
cpu: "8"
memory: 16Gi
mountModelVolume: true
acceleratorTypes:
labelKey: "accelerator"
labelValues:
- "intel-xpu"
multinode: false