-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathvalues-xpu.yaml
More file actions
74 lines (63 loc) · 1.69 KB
/
Copy pathvalues-xpu.yaml
File metadata and controls
74 lines (63 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Simplified Intel XPU configuration using imageDefault mode
# This configuration lets the chart handle most vLLM parameters automatically
modelArtifacts:
name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
labels:
llm-d.ai/inference-serving: "true"
llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B
uri: "hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
size: 10Gi
accelerator:
type: "intel-xe"
# Routing configuration
routing:
proxy:
enabled: false
# Decode pod configuration for Intel XPU - simplified with imageDefault
decode:
create: true
replicas: 1
containers:
- name: "vllm"
# Use custom vLLM image for XPU
image: "ghcr.io/llm-d/llm-d-xpu:latest"
# Use imageDefault mode - chart will generate basic vLLM command automatically
modelCommand: "imageDefault"
# Only specify XPU-specific arguments that differ from defaults
args:
- "--enforce-eager"
- "--dtype"
- "float16"
- "--disable-sliding-window"
- "--gpu-memory-util"
- "0.9"
- "--no-enable-prefix-caching"
- "--max-num-batched-tokens"
- "4096"
- "--disable-log-requests"
- "--max-model-len"
- "4096"
- "--block-size"
- "64"
ports:
- containerPort: 8200
protocol: TCP
resources:
limits:
memory: 24Gi
cpu: "8"
requests:
cpu: "4"
memory: 12Gi
mountModelVolume: true
# XPU-specific node affinity
acceleratorTypes:
labelKey: "accelerator"
labelValues:
- "intel-xpu"
- "intel-gpu-max"
# Disable prefill for simple XPU example
prefill:
create: false
# When true, use LeaderWorkerSet for multi-node XPU setups
multinode: false