-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathvalues-gaudi.yaml
More file actions
65 lines (56 loc) · 1.74 KB
/
Copy pathvalues-gaudi.yaml
File metadata and controls
65 lines (56 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Intel Gaudi configuration using imageDefault mode
# This configuration is for single-node Gaudi setup without prefill
# Models are used from prepopulated model-pvc PVC using Hugging Face Hub
# Gaudi warmup is skipped for startup speed
# Custom vLLM image for Gaudi is used for now
modelArtifacts:
name: meta-llama/Llama-3.1-8B-Instruct
uri: "pvc+hf://model-pvc/meta-llama/Llama-3.1-8B-Instruct"
size: 50Gi
authSecretName: "llm-d-hf-token"
labels:
llm-d.ai/inference-serving: "true"
llm-d.ai/model: random-model
accelerator:
type: "intel-gaudi"
# Routing configuration
routing:
proxy:
enabled: false
# Decode pod configuration for Intel Gaudi - simplified with imageDefault
decode:
create: true
replicas: 1
containers:
- name: "vllm"
# Use custom vLLM image for Gaudi for now
image: "opea/vllm-gaudi:1.22.0"
# Use imageDefault mode - chart will generate basic vLLM command automatically
modelCommand: "imageDefault"
# Only specify Gaudi specific arguments that differ from defaults
args:
- --block-size=128
- --max-num-seqs=256
- --max-seq-len-to-capture=2048
- --max-model-len=2048
- --max-num-batched-token=16000
env:
- name: OMPI_MCA_btl_vader_single_copy_mechanism
value: "none"
- name: HABANA_LOGS # For OpenShift compatibility, set log path to writable location
value: "/tmp/habana_logs"
- name: VLLM_SKIP_WARMUP
value: "true"
- name: DO_NOT_TRACK
value: "1"
- name: VLLM_USE_V1
value: "1"
ports:
- containerPort: 8200
protocol: TCP
mountModelVolume: true
# Disable prefill for simple Intel Gaudi example
prefill:
create: false
# When true, use LeaderWorkerSet for multi-node Intel Gaudi setups
multinode: false