-
Notifications
You must be signed in to change notification settings - Fork 63
Expand file tree
/
Copy pathvalues-requester.yaml
More file actions
125 lines (118 loc) · 3.28 KB
/
Copy pathvalues-requester.yaml
File metadata and controls
125 lines (118 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# This values.yaml file creates the resources for a P/D disaggregation scenario
# Uses a small model: facebook/opt-125m
# See also defaults in chart values.yaml
# When true, LeaderWorkerSet is used instead of Deployment
multinode: false
modelArtifacts:
# This is the model name used to start vLLM.
name: facebook/opt-125m
labels:
llm-d.ai/inference-serving: "true"
llm-d.ai/model: facebook-opt-125m
uri: hf://"{{ .Values.modelArtifacts.name }}"
size: 20Gi
# Describe routing requirements. In addition to service level routing (OpenAI model name, service port)
# also describes elements for Gateway API Inference Extension configuration
routing:
servicePort: 8000
# other fields are inherited from chart values.yaml
proxy:
secure: false
# @schema
# additionalProperties: true
# @schema
# -- Requester configuration part of the dual-pod solution for FMA
requester:
enable: true
image: "ghcr.io/llm-d-incubation/llm-d-fast-model-actuation/requester:latest"
port:
probes: 8080
spi: 8081
readinessProbe:
initialDelaySeconds: 2
periodSeconds: 5
resources:
limits:
gpus: 1
cpus: 1
memory: 250Mi
# Decode pod configuation
decode:
create: true
replicas: 1
containers:
- name: "vllm"
image: "ghcr.io/llm-d/llm-d-cuda:latest"
modelCommand: vllmServe
args:
- "--enforce-eager"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_LOGGING_LEVEL
value: DEBUG
ports:
- containerPort: 8200 # from routing.proxy.targetPort
protocol: TCP
- containerPort: 5600 # NIXL side channel
protocol: TCP
resources:
limits:
memory: 16Gi
cpu: "16"
nvidia.com/gpu: "1"
requests:
cpu: "16"
memory: 16Gi
nvidia.com/gpu: "1"
mountModelVolume: true
# Prefill pod configuation
prefill:
create: true
replicas: 1
containers:
- name: "vllm"
image: "ghcr.io/llm-d/llm-d-cuda:latest"
modelCommand: vllmServe
args:
- "--enforce-eager"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
- name: UCX_TLS
value: "cuda_ipc,cuda_copy,tcp"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5600"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_LOGGING_LEVEL
value: DEBUG
ports:
- containerPort: 8000 # from routing.servicePort
protocol: TCP
- containerPort: 5600 # NIXL side channel
protocol: TCP
resources:
limits:
memory: 16Gi
cpu: "16"
nvidia.com/gpu: "1"
requests:
cpu: "16"
memory: 16Gi
nvidia.com/gpu: "1"
mountModelVolume: true