Skip to content

Commit 7e2e42a

Browse files
Xunzhuomathetake
andauthored
feat: support inferencepool v1 (#1033)
**Description** This PR is to add v1 inferencepool support **Related Issues/PRs (if applicable)** Fixes #1032 --------- Signed-off-by: bitliu <[email protected]> Co-authored-by: Takeshi Yoneda <[email protected]>
1 parent e3bf5fd commit 7e2e42a

39 files changed

+577
-655
lines changed

.golangci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ linters:
4444
alias: egv1a1
4545
- pkg: github.com/envoyproxy/ai-gateway/api/v1alpha1
4646
alias: aigv1a1
47-
- pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2
48-
alias: gwaiev1a2
47+
- pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1
48+
alias: gwaiev1
4949
- pkg: k8s.io/apimachinery/pkg/apis/meta/v1
5050
alias: metav1
5151
- pkg: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1

api/v1alpha1/ai_gateway_route.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ type AIGatewayRouteRule struct {
266266
// It can reference either an AIServiceBackend or an InferencePool resource.
267267
//
268268
// +kubebuilder:validation:XValidation:rule="!has(self.group) && !has(self.kind) || (has(self.group) && has(self.kind))", message="group and kind must be specified together"
269-
// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.x-k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.x-k8s.io group is supported"
269+
// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.k8s.io group is supported"
270270
type AIGatewayRouteRuleBackendRef struct {
271271
// Name is the name of the backend resource.
272272
// When Group and Kind are not specified, this refers to an AIServiceBackend.
@@ -278,7 +278,7 @@ type AIGatewayRouteRuleBackendRef struct {
278278

279279
// Group is the group of the backend resource.
280280
// When not specified, defaults to aigateway.envoyproxy.io (AIServiceBackend).
281-
// Currently, only "inference.networking.x-k8s.io" is supported for InferencePool resources.
281+
// Currently, only "inference.networking.k8s.io" is supported for InferencePool resources.
282282
//
283283
// +optional
284284
// +kubebuilder:validation:MaxLength=253

api/v1alpha1/ai_gateway_route_helper.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const (
1515
defaultRequestTimeout gwapiv1.Duration = "60s"
1616

1717
// inferencePoolGroup is the API group for InferencePool resources.
18-
inferencePoolGroup = "inference.networking.x-k8s.io"
18+
inferencePoolGroup = "inference.networking.k8s.io"
1919
// inferencePoolKind is the kind for InferencePool resources.
2020
inferencePoolKind = "InferencePool"
2121
)

cmd/aigw/envoy-gateway-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ extensionApis:
2424
enableBackend: true
2525
extensionManager:
2626
backendResources:
27-
- group: inference.networking.x-k8s.io
27+
- group: inference.networking.k8s.io
2828
kind: InferencePool
2929
version: v1alpha2
3030
hooks:

docs/proposals/003-epp-integration-proposal/proposal.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ When request goes to envoyproxy, it goes to the http filter chain, the ext-proc
5151
The gRPC service info is pre-defined in [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/) extensionRef, giving an example below:
5252

5353
```
54-
apiVersion: inference.networking.x-k8s.io/v1alpha2
54+
apiVersion: inference.networking.k8s.io/v1
5555
kind: InferencePool
5656
metadata:
5757
name: vllm-llama3-8b-instruct
@@ -81,7 +81,7 @@ spec:
8181
name: inference-gateway
8282
rules:
8383
- backendRefs:
84-
- group: inference.networking.x-k8s.io
84+
- group: inference.networking.k8s.io
8585
kind: InferencePool
8686
name: vllm-llama3-8b-instruct
8787
matches:
@@ -209,7 +209,7 @@ This requires to expand the `AIGatewayRouteRuleBackendRef` with `BackendObjectRe
209209
- When it matches vllm-llama3-8b-instruct goes to InferencePool `vllm-llama3-8b-instruct`
210210

211211
```
212-
apiVersion: inference.networking.x-k8s.io/v1alpha2
212+
apiVersion: inference.networking.k8s.io/v1
213213
kind: InferencePool
214214
metadata:
215215
name: vllm-llama3-8b-instruct
@@ -249,7 +249,7 @@ spec:
249249
value: vllm-llama3-8b-instruct
250250
backendRefs:
251251
- name: vllm-llama3-8b-instruct
252-
group: inference.networking.x-k8s.io
252+
group: inference.networking.k8s.io
253253
kind: InferencePool
254254
```
255255

@@ -269,7 +269,7 @@ This approach is preferred because InferencePool resources do not require Backen
269269
- When it matches vllm-llama3-8b-instruct goes to AIServiceBackend `vllm-llama3-8b-instruct`
270270

271271
```yaml
272-
apiVersion: inference.networking.x-k8s.io/v1alpha2
272+
apiVersion: inference.networking.k8s.io/v1
273273
kind: InferencePool
274274
metadata:
275275
name: vllm-llama3-8b-instruct
@@ -319,7 +319,7 @@ spec:
319319
name: OpenAI
320320
backendRef:
321321
name: vllm-llama3-8b-instruct
322-
group: inference.networking.x-k8s.io
322+
group: inference.networking.k8s.io
323323
kind: InferencePool
324324
```
325325
@@ -384,7 +384,7 @@ It adds the the cluster with override_host loadBalancingPolicy, we can add the h
384384
Take the configuration below as an example:
385385

386386
```yaml
387-
apiVersion: inference.networking.x-k8s.io/v1alpha2
387+
apiVersion: inference.networking.k8s.io/v1
388388
kind: InferencePool
389389
metadata:
390390
name: vllm-llama3-8b-instruct
@@ -417,7 +417,7 @@ spec:
417417
value: vllm-llama3-8b-instruct
418418
backendRefs:
419419
- name: vllm-llama3-8b-instruct
420-
group: inference.networking.x-k8s.io
420+
group: inference.networking.k8s.io
421421
kind: InferencePool
422422
```
423423

@@ -582,7 +582,7 @@ spec:
582582
name: x-ai-eg-model
583583
value: meta-llama/Llama-3.1-8B-Instruct
584584
backendRefs:
585-
- group: inference.networking.x-k8s.io
585+
- group: inference.networking.k8s.io
586586
kind: InferencePool
587587
name: vllm-llama3-8b-instruct
588588
- matches:
@@ -591,7 +591,7 @@ spec:
591591
name: x-ai-eg-model
592592
value: mistral:latest
593593
backendRefs:
594-
- group: inference.networking.x-k8s.io
594+
- group: inference.networking.k8s.io
595595
kind: InferencePool
596596
name: mistral
597597
- matches:
@@ -619,7 +619,7 @@ spec:
619619
namespace: default
620620
rules:
621621
- backendRefs:
622-
- group: inference.networking.x-k8s.io
622+
- group: inference.networking.k8s.io
623623
kind: InferencePool
624624
name: vllm-llama3-8b-instruct
625625
namespace: default

examples/inference-pool/aigwroute.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ spec:
4949
name: Authorization
5050
value: sk-zyxwvutsrqponmlkjihgfedcba
5151
backendRefs:
52-
- group: inference.networking.x-k8s.io
52+
- group: inference.networking.k8s.io
5353
kind: InferencePool
5454
name: vllm-llama3-8b-instruct
5555
- matches:
@@ -58,7 +58,7 @@ spec:
5858
name: x-ai-eg-model
5959
value: mistral:latest
6060
backendRefs:
61-
- group: inference.networking.x-k8s.io
61+
- group: inference.networking.k8s.io
6262
kind: InferencePool
6363
name: mistral
6464
- matches:

examples/inference-pool/base.yaml

Lines changed: 55 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -49,31 +49,40 @@ spec:
4949
initialDelaySeconds: 1
5050
periodSeconds: 1
5151
---
52-
apiVersion: inference.networking.x-k8s.io/v1alpha2
52+
apiVersion: inference.networking.k8s.io/v1
5353
kind: InferencePool
5454
metadata:
5555
name: mistral
5656
namespace: default
5757
spec:
58-
targetPortNumber: 8080
58+
targetPorts:
59+
- number: 8080
5960
selector:
60-
app: mistral-upstream
61-
extensionRef:
61+
matchLabels:
62+
app: mistral-upstream
63+
endpointPickerRef:
6264
name: mistral-epp
65+
port:
66+
number: 9002
6367
---
6468
apiVersion: inference.networking.x-k8s.io/v1alpha2
65-
kind: InferenceModel
69+
kind: InferenceObjective
6670
metadata:
6771
name: mistral
6872
namespace: default
6973
spec:
70-
modelName: mistral:latest
71-
criticality: Critical
74+
priority: 10
7275
poolRef:
73-
# Bind the InferenceModel to the InferencePool.
76+
# Bind the InferenceObjective to the InferencePool.
7477
name: mistral
7578
---
7679
apiVersion: v1
80+
kind: ServiceAccount
81+
metadata:
82+
name: mistral-epp
83+
namespace: default
84+
---
85+
apiVersion: v1
7786
kind: Service
7887
metadata:
7988
name: mistral-epp
@@ -105,26 +114,27 @@ spec:
105114
labels:
106115
app: mistral-epp
107116
spec:
117+
serviceAccountName: mistral-epp
108118
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
109119
terminationGracePeriodSeconds: 130
110120
containers:
111121
- name: epp
112-
image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
122+
image: registry.k8s.io/gateway-api-inference-extension/epp:v1.0.1
113123
imagePullPolicy: IfNotPresent
114124
args:
115-
- -poolName
125+
- --pool-name
116126
- "mistral"
117-
- "-poolNamespace"
127+
- "--pool-namespace"
118128
- "default"
119-
- -v
129+
- --v
120130
- "4"
121131
- --zap-encoder
122132
- "json"
123-
- -grpcPort
133+
- --grpc-port
124134
- "9002"
125-
- -grpcHealthPort
135+
- --grpc-health-port
126136
- "9003"
127-
- "-configFile"
137+
- "--config-file"
128138
- "/config/default-plugins.yaml"
129139
ports:
130140
- containerPort: 9002
@@ -158,95 +168,54 @@ metadata:
158168
namespace: default
159169
data:
160170
default-plugins.yaml: |
161-
apiVersion: inference.networking.x-k8s.io/v1alpha1
162-
kind: EndpointPickerConfig
163-
plugins:
164-
- type: low-queue-filter
165-
parameters:
166-
threshold: 128
167-
- type: lora-affinity-filter
168-
parameters:
169-
threshold: 0.999
170-
- type: least-queue-filter
171-
- type: least-kv-cache-filter
172-
- type: decision-tree-filter
173-
name: low-latency-filter
174-
parameters:
175-
current:
176-
pluginRef: low-queue-filter
177-
nextOnSuccess:
178-
decisionTree:
179-
current:
180-
pluginRef: lora-affinity-filter
181-
nextOnSuccessOrFailure:
182-
decisionTree:
183-
current:
184-
pluginRef: least-queue-filter
185-
nextOnSuccessOrFailure:
186-
decisionTree:
187-
current:
188-
pluginRef: least-kv-cache-filter
189-
nextOnFailure:
190-
decisionTree:
191-
current:
192-
pluginRef: least-queue-filter
193-
nextOnSuccessOrFailure:
194-
decisionTree:
195-
current:
196-
pluginRef: lora-affinity-filter
197-
nextOnSuccessOrFailure:
198-
decisionTree:
199-
current:
200-
pluginRef: least-kv-cache-filter
201-
- type: random-picker
202-
parameters:
203-
maxNumOfEndpoints: 1
204-
- type: single-profile-handler
205-
schedulingProfiles:
206-
- name: default
207-
plugins:
208-
- pluginRef: low-latency-filter
209-
- pluginRef: random-picker
210-
plugins-v2.yaml: |
211171
apiVersion: inference.networking.x-k8s.io/v1alpha1
212172
kind: EndpointPickerConfig
213173
plugins:
214174
- type: queue-scorer
215-
- type: kv-cache-scorer
175+
- type: kv-cache-utilization-scorer
216176
- type: prefix-cache-scorer
217-
parameters:
218-
hashBlockSize: 64
219-
maxPrefixBlocksToMatch: 256
220-
lruCapacityPerServer: 31250
221-
- type: max-score-picker
222-
parameters:
223-
maxNumOfEndpoints: 1
224-
- type: single-profile-handler
225177
schedulingProfiles:
226178
- name: default
227179
plugins:
228180
- pluginRef: queue-scorer
229-
weight: 1
230-
- pluginRef: kv-cache-scorer
231-
weight: 1
181+
- pluginRef: kv-cache-utilization-scorer
232182
- pluginRef: prefix-cache-scorer
233-
weight: 1
234-
- pluginRef: max-score-picker
235183
---
236-
kind: ClusterRole
184+
kind: Role
237185
apiVersion: rbac.authorization.k8s.io/v1
238186
metadata:
239187
name: pod-read
188+
namespace: default
240189
rules:
241190
- apiGroups: ["inference.networking.x-k8s.io"]
242-
resources: ["inferencepools"]
191+
resources: ["inferenceobjectives", "inferencepools"]
243192
verbs: ["get", "watch", "list"]
244-
- apiGroups: ["inference.networking.x-k8s.io"]
245-
resources: ["inferencemodels"]
193+
- apiGroups: ["inference.networking.k8s.io"]
194+
resources: ["inferencepools"]
246195
verbs: ["get", "watch", "list"]
247196
- apiGroups: [""]
248197
resources: ["pods"]
249198
verbs: ["get", "watch", "list"]
199+
---
200+
kind: RoleBinding
201+
apiVersion: rbac.authorization.k8s.io/v1
202+
metadata:
203+
name: pod-read-binding
204+
namespace: default
205+
subjects:
206+
- kind: ServiceAccount
207+
name: mistral-epp
208+
namespace: default
209+
roleRef:
210+
apiGroup: rbac.authorization.k8s.io
211+
kind: Role
212+
name: pod-read
213+
---
214+
kind: ClusterRole
215+
apiVersion: rbac.authorization.k8s.io/v1
216+
metadata:
217+
name: auth-reviewer
218+
rules:
250219
- apiGroups:
251220
- authentication.k8s.io
252221
resources:
@@ -263,15 +232,15 @@ rules:
263232
kind: ClusterRoleBinding
264233
apiVersion: rbac.authorization.k8s.io/v1
265234
metadata:
266-
name: pod-read-binding
235+
name: auth-reviewer-binding
267236
subjects:
268237
- kind: ServiceAccount
269-
name: default
238+
name: mistral-epp
270239
namespace: default
271240
roleRef:
272241
apiGroup: rbac.authorization.k8s.io
273242
kind: ClusterRole
274-
name: pod-read
243+
name: auth-reviewer
275244
---
276245
apiVersion: aigateway.envoyproxy.io/v1alpha1
277246
kind: AIServiceBackend

0 commit comments

Comments
 (0)