Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ linters:
alias: egv1a1
- pkg: github.com/envoyproxy/ai-gateway/api/v1alpha1
alias: aigv1a1
- pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2
alias: gwaiev1a2
- pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1
alias: gwaiev1
- pkg: k8s.io/apimachinery/pkg/apis/meta/v1
alias: metav1
- pkg: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1
Expand Down
4 changes: 2 additions & 2 deletions api/v1alpha1/ai_gateway_route.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ type AIGatewayRouteRule struct {
// It can reference either an AIServiceBackend or an InferencePool resource.
//
// +kubebuilder:validation:XValidation:rule="!has(self.group) && !has(self.kind) || (has(self.group) && has(self.kind))", message="group and kind must be specified together"
// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.x-k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.x-k8s.io group is supported"
// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.k8s.io group is supported"
type AIGatewayRouteRuleBackendRef struct {
// Name is the name of the backend resource.
// When Group and Kind are not specified, this refers to an AIServiceBackend.
Expand All @@ -278,7 +278,7 @@ type AIGatewayRouteRuleBackendRef struct {

// Group is the group of the backend resource.
// When not specified, defaults to aigateway.envoyproxy.io (AIServiceBackend).
// Currently, only "inference.networking.x-k8s.io" is supported for InferencePool resources.
// Currently, only "inference.networking.k8s.io" is supported for InferencePool resources.
//
// +optional
// +kubebuilder:validation:MaxLength=253
Expand Down
2 changes: 1 addition & 1 deletion api/v1alpha1/ai_gateway_route_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const (
defaultRequestTimeout gwapiv1.Duration = "60s"

// inferencePoolGroup is the API group for InferencePool resources.
inferencePoolGroup = "inference.networking.x-k8s.io"
inferencePoolGroup = "inference.networking.k8s.io"
// inferencePoolKind is the kind for InferencePool resources.
inferencePoolKind = "InferencePool"
)
Expand Down
2 changes: 1 addition & 1 deletion cmd/aigw/envoy-gateway-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ extensionApis:
enableBackend: true
extensionManager:
backendResources:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
version: v1alpha2
hooks:
Expand Down
22 changes: 11 additions & 11 deletions docs/proposals/003-epp-integration-proposal/proposal.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ When request goes to envoyproxy, it goes to the http filter chain, the ext-proc
The gRPC service info is pre-defined in [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/) extensionRef, giving an example below:

```
apiVersion: inference.networking.x-k8s.io/v1alpha2
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: vllm-llama3-8b-instruct
Expand Down Expand Up @@ -81,7 +81,7 @@ spec:
name: inference-gateway
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: vllm-llama3-8b-instruct
matches:
Expand Down Expand Up @@ -209,7 +209,7 @@ This requires to expand the `AIGatewayRouteRuleBackendRef` with `BackendObjectRe
- When it matches vllm-llama3-8b-instruct goes to InferencePool `vllm-llama3-8b-instruct`

```
apiVersion: inference.networking.x-k8s.io/v1alpha2
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: vllm-llama3-8b-instruct
Expand Down Expand Up @@ -249,7 +249,7 @@ spec:
value: vllm-llama3-8b-instruct
backendRefs:
- name: vllm-llama3-8b-instruct
group: inference.networking.x-k8s.io
group: inference.networking.k8s.io
kind: InferencePool
```

Expand All @@ -269,7 +269,7 @@ This approach is preferred because InferencePool resources do not require Backen
- When it matches vllm-llama3-8b-instruct goes to AIServiceBackend `vllm-llama3-8b-instruct`

```yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: vllm-llama3-8b-instruct
Expand Down Expand Up @@ -319,7 +319,7 @@ spec:
name: OpenAI
backendRef:
name: vllm-llama3-8b-instruct
group: inference.networking.x-k8s.io
group: inference.networking.k8s.io
kind: InferencePool
```

Expand Down Expand Up @@ -384,7 +384,7 @@ It adds the the cluster with override_host loadBalancingPolicy, we can add the h
Take the configuration below as an example:

```yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: vllm-llama3-8b-instruct
Expand Down Expand Up @@ -417,7 +417,7 @@ spec:
value: vllm-llama3-8b-instruct
backendRefs:
- name: vllm-llama3-8b-instruct
group: inference.networking.x-k8s.io
group: inference.networking.k8s.io
kind: InferencePool
```

Expand Down Expand Up @@ -582,7 +582,7 @@ spec:
name: x-ai-eg-model
value: meta-llama/Llama-3.1-8B-Instruct
backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: vllm-llama3-8b-instruct
- matches:
Expand All @@ -591,7 +591,7 @@ spec:
name: x-ai-eg-model
value: mistral:latest
backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: mistral
- matches:
Expand Down Expand Up @@ -619,7 +619,7 @@ spec:
namespace: default
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: vllm-llama3-8b-instruct
namespace: default
Expand Down
4 changes: 2 additions & 2 deletions examples/inference-pool/aigwroute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ spec:
name: Authorization
value: sk-zyxwvutsrqponmlkjihgfedcba
backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: vllm-llama3-8b-instruct
- matches:
Expand All @@ -58,7 +58,7 @@ spec:
name: x-ai-eg-model
value: mistral:latest
backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: mistral
- matches:
Expand Down
141 changes: 55 additions & 86 deletions examples/inference-pool/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,31 +49,40 @@ spec:
initialDelaySeconds: 1
periodSeconds: 1
---
apiVersion: inference.networking.x-k8s.io/v1alpha2
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: mistral
namespace: default
spec:
targetPortNumber: 8080
targetPorts:
- number: 8080
selector:
app: mistral-upstream
extensionRef:
matchLabels:
app: mistral-upstream
endpointPickerRef:
name: mistral-epp
port:
number: 9002
---
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
kind: InferenceObjective
metadata:
name: mistral
namespace: default
spec:
modelName: mistral:latest
criticality: Critical
priority: 10
poolRef:
# Bind the InferenceModel to the InferencePool.
# Bind the InferenceObjective to the InferencePool.
name: mistral
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: mistral-epp
namespace: default
---
apiVersion: v1
kind: Service
metadata:
name: mistral-epp
Expand Down Expand Up @@ -105,26 +114,27 @@ spec:
labels:
app: mistral-epp
spec:
serviceAccountName: mistral-epp
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
terminationGracePeriodSeconds: 130
containers:
- name: epp
image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
image: registry.k8s.io/gateway-api-inference-extension/epp:v1.0.1
imagePullPolicy: IfNotPresent
args:
- -poolName
- --pool-name
- "mistral"
- "-poolNamespace"
- "--pool-namespace"
- "default"
- -v
- --v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- --grpc-port
- "9002"
- -grpcHealthPort
- --grpc-health-port
- "9003"
- "-configFile"
- "--config-file"
- "/config/default-plugins.yaml"
ports:
- containerPort: 9002
Expand Down Expand Up @@ -158,95 +168,54 @@ metadata:
namespace: default
data:
default-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: low-queue-filter
parameters:
threshold: 128
- type: lora-affinity-filter
parameters:
threshold: 0.999
- type: least-queue-filter
- type: least-kv-cache-filter
- type: decision-tree-filter
name: low-latency-filter
parameters:
current:
pluginRef: low-queue-filter
nextOnSuccess:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
nextOnFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
- type: random-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: low-latency-filter
- pluginRef: random-picker
plugins-v2.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-scorer
- type: kv-cache-utilization-scorer
- type: prefix-cache-scorer
parameters:
hashBlockSize: 64
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: max-score-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 1
- pluginRef: kv-cache-scorer
weight: 1
- pluginRef: kv-cache-utilization-scorer
- pluginRef: prefix-cache-scorer
weight: 1
- pluginRef: max-score-picker
---
kind: ClusterRole
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
namespace: default
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
resources: ["inferenceobjectives", "inferencepools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
- apiGroups: ["inference.networking.k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read-binding
namespace: default
subjects:
- kind: ServiceAccount
name: mistral-epp
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: pod-read
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: auth-reviewer
rules:
- apiGroups:
- authentication.k8s.io
resources:
Expand All @@ -263,15 +232,15 @@ rules:
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read-binding
name: auth-reviewer-binding
subjects:
- kind: ServiceAccount
name: default
name: mistral-epp
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pod-read
name: auth-reviewer
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIServiceBackend
Expand Down
Loading
Loading