envoyproxy · mathetake · Oct 10, 2025 · Aug 11, 2025 · Oct 10, 2025 · Oct 10, 2025
@@ -44,8 +44,8 @@ linters:
           alias: egv1a1
         - pkg: github.com/envoyproxy/ai-gateway/api/v1alpha1
           alias: aigv1a1
-        - pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2
-          alias: gwaiev1a2
+        - pkg: sigs.k8s.io/gateway-api-inference-extension/api/v1
+          alias: gwaiev1
         - pkg: k8s.io/apimachinery/pkg/apis/meta/v1
           alias: metav1
         - pkg: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1

@@ -266,7 +266,7 @@ type AIGatewayRouteRule struct {
 // It can reference either an AIServiceBackend or an InferencePool resource.
 //
 // +kubebuilder:validation:XValidation:rule="!has(self.group) && !has(self.kind) || (has(self.group) && has(self.kind))", message="group and kind must be specified together"
-// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.x-k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.x-k8s.io group is supported"
+// +kubebuilder:validation:XValidation:rule="!has(self.group) || (self.group == 'inference.networking.k8s.io' && self.kind == 'InferencePool')", message="only InferencePool from inference.networking.k8s.io group is supported"
 type AIGatewayRouteRuleBackendRef struct {
 	// Name is the name of the backend resource.
 	// When Group and Kind are not specified, this refers to an AIServiceBackend.
@@ -278,7 +278,7 @@ type AIGatewayRouteRuleBackendRef struct {
 
 	// Group is the group of the backend resource.
 	// When not specified, defaults to aigateway.envoyproxy.io (AIServiceBackend).
-	// Currently, only "inference.networking.x-k8s.io" is supported for InferencePool resources.
+	// Currently, only "inference.networking.k8s.io" is supported for InferencePool resources.
 	//
 	// +optional
 	// +kubebuilder:validation:MaxLength=253

@@ -15,7 +15,7 @@ const (
 	defaultRequestTimeout gwapiv1.Duration = "60s"
 
 	// inferencePoolGroup is the API group for InferencePool resources.
-	inferencePoolGroup = "inference.networking.x-k8s.io"
+	inferencePoolGroup = "inference.networking.k8s.io"
 	// inferencePoolKind is the kind for InferencePool resources.
 	inferencePoolKind = "InferencePool"
 )

@@ -24,7 +24,7 @@ extensionApis:
   enableBackend: true
 extensionManager:
   backendResources:
-    - group: inference.networking.x-k8s.io
+    - group: inference.networking.k8s.io
       kind: InferencePool
       version: v1alpha2
   hooks:

@@ -51,7 +51,7 @@ When request goes to envoyproxy, it goes to the http filter chain, the ext-proc
 The gRPC service info is pre-defined in [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/) extensionRef, giving an example below:
 
 ```
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: vllm-llama3-8b-instruct
@@ -81,7 +81,7 @@ spec:
     name: inference-gateway
   rules:
   - backendRefs:
-    - group: inference.networking.x-k8s.io
+    - group: inference.networking.k8s.io
       kind: InferencePool
       name: vllm-llama3-8b-instruct
     matches:
@@ -209,7 +209,7 @@ This requires to expand the `AIGatewayRouteRuleBackendRef` with `BackendObjectRe
 - When it matches vllm-llama3-8b-instruct goes to InferencePool `vllm-llama3-8b-instruct`
 
 ```
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: vllm-llama3-8b-instruct
@@ -249,7 +249,7 @@ spec:
               value: vllm-llama3-8b-instruct
       backendRefs:
         - name: vllm-llama3-8b-instruct
-        	group: inference.networking.x-k8s.io
+        	group: inference.networking.k8s.io
           kind: InferencePool
 ```
 
@@ -269,7 +269,7 @@ This approach is preferred because InferencePool resources do not require Backen
 - When it matches vllm-llama3-8b-instruct goes to AIServiceBackend `vllm-llama3-8b-instruct`
 
 ```yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: vllm-llama3-8b-instruct
@@ -319,7 +319,7 @@ spec:
     name: OpenAI
   backendRef:
     name: vllm-llama3-8b-instruct
-    group: inference.networking.x-k8s.io
+    group: inference.networking.k8s.io
     kind: InferencePool
 ```
 
@@ -384,7 +384,7 @@ It adds the the cluster with override_host loadBalancingPolicy, we can add the h
 Take the configuration below as an example:
 
 ```yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: vllm-llama3-8b-instruct
@@ -417,7 +417,7 @@ spec:
               value: vllm-llama3-8b-instruct
       backendRefs:
         - name: vllm-llama3-8b-instruct
-        	group: inference.networking.x-k8s.io
+        	group: inference.networking.k8s.io
           kind: InferencePool
 ```
 
@@ -582,7 +582,7 @@ spec:
               name: x-ai-eg-model
               value: meta-llama/Llama-3.1-8B-Instruct
       backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
           kind: InferencePool
           name: vllm-llama3-8b-instruct
     - matches:
@@ -591,7 +591,7 @@ spec:
               name: x-ai-eg-model
               value: mistral:latest
       backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
           kind: InferencePool
           name: mistral
     - matches:
@@ -619,7 +619,7 @@ spec:
       namespace: default
   rules:
     - backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
           kind: InferencePool
           name: vllm-llama3-8b-instruct
           namespace: default

@@ -49,7 +49,7 @@ spec:
               name: Authorization
               value: sk-zyxwvutsrqponmlkjihgfedcba
       backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
           kind: InferencePool
           name: vllm-llama3-8b-instruct
     - matches:
@@ -58,7 +58,7 @@ spec:
               name: x-ai-eg-model
               value: mistral:latest
       backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
           kind: InferencePool
           name: mistral
     - matches:

@@ -49,31 +49,40 @@ spec:
             initialDelaySeconds: 1
             periodSeconds: 1
 ---
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: mistral
   namespace: default
 spec:
-  targetPortNumber: 8080
+  targetPorts:
+    - number: 8080
   selector:
-    app: mistral-upstream
-  extensionRef:
+    matchLabels:
+      app: mistral-upstream
+  endpointPickerRef:
     name: mistral-epp
+    port:
+      number: 9002
 ---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
+kind: InferenceObjective
 metadata:
   name: mistral
   namespace: default
 spec:
-  modelName: mistral:latest
-  criticality: Critical
+  priority: 10
   poolRef:
-    # Bind the InferenceModel to the InferencePool.
+    # Bind the InferenceObjective to the InferencePool.
     name: mistral
 ---
 apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: mistral-epp
+  namespace: default
+---
+apiVersion: v1
 kind: Service
 metadata:
   name: mistral-epp
@@ -105,26 +114,27 @@ spec:
       labels:
         app: mistral-epp
     spec:
+      serviceAccountName: mistral-epp
       # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
       terminationGracePeriodSeconds: 130
       containers:
         - name: epp
-          image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
+          image: registry.k8s.io/gateway-api-inference-extension/epp:v1.0.1
           imagePullPolicy: IfNotPresent
           args:
-            - -poolName
+            - --pool-name
             - "mistral"
-            - "-poolNamespace"
+            - "--pool-namespace"
             - "default"
-            - -v
+            - --v
             - "4"
             - --zap-encoder
             - "json"
-            - -grpcPort
+            - --grpc-port
             - "9002"
-            - -grpcHealthPort
+            - --grpc-health-port
             - "9003"
-            - "-configFile"
+            - "--config-file"
             - "/config/default-plugins.yaml"
           ports:
             - containerPort: 9002
@@ -158,95 +168,54 @@ metadata:
   namespace: default
 data:
   default-plugins.yaml: |
-    apiVersion: inference.networking.x-k8s.io/v1alpha1
-    kind: EndpointPickerConfig
-    plugins:
-    - type: low-queue-filter
-      parameters:
-        threshold: 128
-    - type: lora-affinity-filter
-      parameters:
-        threshold: 0.999
-    - type: least-queue-filter
-    - type: least-kv-cache-filter
-    - type: decision-tree-filter
-      name: low-latency-filter
-      parameters:
-        current:
-          pluginRef: low-queue-filter
-        nextOnSuccess:
-          decisionTree:
-            current:
-              pluginRef: lora-affinity-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: least-queue-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-        nextOnFailure:
-          decisionTree:
-            current:
-              pluginRef: least-queue-filter
-            nextOnSuccessOrFailure:
-              decisionTree:
-                current:
-                  pluginRef: lora-affinity-filter
-                nextOnSuccessOrFailure:
-                  decisionTree:
-                    current:
-                      pluginRef: least-kv-cache-filter
-    - type: random-picker
-      parameters:
-        maxNumOfEndpoints: 1
-    - type: single-profile-handler
-    schedulingProfiles:
-    - name: default
-      plugins:
-      - pluginRef: low-latency-filter
-      - pluginRef: random-picker
-  plugins-v2.yaml: |
     apiVersion: inference.networking.x-k8s.io/v1alpha1
     kind: EndpointPickerConfig
     plugins:
     - type: queue-scorer
-    - type: kv-cache-scorer
+    - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
-      parameters:
-        hashBlockSize: 64
-        maxPrefixBlocksToMatch: 256
-        lruCapacityPerServer: 31250
-    - type: max-score-picker
-      parameters:
-        maxNumOfEndpoints: 1
-    - type: single-profile-handler
     schedulingProfiles:
     - name: default
       plugins:
       - pluginRef: queue-scorer
-        weight: 1
-      - pluginRef: kv-cache-scorer
-        weight: 1
+      - pluginRef: kv-cache-utilization-scorer
       - pluginRef: prefix-cache-scorer
-        weight: 1
-      - pluginRef: max-score-picker
 ---
-kind: ClusterRole
+kind: Role
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
   name: pod-read
+  namespace: default
 rules:
   - apiGroups: ["inference.networking.x-k8s.io"]
-    resources: ["inferencepools"]
+    resources: ["inferenceobjectives", "inferencepools"]
     verbs: ["get", "watch", "list"]
-  - apiGroups: ["inference.networking.x-k8s.io"]
-    resources: ["inferencemodels"]
+  - apiGroups: ["inference.networking.k8s.io"]
+    resources: ["inferencepools"]
     verbs: ["get", "watch", "list"]
   - apiGroups: [""]
     resources: ["pods"]
     verbs: ["get", "watch", "list"]
+---
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read-binding
+  namespace: default
+subjects:
+  - kind: ServiceAccount
+    name: mistral-epp
+    namespace: default
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: pod-read
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: auth-reviewer
+rules:
   - apiGroups:
       - authentication.k8s.io
     resources:
@@ -263,15 +232,15 @@ rules:
 kind: ClusterRoleBinding
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
-  name: pod-read-binding
+  name: auth-reviewer-binding
 subjects:
   - kind: ServiceAccount
-    name: default
+    name: mistral-epp
     namespace: default
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: pod-read
+  name: auth-reviewer
 ---
 apiVersion: aigateway.envoyproxy.io/v1alpha1
 kind: AIServiceBackend