diff --git a/.github/workflows/kal.yml b/.github/workflows/kal.yml index 618256e88..17b6281f5 100644 --- a/.github/workflows/kal.yml +++ b/.github/workflows/kal.yml @@ -20,9 +20,5 @@ jobs: persist-credentials: false - name: Set up Go uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # tag=v5.5.0 - - name: Install Golang CI Lint - run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.2.1 - - name: Build KAL - run: golangci-lint custom - - name: run api linter - run: ./bin/golangci-kube-api-linter run -c ./.golangci-kal.yml ./... + - name: Run API Linter + run: make api-lint \ No newline at end of file diff --git a/.github/workflows/non-main-gatekeeper.yml b/.github/workflows/non-main-gatekeeper.yml new file mode 100644 index 000000000..af91ea966 --- /dev/null +++ b/.github/workflows/non-main-gatekeeper.yml @@ -0,0 +1,18 @@ +name: Label non-main PRs + +on: + pull_request: + types: [opened, edited, synchronize, reopened] + +jobs: + add-label: + runs-on: ubuntu-latest + steps: + - name: Add labels when base branch is not main + if: github.event.pull_request.base.ref != 'main' + uses: actions-ecosystem/action-add-labels@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + labels: | + do-not-merge/hold + do-not-merge/cherry-pick-not-approved diff --git a/Makefile b/Makefile index 8113c33b2..e61866beb 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,8 @@ SHELL = /usr/bin/env bash -o pipefail GIT_COMMIT_SHA ?= "$(shell git rev-parse HEAD 2>/dev/null)" GIT_TAG ?= $(shell git describe --tags --dirty --always) -PLATFORMS ?= linux/amd64 +TARGETARCH ?= $(shell go env GOARCH) +PLATFORMS ?= linux/$(TARGETARCH) DOCKER_BUILDX_CMD ?= docker buildx IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build IMAGE_BUILD_EXTRA_OPTS ?= @@ -99,6 +100,7 @@ help: ## Display this help. .PHONY: generate generate: controller-gen code-generator ## Generate WebhookConfiguration, ClusterRole, CustomResourceDefinition objects, code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. $(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.generatego.txt" paths="./..." + $(CONTROLLER_GEN) crd output:dir="./config/crd/bases" paths="./..." ./hack/update-codegen.sh # Use same code-generator version as k8s.io/api @@ -133,12 +135,18 @@ vet: ## Run go vet against code. go vet ./... .PHONY: test -test: generate fmt vet envtest image-build verify-crds ## Run tests. +test: generate fmt vet envtest image-build verify-crds verify-helm-charts ## Run tests. CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e | grep -v /conformance) -race -coverprofile cover.out .PHONY: test-unit test-unit: ## Run unit tests. - CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -race -coverprofile cover.out + CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -race -coverprofile cover.out; \ + go tool cover -func=cover.out; \ + rm cover.out + +.PHONY: test-benchmark +test-benchmark: ## Run benchmarks. + CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -bench=. -benchmem; .PHONY: test-integration test-integration: envtest ## Run integration tests. @@ -160,14 +168,22 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes ci-lint: golangci-lint $(GOLANGCI_LINT) run --timeout 15m0s +.PHONY: api-lint +api-lint: golangci-api-lint + $(GOLANGCI_API_LINT) run -c .golangci-kal.yml --timeout 15m0s ./... + .PHONY: verify -verify: vet fmt-verify generate ci-lint verify-all +verify: vet fmt-verify generate ci-lint api-lint verify-all git --no-pager diff --exit-code config api client-go .PHONY: verify-crds verify-crds: kubectl-validate hack/verify-manifests.sh +.PHONY: verify-helm-charts +verify-helm-charts: helm-install + hack/verify-helm.sh + # Run static analysis. .PHONY: verify-all verify-all: @@ -297,13 +313,24 @@ live-docs: docker build -t gaie/mkdocs hack/mkdocs/image docker run --rm -it -p 3000:3000 -v ${PWD}:/docs gaie/mkdocs -.PHONY: apix-ref-docs -apix-ref-docs: crd-ref-docs +.PHONY: api-ref-docs-all +api-ref-docs-all: apix-v1a1-ref-docs apix-v1a2-ref-docs api-ref-docs + +.PHONY: apix-v1a1-ref-docs +apix-v1a1-ref-docs: crd-ref-docs + ${CRD_REF_DOCS} \ + --source-path=${PWD}/apix/v1alpha1 \ + --config=crd-ref-docs.yaml \ + --renderer=markdown \ + --output-path=${PWD}/site-src/reference/x-v1a1-spec.md + +.PHONY: apix-v1a2-ref-docs +apix-v1a2-ref-docs: crd-ref-docs ${CRD_REF_DOCS} \ --source-path=${PWD}/apix/v1alpha2 \ --config=crd-ref-docs.yaml \ --renderer=markdown \ - --output-path=${PWD}/site-src/reference/x-spec.md + --output-path=${PWD}/site-src/reference/x-v1a2-spec.md .PHONY: api-ref-docs api-ref-docs: crd-ref-docs @@ -329,11 +356,11 @@ uninstall: generate kustomize ## Uninstall CRDs from the K8s cluster specified i ##@ Helm .PHONY: inferencepool-helm-chart-push -inferencepool-helm-chart-push: yq helm +inferencepool-helm-chart-push: yq helm-install CHART=inferencepool EXTRA_TAG="$(EXTRA_TAG)" IMAGE_REGISTRY="$(IMAGE_REGISTRY)" YQ="$(YQ)" HELM="$(HELM)" ./hack/push-chart.sh .PHONY: bbr-helm-chart-push -bbr-helm-chart-push: yq helm +bbr-helm-chart-push: yq helm-install CHART=body-based-routing EXTRA_TAG="$(EXTRA_TAG)" IMAGE_REGISTRY="$(IMAGE_REGISTRY)" YQ="$(YQ)" HELM="$(HELM)" ./hack/push-chart.sh ##@ Release @@ -343,10 +370,12 @@ release-quickstart: ## Update the quickstart guide for a release. ./hack/release-quickstart.sh .PHONY: artifacts -artifacts: kustomize +artifacts: kustomize yq if [ -d artifacts ]; then rm -rf artifacts; fi mkdir -p artifacts $(KUSTOMIZE) build config/crd -o artifacts/manifests.yaml + $(YQ) -P 'select(.spec.versions[].name == "v1")' artifacts/manifests.yaml > artifacts/v1-manifests.yaml + $(YQ) -P 'select(.spec.versions[].name != "v1")' artifacts/manifests.yaml > artifacts/experimental-manifests.yaml @$(call clean-manifests) .PHONY: release @@ -366,6 +395,7 @@ CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen ENVTEST ?= $(LOCALBIN)/setup-envtest CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs GOLANGCI_LINT = $(LOCALBIN)/golangci-lint +GOLANGCI_API_LINT = $(LOCALBIN)/golangci-kube-api-linter HELM = $(PROJECT_DIR)/bin/helm YQ = $(PROJECT_DIR)/bin/yq KUBECTL_VALIDATE = $(PROJECT_DIR)/bin/kubectl-validate @@ -373,7 +403,7 @@ GCI = $(LOCALBIN)/gci ## Tool Versions KUSTOMIZE_VERSION ?= v5.4.3 -CONTROLLER_TOOLS_VERSION ?= v0.17.0 +CONTROLLER_TOOLS_VERSION ?= v0.19.0 ENVTEST_VERSION ?= release-0.19 CRD_REF_DOCS_VERSION ?= v0.2.0 GOLANGCI_LINT_VERSION ?= v2.3.0 @@ -407,12 +437,18 @@ golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. $(GOLANGCI_LINT): $(LOCALBIN) $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) +.PHONY: golangci-api-lint +golangci-api-lint: golangci-lint $(GOLANGCI_API_LINT) ## Download golangci-lint locally if necessary before building KAL +$(GOLANGCI_API_LINT): + $(GOLANGCI_LINT) custom + .PHONY: yq yq: ## Download yq locally if necessary. GOBIN=$(PROJECT_DIR)/bin GO111MODULE=on go install github.com/mikefarah/yq/v4@$(YQ_VERSION) -.PHONY: helm -helm: ## Download helm locally if necessary. +.PHONY: helm-install +helm-install: $(HELM) ## Download helm locally if necessary. +$(HELM): $(LOCALBIN) GOBIN=$(PROJECT_DIR)/bin GO111MODULE=on go install helm.sh/helm/v3/cmd/helm@$(HELM_VERSION) .PHONY: kubectl-validate diff --git a/README.md b/README.md index ff193dfb1..01d750285 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,6 @@ This is achieved by leveraging Envoy's [External Processing] (ext-proc) to exten [Inference Gateway]:#concepts-and-definitions - -> ***NOTE*** : As we prep for our `v1` release, some of our docs may fall out of scope, we are working hard to get these up to date and they will be ready by the time we launch `v1`. Thanks! - ## New! Inference Gateway has partnered with vLLM to accelerate LLM serving optimizations with [llm-d](https://llm-d.ai/blog/llm-d-announce)! @@ -32,7 +29,8 @@ The following specific terms to this project: performance, availability and capabilities to optimize routing. Includes things like [Prefix Cache] status or [LoRA Adapters] availability. - **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). - +- **Body Based Router(BBR)**: An optional additional [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) server that parses the http body of the inference prompt message and extracts information (currently the model name for OpenAI API style messages) into a format which can then be used by the gateway for routing purposes. Additional info [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/bbr/README.md) and in the documentation [user guides](https://gateway-api-inference-extension.sigs.k8s.io/guides/). + The following are key industry terms that are important to understand for this project: @@ -114,7 +112,7 @@ Follow this [README](./test/e2e/epp/README.md) to learn more about running the i ## Contributing -Our community meeting is weekly at Thursday 10AM PDT ([Zoom](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://www.google.com/url?q=https://docs.google.com/document/d/1frfPE5L1sI3737rdQV04IcDGeOcGJj2ItjMg6z2SRH0/edit?usp%3Dsharing&sa=D&source=calendar&usd=2&usg=AOvVaw1pUVy7UN_2PMj8qJJcFm1U)). +Our community meeting is weekly at Thursday 10AM PDT ([Zoom](https://zoom.us/j/96271651417?pwd=NViXawg6lMsRjgXbu2YmW8DxWqbjta.1), [Meeting Notes](https://www.google.com/url?q=https://docs.google.com/document/d/1frfPE5L1sI3737rdQV04IcDGeOcGJj2ItjMg6z2SRH0/edit?usp%3Dsharing&sa=D&source=calendar&usd=2&usg=AOvVaw1pUVy7UN_2PMj8qJJcFm1U)). We currently utilize the [#gateway-api-inference-extension](https://kubernetes.slack.com/?redir=%2Fmessages%2Fgateway-api-inference-extension) channel in Kubernetes Slack workspace for communications. diff --git a/api/v1/inferencepool_types.go b/api/v1/inferencepool_types.go index 2c7b52705..92e7aff14 100644 --- a/api/v1/inferencepool_types.go +++ b/api/v1/inferencepool_types.go @@ -70,10 +70,12 @@ type InferencePoolSpec struct { Selector LabelSelector `json:"selector,omitzero"` // TargetPorts defines a list of ports that are exposed by this InferencePool. - // Currently, the list may only include a single port definition. + // Every port will be treated as a distinctive endpoint by EPP, + // addressable as a 'podIP:portNumber' combination. // // +kubebuilder:validation:MinItems=1 - // +kubebuilder:validation:MaxItems=1 + // +kubebuilder:validation:MaxItems=8 + // +kubebuilder:validation:XValidation:message="port number must be unique",rule="self.all(p1, self.exists_one(p2, p1.number==p2.number))" // +listType=atomic // +required TargetPorts []Port `json:"targetPorts,omitempty"` @@ -200,8 +202,42 @@ type ParentStatus struct { // // +required ParentRef ParentReference `json:"parentRef,omitzero"` + + // ControllerName is a domain/path string that indicates the name of the controller that + // wrote this status. This corresponds with the GatewayClass controllerName field when the + // parentRef references a Gateway kind. + // + // Example: "example.net/gateway-controller". + // + // The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes names: + // + // https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + // + // Controllers MAY populate this field when writing status. When populating this field, controllers + // should ensure that entries to status populated with their ControllerName are cleaned up when they + // are no longer necessary. + // + // +optional + ControllerName ControllerName `json:"controllerName,omitempty"` } +// ControllerName is the name of a controller that manages ParentStatus. It must be a domain prefixed +// path. +// +// Valid values include: +// +// * "example.com/bar" +// +// Invalid values include: +// +// * "example.com" - must include path +// * "foo.example.com" - must include path +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$` +type ControllerName string + // InferencePoolConditionType is a type of status condition for the InferencePool. type InferencePoolConditionType string @@ -272,6 +308,38 @@ const ( InferencePoolReasonInvalidExtensionRef InferencePoolReason = "InvalidExtensionRef" ) +const ( + // InferencePoolConditionExported is a type of condition that indicates whether the + // controller was able to export the InferencePool to the specified clusters. + // + // Possible reasons for this condition to be True are: + // + // * "Exported" + // + // Possible reasons for this condition to be False are: + // + // * "NotRequested" + // * "NotSupported" + // + // Controllers MAY raise this condition with other reasons, but should + // prefer to use the reasons listed above to improve interoperability. + InferencePoolConditionExported InferencePoolConditionType = "Exported" + + // InferencePoolReasonExported is a reason used with the "Exported" condition when the + // condition is true. + InferencePoolReasonExported InferencePoolReason = "Exported" + + // InferencePoolReasonNotRequested is a reason used with the "Exported" condition when the + // condition is false and no export was requested by the InferencePool. This indicates a + // deliberate non-action rather than an error. + InferencePoolReasonNotRequested InferencePoolReason = "NotRequested" + + // InferencePoolReasonNotSupported is a reason used with the "Exported" condition when the + // condition is false and the export was requested but is not supported by the implementation. + // Controllers should include details in the condition message. + InferencePoolReasonNotSupported InferencePoolReason = "NotSupported" +) + // ParentReference identifies an API object. It is used to associate the InferencePool with a // parent resource, such as a Gateway. type ParentReference struct { diff --git a/api/v1/shared_types.go b/api/v1/shared_types.go index bc315fd4f..5d5d3e763 100644 --- a/api/v1/shared_types.go +++ b/api/v1/shared_types.go @@ -137,7 +137,7 @@ type LabelSelector struct { // The matching logic is an AND operation on all entries. // // +required - // +kubebuilder:validation:MinItems=1 - // +kubebuilder:validation:MaxItems=64 + // +kubebuilder:validation:MinProperties=1 + // +kubebuilder:validation:MaxProperties=64 MatchLabels map[LabelKey]LabelValue `json:"matchLabels,omitempty"` } diff --git a/apix/v1alpha1/doc.go b/apix/v1alpha1/doc.go new file mode 100644 index 000000000..122c3b952 --- /dev/null +++ b/apix/v1alpha1/doc.go @@ -0,0 +1,22 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the +// inference.networking.x-k8s.io API group. +// +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha1 diff --git a/apix/v1alpha1/inferencepoolimport_types.go b/apix/v1alpha1/inferencepoolimport_types.go new file mode 100644 index 000000000..2239aa3d0 --- /dev/null +++ b/apix/v1alpha1/inferencepoolimport_types.go @@ -0,0 +1,143 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" +) + +// InferencePoolImport is the Schema for the InferencePoolImports API. +// +// +kubebuilder:object:root=true +// +kubebuilder:resource:shortName=infpimp +// +kubebuilder:subresource:status +// +kubebuilder:storageversion +// +genclient +type InferencePoolImport struct { + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Status defines the observed state of the InferencePoolImport. + // + // +optional + //nolint:kubeapilinter // status should not be a pointer. + Status InferencePoolImportStatus `json:"status,omitempty"` +} + +// InferencePoolImportList contains a list of InferencePoolImports. +// +// +kubebuilder:object:root=true +type InferencePoolImportList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferencePoolImport `json:"items"` +} + +// InferencePoolImportStatus defines the observed state of the InferencePoolImport. +type InferencePoolImportStatus struct { + // Controllers is a list of controllers that are responsible for managing the InferencePoolImport. + // + // +listType=map + // +listMapKey=name + // +kubebuilder:validation:MaxItems=8 + // +kubebuilder:validation:Required + Controllers []ImportController `json:"controllers"` +} + +// ImportController defines a controller that is responsible for managing the InferencePoolImport. +type ImportController struct { + // Name is a domain/path string that indicates the name of the controller that manages the + // InferencePoolImport. Name corresponds to the GatewayClass controllerName field when the + // controller will manage parents of type "Gateway". Otherwise, the name is implementation-specific. + // + // Example: "example.net/import-controller". + // + // The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes + // names (https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). + // + // A controller MUST populate this field when writing status and ensure that entries to status + // populated with their controller name are removed when they are no longer necessary. + // + // +required + Name ControllerName `json:"name"` + + // ExportingClusters is a list of clusters that exported the InferencePool(s) that back the + // InferencePoolImport. Required when the controller is responsible for CRUD'ing the InferencePoolImport + // from the exported InferencePool(s). + // + // +optional + ExportingClusters []ExportingCluster `json:"exportingClusters,omitempty"` + + // Parents is a list of parent resources, typically Gateways, that are associated with the + // InferencePoolImport, and the status of the InferencePoolImport with respect to each parent. + // + // Ancestor would be a more accurate name, but Parent is consistent with InferencePool terminology. + // + // Required when the controller manages the InferencePoolImport as an HTTPRoute backendRef. The controller + // must add an entry for each parent it manages and remove the parent entry when the controller no longer + // considers the InferencePoolImport to be associated with that parent. + // + // +optional + // +listType=atomic + Parents []v1.ParentStatus `json:"parents,omitempty"` + + // Conditions track the state of the InferencePoolImport. + // + // Known condition types are: + // + // * "Accepted" + // + // +optional + // +listType=map + // +listMapKey=type + // +kubebuilder:validation:MaxItems=8 + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// ControllerName is the name of a controller that manages a resource. It must be a domain prefixed path. +// +// Valid values include: +// +// - "example.com/bar" +// +// Invalid values include: +// +// - "example.com" - must include path +// - "foo.example.com" - must include path +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$` +type ControllerName string + +// ClusterName is the name of a cluster that exported the InferencePool. +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +type ClusterName string + +// ExportingCluster defines a cluster that exported the InferencePool that backs this InferencePoolImport. +type ExportingCluster struct { + // Name of the exporting cluster (must be unique within the list). + // + // +kubebuilder:validation:Required + Name ClusterName `json:"name"` +} diff --git a/apix/v1alpha1/shared_types.go b/apix/v1alpha1/shared_types.go new file mode 100644 index 000000000..56fd71d33 --- /dev/null +++ b/apix/v1alpha1/shared_types.go @@ -0,0 +1,24 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +// ExportAnnotationKey is the annotation key used to export an InferencePool. +var ExportAnnotationKey = "inference.networking.x-k8s.io/export" + +// ExportAnnotationVal is the annotation value used to export an InferencePool +// to all clusters. +var ExportAnnotationVal = "ClusterSet" diff --git a/apix/v1alpha1/zz_generated.deepcopy.go b/apix/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 000000000..f6f182616 --- /dev/null +++ b/apix/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,156 @@ +//go:build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/gateway-api-inference-extension/api/v1" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExportingCluster) DeepCopyInto(out *ExportingCluster) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExportingCluster. +func (in *ExportingCluster) DeepCopy() *ExportingCluster { + if in == nil { + return nil + } + out := new(ExportingCluster) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ImportController) DeepCopyInto(out *ImportController) { + *out = *in + if in.ExportingClusters != nil { + in, out := &in.ExportingClusters, &out.ExportingClusters + *out = make([]ExportingCluster, len(*in)) + copy(*out, *in) + } + if in.Parents != nil { + in, out := &in.Parents, &out.Parents + *out = make([]v1.ParentStatus, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImportController. +func (in *ImportController) DeepCopy() *ImportController { + if in == nil { + return nil + } + out := new(ImportController) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolImport) DeepCopyInto(out *InferencePoolImport) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolImport. +func (in *InferencePoolImport) DeepCopy() *InferencePoolImport { + if in == nil { + return nil + } + out := new(InferencePoolImport) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferencePoolImport) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolImportList) DeepCopyInto(out *InferencePoolImportList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]InferencePoolImport, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolImportList. +func (in *InferencePoolImportList) DeepCopy() *InferencePoolImportList { + if in == nil { + return nil + } + out := new(InferencePoolImportList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferencePoolImportList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolImportStatus) DeepCopyInto(out *InferencePoolImportStatus) { + *out = *in + if in.Controllers != nil { + in, out := &in.Controllers, &out.Controllers + *out = make([]ImportController, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolImportStatus. +func (in *InferencePoolImportStatus) DeepCopy() *InferencePoolImportStatus { + if in == nil { + return nil + } + out := new(InferencePoolImportStatus) + in.DeepCopyInto(out) + return out +} diff --git a/apix/v1alpha1/zz_generated.register.go b/apix/v1alpha1/zz_generated.register.go new file mode 100644 index 000000000..4894b76aa --- /dev/null +++ b/apix/v1alpha1/zz_generated.register.go @@ -0,0 +1,70 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by register-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" +) + +// GroupName specifies the group name used to register the objects. +const GroupName = "inference.networking.x-k8s.io" + +// GroupVersion specifies the group and the version used to register the objects. +var GroupVersion = v1.GroupVersion{Group: GroupName, Version: "v1alpha1"} + +// SchemeGroupVersion is group version used to register these objects +// Deprecated: use GroupVersion instead. +var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + +// Resource takes an unqualified resource and returns a Group qualified GroupResource +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +var ( + // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. + SchemeBuilder runtime.SchemeBuilder + localSchemeBuilder = &SchemeBuilder + // Deprecated: use Install instead + AddToScheme = localSchemeBuilder.AddToScheme + Install = localSchemeBuilder.AddToScheme +) + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + localSchemeBuilder.Register(addKnownTypes) +} + +// Adds the list of known types to Scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &InferencePoolImport{}, + &InferencePoolImportList{}, + ) + // AddToGroupVersion allows the serialization of client types like ListOptions. + v1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/bbr.Dockerfile b/bbr.Dockerfile index 36ae378cc..1c294c4c2 100644 --- a/bbr.Dockerfile +++ b/bbr.Dockerfile @@ -18,6 +18,7 @@ RUN go mod download COPY cmd/bbr ./cmd COPY pkg ./pkg COPY internal ./internal +COPY api ./api WORKDIR /src/cmd RUN go build -o /bbr diff --git a/client-go/applyconfiguration/api/v1/parentstatus.go b/client-go/applyconfiguration/api/v1/parentstatus.go index 5accb0f45..929df5815 100644 --- a/client-go/applyconfiguration/api/v1/parentstatus.go +++ b/client-go/applyconfiguration/api/v1/parentstatus.go @@ -20,13 +20,15 @@ package v1 import ( metav1 "k8s.io/client-go/applyconfigurations/meta/v1" + apiv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" ) // ParentStatusApplyConfiguration represents a declarative configuration of the ParentStatus type for use // with apply. type ParentStatusApplyConfiguration struct { - Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` - ParentRef *ParentReferenceApplyConfiguration `json:"parentRef,omitempty"` + Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` + ParentRef *ParentReferenceApplyConfiguration `json:"parentRef,omitempty"` + ControllerName *apiv1.ControllerName `json:"controllerName,omitempty"` } // ParentStatusApplyConfiguration constructs a declarative configuration of the ParentStatus type for use with @@ -55,3 +57,11 @@ func (b *ParentStatusApplyConfiguration) WithParentRef(value *ParentReferenceApp b.ParentRef = value return b } + +// WithControllerName sets the ControllerName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ControllerName field is set to the value of the last call. +func (b *ParentStatusApplyConfiguration) WithControllerName(value apiv1.ControllerName) *ParentStatusApplyConfiguration { + b.ControllerName = &value + return b +} diff --git a/client-go/applyconfiguration/apix/v1alpha1/exportingcluster.go b/client-go/applyconfiguration/apix/v1alpha1/exportingcluster.go new file mode 100644 index 000000000..d1b2db058 --- /dev/null +++ b/client-go/applyconfiguration/apix/v1alpha1/exportingcluster.go @@ -0,0 +1,43 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" +) + +// ExportingClusterApplyConfiguration represents a declarative configuration of the ExportingCluster type for use +// with apply. +type ExportingClusterApplyConfiguration struct { + Name *apixv1alpha1.ClusterName `json:"name,omitempty"` +} + +// ExportingClusterApplyConfiguration constructs a declarative configuration of the ExportingCluster type for use with +// apply. +func ExportingCluster() *ExportingClusterApplyConfiguration { + return &ExportingClusterApplyConfiguration{} +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *ExportingClusterApplyConfiguration) WithName(value apixv1alpha1.ClusterName) *ExportingClusterApplyConfiguration { + b.Name = &value + return b +} diff --git a/client-go/applyconfiguration/apix/v1alpha1/importcontroller.go b/client-go/applyconfiguration/apix/v1alpha1/importcontroller.go new file mode 100644 index 000000000..1c30787ed --- /dev/null +++ b/client-go/applyconfiguration/apix/v1alpha1/importcontroller.go @@ -0,0 +1,87 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + metav1 "k8s.io/client-go/applyconfigurations/meta/v1" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" + v1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1" +) + +// ImportControllerApplyConfiguration represents a declarative configuration of the ImportController type for use +// with apply. +type ImportControllerApplyConfiguration struct { + Name *apixv1alpha1.ControllerName `json:"name,omitempty"` + ExportingClusters []ExportingClusterApplyConfiguration `json:"exportingClusters,omitempty"` + Parents []v1.ParentStatusApplyConfiguration `json:"parents,omitempty"` + Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` +} + +// ImportControllerApplyConfiguration constructs a declarative configuration of the ImportController type for use with +// apply. +func ImportController() *ImportControllerApplyConfiguration { + return &ImportControllerApplyConfiguration{} +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *ImportControllerApplyConfiguration) WithName(value apixv1alpha1.ControllerName) *ImportControllerApplyConfiguration { + b.Name = &value + return b +} + +// WithExportingClusters adds the given value to the ExportingClusters field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the ExportingClusters field. +func (b *ImportControllerApplyConfiguration) WithExportingClusters(values ...*ExportingClusterApplyConfiguration) *ImportControllerApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithExportingClusters") + } + b.ExportingClusters = append(b.ExportingClusters, *values[i]) + } + return b +} + +// WithParents adds the given value to the Parents field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Parents field. +func (b *ImportControllerApplyConfiguration) WithParents(values ...*v1.ParentStatusApplyConfiguration) *ImportControllerApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithParents") + } + b.Parents = append(b.Parents, *values[i]) + } + return b +} + +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *ImportControllerApplyConfiguration) WithConditions(values ...*metav1.ConditionApplyConfiguration) *ImportControllerApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimport.go b/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimport.go new file mode 100644 index 000000000..f29c03845 --- /dev/null +++ b/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimport.go @@ -0,0 +1,233 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + v1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// InferencePoolImportApplyConfiguration represents a declarative configuration of the InferencePoolImport type for use +// with apply. +type InferencePoolImportApplyConfiguration struct { + v1.TypeMetaApplyConfiguration `json:",inline"` + *v1.ObjectMetaApplyConfiguration `json:"metadata,omitempty"` + Status *InferencePoolImportStatusApplyConfiguration `json:"status,omitempty"` +} + +// InferencePoolImport constructs a declarative configuration of the InferencePoolImport type for use with +// apply. +func InferencePoolImport(name, namespace string) *InferencePoolImportApplyConfiguration { + b := &InferencePoolImportApplyConfiguration{} + b.WithName(name) + b.WithNamespace(namespace) + b.WithKind("InferencePoolImport") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha1") + return b +} +func (b InferencePoolImportApplyConfiguration) IsApplyConfiguration() {} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithKind(value string) *InferencePoolImportApplyConfiguration { + b.TypeMetaApplyConfiguration.Kind = &value + return b +} + +// WithAPIVersion sets the APIVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the APIVersion field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithAPIVersion(value string) *InferencePoolImportApplyConfiguration { + b.TypeMetaApplyConfiguration.APIVersion = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithName(value string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Name = &value + return b +} + +// WithGenerateName sets the GenerateName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GenerateName field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithGenerateName(value string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.GenerateName = &value + return b +} + +// WithNamespace sets the Namespace field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Namespace field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithNamespace(value string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Namespace = &value + return b +} + +// WithUID sets the UID field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the UID field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithUID(value types.UID) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.UID = &value + return b +} + +// WithResourceVersion sets the ResourceVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ResourceVersion field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithResourceVersion(value string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.ResourceVersion = &value + return b +} + +// WithGeneration sets the Generation field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Generation field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithGeneration(value int64) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Generation = &value + return b +} + +// WithCreationTimestamp sets the CreationTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the CreationTimestamp field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithCreationTimestamp(value metav1.Time) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.CreationTimestamp = &value + return b +} + +// WithDeletionTimestamp sets the DeletionTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionTimestamp field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithDeletionTimestamp(value metav1.Time) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionTimestamp = &value + return b +} + +// WithDeletionGracePeriodSeconds sets the DeletionGracePeriodSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionGracePeriodSeconds field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithDeletionGracePeriodSeconds(value int64) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionGracePeriodSeconds = &value + return b +} + +// WithLabels puts the entries into the Labels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Labels field, +// overwriting an existing map entries in Labels field with the same key. +func (b *InferencePoolImportApplyConfiguration) WithLabels(entries map[string]string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Labels == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Labels = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Labels[k] = v + } + return b +} + +// WithAnnotations puts the entries into the Annotations field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Annotations field, +// overwriting an existing map entries in Annotations field with the same key. +func (b *InferencePoolImportApplyConfiguration) WithAnnotations(entries map[string]string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Annotations == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Annotations = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Annotations[k] = v + } + return b +} + +// WithOwnerReferences adds the given value to the OwnerReferences field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the OwnerReferences field. +func (b *InferencePoolImportApplyConfiguration) WithOwnerReferences(values ...*v1.OwnerReferenceApplyConfiguration) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + if values[i] == nil { + panic("nil value passed to WithOwnerReferences") + } + b.ObjectMetaApplyConfiguration.OwnerReferences = append(b.ObjectMetaApplyConfiguration.OwnerReferences, *values[i]) + } + return b +} + +// WithFinalizers adds the given value to the Finalizers field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Finalizers field. +func (b *InferencePoolImportApplyConfiguration) WithFinalizers(values ...string) *InferencePoolImportApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + b.ObjectMetaApplyConfiguration.Finalizers = append(b.ObjectMetaApplyConfiguration.Finalizers, values[i]) + } + return b +} + +func (b *InferencePoolImportApplyConfiguration) ensureObjectMetaApplyConfigurationExists() { + if b.ObjectMetaApplyConfiguration == nil { + b.ObjectMetaApplyConfiguration = &v1.ObjectMetaApplyConfiguration{} + } +} + +// WithStatus sets the Status field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Status field is set to the value of the last call. +func (b *InferencePoolImportApplyConfiguration) WithStatus(value *InferencePoolImportStatusApplyConfiguration) *InferencePoolImportApplyConfiguration { + b.Status = value + return b +} + +// GetKind retrieves the value of the Kind field in the declarative configuration. +func (b *InferencePoolImportApplyConfiguration) GetKind() *string { + return b.TypeMetaApplyConfiguration.Kind +} + +// GetAPIVersion retrieves the value of the APIVersion field in the declarative configuration. +func (b *InferencePoolImportApplyConfiguration) GetAPIVersion() *string { + return b.TypeMetaApplyConfiguration.APIVersion +} + +// GetName retrieves the value of the Name field in the declarative configuration. +func (b *InferencePoolImportApplyConfiguration) GetName() *string { + b.ensureObjectMetaApplyConfigurationExists() + return b.ObjectMetaApplyConfiguration.Name +} + +// GetNamespace retrieves the value of the Namespace field in the declarative configuration. +func (b *InferencePoolImportApplyConfiguration) GetNamespace() *string { + b.ensureObjectMetaApplyConfigurationExists() + return b.ObjectMetaApplyConfiguration.Namespace +} diff --git a/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimportstatus.go b/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimportstatus.go new file mode 100644 index 000000000..9c2141481 --- /dev/null +++ b/client-go/applyconfiguration/apix/v1alpha1/inferencepoolimportstatus.go @@ -0,0 +1,44 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha1 + +// InferencePoolImportStatusApplyConfiguration represents a declarative configuration of the InferencePoolImportStatus type for use +// with apply. +type InferencePoolImportStatusApplyConfiguration struct { + Controllers []ImportControllerApplyConfiguration `json:"controllers,omitempty"` +} + +// InferencePoolImportStatusApplyConfiguration constructs a declarative configuration of the InferencePoolImportStatus type for use with +// apply. +func InferencePoolImportStatus() *InferencePoolImportStatusApplyConfiguration { + return &InferencePoolImportStatusApplyConfiguration{} +} + +// WithControllers adds the given value to the Controllers field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Controllers field. +func (b *InferencePoolImportStatusApplyConfiguration) WithControllers(values ...*ImportControllerApplyConfiguration) *InferencePoolImportStatusApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithControllers") + } + b.Controllers = append(b.Controllers, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 8e11c0082..7e4ea0915 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -23,8 +23,10 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" managedfields "k8s.io/apimachinery/pkg/util/managedfields" v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" apiv1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/apix/v1alpha1" apixv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/apix/v1alpha2" internal "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" ) @@ -51,6 +53,16 @@ func ForKind(kind schema.GroupVersionKind) interface{} { case v1.SchemeGroupVersion.WithKind("Port"): return &apiv1.PortApplyConfiguration{} + // Group=inference.networking.x-k8s.io, Version=v1alpha1 + case v1alpha1.SchemeGroupVersion.WithKind("ExportingCluster"): + return &apixv1alpha1.ExportingClusterApplyConfiguration{} + case v1alpha1.SchemeGroupVersion.WithKind("ImportController"): + return &apixv1alpha1.ImportControllerApplyConfiguration{} + case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolImport"): + return &apixv1alpha1.InferencePoolImportApplyConfiguration{} + case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolImportStatus"): + return &apixv1alpha1.InferencePoolImportStatusApplyConfiguration{} + // Group=inference.networking.x-k8s.io, Version=v1alpha2 case v1alpha2.SchemeGroupVersion.WithKind("Extension"): return &apixv1alpha2.ExtensionApplyConfiguration{} diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 928ab89f1..0ea4a00a2 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -26,12 +26,14 @@ import ( rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" inferencev1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1" xinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha2" ) type Interface interface { Discovery() discovery.DiscoveryInterface InferenceV1() inferencev1.InferenceV1Interface + InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface XInferenceV1alpha2() xinferencev1alpha2.XInferenceV1alpha2Interface } @@ -39,6 +41,7 @@ type Interface interface { type Clientset struct { *discovery.DiscoveryClient inferenceV1 *inferencev1.InferenceV1Client + inferenceV1alpha1 *inferencev1alpha1.InferenceV1alpha1Client xInferenceV1alpha2 *xinferencev1alpha2.XInferenceV1alpha2Client } @@ -47,6 +50,11 @@ func (c *Clientset) InferenceV1() inferencev1.InferenceV1Interface { return c.inferenceV1 } +// InferenceV1alpha1 retrieves the InferenceV1alpha1Client +func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { + return c.inferenceV1alpha1 +} + // XInferenceV1alpha2 retrieves the XInferenceV1alpha2Client func (c *Clientset) XInferenceV1alpha2() xinferencev1alpha2.XInferenceV1alpha2Interface { return c.xInferenceV1alpha2 @@ -100,6 +108,10 @@ func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, if err != nil { return nil, err } + cs.inferenceV1alpha1, err = inferencev1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } cs.xInferenceV1alpha2, err = xinferencev1alpha2.NewForConfigAndClient(&configShallowCopy, httpClient) if err != nil { return nil, err @@ -126,6 +138,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { func New(c rest.Interface) *Clientset { var cs Clientset cs.inferenceV1 = inferencev1.New(c) + cs.inferenceV1alpha1 = inferencev1alpha1.New(c) cs.xInferenceV1alpha2 = xinferencev1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index a0709d723..baa42d884 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -29,6 +29,8 @@ import ( clientset "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" inferencev1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1" fakeinferencev1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1/fake" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1" + fakeinferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1/fake" xinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha2" fakexinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha2/fake" ) @@ -132,6 +134,11 @@ func (c *Clientset) InferenceV1() inferencev1.InferenceV1Interface { return &fakeinferencev1.FakeInferenceV1{Fake: &c.Fake} } +// InferenceV1alpha1 retrieves the InferenceV1alpha1Client +func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { + return &fakeinferencev1alpha1.FakeInferenceV1alpha1{Fake: &c.Fake} +} + // XInferenceV1alpha2 retrieves the XInferenceV1alpha2Client func (c *Clientset) XInferenceV1alpha2() xinferencev1alpha2.XInferenceV1alpha2Interface { return &fakexinferencev1alpha2.FakeXInferenceV1alpha2{Fake: &c.Fake} diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index 5c6d338ce..412308d9e 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -25,6 +25,7 @@ import ( serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" inferencev1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" xinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" ) @@ -33,6 +34,7 @@ var codecs = serializer.NewCodecFactory(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ inferencev1.AddToScheme, + inferencev1alpha1.AddToScheme, xinferencev1alpha2.AddToScheme, } diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 7836df4f5..47bdc7c33 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -25,6 +25,7 @@ import ( serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" inferencev1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" xinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" ) @@ -33,6 +34,7 @@ var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ inferencev1.AddToScheme, + inferencev1alpha1.AddToScheme, xinferencev1alpha2.AddToScheme, } diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/apix_client.go b/client-go/clientset/versioned/typed/apix/v1alpha1/apix_client.go new file mode 100644 index 000000000..726c4103b --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/apix_client.go @@ -0,0 +1,101 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + http "net/http" + + rest "k8s.io/client-go/rest" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" +) + +type InferenceV1alpha1Interface interface { + RESTClient() rest.Interface + InferencePoolImportsGetter +} + +// InferenceV1alpha1Client is used to interact with features provided by the inference.networking.x-k8s.io group. +type InferenceV1alpha1Client struct { + restClient rest.Interface +} + +func (c *InferenceV1alpha1Client) InferencePoolImports(namespace string) InferencePoolImportInterface { + return newInferencePoolImports(c, namespace) +} + +// NewForConfig creates a new InferenceV1alpha1Client for the given config. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*InferenceV1alpha1Client, error) { + config := *c + setConfigDefaults(&config) + httpClient, err := rest.HTTPClientFor(&config) + if err != nil { + return nil, err + } + return NewForConfigAndClient(&config, httpClient) +} + +// NewForConfigAndClient creates a new InferenceV1alpha1Client for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha1Client, error) { + config := *c + setConfigDefaults(&config) + client, err := rest.RESTClientForConfigAndClient(&config, h) + if err != nil { + return nil, err + } + return &InferenceV1alpha1Client{client}, nil +} + +// NewForConfigOrDie creates a new InferenceV1alpha1Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *InferenceV1alpha1Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new InferenceV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *InferenceV1alpha1Client { + return &InferenceV1alpha1Client{c} +} + +func setConfigDefaults(config *rest.Config) { + gv := apixv1alpha1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = rest.CodecFactoryForGeneratedClient(scheme.Scheme, scheme.Codecs).WithoutConversion() + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *InferenceV1alpha1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/doc.go b/client-go/clientset/versioned/typed/apix/v1alpha1/doc.go new file mode 100644 index 000000000..df51baa4d --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/doc.go @@ -0,0 +1,20 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated typed clients. +package v1alpha1 diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/fake/doc.go b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/doc.go new file mode 100644 index 000000000..16f443990 --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/doc.go @@ -0,0 +1,20 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +// Package fake has the automatically generated clients. +package fake diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_apix_client.go b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_apix_client.go new file mode 100644 index 000000000..1de7688eb --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_apix_client.go @@ -0,0 +1,40 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1" +) + +type FakeInferenceV1alpha1 struct { + *testing.Fake +} + +func (c *FakeInferenceV1alpha1) InferencePoolImports(namespace string) v1alpha1.InferencePoolImportInterface { + return newFakeInferencePoolImports(c, namespace) +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeInferenceV1alpha1) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_inferencepoolimport.go b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_inferencepoolimport.go new file mode 100644 index 000000000..7a6938f9d --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_inferencepoolimport.go @@ -0,0 +1,53 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + gentype "k8s.io/client-go/gentype" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/apix/v1alpha1" + typedapixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1" +) + +// fakeInferencePoolImports implements InferencePoolImportInterface +type fakeInferencePoolImports struct { + *gentype.FakeClientWithListAndApply[*v1alpha1.InferencePoolImport, *v1alpha1.InferencePoolImportList, *apixv1alpha1.InferencePoolImportApplyConfiguration] + Fake *FakeInferenceV1alpha1 +} + +func newFakeInferencePoolImports(fake *FakeInferenceV1alpha1, namespace string) typedapixv1alpha1.InferencePoolImportInterface { + return &fakeInferencePoolImports{ + gentype.NewFakeClientWithListAndApply[*v1alpha1.InferencePoolImport, *v1alpha1.InferencePoolImportList, *apixv1alpha1.InferencePoolImportApplyConfiguration]( + fake.Fake, + namespace, + v1alpha1.SchemeGroupVersion.WithResource("inferencepoolimports"), + v1alpha1.SchemeGroupVersion.WithKind("InferencePoolImport"), + func() *v1alpha1.InferencePoolImport { return &v1alpha1.InferencePoolImport{} }, + func() *v1alpha1.InferencePoolImportList { return &v1alpha1.InferencePoolImportList{} }, + func(dst, src *v1alpha1.InferencePoolImportList) { dst.ListMeta = src.ListMeta }, + func(list *v1alpha1.InferencePoolImportList) []*v1alpha1.InferencePoolImport { + return gentype.ToPointerSlice(list.Items) + }, + func(list *v1alpha1.InferencePoolImportList, items []*v1alpha1.InferencePoolImport) { + list.Items = gentype.FromPointerSlice(items) + }, + ), + fake, + } +} diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/generated_expansion.go b/client-go/clientset/versioned/typed/apix/v1alpha1/generated_expansion.go new file mode 100644 index 000000000..0969f5b0c --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/generated_expansion.go @@ -0,0 +1,21 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +type InferencePoolImportExpansion interface{} diff --git a/client-go/clientset/versioned/typed/apix/v1alpha1/inferencepoolimport.go b/client-go/clientset/versioned/typed/apix/v1alpha1/inferencepoolimport.go new file mode 100644 index 000000000..f68766d9a --- /dev/null +++ b/client-go/clientset/versioned/typed/apix/v1alpha1/inferencepoolimport.go @@ -0,0 +1,74 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + context "context" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + gentype "k8s.io/client-go/gentype" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" + applyconfigurationapixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/apix/v1alpha1" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" +) + +// InferencePoolImportsGetter has a method to return a InferencePoolImportInterface. +// A group's client should implement this interface. +type InferencePoolImportsGetter interface { + InferencePoolImports(namespace string) InferencePoolImportInterface +} + +// InferencePoolImportInterface has methods to work with InferencePoolImport resources. +type InferencePoolImportInterface interface { + Create(ctx context.Context, inferencePoolImport *apixv1alpha1.InferencePoolImport, opts v1.CreateOptions) (*apixv1alpha1.InferencePoolImport, error) + Update(ctx context.Context, inferencePoolImport *apixv1alpha1.InferencePoolImport, opts v1.UpdateOptions) (*apixv1alpha1.InferencePoolImport, error) + // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + UpdateStatus(ctx context.Context, inferencePoolImport *apixv1alpha1.InferencePoolImport, opts v1.UpdateOptions) (*apixv1alpha1.InferencePoolImport, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*apixv1alpha1.InferencePoolImport, error) + List(ctx context.Context, opts v1.ListOptions) (*apixv1alpha1.InferencePoolImportList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apixv1alpha1.InferencePoolImport, err error) + Apply(ctx context.Context, inferencePoolImport *applyconfigurationapixv1alpha1.InferencePoolImportApplyConfiguration, opts v1.ApplyOptions) (result *apixv1alpha1.InferencePoolImport, err error) + // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). + ApplyStatus(ctx context.Context, inferencePoolImport *applyconfigurationapixv1alpha1.InferencePoolImportApplyConfiguration, opts v1.ApplyOptions) (result *apixv1alpha1.InferencePoolImport, err error) + InferencePoolImportExpansion +} + +// inferencePoolImports implements InferencePoolImportInterface +type inferencePoolImports struct { + *gentype.ClientWithListAndApply[*apixv1alpha1.InferencePoolImport, *apixv1alpha1.InferencePoolImportList, *applyconfigurationapixv1alpha1.InferencePoolImportApplyConfiguration] +} + +// newInferencePoolImports returns a InferencePoolImports +func newInferencePoolImports(c *InferenceV1alpha1Client, namespace string) *inferencePoolImports { + return &inferencePoolImports{ + gentype.NewClientWithListAndApply[*apixv1alpha1.InferencePoolImport, *apixv1alpha1.InferencePoolImportList, *applyconfigurationapixv1alpha1.InferencePoolImportApplyConfiguration]( + "inferencepoolimports", + c.RESTClient(), + scheme.ParameterCodec, + namespace, + func() *apixv1alpha1.InferencePoolImport { return &apixv1alpha1.InferencePoolImport{} }, + func() *apixv1alpha1.InferencePoolImportList { return &apixv1alpha1.InferencePoolImportList{} }, + ), + } +} diff --git a/client-go/informers/externalversions/apix/interface.go b/client-go/informers/externalversions/apix/interface.go index 26e18173f..426cbcdd9 100644 --- a/client-go/informers/externalversions/apix/interface.go +++ b/client-go/informers/externalversions/apix/interface.go @@ -19,12 +19,15 @@ limitations under the License. package apix import ( + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/apix/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/apix/v1alpha2" internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to each of this group's versions. type Interface interface { + // V1alpha1 provides access to shared informers for resources in V1alpha1. + V1alpha1() v1alpha1.Interface // V1alpha2 provides access to shared informers for resources in V1alpha2. V1alpha2() v1alpha2.Interface } @@ -40,6 +43,11 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } +// V1alpha1 returns a new v1alpha1.Interface. +func (g *group) V1alpha1() v1alpha1.Interface { + return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) +} + // V1alpha2 returns a new v1alpha2.Interface. func (g *group) V1alpha2() v1alpha2.Interface { return v1alpha2.New(g.factory, g.namespace, g.tweakListOptions) diff --git a/client-go/informers/externalversions/apix/v1alpha1/inferencepoolimport.go b/client-go/informers/externalversions/apix/v1alpha1/inferencepoolimport.go new file mode 100644 index 000000000..9013e02a5 --- /dev/null +++ b/client-go/informers/externalversions/apix/v1alpha1/inferencepoolimport.go @@ -0,0 +1,102 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + context "context" + time "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/apix/v1alpha1" +) + +// InferencePoolImportInformer provides access to a shared informer and lister for +// InferencePoolImports. +type InferencePoolImportInformer interface { + Informer() cache.SharedIndexInformer + Lister() apixv1alpha1.InferencePoolImportLister +} + +type inferencePoolImportInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewInferencePoolImportInformer constructs a new informer for InferencePoolImport type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewInferencePoolImportInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredInferencePoolImportInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredInferencePoolImportInformer constructs a new informer for InferencePoolImport type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredInferencePoolImportInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha1().InferencePoolImports(namespace).List(context.Background(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha1().InferencePoolImports(namespace).Watch(context.Background(), options) + }, + ListWithContextFunc: func(ctx context.Context, options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha1().InferencePoolImports(namespace).List(ctx, options) + }, + WatchFuncWithContext: func(ctx context.Context, options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha1().InferencePoolImports(namespace).Watch(ctx, options) + }, + }, + &gatewayapiinferenceextensionapixv1alpha1.InferencePoolImport{}, + resyncPeriod, + indexers, + ) +} + +func (f *inferencePoolImportInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredInferencePoolImportInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *inferencePoolImportInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&gatewayapiinferenceextensionapixv1alpha1.InferencePoolImport{}, f.defaultInformer) +} + +func (f *inferencePoolImportInformer) Lister() apixv1alpha1.InferencePoolImportLister { + return apixv1alpha1.NewInferencePoolImportLister(f.Informer().GetIndexer()) +} diff --git a/client-go/informers/externalversions/apix/v1alpha1/interface.go b/client-go/informers/externalversions/apix/v1alpha1/interface.go new file mode 100644 index 000000000..50e84d5b1 --- /dev/null +++ b/client-go/informers/externalversions/apix/v1alpha1/interface.go @@ -0,0 +1,45 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // InferencePoolImports returns a InferencePoolImportInformer. + InferencePoolImports() InferencePoolImportInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// InferencePoolImports returns a InferencePoolImportInformer. +func (v *version) InferencePoolImports() InferencePoolImportInformer { + return &inferencePoolImportInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index 03c95ee05..2fe29156b 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -24,6 +24,7 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" ) @@ -57,6 +58,10 @@ func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource case v1.SchemeGroupVersion.WithResource("inferencepools"): return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1().InferencePools().Informer()}, nil + // Group=inference.networking.x-k8s.io, Version=v1alpha1 + case v1alpha1.SchemeGroupVersion.WithResource("inferencepoolimports"): + return &genericInformer{resource: resource.GroupResource(), informer: f.XInference().V1alpha1().InferencePoolImports().Informer()}, nil + // Group=inference.networking.x-k8s.io, Version=v1alpha2 case v1alpha2.SchemeGroupVersion.WithResource("inferenceobjectives"): return &genericInformer{resource: resource.GroupResource(), informer: f.XInference().V1alpha2().InferenceObjectives().Informer()}, nil diff --git a/client-go/listers/apix/v1alpha1/expansion_generated.go b/client-go/listers/apix/v1alpha1/expansion_generated.go new file mode 100644 index 000000000..7c0d32cbb --- /dev/null +++ b/client-go/listers/apix/v1alpha1/expansion_generated.go @@ -0,0 +1,27 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +// InferencePoolImportListerExpansion allows custom methods to be added to +// InferencePoolImportLister. +type InferencePoolImportListerExpansion interface{} + +// InferencePoolImportNamespaceListerExpansion allows custom methods to be added to +// InferencePoolImportNamespaceLister. +type InferencePoolImportNamespaceListerExpansion interface{} diff --git a/client-go/listers/apix/v1alpha1/inferencepoolimport.go b/client-go/listers/apix/v1alpha1/inferencepoolimport.go new file mode 100644 index 000000000..653bbfa45 --- /dev/null +++ b/client-go/listers/apix/v1alpha1/inferencepoolimport.go @@ -0,0 +1,70 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + labels "k8s.io/apimachinery/pkg/labels" + listers "k8s.io/client-go/listers" + cache "k8s.io/client-go/tools/cache" + apixv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1" +) + +// InferencePoolImportLister helps list InferencePoolImports. +// All objects returned here must be treated as read-only. +type InferencePoolImportLister interface { + // List lists all InferencePoolImports in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apixv1alpha1.InferencePoolImport, err error) + // InferencePoolImports returns an object that can list and get InferencePoolImports. + InferencePoolImports(namespace string) InferencePoolImportNamespaceLister + InferencePoolImportListerExpansion +} + +// inferencePoolImportLister implements the InferencePoolImportLister interface. +type inferencePoolImportLister struct { + listers.ResourceIndexer[*apixv1alpha1.InferencePoolImport] +} + +// NewInferencePoolImportLister returns a new InferencePoolImportLister. +func NewInferencePoolImportLister(indexer cache.Indexer) InferencePoolImportLister { + return &inferencePoolImportLister{listers.New[*apixv1alpha1.InferencePoolImport](indexer, apixv1alpha1.Resource("inferencepoolimport"))} +} + +// InferencePoolImports returns an object that can list and get InferencePoolImports. +func (s *inferencePoolImportLister) InferencePoolImports(namespace string) InferencePoolImportNamespaceLister { + return inferencePoolImportNamespaceLister{listers.NewNamespaced[*apixv1alpha1.InferencePoolImport](s.ResourceIndexer, namespace)} +} + +// InferencePoolImportNamespaceLister helps list and get InferencePoolImports. +// All objects returned here must be treated as read-only. +type InferencePoolImportNamespaceLister interface { + // List lists all InferencePoolImports in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apixv1alpha1.InferencePoolImport, err error) + // Get retrieves the InferencePoolImport from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*apixv1alpha1.InferencePoolImport, error) + InferencePoolImportNamespaceListerExpansion +} + +// inferencePoolImportNamespaceLister implements the InferencePoolImportNamespaceLister +// interface. +type inferencePoolImportNamespaceLister struct { + listers.ResourceIndexer[*apixv1alpha1.InferencePoolImport] +} diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index 7d25fc7c7..e854d7cdd 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -37,6 +37,7 @@ import ( healthPb "google.golang.org/grpc/health/grpc_health_v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -51,15 +52,18 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" dlmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol" + fccontroller "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller" + fcregistry "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/registry" latencypredictor "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics/collectors" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/slorequest" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" @@ -71,23 +75,38 @@ import ( ) const ( - // enableExperimentalDatalayerV2 defines the environment variable - // used as feature flag for the pluggable data layer. + // enableExperimentalDatalayerV2 defines the environment variable used as feature flag for the pluggable data layer. enableExperimentalDatalayerV2 = "ENABLE_EXPERIMENTAL_DATALAYER_V2" + // enableExperimentalFlowControlLayer defines the environment variable used as a feature flag for the pluggable flow + // control layer. + enableExperimentalFlowControlLayer = "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER" ) +// TODO: this is hardcoded for POC only. This needs to be hooked up to our text-based config story. +var flowControlConfig = flowcontrol.Config{ + Controller: fccontroller.Config{}, // Use all defaults. + Registry: fcregistry.Config{ + // Define domain of accepted priority levels as this field is required. Use defaults for all optional fields. + // TODO: this should not be hardcoded. + PriorityBands: []fcregistry.PriorityBandConfig{ + {Priority: 0, PriorityName: "Default"}, + }, + }, +} + var ( - grpcPort = flag.Int("grpc-port", runserver.DefaultGrpcPort, "The gRPC port used for communicating with Envoy proxy") - grpcHealthPort = flag.Int("grpc-health-port", runserver.DefaultGrpcHealthPort, "The port used for gRPC liveness and readiness probes") - metricsPort = flag.Int("metrics-port", runserver.DefaultMetricsPort, "The metrics port") - enablePprof = flag.Bool("enable-pprof", runserver.DefaultEnablePprof, "Enables pprof handlers. Defaults to true. Set to false to disable pprof handlers.") - poolName = flag.String("pool-name", runserver.DefaultPoolName, "Name of the InferencePool this Endpoint Picker is associated with.") - poolGroup = flag.String("pool-group", runserver.DefaultPoolGroup, "group of the InferencePool this Endpoint Picker is associated with.") - poolNamespace = flag.String("pool-namespace", runserver.DefaultPoolNamespace, "Namespace of the InferencePool this Endpoint Picker is associated with.") - logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") - secureServing = flag.Bool("secure-serving", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") - healthChecking = flag.Bool("health-checking", runserver.DefaultHealthChecking, "Enables health checking") - certPath = flag.String("cert-path", runserver.DefaultCertPath, "The path to the certificate for secure serving. The certificate and private key files "+ + grpcPort = flag.Int("grpc-port", runserver.DefaultGrpcPort, "The gRPC port used for communicating with Envoy proxy") + grpcHealthPort = flag.Int("grpc-health-port", runserver.DefaultGrpcHealthPort, "The port used for gRPC liveness and readiness probes") + metricsPort = flag.Int("metrics-port", runserver.DefaultMetricsPort, "The metrics port") + metricsEndpointAuth = flag.Bool("metrics-endpoint-auth", true, "Enables authentication and authorization of the metrics endpoint") + enablePprof = flag.Bool("enable-pprof", runserver.DefaultEnablePprof, "Enables pprof handlers. Defaults to true. Set to false to disable pprof handlers.") + poolName = flag.String("pool-name", runserver.DefaultPoolName, "Name of the InferencePool this Endpoint Picker is associated with.") + poolGroup = flag.String("pool-group", runserver.DefaultPoolGroup, "group of the InferencePool this Endpoint Picker is associated with.") + poolNamespace = flag.String("pool-namespace", "", "Namespace of the InferencePool this Endpoint Picker is associated with.") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + secureServing = flag.Bool("secure-serving", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") + healthChecking = flag.Bool("health-checking", runserver.DefaultHealthChecking, "Enables health checking") + certPath = flag.String("cert-path", runserver.DefaultCertPath, "The path to the certificate for secure serving. The certificate and private key files "+ "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") // metric flags @@ -96,6 +115,8 @@ var ( kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") + // Cache info metrics + cacheInfoMetric = flag.String("cache-info-metric", runserver.DefaultCacheInfoMetric, "Prometheus metric for the cache info metrics.") // metrics related flags refreshMetricsInterval = flag.Duration("refresh-metrics-interval", runserver.DefaultRefreshMetricsInterval, "interval to refresh metrics") refreshPrometheusMetricsInterval = flag.Duration("refresh-prometheus-metrics-interval", runserver.DefaultRefreshPrometheusMetricsInterval, "interval to flush prometheus metrics") @@ -113,6 +134,7 @@ var ( // Latency Predictor Flag enableLatencyPredictor = flag.Bool("enable-latency-predictor", false, "Enable the regression-based latency predictor and scheduler scorer.") + tracing = flag.Bool("tracing", true, "Enables emitting traces") setupLog = ctrl.Log.WithName("setup") ) @@ -148,6 +170,13 @@ func (r *Runner) Run(ctx context.Context) error { flag.Parse() initLogging(&opts) + if *tracing { + err := common.InitTracing(ctx, setupLog) + if err != nil { + return err + } + } + setupLog.Info("GIE build", "commit-sha", version.CommitSHA, "build-ref", version.BuildRef) // Validate flags @@ -179,7 +208,7 @@ func (r *Runner) Run(ctx context.Context) error { if err != nil { return err } - datastore := datastore.NewDatastore(ctx, epf) + datastore := datastore.NewDatastore(ctx, epf, int32(*modelServerMetricsPort)) // --- Setup Metrics Server --- customCollectors := []prometheus.Collector{collectors.NewInferencePoolMetricsCollector(datastore)} @@ -191,13 +220,30 @@ func (r *Runner) Run(ctx context.Context) error { // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server // - https://book.kubebuilder.io/reference/metrics.html metricsServerOptions := metricsserver.Options{ - BindAddress: fmt.Sprintf(":%d", *metricsPort), - FilterProvider: filters.WithAuthenticationAndAuthorization, + BindAddress: fmt.Sprintf(":%d", *metricsPort), + FilterProvider: func() func(c *rest.Config, httpClient *http.Client) (metricsserver.Filter, error) { + if *metricsEndpointAuth { + return filters.WithAuthenticationAndAuthorization + } + + return nil + }(), } + // Determine pool namespace: if --pool-namespace is non-empty, use it; else NAMESPACE env var; else default + resolvePoolNamespace := func() string { + if *poolNamespace != "" { + return *poolNamespace + } + if nsEnv := os.Getenv("NAMESPACE"); nsEnv != "" { + return nsEnv + } + return runserver.DefaultPoolNamespace + } + resolvedPoolNamespace := resolvePoolNamespace() poolNamespacedName := types.NamespacedName{ Name: *poolName, - Namespace: *poolNamespace, + Namespace: resolvedPoolNamespace, } poolGroupKind := schema.GroupKind{ Group: *poolGroup, @@ -279,7 +325,43 @@ func (r *Runner) Run(ctx context.Context) error { saturationDetector := saturationdetector.NewDetector(sdConfig, setupLog) - director := requestcontrol.NewDirectorWithConfig(datastore, scheduler, saturationDetector, r.requestControlConfig) + // --- Admission Control Initialization --- + enableFlowControl := env.GetEnvBool(enableExperimentalFlowControlLayer, false, setupLog) + var admissionController requestcontrol.AdmissionController + if enableFlowControl { + setupLog.Info("Initializing experimental Flow Control layer") + fcCfg, err := flowControlConfig.ValidateAndApplyDefaults() + if err != nil { + setupLog.Error(err, "failed to initialize Flow Control layer") + return fmt.Errorf("invalid Flow Control config: %w", err) + } + + registry, err := fcregistry.NewFlowRegistry(fcCfg.Registry, setupLog) + if err != nil { + return fmt.Errorf("failed to initialize Flow Registry: %w", err) + } + fc, err := fccontroller.NewFlowController( + ctx, + fcCfg.Controller, + registry, + saturationDetector, + setupLog, + ) + if err != nil { + return fmt.Errorf("failed to initialize Flow Controller: %w", err) + } + go registry.Run(ctx) + admissionController = requestcontrol.NewFlowControlAdmissionController(saturationDetector, fc) + } else { + setupLog.Info("Experimental Flow Control layer is disabled, using legacy admission control") + admissionController = requestcontrol.NewLegacyAdmissionController(saturationDetector) + } + + director := requestcontrol.NewDirectorWithConfig( + datastore, + scheduler, + admissionController, + r.requestControlConfig) // --- Setup ExtProc Server Runner --- serverRunner := &runserver.ExtProcServerRunner{ @@ -338,20 +420,14 @@ func (r *Runner) registerInTreePlugins() { plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory) } -func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface, datastore datastore.Datastore) { - // Register the SLO request tracker and scorer plugin, these plugins need access to the predictor and datastore. - // We have to specify a custom factory function to create the plugins with the correct dependencies. - plugins.Register(slorequest.SLORequestTrackerPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { - return slorequest.New(predictor, datastore).WithName(name), nil - }) - plugins.Register(scorer.SLOScorerPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { - return scorer.NewSLOScorer(predictor, datastore, scorer.HeadroomSelectionStrategy).WithName(name), nil +func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface) { + plugins.Register(slo_aware_router.SLOAwareRouterPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return slo_aware_router.NewSLOAwareRouter(predictor, slo_aware_router.HeadroomSelectionStrategy).WithName(name), nil }) plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory) - plugins.Register(picker.WeightedRandomPickerType, picker.WeightedRandomPickerFactory) } -func (r *Runner) parsePluginsConfiguration(ctx context.Context, predictor latencypredictor.PredictorInterface, datastore datastore.Datastore) error { +func (r *Runner) parsePluginsConfiguration(ctx context.Context, predictor latencypredictor.PredictorInterface, ds datastore.Datastore) error { if *configText == "" && *configFile == "" { return nil // configuring through code, not through file } @@ -372,12 +448,13 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context, predictor latenc r.registerInTreePlugins() // If we have a latency predictor enabled and predictor and datastore are not nil, // register the latency predictor plugins (currently just the SLO scorer). - if *enableLatencyPredictor && predictor != nil && datastore != nil { + if *enableLatencyPredictor && predictor != nil { setupLog.Info("Registering latency predictor plugins") - r.registerLatencyPredictorPlugins(predictor, datastore) + r.registerLatencyPredictorPlugins(predictor) } - handle := plugins.NewEppHandle(ctx) + handle := plugins.NewEppHandle(ctx, ds.PodList) config, err := loader.LoadConfig(configBytes, handle, logger) + if err != nil { return fmt.Errorf("failed to load the configuration - %w", err) } @@ -408,6 +485,7 @@ func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) { *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, *loraInfoMetric, + *cacheInfoMetric, ) if err != nil { setupLog.Error(err, "Failed to create metric mapping from flags.") @@ -430,7 +508,6 @@ func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{ MetricMapping: mapping, - ModelServerMetricsPort: int32(*modelServerMetricsPort), ModelServerMetricsPath: *modelServerMetricsPath, ModelServerMetricsScheme: *modelServerMetricsScheme, Client: metricsHttpClient, @@ -445,14 +522,13 @@ func setupDatalayer() (datalayer.EndpointFactory, error) { // this (and registering the sources with the endpoint factory) should // be moved accordingly. source := dlmetrics.NewDataSource(*modelServerMetricsScheme, - int32(*modelServerMetricsPort), // start with (optional) command line port value *modelServerMetricsPath, *modelServerMetricsHttpsInsecureSkipVerify, nil) extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric, *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, - *loraInfoMetric) + *loraInfoMetric, *cacheInfoMetric) if err != nil { return nil, err @@ -537,6 +613,9 @@ func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logge if mapping.LoraRequestInfo == nil { logger.Info("Not scraping metric: LoraRequestInfo") } + if mapping.CacheConfigInfo == nil { + logger.Info("Not scraping metric: CacheConfigInfo") + } } // setupPprofHandlers only implements the pre-defined profiles: diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 82be6b85c..9fbbfb9bf 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -16,7 +16,7 @@ To install via the latest published chart in staging (--version v0 indicates la ```txt $ helm install vllm-llama3-8b-instruct \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ - --set provider.name=[none|gke] \ + --set provider.name=[none|gke|istio] \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` @@ -95,22 +95,61 @@ Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Tri $ helm install triton-llama3-8b-instruct \ --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \ --set inferencePool.modelServerType=triton-tensorrt-llm \ - --set provider.name=[none|gke] \ + --set provider.name=[none|gke|istio] \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` ### Install with High Availability (HA) -To deploy the EndpointPicker in a high-availability (HA) active-passive configuration, you can enable leader election. When enabled, the EPP deployment will have multiple replicas, but only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. +To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. -To enable HA, set `inferenceExtension.flags.has-enable-leader-election` to `true` and increase the number of replicas in your `values.yaml` file: +To enable HA, set `inferenceExtension.replicas` to a number greater than 1. + +* Via `--set` flag: + + ```txt + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferenceExtension.replicas=3 \ + --set provider=[none|gke] \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 + ``` + +* Via `values.yaml`: + + ```yaml + inferenceExtension: + replicas: 3 + ``` + + Then apply it with: + + ```txt + helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml + ``` + +### Install with Monitoring + +To enable metrics collection and monitoring for the EndpointPicker, you can configure Prometheus ServiceMonitor creation: ```yaml inferenceExtension: - replicas: 3 - has-enable-leader-election: true + monitoring: + interval: "10s" + prometheus: + enabled: false + auth: + enabled: true + secretName: inference-gateway-sa-metrics-reader-secret + extraLabels: {} ``` +**Note:** Prometheus monitoring requires the Prometheus Operator and ServiceMonitor CRD to be installed in the cluster. + +For GKE environments, you need to set `provider.name` to `gke` firstly. This will create the necessary `PodMonitoring` and RBAC resources for metrics collection. + +If you are using a GKE Autopilot cluster, you also need to set `provider.gke.autopilot` to `true`. + Then apply it with: ```txt @@ -129,25 +168,74 @@ $ helm uninstall pool-1 The following table list the configurable parameters of the chart. +| **Parameter Name** | **Description** | +|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `inferencePool.apiVersion` | The API version of the InferencePool resource. Defaults to `inference.networking.k8s.io/v1`. This can be changed to `inference.networking.x-k8s.io/v1alpha2` to support older API versions. | +| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | +| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. | +| `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | +| `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. If More than one replica is used, EPP will run in HA active-passive mode. Defaults to `1`. | +| `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. | +| `inferenceExtension.image.hub` | Registry URL where the endpoint picker image is hosted. | +| `inferenceExtension.image.tag` | Image tag of the endpoint picker. | +| `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | +| `inferenceExtension.env` | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`. | +| `inferenceExtension.extraContainerPorts` | List of additional container ports to expose. Defaults to `[]`. | +| `inferenceExtension.extraServicePorts` | List of additional service ports to expose. Defaults to `[]`. | +| `inferenceExtension.flags` | List of flags which are passed through to endpoint picker. Example flags, enable-pprof, grpc-port etc. Refer [runner.go](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/runner/runner.go) for complete list. | +| `inferenceExtension.affinity` | Affinity for the endpoint picker. Defaults to `{}`. | +| `inferenceExtension.tolerations` | Tolerations for the endpoint picker. Defaults to `[]`. | +| `inferenceExtension.monitoring.interval` | Metrics scraping interval for monitoring. Defaults to `10s`. | +| `inferenceExtension.monitoring.prometheus.enabled` | Enable Prometheus ServiceMonitor creation for EPP metrics collection. Defaults to `false`. | +| `inferenceExtension.monitoring.gke.enabled` | **DEPRECATED**: This field is deprecated and will be removed in the next release. Enable GKE monitoring resources (`PodMonitoring` and RBAC). Defaults to `false`. | +| `inferenceExtension.monitoring.prometheus.auth.enabled` | Enable auth for Prometheus metrics endpoint. Defaults is `true` | +| `inferenceExtension.monitoring.prometheus.auth.secretName` | Name of the service account token secret for metrics authentication. Defaults to `inference-gateway-sa-metrics-reader-secret`. | +| `inferenceExtension.monitoring.prometheus.extraLabels` | Extra labels added to ServiceMonitor. | +| `inferenceExtension.pluginsCustomConfig` | Custom config that is passed to EPP as inline yaml. | +| `inferenceExtension.tracing.enabled` | Enables or disables OpenTelemetry tracing globally for the EndpointPicker. | +| `inferenceExtension.tracing.otelExporterEndpoint` | OpenTelemetry collector endpoint. | +| `inferenceExtension.tracing.sampling.sampler` | The trace sampler to use. Currently, only `parentbased_traceidratio` is supported. This sampler respects the parent span’s sampling decision when present, and applies the configured ratio for root spans. | +| `inferenceExtension.tracing.sampling.samplerArg` | Sampler-specific argument. For `parentbased_traceidratio`, this defines the base sampling rate for new traces (root spans), as a float string in the range [0.0, 1.0]. For example, "0.1" enables 10% sampling. | +| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: [`none`, `gke`, or `istio`]. Defaults to `none`. | +| `provider.gke.autopilot` | Set to `true` if the cluster is a GKE Autopilot cluster. This is only used if `provider.name` is `gke`. Defaults to `false`. | + +### Provider Specific Configuration + +This section should document any Gateway provider specific values configurations. + +#### GKE + +These are the options available to you with `provider.name` set to `gke`: + | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| -| `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | -| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. | -| `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | -| `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | -| `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. | -| `inferenceExtension.image.hub` | Registry URL where the endpoint picker image is hosted. | -| `inferenceExtension.image.tag` | Image tag of the endpoint picker. | -| `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | -| `inferenceExtension.env` | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`. | -| `inferenceExtension.extraContainerPorts` | List of additional container ports to expose. Defaults to `[]`. | -| `inferenceExtension.extraServicePorts` | List of additional service ports to expose. Defaults to `[]`. | -| `inferenceExtension.flags` | List of flags which are passed through to endpoint picker. Example flags, enable-pprof, grpc-port etc. Refer [runner.go](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/runner/runner.go) for complete list. | -| `inferenceExtension.affinity` | Affinity for the endpoint picker. Defaults to `{}`. | -| `inferenceExtension.tolerations` | Tolerations for the endpoint picker. Defaults to `[]`. | -| `inferenceExtension.flags.has-enable-leader-election` | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. | -| `inferenceExtension.pluginsCustomConfig` | Custom config that is passed to EPP as inline yaml. | -| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. | +| `gke.monitoringSecret.name` | The name of the monitoring secret to be used. Defaults to `inference-gateway-sa-metrics-reader-secret`. | +| `gke.monitoringSecret.namespace` | The namespace that the monitoring secret lives in. Defaults to `default`. | + + +#### Istio + +These are the options available to you with `provider.name` set to `istio`: + +| **Parameter Name** | **Description** | +|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| +| `istio.destinationRule.host` | Custom host value for the destination rule. If not set this will use the default value which is derrived from the epp service name and release namespace to gerenate a valid service address. | +| `istio.destinationRule.trafficPolicy.connectionPool` | Configure the connectionPool level settings of the traffic policy | + +#### OpenTelemetry + +The EndpointPicker supports OpenTelemetry-based tracing. To enable trace collection, use the following configuration: +```yaml +inferenceExtension: + tracing: + enabled: true + otelExporterEndpoint: "http://localhost:4317" + sampling: + sampler: "parentbased_traceidratio" + samplerArg: "0.1" +``` +Make sure that the `otelExporterEndpoint` points to your OpenTelemetry collector endpoint. +Current only the `parentbased_traceidratio` sampler is supported. You can adjust the base sampling ratio using the `samplerArg` (e.g., 0.1 means 10% of traces will be sampled). ## Notes diff --git a/config/charts/inferencepool/templates/_helpers.tpl b/config/charts/inferencepool/templates/_helpers.tpl index e011bb7c1..fdc9b1a2b 100644 --- a/config/charts/inferencepool/templates/_helpers.tpl +++ b/config/charts/inferencepool/templates/_helpers.tpl @@ -16,6 +16,15 @@ Inference extension name {{ $base }}-epp {{- end -}} +{{/* +Cluster RBAC unique name +*/}} +{{- define "gateway-api-inference-extension.cluster-rbac-name" -}} +{{- $base := .Release.Name | default "default-pool" | lower | trim | trunc 40 }} +{{- $ns := .Release.Namespace | default "default" | lower | trim | trunc 40 }} +{{- printf "%s-%s-epp" $base $ns | quote | trunc 84 }} +{{- end -}} + {{/* Selector labels */}} diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index f01699a96..10eb2907a 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -7,6 +7,14 @@ metadata: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: replicas: {{ .Values.inferenceExtension.replicas | default 1 }} + strategy: + # The current recommended EPP deployment pattern is to have a single active replica. This ensures + # optimal performance of the stateful operations such prefix cache aware scorer. + # The Recreate strategy the old replica is killed immediately, and allow the new replica(s) to + # quickly take over. This is particularly important in the high availability set up with leader + # election, as the rolling update strategy would prevent the old leader being killed because + # otherwise the maxUnavailable would be 100%. + type: Recreate selector: matchLabels: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} @@ -25,16 +33,19 @@ spec: args: - --pool-name - {{ .Release.Name }} + # The pool namespace is optional because EPP can default to the NAMESPACE env var. + # We still keep this here so that the template works with older versions of EPP, or other + # distros of EPP which may not have implemented the NAMESPACE env var defaulting behavior. - --pool-namespace - {{ .Release.Namespace }} + {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io" }} + - --pool-group + - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" + {{- end }} - --zap-encoder - "json" - --config-file - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}" - {{- range .Values.inferenceExtension.flags }} - - "--{{ .name }}" - - "{{ .value }}" - {{- end }} {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} - --total-queued-requests-metric - "nv_trt_llm_request_metrics{request_type=waiting}" @@ -43,6 +54,23 @@ spec: - --lora-info-metric - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. {{- end }} + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + - --ha-enable-leader-election + {{- end }} + # Pass additional flags via the inferenceExtension.flags field in values.yaml. + {{- range .Values.inferenceExtension.flags }} + - "--{{ .name }}" + - "{{ .value }}" + {{- end }} + - "--tracing" + {{- if .Values.inferenceExtension.tracing.enabled }} + - "true" + {{- else }} + - "false" + {{- end }} + {{- if not .Values.inferenceExtension.monitoring.prometheus.enabled }} + - --metrics-endpoint-auth=false + {{- end }} ports: - name: grpc containerPort: 9002 @@ -54,7 +82,7 @@ spec: {{- toYaml .Values.inferenceExtension.extraContainerPorts | nindent 8 }} {{- end }} livenessProbe: - {{- if .Values.inferenceExtension.enableLeaderElection }} + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} grpc: port: 9003 service: liveness @@ -66,7 +94,7 @@ spec: initialDelaySeconds: 5 periodSeconds: 10 readinessProbe: - {{- if .Values.inferenceExtension.enableLeaderElection }} + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} grpc: port: 9003 service: readiness @@ -75,10 +103,38 @@ spec: port: 9003 service: inference-extension {{- end }} - initialDelaySeconds: 5 - periodSeconds: 10 - {{- if .Values.inferenceExtension.env }} + periodSeconds: 2 + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- if .Values.inferenceExtension.tracing.enabled }} + - name: OTEL_SERVICE_NAME + value: "gateway-api-inference-extension" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .Values.inferenceExtension.tracing.otelExporterEndpoint | quote }} + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_RESOURCE_ATTRIBUTES_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES + value: 'k8s.namespace.name=$(NAMESPACE),k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME)' + - name: OTEL_TRACES_SAMPLER + value: {{ .Values.inferenceExtension.tracing.sampling.sampler | quote }} + - name: OTEL_TRACES_SAMPLER_ARG + value: {{ .Values.inferenceExtension.tracing.sampling.samplerArg | quote }} + {{- end }} + {{- if .Values.inferenceExtension.env }} {{- toYaml .Values.inferenceExtension.env | nindent 8 }} {{- end }} volumeMounts: diff --git a/config/charts/inferencepool/templates/epp-sa-token-secret.yaml b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml new file mode 100644 index 000000000..16d935f96 --- /dev/null +++ b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml @@ -0,0 +1,12 @@ +{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled (ne (lower .Values.provider.name) "gke") }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }} +type: kubernetes.io/service-account-token +{{- end }} \ No newline at end of file diff --git a/config/charts/inferencepool/templates/epp-servicemonitor.yaml b/config/charts/inferencepool/templates/epp-servicemonitor.yaml new file mode 100644 index 000000000..220be76dc --- /dev/null +++ b/config/charts/inferencepool/templates/epp-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled (ne (lower .Values.provider.name) "gke") }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-monitor + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + {{- with .Values.inferenceExtension.monitoring.prometheus.extraLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + endpoints: + - interval: {{ .Values.inferenceExtension.monitoring.interval }} + port: "http-metrics" + path: "/metrics" + {{- if .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} + authorization: + credentials: + key: token + name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} + {{- end }} + jobLabel: {{ include "gateway-api-inference-extension.name" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "gateway-api-inference-extension.labels" . | nindent 6 }} +{{- end }} diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml index 92010c0d0..a2d8bbc87 100644 --- a/config/charts/inferencepool/templates/gke.yaml +++ b/config/charts/inferencepool/templates/gke.yaml @@ -9,10 +9,14 @@ metadata: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: targetRef: - group: "inference.networking.k8s.io" + group: "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" kind: InferencePool name: {{ .Release.Name }} default: + # Set a more aggressive health check than the default 5s for faster switch + # over during EPP rollout. + timeoutSec: 2 + checkIntervalSec: 2 config: type: HTTP httpHealthCheck: @@ -28,7 +32,7 @@ metadata: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: targetRef: - group: "inference.networking.k8s.io" + group: "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" kind: InferencePool name: {{ .Release.Name }} default: @@ -36,26 +40,107 @@ spec: logging: enabled: true # log all requests by default --- +{{- if or .Values.inferenceExtension.monitoring.gke.enabled (and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled) }} +{{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}} +{{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}} +{{- $metricsReadRoleName := printf "%s-%s-metrics-reader" .Release.Namespace .Release.Name -}} +{{- $metricsReadRoleBindingName := printf "%s-%s-metrics-reader-role-binding" .Release.Namespace .Release.Name -}} +{{- $secretReadRoleName := printf "%s-metrics-reader-secret-read" .Release.Name -}} +{{- $gmpNamespace := "gmp-system" -}} +{{- $isAutopilot := false -}} +{{- with .Values.provider.gke }} + {{- $isAutopilot = .autopilot | default false -}} +{{- end }} +{{- if $isAutopilot -}} +{{- $gmpNamespace = "gke-gmp-system" -}} +{{- end -}} +{{- $gmpCollectorRoleBindingName := printf "%s:collector:%s-%s-metrics-reader-secret-read" $gmpNamespace .Release.Namespace .Release.Name -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $metricsReadSA }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ $metricsReadSecretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ $metricsReadSA }} +type: kubernetes.io/service-account-token +--- apiVersion: monitoring.googleapis.com/v1 -kind: ClusterPodMonitoring +kind: PodMonitoring metadata: - name: {{ .Release.Namespace }}-{{ .Release.Name }} + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} spec: endpoints: - port: metrics scheme: http - interval: 5s + interval: {{ .Values.inferenceExtension.monitoring.interval }} path: /metrics authorization: type: Bearer credentials: secret: - name: {{ .Values.gke.monitoringSecret.name }} + name: {{ $metricsReadSecretName }} key: token - namespace: {{ .Values.gke.monitoringSecret.namespace }} selector: matchLabels: {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ $metricsReadRoleName }} +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ $metricsReadRoleBindingName }} +subjects: +- kind: ServiceAccount + name: {{ $metricsReadSA }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ $metricsReadRoleName }} + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $secretReadRoleName }} +rules: +- resources: + - secrets + apiGroups: [""] + verbs: ["get", "list", "watch"] + resourceNames: [{{ $metricsReadSecretName | quote }}] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $gmpCollectorRoleBindingName }} + namespace: {{ .Release.Namespace }} +roleRef: + name: {{ $secretReadRoleName }} + kind: Role + apiGroup: rbac.authorization.k8s.io +subjects: +- name: collector + namespace: {{ $gmpNamespace }} + kind: ServiceAccount +{{- end }} {{- end }} diff --git a/config/charts/inferencepool/templates/istio.yaml b/config/charts/inferencepool/templates/istio.yaml new file mode 100644 index 000000000..b50c0b021 --- /dev/null +++ b/config/charts/inferencepool/templates/istio.yaml @@ -0,0 +1,16 @@ +{{- if eq .Values.provider.name "istio" }} +apiVersion: networking.istio.io/v1beta1 +kind: DestinationRule +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} +spec: + host: {{ .Values.istio.destinationRule.host | default (printf "%s.%s.svc.cluster.local" (include "gateway-api-inference-extension.name" .) .Release.Namespace) }} + trafficPolicy: + tls: + mode: SIMPLE + insecureSkipVerify: true + {{- if .Values.istio.destinationRule.trafficPolicy.connectionPool }} + connectionPool: + {{- .Values.istio.destinationRule.trafficPolicy.connectionPool | toYaml | nindent 6 }} + {{- end }} +{{- end }} diff --git a/config/charts/inferencepool/templates/leader-election-rbac.yaml b/config/charts/inferencepool/templates/leader-election-rbac.yaml index 923bdd6f4..11b3dd516 100644 --- a/config/charts/inferencepool/templates/leader-election-rbac.yaml +++ b/config/charts/inferencepool/templates/leader-election-rbac.yaml @@ -1,4 +1,4 @@ -{{- if .Values.inferenceExtension.enableLeaderElection }} +{{- if gt (.Values.inferenceExtension.replicas | int) 1 }} --- kind: Role apiVersion: rbac.authorization.k8s.io/v1 diff --git a/config/charts/inferencepool/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml index a8d891c32..ebe68c3ea 100644 --- a/config/charts/inferencepool/templates/rbac.yaml +++ b/config/charts/inferencepool/templates/rbac.yaml @@ -1,7 +1,7 @@ kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} rules: @@ -17,11 +17,17 @@ rules: - subjectaccessreviews verbs: - create +{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }} +- nonResourceURLs: + - "/metrics" + verbs: + - get +{{- end }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} subjects: - kind: ServiceAccount name: {{ include "gateway-api-inference-extension.name" . }} @@ -29,7 +35,7 @@ subjects: roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -40,9 +46,9 @@ metadata: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} rules: - apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferenceobjectives", "inferencepools"] + resources: ["inferenceobjectives"] verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.k8s.io"] +- apiGroups: ["{{ (split "/" .Values.inferencePool.apiVersion)._0 }}"] resources: ["inferencepools"] verbs: ["get", "watch", "list"] - apiGroups: [""] diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index d45e6ed39..8b3385ab1 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -40,6 +40,29 @@ inferenceExtension: tolerations: [] + # Monitoring configuration for EPP + monitoring: + interval: "10s" + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: false + auth: + enabled: true + # Service account token secret for authentication + secretName: inference-gateway-sa-metrics-reader-secret + # additional labels for the ServiceMonitor + extraLabels: {} + + # DEPRECATED: The 'gke' configuration will be removed in the next release. + gke: + enabled: false + tracing: + enabled: false + otelExporterEndpoint: "http://localhost:4317" + sampling: + sampler: "parentbased_traceidratio" + samplerArg: "0.1" + inferencePool: targetPorts: - number: 8000 @@ -53,10 +76,22 @@ inferencePool: # This will soon be deprecated when upstream GW providers support v1, just doing something simple for now. targetPortNumber: 8000 +# Options: ["gke", "istio", "none"] provider: name: none -gke: - monitoringSecret: - name: inference-gateway-sa-metrics-reader-secret - namespace: default + # GKE-specific configuration. + # This block is only used if name is "gke". + gke: + # Set to true if the cluster is an Autopilot cluster. + autopilot: false + +istio: + destinationRule: + # Provide a way to override the default calculated host + host: "" + # Optional: Enables customization of the traffic policy + trafficPolicy: {} + # connectionPool: + # http: + # maxRequestsPerConnection: 256000 \ No newline at end of file diff --git a/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml index ed325bea6..a3f769633 100644 --- a/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml @@ -147,6 +147,8 @@ spec: MatchLabels contains a set of required {key,value} pairs. An object must match every label in this map to be selected. The matching logic is an AND operation on all entries. + maxProperties: 64 + minProperties: 1 type: object required: - matchLabels @@ -154,7 +156,8 @@ spec: targetPorts: description: |- TargetPorts defines a list of ports that are exposed by this InferencePool. - Currently, the list may only include a single port definition. + Every port will be treated as a distinctive endpoint by EPP, + addressable as a 'podIP:portNumber' combination. items: description: Port defines the network port that will be exposed by this InferencePool. @@ -170,10 +173,13 @@ spec: required: - number type: object - maxItems: 1 + maxItems: 8 minItems: 1 type: array x-kubernetes-list-type: atomic + x-kubernetes-validations: + - message: port number must be unique + rule: self.all(p1, self.exists_one(p2, p1.number==p2.number)) required: - endpointPickerRef - selector @@ -267,6 +273,25 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + controllerName: + description: |- + ControllerName is a domain/path string that indicates the name of the controller that + wrote this status. This corresponds with the GatewayClass controllerName field when the + parentRef references a Gateway kind. + + Example: "example.net/gateway-controller". + + The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes names: + + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + + Controllers MAY populate this field when writing status. When populating this field, controllers + should ensure that entries to status populated with their ControllerName are cleaned up when they + are no longer necessary. + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string parentRef: description: |- ParentRef is used to identify the parent resource that this status diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepoolimports.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepoolimports.yaml new file mode 100644 index 000000000..318e6d46f --- /dev/null +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepoolimports.yaml @@ -0,0 +1,330 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + inference.networking.k8s.io/bundle-version: main-dev + name: inferencepoolimports.inference.networking.x-k8s.io +spec: + group: inference.networking.x-k8s.io + names: + kind: InferencePoolImport + listKind: InferencePoolImportList + plural: inferencepoolimports + shortNames: + - infpimp + singular: inferencepoolimport + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: InferencePoolImport is the Schema for the InferencePoolImports + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + description: Status defines the observed state of the InferencePoolImport. + properties: + controllers: + description: Controllers is a list of controllers that are responsible + for managing the InferencePoolImport. + items: + description: ImportController defines a controller that is responsible + for managing the InferencePoolImport. + properties: + conditions: + description: |- + Conditions track the state of the InferencePoolImport. + + Known condition types are: + + * "Accepted" + items: + description: Condition contains details for one aspect of + the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + exportingClusters: + description: |- + ExportingClusters is a list of clusters that exported the InferencePool(s) that back the + InferencePoolImport. Required when the controller is responsible for CRUD'ing the InferencePoolImport + from the exported InferencePool(s). + items: + description: ExportingCluster defines a cluster that exported + the InferencePool that backs this InferencePoolImport. + properties: + name: + description: Name of the exporting cluster (must be unique + within the list). + maxLength: 253 + minLength: 1 + type: string + required: + - name + type: object + type: array + name: + description: |- + Name is a domain/path string that indicates the name of the controller that manages the + InferencePoolImport. Name corresponds to the GatewayClass controllerName field when the + controller will manage parents of type "Gateway". Otherwise, the name is implementation-specific. + + Example: "example.net/import-controller". + + The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes + names (https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). + + A controller MUST populate this field when writing status and ensure that entries to status + populated with their controller name are removed when they are no longer necessary. + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + parents: + description: |- + Parents is a list of parent resources, typically Gateways, that are associated with the + InferencePoolImport, and the status of the InferencePoolImport with respect to each parent. + + Ancestor would be a more accurate name, but Parent is consistent with InferencePool terminology. + + Required when the controller manages the InferencePoolImport as an HTTPRoute backendRef. The controller + must add an entry for each parent it manages and remove the parent entry when the controller no longer + considers the InferencePoolImport to be associated with that parent. + items: + description: ParentStatus defines the observed state of InferencePool + from a Parent, i.e. Gateway. + properties: + conditions: + description: |- + Conditions is a list of status conditions that provide information about the observed + state of the InferencePool. This field is required to be set by the controller that + manages the InferencePool. + + Supported condition types are: + + * "Accepted" + * "ResolvedRefs" + items: + description: Condition contains details for one aspect + of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, + False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in + foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + controllerName: + description: |- + ControllerName is a domain/path string that indicates the name of the controller that + wrote this status. This corresponds with the GatewayClass controllerName field when the + parentRef references a Gateway kind. + + Example: "example.net/gateway-controller". + + The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes names: + + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + + Controllers MAY populate this field when writing status. When populating this field, controllers + should ensure that entries to status populated with their ControllerName are cleaned up when they + are no longer necessary. + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + parentRef: + description: |- + ParentRef is used to identify the parent resource that this status + is associated with. It is used to match the InferencePool with the parent + resource, such as a Gateway. + properties: + group: + default: gateway.networking.k8s.io + description: |- + Group is the group of the referent API object. When unspecified, the referent is assumed + to be in the "gateway.networking.k8s.io" API group. + maxLength: 253 + minLength: 0 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Gateway + description: |- + Kind is the kind of the referent API object. When unspecified, the referent is assumed + to be a "Gateway" kind. + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + description: Name is the name of the referent API + object. + maxLength: 253 + minLength: 1 + type: string + namespace: + description: |- + Namespace is the namespace of the referenced object. When unspecified, the local + namespace is inferred. + + Note that when a namespace different than the local namespace is specified, + a ReferenceGrant object is required in the referent namespace to allow that + namespace's owner to accept the reference. See the ReferenceGrant + documentation for details: https://gateway-api.sigs.k8s.io/api-types/referencegrant/ + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - name + type: object + required: + - parentRef + type: object + type: array + x-kubernetes-list-type: atomic + required: + - name + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + required: + - controllers + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/config/manifests/bbr-example/httproute_bbr.yaml b/config/manifests/bbr-example/httproute_bbr.yaml new file mode 100644 index 000000000..8702546dc --- /dev/null +++ b/config/manifests/bbr-example/httproute_bbr.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-llama-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: X-Gateway-Model-Name + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-phi4-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-phi4-mini-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: X-Gateway-Model-Name + value: 'microsoft/Phi-4-mini-instruct' + timeouts: + request: 300s +--- diff --git a/config/manifests/bbr-example/vllm-phi4-mini.yaml b/config/manifests/bbr-example/vllm-phi4-mini.yaml new file mode 100644 index 000000000..7f7827cb9 --- /dev/null +++ b/config/manifests/bbr-example/vllm-phi4-mini.yaml @@ -0,0 +1,88 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: phi4-mini + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi4-mini + namespace: default + labels: + app: phi4-mini +spec: + replicas: 1 + selector: + matchLabels: + app: phi4-mini + template: + metadata: + labels: + app: phi4-mini + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: phi4-mini + containers: + - name: phi4-mini + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve microsoft/Phi-4-mini-instruct --trust-remote-code --enable-chunked-prefill" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + ports: + - containerPort: 8000 + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 600 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 600 + periodSeconds: 5 +--- +apiVersion: v1 +kind: Service +metadata: + name: phi4-mini + namespace: default +spec: + ports: + - name: http-phi4-mini + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: phi4-mini + sessionAffinity: None + type: ClusterIP + diff --git a/config/manifests/gateway/agentgateway/httproute.yaml b/config/manifests/gateway/agentgateway/httproute.yaml index 18e90ced6..18c450708 100644 --- a/config/manifests/gateway/agentgateway/httproute.yaml +++ b/config/manifests/gateway/agentgateway/httproute.yaml @@ -9,7 +9,7 @@ spec: name: inference-gateway rules: - backendRefs: - - group: inference.networking.x-k8s.io + - group: inference.networking.k8s.io kind: InferencePool name: vllm-llama3-8b-instruct matches: diff --git a/config/manifests/gateway/envoyaigateway/gateway.yaml b/config/manifests/gateway/envoyaigateway/gateway.yaml new file mode 100644 index 000000000..1a536411b --- /dev/null +++ b/config/manifests/gateway/envoyaigateway/gateway.yaml @@ -0,0 +1,17 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: envoy-ai-gateway +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: envoy-ai-gateway + listeners: + - name: http + protocol: HTTP + port: 80 diff --git a/config/manifests/gateway/envoyaigateway/httproute.yaml b/config/manifests/gateway/envoyaigateway/httproute.yaml new file mode 100644 index 000000000..e685940fe --- /dev/null +++ b/config/manifests/gateway/envoyaigateway/httproute.yaml @@ -0,0 +1,20 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + timeouts: + request: 300s diff --git a/config/manifests/gateway/istio/httproute.yaml b/config/manifests/gateway/istio/httproute.yaml index 18e90ced6..18c450708 100644 --- a/config/manifests/gateway/istio/httproute.yaml +++ b/config/manifests/gateway/istio/httproute.yaml @@ -9,7 +9,7 @@ spec: name: inference-gateway rules: - backendRefs: - - group: inference.networking.x-k8s.io + - group: inference.networking.k8s.io kind: InferencePool name: vllm-llama3-8b-instruct matches: diff --git a/config/manifests/gateway/kgateway/httproute.yaml b/config/manifests/gateway/kgateway/httproute.yaml index 18e90ced6..18c450708 100644 --- a/config/manifests/gateway/kgateway/httproute.yaml +++ b/config/manifests/gateway/kgateway/httproute.yaml @@ -9,7 +9,7 @@ spec: name: inference-gateway rules: - backendRefs: - - group: inference.networking.x-k8s.io + - group: inference.networking.k8s.io kind: InferencePool name: vllm-llama3-8b-instruct matches: diff --git a/config/manifests/inferencepool-resources-lp.yaml b/config/manifests/inferencepool-resources-lp.yaml index e7c58afb5..31db1813e 100644 --- a/config/manifests/inferencepool-resources-lp.yaml +++ b/config/manifests/inferencepool-resources-lp.yaml @@ -367,21 +367,20 @@ data: plugins: - type: queue-scorer - type: kv-cache-utilization-scorer - - type: slo-request-tracker - - type: slo-scorer + - type: slo-aware-routing - type: slo-aware-profile-handler - type: max-score-picker schedulingProfiles: - name: default plugins: - - pluginRef: slo-request-tracker + - pluginRef: slo-aware-routing + weight: 0 - pluginRef: queue-scorer - pluginRef: kv-cache-utilization-scorer - pluginRef: max-score-picker - name: slo plugins: - - pluginRef: slo-request-tracker - - pluginRef: slo-scorer + - pluginRef: slo-aware-routing - pluginRef: max-score-picker --- # --- RBAC --- diff --git a/config/manifests/vllm/sim-deployment.yaml b/config/manifests/vllm/sim-deployment.yaml index 17b689112..2415c1066 100644 --- a/config/manifests/vllm/sim-deployment.yaml +++ b/config/manifests/vllm/sim-deployment.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: vllm-sim - image: ghcr.io/llm-d/llm-d-inference-sim:v0.3.0 + image: ghcr.io/llm-d/llm-d-inference-sim:v0.5.0 imagePullPolicy: Always args: - --model diff --git a/conformance/conformance.go b/conformance/conformance.go index 5d94197a8..7b3c32cfd 100644 --- a/conformance/conformance.go +++ b/conformance/conformance.go @@ -208,7 +208,7 @@ func RunConformanceWithOptions(t *testing.T, opts confsuite.ConformanceOptions) installedCRDs := &apiextensionsv1.CustomResourceDefinitionList{} err = opts.Client.List(ctx, installedCRDs) require.NoError(t, err, "error getting installedCRDs") - apiVersion, err := getGatewayInferenceExtentionVersion(installedCRDs.Items) + apiVersion, err := getGatewayInferenceExtensionVersion(installedCRDs.Items) if err != nil { if opts.AllowCRDsMismatch { apiVersion = "UNDEFINED" @@ -266,7 +266,7 @@ func SetupConformanceTestSuite(ctx context.Context, t *testing.T, suite *confsui ensureGatewayAvailableAndReady(ctx, t, suite.Client, opts, resources.SecondaryGatewayNN) } -func getGatewayInferenceExtentionVersion(crds []apiextensionsv1.CustomResourceDefinition) (string, error) { +func getGatewayInferenceExtensionVersion(crds []apiextensionsv1.CustomResourceDefinition) (string, error) { var inferenceVersion string for _, crd := range crds { v, okv := crd.Annotations[version.BundleVersionAnnotation] diff --git a/conformance/reports/v1.0.0/gateway/kgateway/README.md b/conformance/reports/v1.0.0/gateway/kgateway/README.md new file mode 100644 index 000000000..c64981f35 --- /dev/null +++ b/conformance/reports/v1.0.0/gateway/kgateway/README.md @@ -0,0 +1,78 @@ +# Kgateway + +This guide provides the steps for running Gateway conformance tests against [kgateway](https://kgateway.dev/) with the default +([Envoy](https://www.envoyproxy.io/)) data plane. + +## Table of Contents + +| Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | +|--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| +| v1.0.0 | Gateway | v2.1.0-main | default | [v2.1.0-main report](./inference-v2.1.0-main-report.yaml) | + +## Reproduce + +This is a mirror of the kgateway [inference conformance GHA workflow](https://github.com/kgateway-dev/kgateway/blob/v2.0.x/.github/actions/kube-inference-extension-conformance-tests/action.yaml). + +### Prerequisites + +In order to run the conformance tests, the following prerequisites must be met: + +- The [kubectl](https://kubernetes.io/docs/tasks/tools/) command-line tool installed and configured for the active cluster context. +- The [helm](https://github.com/helm/helm), [git](https://git-scm.com/downloads), and [make](https://www.gnu.org/software/make/) command-line tools installed. + +### Steps + +1. Set the environment variables use by the proceeding steps: + + ```sh + # The kgateway version + export VERSION=v2.1.0-main + # Skip building and loading the kgateway images + export SKIP_DOCKER=true + # Install Gateway API and Inference Extension CRDs + export CONFORMANCE=true + ``` + +2. Clone the kgateway repository and checkout the release: + + ```sh + git clone -b $VERSION https://github.com/kgateway-dev/kgateway.git && cd kgateway + ``` + +3. Create a KinD cluster: + + ```sh + make kind-setup + ``` + +4. Install the kgateway CRDs: + + ```sh + helm upgrade -i --create-namespace --namespace kgateway-system \ + --version $VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + +5. Install kgateway with Inference Extension enabled: + + ```sh + helm upgrade -i --namespace kgateway-system --version $VERSION \ + kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true + ``` + +6. Wait for the kgateway rollout to complete: + + ```sh + kubectl rollout status deploy/kgateway -n kgateway-system + ``` + +7. Run the conformance tests: + + ```sh + make gie-conformance + ``` + +8. View and verify the conformance report: + + ```sh + cat _test/conformance/inference-$VERSION-report.yaml + ``` diff --git a/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/README.md b/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/README.md new file mode 100644 index 000000000..7f35d2b2a --- /dev/null +++ b/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/README.md @@ -0,0 +1,79 @@ +# Kgateway with agentgateway + +This guide provides the steps for running Gateway conformance tests against [kgateway](https://kgateway.dev/) with the +([agentgateway](https://agentgateway.dev/)) data plane. + +## Table of Contents + +| Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | +|--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| +| v1.0.0 | Gateway | v2.1.0-main | default | [v2.1.0-main report](./inference-v2.1.0-main-report.yaml) | + +## Reproduce + +This is a mirror of the kgateway [conformance test] with the default (Envoy) data plane [conformance test](../README.md). + +### Prerequisites + +In order to run the conformance tests, the following prerequisites must be met: + +- The [kubectl](https://kubernetes.io/docs/tasks/tools/) command-line tool installed and configured for the active cluster context. +- The [helm](https://github.com/helm/helm), [git](https://git-scm.com/downloads), and [make](https://www.gnu.org/software/make/) command-line tools installed. + +### Steps + +1. Set the environment variables use by the proceeding steps: + + ```sh + # The kgateway version + export VERSION=v2.1.0-main + # Skip building and loading the kgateway images + export SKIP_DOCKER=true + # Install Gateway API and Inference Extension CRDs + export CONFORMANCE=true + ``` + +2. Clone the kgateway repository and checkout the release: + + ```sh + git clone -b $VERSION https://github.com/kgateway-dev/kgateway.git && cd kgateway + ``` + +3. Create a KinD cluster: + + ```sh + make kind-setup + ``` + +4. Install the kgateway CRDs: + + ```sh + helm upgrade -i --create-namespace --namespace kgateway-system \ + --version $VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + +5. Install kgateway with Inference Extension and agentgateway enabled: + + ```sh + helm upgrade -i --namespace kgateway-system --version $VERSION \ + kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway \ + --set inferenceExtension.enabled=true --set agentGateway.enabled=true + ``` + +6. Wait for the kgateway rollout to complete: + + ```sh + kubectl rollout status deploy/kgateway -n kgateway-system + ``` + +7. Run the conformance tests: + + ```sh + CONFORMANCE_GATEWAY_CLASS=agentgateway make gie-conformance + ``` + +8. View and verify the conformance report: + + ```sh + cat _test/conformance/inference-$VERSION-report.yaml + ``` diff --git a/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/inference-v2.1.0-main-report.yaml b/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/inference-v2.1.0-main-report.yaml new file mode 100644 index 000000000..9e4063b5c --- /dev/null +++ b/conformance/reports/v1.0.0/gateway/kgateway/agentgateway/inference-v2.1.0-main-report.yaml @@ -0,0 +1,23 @@ +GatewayAPIInferenceExtensionVersion: v1.0.0 +apiVersion: gateway.networking.k8s.io/v1 +date: "2025-09-14T10:03:01-07:01" +gatewayAPIChannel: experimental +gatewayAPIVersion: v1.3.0 +implementation: + contact: + - github.com/kgateway-dev/kgateway/issues/new/choose + organization: kgateway-dev + project: kgateway + url: github.com/kgateway-dev/kgateway + version: v2.1.0-main +kind: ConformanceReport +mode: default +profiles: +- core: + result: success + statistics: + Failed: 0 + Passed: 9 + Skipped: 0 + name: Gateway + summary: Core tests succeeded. diff --git a/conformance/reports/v1.0.0/gateway/kgateway/inference-v2.1.0-main-report.yaml b/conformance/reports/v1.0.0/gateway/kgateway/inference-v2.1.0-main-report.yaml new file mode 100644 index 000000000..3bee5ac54 --- /dev/null +++ b/conformance/reports/v1.0.0/gateway/kgateway/inference-v2.1.0-main-report.yaml @@ -0,0 +1,23 @@ +GatewayAPIInferenceExtensionVersion: v1.0.0 +apiVersion: gateway.networking.k8s.io/v1 +date: "2025-09-12T11:05:01-07:00" +gatewayAPIChannel: experimental +gatewayAPIVersion: v1.3.0 +implementation: + contact: + - github.com/kgateway-dev/kgateway/issues/new/choose + organization: kgateway-dev + project: kgateway + url: github.com/kgateway-dev/kgateway + version: v2.1.0-main +kind: ConformanceReport +mode: default +profiles: +- core: + result: success + statistics: + Failed: 0 + Passed: 9 + Skipped: 0 + name: Gateway + summary: Core tests succeeded. diff --git a/conformance/resources/base.yaml b/conformance/resources/base.yaml index 7f4a7fa20..2e1b378c3 100644 --- a/conformance/resources/base.yaml +++ b/conformance/resources/base.yaml @@ -7,16 +7,16 @@ apiVersion: v1 kind: Namespace metadata: - name: gateway-conformance-infra + name: inference-conformance-infra labels: - gateway-conformance: infra + inference-conformance: infra --- apiVersion: v1 kind: Namespace metadata: - name: gateway-conformance-app-backend + name: inference-conformance-app-backend labels: - gateway-conformance: backend + inference-conformance: backend --- # A basic Gateway resource that allows HTTPRoutes from the same namespace. # Tests can use this as a parent reference for routes that target InferencePools. @@ -24,7 +24,7 @@ apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra spec: gatewayClassName: "{GATEWAY_CLASS_NAME}" listeners: @@ -42,7 +42,7 @@ apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: conformance-secondary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra spec: gatewayClassName: "{GATEWAY_CLASS_NAME}" listeners: @@ -55,14 +55,14 @@ spec: from: All ### The following defines the essential resources for the gateway conformance test. -### All resources are created in the 'gateway-conformance-app-backend' namespace. +### All resources are created in the 'inference-conformance-app-backend' namespace. --- # Deploys a mock backend service to act as a model server. apiVersion: apps/v1 kind: Deployment metadata: name: primary-inference-model-server-deployment - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend labels: app: primary-inference-model-server spec: @@ -106,7 +106,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: secondary-inference-model-server-deployment - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend labels: app: secondary-inference-model-server spec: @@ -150,7 +150,7 @@ apiVersion: inference.networking.k8s.io/v1 kind: InferencePool metadata: name: primary-inference-pool - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: selector: matchLabels: @@ -167,7 +167,7 @@ apiVersion: v1 kind: Service metadata: name: primary-endpoint-picker-svc - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: selector: app: primary-app-backend-epp @@ -183,7 +183,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: primary-app-endpoint-picker - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend labels: app: primary-app-backend-epp spec: @@ -200,13 +200,13 @@ spec: terminationGracePeriodSeconds: 130 containers: - name: epp - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v1.0.0 imagePullPolicy: Always args: - --pool-name - "primary-inference-pool" - --pool-namespace - - "gateway-conformance-app-backend" + - "inference-conformance-app-backend" - --v - "4" - --zap-encoder @@ -247,7 +247,7 @@ apiVersion: inference.networking.k8s.io/v1 kind: InferencePool metadata: name: secondary-inference-pool - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: selector: matchLabels: @@ -265,7 +265,7 @@ apiVersion: v1 kind: Service metadata: name: secondary-endpoint-picker-svc - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: selector: app: secondary-app-backend-epp @@ -281,7 +281,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: secondary-app-endpoint-picker - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend labels: app: secondary-app-backend-epp spec: @@ -298,13 +298,13 @@ spec: terminationGracePeriodSeconds: 130 containers: - name: epp - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v1.0.0 imagePullPolicy: Always args: - --pool-name - "secondary-inference-pool" - --pool-namespace - - "gateway-conformance-app-backend" + - "inference-conformance-app-backend" - --v - "4" - --zap-encoder @@ -344,7 +344,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: plugins-config - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend data: conformance-plugins.yaml: | apiVersion: inference.networking.x-k8s.io/v1alpha1 @@ -361,7 +361,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: inference-model-reader - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend rules: - apiGroups: ["inference.networking.x-k8s.io"] resources: ["inferenceobjectives", "inferencepools"] @@ -377,11 +377,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: epp-to-inference-model-reader - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend subjects: - kind: ServiceAccount name: default - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend roleRef: kind: Role name: inference-model-reader diff --git a/conformance/resources/resourcename.go b/conformance/resources/resourcename.go index 4c0c7b4c2..4ac6437fe 100644 --- a/conformance/resources/resourcename.go +++ b/conformance/resources/resourcename.go @@ -19,8 +19,8 @@ package resources import "k8s.io/apimachinery/pkg/types" const ( - AppBackendNamespace = "gateway-conformance-app-backend" - InfraNamespace = "gateway-conformance-infra" + AppBackendNamespace = "inference-conformance-app-backend" + InfraNamespace = "inference-conformance-infra" PrimaryGatewayName = "conformance-primary" SecondaryGatewayName = "conformance-secondary" diff --git a/conformance/scripts/istio/Makefile b/conformance/scripts/istio/Makefile index 309d8c459..cfce3cc9c 100644 --- a/conformance/scripts/istio/Makefile +++ b/conformance/scripts/istio/Makefile @@ -3,7 +3,7 @@ GATEWAY_API_VERSION ?= v1.3.0 INFERENCE_EXTENSION_VERSION ?= v0.4.0 ISTIO_VERSION ?= 1.27-alpha.0551127f00634403cddd4634567e65a8ecc499a7 -ISTIO_HUB ?= +ISTIO_HUB ?= ISTIO_PROFILE ?= minimal # Conformance test variables @@ -72,7 +72,7 @@ apiVersion: networking.istio.io/v1 kind: DestinationRule metadata: name: primary-endpoint-picker-tls - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: host: primary-endpoint-picker-svc trafficPolicy: @@ -84,7 +84,7 @@ apiVersion: networking.istio.io/v1 kind: DestinationRule metadata: name: secondary-endpoint-picker-tls - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: host: secondary-endpoint-picker-svc trafficPolicy: @@ -432,7 +432,7 @@ setup-crds: setup-gateway-api-crds setup-inference-extension-crds # Setup TLS for EPP setup-tls: @echo "Setting up TLS for EPP..." - -kubectl create namespace gateway-conformance-app-backend || true + -kubectl create namespace inference-conformance-app-backend || true $(file >/tmp/tls-destination-rules.yaml,$(TLS_DESTINATION_RULES)) kubectl apply -f /tmp/tls-destination-rules.yaml @rm -f /tmp/tls-destination-rules.yaml @@ -518,7 +518,7 @@ readme-update: $(REPORT_BASE_DIR)/README.md # Clean up resources clean: @echo "Cleaning up..." - kubectl delete namespace gateway-conformance-app-backend --ignore-not-found=true + kubectl delete namespace inference-conformance-app-backend --ignore-not-found=true @echo "Cleaning up downloaded istioctl binaries..." @rm -f $(ISTIOCTL_DIR)/istioctl-* @echo "Note: If using minikube, run 'minikube delete' to completely clean up" diff --git a/conformance/tests/epp_unavailable_fail_open.go b/conformance/tests/epp_unavailable_fail_open.go index 9a831b11b..7f9089abf 100644 --- a/conformance/tests/epp_unavailable_fail_open.go +++ b/conformance/tests/epp_unavailable_fail_open.go @@ -22,13 +22,13 @@ import ( "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/types" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" "sigs.k8s.io/gateway-api/conformance/utils/suite" "sigs.k8s.io/gateway-api/pkg/features" "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/test" ) @@ -69,19 +69,24 @@ var EppUnAvailableFailOpen = suite.ConformanceTest{ targetPodIP := pods[0].Status.PodIP t.Run("Phase 1: Verify baseline connectivity with EPP available", func(t *testing.T) { t.Log("Sending request to ensure the Gateway and EPP are working correctly...") - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwAddr, - trafficutils.Request{ - Host: hostname, - Path: path, - Headers: map[string]string{ - test.HeaderTestEppEndPointSelectionKey: targetPodIP, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Headers: map[string]string{ + test.HeaderTestEppEndPointSelectionKey: targetPodIP, + }, + Method: http.MethodPost, + Body: requestBody, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, }, - Method: http.MethodPost, - Body: requestBody, Backend: pods[0].Name, // Make sure the request is from the targetPod when the EPP is alive. Namespace: resources.AppBackendNamespace, }, @@ -96,19 +101,24 @@ var EppUnAvailableFailOpen = suite.ConformanceTest{ require.NoError(t, err, "Failed to make the EPP service %v unavailable", resources.PrimaryEppServiceNN) t.Log("Sending request again, expecting success to verify fail-open...") - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwAddr, - trafficutils.Request{ - Host: hostname, - Path: path, - Headers: map[string]string{ - test.HeaderTestEppEndPointSelectionKey: targetPodIP, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Headers: map[string]string{ + test.HeaderTestEppEndPointSelectionKey: targetPodIP, + }, + Method: http.MethodPost, + Body: requestBody, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, }, - Method: http.MethodPost, - Body: requestBody, Backend: appPodBackendPrefix, // Only checks the prefix since the EPP is not alive and the response can return from any Pod. Namespace: resources.AppBackendNamespace, }, diff --git a/conformance/tests/epp_unavailable_fail_open.yaml b/conformance/tests/epp_unavailable_fail_open.yaml index 9388a017d..df3a6f0f5 100644 --- a/conformance/tests/epp_unavailable_fail_open.yaml +++ b/conformance/tests/epp_unavailable_fail_open.yaml @@ -2,13 +2,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-failopen-pool-gw - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-secondary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "secondary.example.com" diff --git a/conformance/tests/gateway_following_epp_routing.go b/conformance/tests/gateway_following_epp_routing.go index c320ad35e..df8af8ca5 100644 --- a/conformance/tests/gateway_following_epp_routing.go +++ b/conformance/tests/gateway_following_epp_routing.go @@ -32,7 +32,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/test" ) @@ -86,19 +85,24 @@ var GatewayFollowingEPPRouting = suite.ConformanceTest{ for i := 0; i < len(pods); i++ { // Send an initial request targeting a single pod and wait for it to be successful to ensure the Gateway and EPP // are functioning correctly before running the main test cases. - traffic.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwAddr, - traffic.Request{ - Host: hostname, - Path: path, - Headers: map[string]string{ - test.HeaderTestEppEndPointSelectionKey: podIPs[i], + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Method: http.MethodPost, + Body: requestBody, + Headers: map[string]string{ + test.HeaderTestEppEndPointSelectionKey: podIPs[i], + }, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, }, - Method: http.MethodPost, - Body: requestBody, Backend: podNames[i], Namespace: resources.AppBackendNamespace, }, @@ -142,21 +146,21 @@ var GatewayFollowingEPPRouting = suite.ConformanceTest{ Host: hostname, Path: path, Method: http.MethodPost, + Body: requestBody, Headers: headers, }, Response: gwhttp.Response{ StatusCode: http.StatusOK, }, - // DO NOT SUBMIT Backend: appPodBackendPrefix, Namespace: resources.AppBackendNamespace, - }, requestBody, tc.expectAllRequestsRoutedWithinPodNames) + }, tc.expectAllRequestsRoutedWithinPodNames) }) } }, } -func assertTrafficOnlyReachesToExpectedPods(t *testing.T, suite *suite.ConformanceTestSuite, gwAddr string, expected gwhttp.ExpectedResponse, requestBody string, expectedPodNames []string) { +func assertTrafficOnlyReachesToExpectedPods(t *testing.T, suite *suite.ConformanceTestSuite, gwAddr string, expected gwhttp.ExpectedResponse, expectedPodNames []string) { t.Helper() const ( concurrentRequests = 10 @@ -170,11 +174,11 @@ func assertTrafficOnlyReachesToExpectedPods(t *testing.T, suite *suite.Conforman g.SetLimit(concurrentRequests) for i := 0; i < totalRequests; i++ { g.Go(func() error { - cReq, cRes, err := traffic.MakeCallRoundTripper(t, roundTripper, &traffic.RequestWithBody{Request: req, Body: strings.NewReader(requestBody)}) + cReq, cRes, err := roundTripper.CaptureRoundTrip(req) if err != nil { return fmt.Errorf("failed to roundtrip request: %w", err) } - if err := gwhttp.CompareRequest(t, &req, cReq, cRes, expected); err != nil { + if err := gwhttp.CompareRoundTrip(t, &req, cReq, cRes, expected); err != nil { return fmt.Errorf("response expectation failed for request: %w", err) } diff --git a/conformance/tests/gateway_following_epp_routing.yaml b/conformance/tests/gateway_following_epp_routing.yaml index c4db6386e..d0f441af7 100644 --- a/conformance/tests/gateway_following_epp_routing.yaml +++ b/conformance/tests/gateway_following_epp_routing.yaml @@ -2,13 +2,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-primary-gw - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "primary.example.com" @@ -21,4 +21,3 @@ spec: - path: type: PathPrefix value: /primary-gateway-test - \ No newline at end of file diff --git a/conformance/tests/gateway_weighted_two_pools.go b/conformance/tests/gateway_weighted_two_pools.go new file mode 100644 index 000000000..2414522a3 --- /dev/null +++ b/conformance/tests/gateway_weighted_two_pools.go @@ -0,0 +1,223 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "fmt" + "math" + "net/http" + "strings" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" + "k8s.io/apimachinery/pkg/types" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/test" +) + +func init() { + ConformanceTests = append(ConformanceTests, GatewayWeightedAcrossTwoInferencePools) +} + +// GatewayWeightedAcrossTwoInferencePools verifies that Gateway splits traffic across two +// InferencePools according to backendRef weights, and that each request is routed to an +// endpoint of the selected InferencePool. +var GatewayWeightedAcrossTwoInferencePools = suite.ConformanceTest{ + ShortName: "GatewayWeightedAcrossTwoInferencePools", + Description: "Gateway should split traffic across two InferencePools based on backendRef weights and route only to endpoints of the selected InferencePool", + Manifests: []string{"tests/gateway_weighted_two_pools.yaml"}, + Features: []features.FeatureName{ + features.SupportGateway, + features.FeatureName("SupportInferencePool"), + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + hostname = "primary.example.com" + path = "/weighted-two-pools-test" + + // Sample size so the weight signal dominates random noise. + totalRequests = 200 + concurrentRequests = 5 + + // These route weights must match the test manifest. + primaryWeight = 70 + secondaryWeight = 30 + ) + + // Objects under test. + httpRouteNN := types.NamespacedName{Name: "httproute-weighted-two-pools", Namespace: resources.AppBackendNamespace} + gatewayNN := resources.PrimaryGatewayNN + primaryPoolNN := resources.PrimaryInferencePoolNN + secondaryPoolNN := types.NamespacedName{Name: "secondary-inference-pool", Namespace: resources.AppBackendNamespace} + + // Labels for the two deployments defined in base.yaml. + primaryLabels := map[string]string{"app": "primary-inference-model-server"} + secondaryLabels := map[string]string{"app": "secondary-inference-model-server"} + + t.Log("Verifying HTTPRoute and both InferencePools are accepted and the Gateway has an address.") + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRouteNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, primaryPoolNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, secondaryPoolNN, gatewayNN) + gwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN) + + // Discover pods for each pool and build quick lookup sets. + t.Logf("Fetching primary backend pods with labels: %v", primaryLabels) + primaryPods, err := k8sutils.GetPodsWithLabel(t, s.Client, resources.AppBackendNamespace, primaryLabels, s.TimeoutConfig) + require.NoError(t, err) + require.Len(t, primaryPods, 3) // base.yaml uses 3 replicas + + t.Logf("Fetching secondary backend pods with labels: %v", secondaryLabels) + secondaryPods, err := k8sutils.GetPodsWithLabel(t, s.Client, resources.AppBackendNamespace, secondaryLabels, s.TimeoutConfig) + require.NoError(t, err) + require.Len(t, secondaryPods, 3) // base.yaml uses 3 replicas + + primaryPodNames := make([]string, 0, len(primaryPods)) + primaryPodIPs := make([]string, 0, len(primaryPods)) + for _, p := range primaryPods { + require.NotEmpty(t, p.Status.PodIP, "primary pod %s has no IP yet", p.Name) + primaryPodNames = append(primaryPodNames, p.Name) + primaryPodIPs = append(primaryPodIPs, p.Status.PodIP) + } + + secondaryPodNames := make([]string, 0, len(secondaryPods)) + secondaryPodIPs := make([]string, 0, len(secondaryPods)) + for _, p := range secondaryPods { + require.NotEmpty(t, p.Status.PodIP, "secondary pod %s has no IP yet", p.Name) + secondaryPodNames = append(secondaryPodNames, p.Name) + secondaryPodIPs = append(secondaryPodIPs, p.Status.PodIP) + } + + // Send one targeted request per backend Pod to ensure EPP readiness. + allIPs := append(append([]string{}, primaryPodIPs...), secondaryPodIPs...) + allNames := append(append([]string{}, primaryPodNames...), secondaryPodNames...) + for i := 0; i < len(allIPs); i++ { + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + gwAddr, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Method: http.MethodPost, + Body: `{"model":"conformance-fake-model","prompt":"Warmup"}`, + Headers: map[string]string{ + test.HeaderTestEppEndPointSelectionKey: allIPs[i], + }, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, + Backend: allNames[i], + Namespace: resources.AppBackendNamespace, + }, + ) + } + + // Provide a union list of eligible endpoints for the test. Each pool's EPP + // should filter to endpoints that actually belong to its pool. + eppHeaderValue := strings.Join(allIPs, ",") + + requestBody := `{ + "model": "conformance-fake-model", + "prompt": "Write as if you were a critic: San Francisco" + }` + + // Build quick lookup sets for attributing each hit to a pool by backend pod name. + primarySet := make(map[string]struct{}, len(primaryPodNames)) + for _, n := range primaryPodNames { + primarySet[n] = struct{}{} + } + secondarySet := make(map[string]struct{}, len(secondaryPodNames)) + for _, n := range secondaryPodNames { + secondarySet[n] = struct{}{} + } + + headers := map[string]string{ + test.HeaderTestEppEndPointSelectionKey: eppHeaderValue, + } + expected := gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Method: http.MethodPost, + Headers: headers, + Body: requestBody, + }, + Response: gwhttp.Response{ + StatusCode: http.StatusOK, + }, + Namespace: resources.AppBackendNamespace, + } + req := gwhttp.MakeRequest(t, &expected, gwAddr, "HTTP", "http") + + var primaryHits, secondaryHits atomic.Int64 + var g errgroup.Group + g.SetLimit(concurrentRequests) + + for i := 0; i < totalRequests; i++ { + g.Go(func() error { + cReq, cRes, err := s.RoundTripper.CaptureRoundTrip(req) + if err != nil { + return fmt.Errorf("failed to roundtrip request: %w", err) + } + if err := gwhttp.CompareRoundTrip(t, &req, cReq, cRes, expected); err != nil { + return fmt.Errorf("response expectation failed: %w", err) + } + + // Attribute response to pool by backend pod name. + if _, ok := primarySet[cReq.Pod]; ok { + primaryHits.Add(1) + } else if _, ok := secondarySet[cReq.Pod]; ok { + secondaryHits.Add(1) + } else { + return fmt.Errorf("request was handled by unexpected pod %q (not in either pool)", cReq.Pod) + } + return nil + }) + } + require.NoError(t, g.Wait(), "requests failed") + + ph := float64(primaryHits.Load()) + sh := float64(secondaryHits.Load()) + total := ph + sh + require.Equal(t, int64(totalRequests), int64(total), "sum of hits must equal number of attempts") + require.Greater(t, total, 0.0) + + observedPrimary := ph / total + expectedPrimary := float64(primaryWeight) / float64(primaryWeight+secondaryWeight) + + // Allow either a 10 percentage-point absolute error, or a 3-sigma binomial CI. + sigma := math.Sqrt(expectedPrimary * (1.0 - expectedPrimary) / total) + absTolerance := math.Max(0.10, 3.0*sigma) + + diff := math.Abs(observedPrimary - expectedPrimary) + require.LessOrEqualf(t, diff, absTolerance, + "weighted split out of bounds: observed primary=%.3f (hits=%d/%d), expected=%.3f, tolerance=±%.3f", + observedPrimary, int64(ph), int64(total), expectedPrimary, absTolerance) + t.Logf("Weighted split OK: primary=%.3f (hits=%d/%d), expected=%.3f, tolerance=±%.3f; secondary hits=%d", + observedPrimary, int64(ph), int64(total), expectedPrimary, absTolerance, int64(sh)) + }, +} diff --git a/conformance/tests/gateway_weighted_two_pools.yaml b/conformance/tests/gateway_weighted_two_pools.yaml new file mode 100644 index 000000000..2142514ea --- /dev/null +++ b/conformance/tests/gateway_weighted_two_pools.yaml @@ -0,0 +1,30 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-weighted-two-pools + namespace: inference-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary + namespace: inference-conformance-infra + sectionName: http + hostnames: + - "primary.example.com" + rules: + - matches: + - path: + type: PathPrefix + value: /weighted-two-pools-test + backendRefs: + # 70% of traffic goes to the primary pool + - group: inference.networking.k8s.io + kind: InferencePool + name: primary-inference-pool + weight: 70 + # 30% of traffic goes to the secondary pool + - group: inference.networking.k8s.io + kind: InferencePool + name: secondary-inference-pool + weight: 30 diff --git a/conformance/tests/httproute_invalid_inferencepool_ref.yaml b/conformance/tests/httproute_invalid_inferencepool_ref.yaml index 15e7ad597..a0954cd4a 100644 --- a/conformance/tests/httproute_invalid_inferencepool_ref.yaml +++ b/conformance/tests/httproute_invalid_inferencepool_ref.yaml @@ -2,13 +2,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-to-non-existent-pool - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http rules: - backendRefs: diff --git a/conformance/tests/httproute_multiple_gateways_different_pools.go b/conformance/tests/httproute_multiple_gateways_different_pools.go index 19a3cbb23..04137ef92 100644 --- a/conformance/tests/httproute_multiple_gateways_different_pools.go +++ b/conformance/tests/httproute_multiple_gateways_different_pools.go @@ -21,11 +21,11 @@ import ( "testing" "k8s.io/apimachinery/pkg/types" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" "sigs.k8s.io/gateway-api/conformance/utils/suite" "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" ) func init() { @@ -65,17 +65,21 @@ var HTTPRouteMultipleGatewaysDifferentPools = suite.ConformanceTest{ primaryGwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, primaryGatewayNN) - traffic.MakeRequestAndExpectEventuallyConsistentResponse( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, primaryGwAddr, - traffic.Request{ - Host: primaryRouteHostname, - Path: primaryRoutePath, - ExpectedStatusCode: http.StatusOK, - Backend: primaryBackendPodName, - Namespace: resources.AppBackendNamespace, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: primaryRouteHostname, + Path: primaryRoutePath, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, + Backend: primaryBackendPodName, + Namespace: resources.AppBackendNamespace, }, ) }) @@ -91,17 +95,21 @@ var HTTPRouteMultipleGatewaysDifferentPools = suite.ConformanceTest{ secondaryGwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, secondaryGatewayNN) - traffic.MakeRequestAndExpectEventuallyConsistentResponse( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, secondaryGwAddr, - traffic.Request{ - Host: secondaryRouteHostname, - Path: secondaryRoutePath, - ExpectedStatusCode: http.StatusOK, - Backend: secondaryBackendPodName, - Namespace: resources.AppBackendNamespace, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: secondaryRouteHostname, + Path: secondaryRoutePath, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, + Backend: secondaryBackendPodName, + Namespace: resources.AppBackendNamespace, }, ) }) diff --git a/conformance/tests/httproute_multiple_gateways_different_pools.yaml b/conformance/tests/httproute_multiple_gateways_different_pools.yaml index caded16d8..17bf494f8 100644 --- a/conformance/tests/httproute_multiple_gateways_different_pools.yaml +++ b/conformance/tests/httproute_multiple_gateways_different_pools.yaml @@ -3,12 +3,12 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: route-for-primary-gateway - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra hostnames: - "primary.example.com" rules: @@ -25,12 +25,12 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: route-for-secondary-gateway - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - kind: Gateway name: conformance-secondary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra hostnames: - "secondary.example.com" rules: diff --git a/conformance/tests/inferencepool_accepted.yaml b/conformance/tests/inferencepool_accepted.yaml index 59710bae4..b155ae2c9 100644 --- a/conformance/tests/inferencepool_accepted.yaml +++ b/conformance/tests/inferencepool_accepted.yaml @@ -3,20 +3,20 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-inferencepool-accepted - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http rules: - backendRefs: - group: inference.networking.k8s.io kind: InferencePool name: primary-inference-pool - # namespace: gateway-conformance-app-backend - is omitted since it is in the same namespace as HTTPRoute + # namespace: inference-conformance-app-backend - is omitted since it is in the same namespace as HTTPRoute matches: - path: type: PathPrefix diff --git a/conformance/tests/inferencepool_httproute_port_validation.go b/conformance/tests/inferencepool_httproute_port_validation.go index dd3a938d4..35504c2a4 100644 --- a/conformance/tests/inferencepool_httproute_port_validation.go +++ b/conformance/tests/inferencepool_httproute_port_validation.go @@ -17,15 +17,16 @@ limitations under the License. package tests import ( + "net/http" "testing" "k8s.io/apimachinery/pkg/types" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" "sigs.k8s.io/gateway-api/conformance/utils/suite" "sigs.k8s.io/gateway-api/pkg/features" "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" ) func init() { @@ -54,14 +55,19 @@ var InferencePoolHTTPRoutePortValidation = suite.ConformanceTest{ k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN, gatewayNN) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gatewayAddr, - trafficutils.Request{ - Host: hostname, - Path: path, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, @@ -76,14 +82,19 @@ var InferencePoolHTTPRoutePortValidation = suite.ConformanceTest{ k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN, gatewayNN) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gatewayAddr, - trafficutils.Request{ - Host: hostname, - Path: path, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, @@ -99,14 +110,19 @@ var InferencePoolHTTPRoutePortValidation = suite.ConformanceTest{ k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN, gatewayNN) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gatewayAddr, - trafficutils.Request{ - Host: hostname, - Path: path, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, diff --git a/conformance/tests/inferencepool_httproute_port_validation.yaml b/conformance/tests/inferencepool_httproute_port_validation.yaml index 9c78117d9..53d8455c3 100644 --- a/conformance/tests/inferencepool_httproute_port_validation.yaml +++ b/conformance/tests/inferencepool_httproute_port_validation.yaml @@ -4,13 +4,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-pool-port-unspecified - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "port-unspecified.example.com" @@ -30,13 +30,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-pool-port-matching - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "port-matching.example.com" @@ -56,13 +56,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-pool-port-non-matching - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "port-non-matching.example.com" diff --git a/conformance/tests/inferencepool_invalid_epp_service.go b/conformance/tests/inferencepool_invalid_epp_service.go index ed282b1a7..ff5ce092c 100644 --- a/conformance/tests/inferencepool_invalid_epp_service.go +++ b/conformance/tests/inferencepool_invalid_epp_service.go @@ -17,11 +17,13 @@ limitations under the License. package tests import ( + "net/http" "testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" "sigs.k8s.io/gateway-api/conformance/utils/suite" "sigs.k8s.io/gateway-api/pkg/features" @@ -29,7 +31,6 @@ import ( inferenceapi "sigs.k8s.io/gateway-api-inference-extension/api/v1" "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" ) func init() { @@ -59,7 +60,7 @@ var InferencePoolInvalidEPPService = suite.ConformanceTest{ Reason: string(gatewayv1.RouteReasonAccepted), } kubernetes.HTTPRouteMustHaveCondition(t, s.Client, s.TimeoutConfig, routeNN, gwNN, acceptedCondition) - t.Run("InferecePool has a ResolvedRefs Condition with status False", func(t *testing.T) { + t.Run("InferencePool has a ResolvedRefs Condition with status False", func(t *testing.T) { acceptedCondition := metav1.Condition{ Type: string(inferenceapi.InferencePoolConditionResolvedRefs), // Standard condition type Status: metav1.ConditionFalse, @@ -69,10 +70,21 @@ var InferencePoolInvalidEPPService = suite.ConformanceTest{ }) t.Run("Request to a route with an invalid backend reference receives a 500 response", func(t *testing.T) { - trafficutils.MakeRequestAndExpectEventuallyConsistentResponse(t, s.RoundTripper, s.TimeoutConfig, gwAddr, trafficutils.Request{ - Path: routePath, - ExpectedStatusCode: 5, // Expecting response status code 5XX. - }) + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + gwAddr, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Path: routePath, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusInternalServerError, http.StatusNotImplemented, http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout}, // Expecting response status code 5XX. + }, + Namespace: resources.AppBackendNamespace, + }, + ) }) }, } diff --git a/conformance/tests/inferencepool_invalid_epp_service.yaml b/conformance/tests/inferencepool_invalid_epp_service.yaml index b3dc70e19..3f3b8c49e 100644 --- a/conformance/tests/inferencepool_invalid_epp_service.yaml +++ b/conformance/tests/inferencepool_invalid_epp_service.yaml @@ -2,7 +2,7 @@ apiVersion: inference.networking.k8s.io/v1 kind: InferencePool metadata: name: pool-with-invalid-epp - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: selector: matchLabels: @@ -19,11 +19,11 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-invalid-epp-pool - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra rules: - backendRefs: - name: pool-with-invalid-epp diff --git a/conformance/tests/inferencepool_multiple_rules_different_pools.yaml b/conformance/tests/inferencepool_multiple_rules_different_pools.yaml index 2dd8f4a6e..0539a112e 100644 --- a/conformance/tests/inferencepool_multiple_rules_different_pools.yaml +++ b/conformance/tests/inferencepool_multiple_rules_different_pools.yaml @@ -3,11 +3,11 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-multiple-rules-different-pools - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra rules: - matches: - path: diff --git a/conformance/tests/inferencepool_resolvedrefs_condition.go b/conformance/tests/inferencepool_resolvedrefs_condition.go index 6ee182448..b8e02b2c6 100644 --- a/conformance/tests/inferencepool_resolvedrefs_condition.go +++ b/conformance/tests/inferencepool_resolvedrefs_condition.go @@ -26,13 +26,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" "sigs.k8s.io/gateway-api/conformance/utils/suite" "sigs.k8s.io/gateway-api/pkg/features" "sigs.k8s.io/gateway-api-inference-extension/conformance/resources" "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config" k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" - trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" ) func init() { @@ -76,27 +76,37 @@ var InferencePoolParentStatus = suite.ConformanceTest{ k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN, gatewaySecondaryNN) t.Logf("InferencePool %s has parent status Accepted:True as expected with two references.", poolNN.String()) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwPrimaryAddr, - trafficutils.Request{ - Host: hostnamePrimaryGw, - Path: pathPrimaryGw, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostnamePrimaryGw, + Path: pathPrimaryGw, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, ) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, - gwSecondaryAddr, - trafficutils.Request{ - Host: hostnameSecondaryGw, - Path: pathSecondaryGw, + gwPrimaryAddr, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusNotFound}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, @@ -116,28 +126,37 @@ var InferencePoolParentStatus = suite.ConformanceTest{ k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN, gatewaySecondaryNN) t.Logf("InferencePool %s still has parent status Accepted:True as expected with one reference remaining.", poolNN.String()) - trafficutils.MakeRequestAndExpectSuccess( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwSecondaryAddr, - trafficutils.Request{ - Host: hostnameSecondaryGw, - Path: pathSecondaryGw, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusOK}, + }, Backend: resources.PrimaryModelServerDeploymentName, Namespace: resources.AppBackendNamespace, }, ) - trafficutils.MakeRequestAndExpectEventuallyConsistentResponse( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, gwPrimaryAddr, - trafficutils.Request{ - Host: hostnamePrimaryGw, - Path: pathPrimaryGw, - ExpectedStatusCode: http.StatusNotFound, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostnamePrimaryGw, + Path: pathPrimaryGw, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusNotFound}, + }, }, ) }) @@ -155,15 +174,19 @@ var InferencePoolParentStatus = suite.ConformanceTest{ k8sutils.InferencePoolMustHaveNoParents(t, s.Client, poolNN) t.Logf("InferencePool %s correctly shows no parent statuses, indicating it's no longer referenced.", poolNN.String()) - trafficutils.MakeRequestAndExpectEventuallyConsistentResponse( + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse( t, s.RoundTripper, s.TimeoutConfig, - gwSecondaryAddr, - trafficutils.Request{ - Host: hostnameSecondaryGw, - Path: pathSecondaryGw, - ExpectedStatusCode: http.StatusNotFound, + gwPrimaryAddr, + gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + }, + Response: gwhttp.Response{ + StatusCodes: []int{http.StatusNotFound}, + }, }, ) }) diff --git a/conformance/tests/inferencepool_resolvedrefs_condition.yaml b/conformance/tests/inferencepool_resolvedrefs_condition.yaml index 8947168a8..05a2318e9 100644 --- a/conformance/tests/inferencepool_resolvedrefs_condition.yaml +++ b/conformance/tests/inferencepool_resolvedrefs_condition.yaml @@ -8,13 +8,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-primary-gw - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-primary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "primary.example.com" @@ -33,13 +33,13 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: httproute-for-secondary-gw - namespace: gateway-conformance-app-backend + namespace: inference-conformance-app-backend spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway name: conformance-secondary - namespace: gateway-conformance-infra + namespace: inference-conformance-infra sectionName: http hostnames: - "secondary.example.com" diff --git a/conformance/utils/traffic/traffic.go b/conformance/utils/traffic/traffic.go deleted file mode 100644 index f53cc3236..000000000 --- a/conformance/utils/traffic/traffic.go +++ /dev/null @@ -1,325 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package traffic - -import ( - "context" - "encoding/json" - "fmt" - "io" - "net/http" - "net/http/httputil" - "regexp" - "strings" - "testing" - "time" - - gwconfig "sigs.k8s.io/gateway-api/conformance/utils/config" - gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" - "sigs.k8s.io/gateway-api/conformance/utils/roundtripper" - "sigs.k8s.io/gateway-api/conformance/utils/tlog" -) - -// Request defines the parameters for a single HTTP test request and its expected outcome. -type Request struct { - // Host is the hostname to use in the HTTP request. - Host string - // Path is the path to request. - Path string - // Method is the HTTP method to use. Defaults to "GET" if empty. - Method string - // Headers are the HTTP headers to include in the request. - Headers map[string]string - // Body is the request body. - Body string - - // ExpectedStatusCode is the HTTP status code expected in the response. - ExpectedStatusCode int - // Backend is the name of the backend service expected to handle the request. - // This is not checked for non-200 responses. - Backend string - // Namespace is the namespace of the backend service. - Namespace string -} - -// MakeRequestAndExpectSuccess is a convenience wrapper for requests that are -// expected to succeed with a 200 OK status. -func MakeRequestAndExpectSuccess( - t *testing.T, - r roundtripper.RoundTripper, - timeoutConfig gwconfig.TimeoutConfig, - gatewayAddress string, - req Request, -) { - t.Helper() - req.ExpectedStatusCode = http.StatusOK - MakeRequestAndExpectEventuallyConsistentResponse(t, r, timeoutConfig, gatewayAddress, req) -} - -// MakeRequestAndExpectEventuallyConsistentResponse makes a request using the parameters -// from the Request struct and waits for the response to consistently match the expectations. -func MakeRequestAndExpectEventuallyConsistentResponse( - t *testing.T, - r roundtripper.RoundTripper, - timeoutConfig gwconfig.TimeoutConfig, - gatewayAddress string, - req Request, -) { - t.Helper() - - expectedResponse := makeExpectedResponse(t, req) - waitForConvergeToExpected(t, r, timeoutConfig, gatewayAddress, req.Body, expectedResponse) -} - -// MakeRequestAndExpectResponseFromPod sends a request to the specified path -func MakeRequestAndExpectResponseFromPod(t *testing.T, r roundtripper.RoundTripper, timeoutConfig gwconfig.TimeoutConfig, gwAddr, path, podPrefix, nameSpace string) { - t.Helper() - expectedResponse := gwhttp.ExpectedResponse{ - Request: gwhttp.Request{ - Path: path, - }, - Backend: podPrefix, - Namespace: nameSpace, - } - - gwhttp.MakeRequestAndExpectEventuallyConsistentResponse(t, r, timeoutConfig, gwAddr, expectedResponse) -} - -func makeExpectedResponse(t *testing.T, req Request) gwhttp.ExpectedResponse { - t.Helper() - - method := http.MethodGet - if req.Method != "" { - method = req.Method - } - - expectedResponse := gwhttp.ExpectedResponse{ - Request: gwhttp.Request{ - Host: req.Host, - Path: req.Path, - Method: method, - Headers: req.Headers, - }, - Response: gwhttp.Response{ - StatusCode: req.ExpectedStatusCode, - }, - Backend: req.Backend, - Namespace: req.Namespace, - } - - // For successful responses (200 OK), we also verify that the backend - // received the request with the correct details (Host, Path, etc.). - // For other statuses (e.g., 404), this check is skipped. - if req.ExpectedStatusCode == http.StatusOK { - expectedResponse.ExpectedRequest = &gwhttp.ExpectedRequest{ - Request: gwhttp.Request{ - Host: req.Host, - Path: req.Path, - Headers: req.Headers, - Method: method, - }, - } - } - return expectedResponse -} - -// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 -// replace the following method when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. -func waitForConvergeToExpected( - t *testing.T, - r roundtripper.RoundTripper, - timeoutConfig gwconfig.TimeoutConfig, - gatewayAddress string, - requestBody string, - expectedResponse gwhttp.ExpectedResponse, -) { - gwhttp.AwaitConvergence(t, timeoutConfig.RequiredConsecutiveSuccesses, timeoutConfig.MaxTimeToConsistency, func(elapsed time.Duration) bool { - req := gwhttp.MakeRequest(t, &expectedResponse, gatewayAddress, "HTTP", "http") - request := &RequestWithBody{Request: req} - if requestBody != "" { - request = &RequestWithBody{Request: req, Body: strings.NewReader(requestBody)} - } - cReq, cRes, err := MakeCallRoundTripper(t, r, request) - if err != nil { - tlog.Logf(t, "Request failed, not ready yet: %v (after %v)", err.Error(), elapsed) - return false - } - - if err := CompareRequestWithWildcardStatus(t, &request.Request, cReq, cRes, expectedResponse); err != nil { - tlog.Logf(t, "Response expectation failed for request: %+v not ready yet: %v (after %v)", request.Request, err, elapsed) - return false - } - - return true - }) - tlog.Logf(t, "Request passed") -} - -// CompareRequestWithWildcardStatus compares requests with wildcard status code support. -// It treats a single-digit expected code (e.g., 4) as a class wildcard (4xx), -// while standard 3-digit codes are matched exactly. -func CompareRequestWithWildcardStatus(t *testing.T, req *roundtripper.Request, cReq *roundtripper.CapturedRequest, cRes *roundtripper.CapturedResponse, expected gwhttp.ExpectedResponse) error { - if expected.Response.StatusCode < 1 || expected.Response.StatusCode >= 100 { - return gwhttp.CompareRequest(t, req, cReq, cRes, expected) - } - - expectedClass := expected.Response.StatusCode - actualClass := cRes.StatusCode / 100 - if expectedClass != actualClass { - return fmt.Errorf("expected status code class %dxx, but got %d", expectedClass, cRes.StatusCode) - } - - // StatusCode Class matches; update status code on a copy to allow the standard comparator to pass. - modifiedExpected := expected - modifiedExpected.Response.StatusCode = cRes.StatusCode - return gwhttp.CompareRequest(t, req, cReq, cRes, modifiedExpected) -} - -// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 -// remove this when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. -// RequestWithBody extends roundtripper.Request to include a request body. -type RequestWithBody struct { - roundtripper.Request - Body io.Reader -} - -// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 -// remove this when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. -// MakeCallRoundTripper executes an HTTP request using the provided RoundTripper and captures the request and response. -func MakeCallRoundTripper(t *testing.T, r roundtripper.RoundTripper, request *RequestWithBody) (*roundtripper.CapturedRequest, *roundtripper.CapturedResponse, error) { - client := &http.Client{} - - defaultRoundTripper, ok := r.(*roundtripper.DefaultRoundTripper) - if !ok { - t.Fatalf("Unsupported RoundTripper type: %T", r) - } - rt := defaultRoundTripper - if request.UnfollowRedirect { - client.CheckRedirect = func(_ *http.Request, _ []*http.Request) error { - return http.ErrUseLastResponse - } - } - - client.Transport = &http.Transport{ - DialContext: rt.CustomDialContext, - // We disable keep-alives so that we don't leak established TCP connections. - // Leaking TCP connections is bad because we could eventually hit the - // threshold of maximum number of open TCP connections to a specific - // destination. Keep-alives are not presently utilized so disabling this has - // no adverse affect. - // - // Ref. https://github.com/kubernetes-sigs/gateway-api/issues/2357 - DisableKeepAlives: true, - } - - method := "GET" - if request.Method != "" { - method = request.Method - } - ctx, cancel := context.WithTimeout(context.Background(), rt.TimeoutConfig.RequestTimeout) - defer cancel() - req, err := http.NewRequestWithContext(ctx, method, request.URL.String(), request.Body) - if err != nil { - return nil, nil, err - } - - if request.Host != "" { - req.Host = request.Host - } - - if request.Headers != nil { - for name, value := range request.Headers { - req.Header.Set(name, value[0]) - } - } - - if rt.Debug { - var dump []byte - dump, err = httputil.DumpRequestOut(req, true) - if err != nil { - return nil, nil, err - } - - tlog.Logf(request.T, "Sending Request:\n%s\n\n", formatDump(dump, "< ")) - } - - resp, err := client.Do(req) - if err != nil { - return nil, nil, err - } - defer resp.Body.Close() - - if rt.Debug { - var dump []byte - dump, err = httputil.DumpResponse(resp, true) - if err != nil { - return nil, nil, err - } - - tlog.Logf(request.T, "Received Response:\n%s\n\n", formatDump(dump, "< ")) - } - - cReq := &roundtripper.CapturedRequest{} - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, nil, err - } - - // we cannot assume the response is JSON - if resp.Header.Get("Content-type") == "application/json" { - err = json.Unmarshal(body, cReq) - if err != nil { - return nil, nil, fmt.Errorf("unexpected error reading response: %w", err) - } - } else { - cReq.Method = method // assume it made the right request if the service being called isn't echoing - } - - cRes := &roundtripper.CapturedResponse{ - StatusCode: resp.StatusCode, - ContentLength: resp.ContentLength, - Protocol: resp.Proto, - Headers: resp.Header, - } - - if resp.TLS != nil { - cRes.PeerCertificates = resp.TLS.PeerCertificates - } - - if roundtripper.IsRedirect(resp.StatusCode) { - redirectURL, err := resp.Location() - if err != nil { - return nil, nil, err - } - cRes.RedirectRequest = &roundtripper.RedirectRequest{ - Scheme: redirectURL.Scheme, - Host: redirectURL.Hostname(), - Port: redirectURL.Port(), - Path: redirectURL.Path, - } - } - - return cReq, cRes, nil -} - -var startLineRegex = regexp.MustCompile(`(?m)^`) - -func formatDump(data []byte, prefix string) string { - data = startLineRegex.ReplaceAllLiteral(data, []byte(prefix)) - return string(data) -} diff --git a/crd-ref-docs.yaml b/crd-ref-docs.yaml index d00e6d09b..5fefbffd6 100644 --- a/crd-ref-docs.yaml +++ b/crd-ref-docs.yaml @@ -4,7 +4,7 @@ processor: ignoreTypes: - - "(InferencePool|InferenceObjective)List$" + - "(InferencePool|InferenceObjective|InferencePoolImport)List$" # RE2 regular expressions describing type fields that should be excluded from the generated documentation. ignoreFields: - "TypeMeta$" diff --git a/docs/endpoint-picker.svg b/docs/endpoint-picker.svg index 3ec8eed4e..6f4b9cf81 100644 --- a/docs/endpoint-picker.svg +++ b/docs/endpoint-picker.svg @@ -1,3 +1,5 @@ -Endpoint PickerServiceModelServerL7 Proxy / Gateway InferencePool API Selects - the model servers (the endpoints) - the endpoint picker serviceModel ServerProtocolTrafficExtensionProtocolGateway ControllerClientTrafficConfiguresWatchesWatches InferenceModel API Defines - the model/adapter to serve - the serving objectives for the modelObservabilityMetrics ScrapingObservabilityDashboardsStandard GatewayElementsInference ExtensionElementsInference Gateway \ No newline at end of file + + +Endpoint PickerServiceModelServerL7 Proxy / Gateway InferencePool API Selects - the model servers (the endpoints) - the endpoint picker serviceModel ServerProtocolTrafficExtensionProtocolGateway ControllerClientTrafficConfiguresWatchesWatches InferenceObjective API Defines - serving objectives for matching requestsObservabilityProtocolObservabilityDashboards \ No newline at end of file diff --git a/docs/proposals/004-endpoint-picker-protocol/README.md b/docs/proposals/004-endpoint-picker-protocol/README.md index 03b96bb71..919bb6311 100644 --- a/docs/proposals/004-endpoint-picker-protocol/README.md +++ b/docs/proposals/004-endpoint-picker-protocol/README.md @@ -14,6 +14,15 @@ This doc defines the protocol between the EPP and the proxy (e.g, Envoy). The EPP MUST implement the Envoy [external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/ext_proc.proto) protocol. +## Version History + +| Version | Date | Changes | +|----------|------------|--------------------------------------------------| +| v1.0.0 | 2025-07-29 | Added status metadata field for picked endpoints | +| v0.4.0 | 2025-06-03 | Added support for multiple fallback endpoints | +| v0.3.0 | 2025-03-14 | Added subsetting and fallback support | +| v0.2.0 | 2025-02-22 | Initial protocol definition | + ## Endpoint Subset [REQUEST: Data Plane -> EPP] diff --git a/docs/proposals/1374-multi-cluster-inference/README.md b/docs/proposals/1374-multi-cluster-inference/README.md new file mode 100644 index 000000000..321b0d2a4 --- /dev/null +++ b/docs/proposals/1374-multi-cluster-inference/README.md @@ -0,0 +1,493 @@ +# Multi-Cluster InferencePools + +Author(s): @danehans, @bexxmodd, @robscott + +## Proposal Status + + ***Draft*** + +## Summary + +An Inference Gateway (IG) provides efficient routing to LLM workloads in Kubernetes by sending requests to an Endpoint Picker (EPP) associated with +an [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/) and routing the request to a backend model server +based on the EPP-provided endpoint. Although other multi-cluster inference approaches may exist, this proposal extends the current model to support +multi-cluster routing so capacity in one cluster can serve traffic originating in another cluster or outside the clusters. + +### Why Multi-Cluster? + +GPU capacity is scarce and fragmented. Many users operate multiple clusters across regions and providers. A single cluster rarely satisfies peak or +sustained demand, so a prescribed approach is required to share GPU capacity across clusters by: + +- Exporting an InferencePool from a source (“exporting”) cluster. +- Importing the exported InferencePool into one or more destination (“importing”) clusters with enough detail for IGs to route requests to the associated + remote model server Pods. + +### Goals + +- Enable IGs to route to a group of common model server Pods, e.g. InferencePools, that exist in different clusters. +- Align the UX with familiar [Multi-Cluster Services (MCS)](https://multicluster.sigs.k8s.io/concepts/multicluster-services-api/) concepts (export/import). +- Keep the API simple and implementation-agnostic. + +### Non-Goals + +- Managing DNS or automatic naming. +- Over-specifying implementation details to satisfy a single approach to Multi-Cluster InferencePools. + +## Design Proposal + +The Multi-Cluster InferencePools (MCIP) model will largely follow the Multi-Cluster Services (MCS) model, with a few key differences: + +- DNS and ClusterIP resolution will be omitted, e.g. ClusterSetIP. +- A separate export resource will be avoided, e.g. ServiceExport, by inlining the concept within InferencePool. + +An InferencePoolImport resource is introduced that is meant to be fully managed by a controller. This resource provides the information +required for IGs to route LLM requests to model server endpoints of an InferencePool in remote clusters. How the IG routes the request to the remote +cluster is implementation-specific. + +### Routing Modes + +An implementation must support at least one of the following routing modes: + +- Endpoint Mode: An IG of an importing cluster routes to endpoints selected by the EPP of the exported InferencePool. Pod and Service network connectivity + MUST exist between cluster members. +- Parent Mode: An IG of an importing cluster routes to parents, e.g. Gateways, of the exported InferencePool. Parent connectivity MUST exist between cluster + members. + +### Sync Topology (Implementation-Specific) + +An implementation must support at least one of the following distribution topologies. The API does not change between them (same export annotation and InferencePoolImport). + +1. **Hub/Spoke** + - A hub controller has visibility into member clusters. + - It watches exported InferencePools and creates/updates the corresponding InferencePoolImport (same namespace/name) in each member cluster. + - Typical when a central control plane has K8s API server access for each member cluster. + - Consider [KEP-5339-style](https://github.com/kubernetes/enhancements/tree/master/keps/sig-multicluster/5339-clusterprofile-plugin-credentials) pluggable credential issuance to avoid hub-stored long-lived secrets. + +2. **Push/Pull** + - A cluster-local controller watches exported InferencePools and publishes export state to a central hub. + - A cluster-local controller watches the central hub and CRUDs the local InferencePoolImport. + - Typical when you want no hub-stored member credentials, looser coupling, and fleet-scale fan-out. + +### Workflow + +1. **Export an InferencePool:** An [Inference Platform Owner](https://gateway-api-inference-extension.sigs.k8s.io/concepts/roles-and-personas/) + exports an InferencePool by annotating it. +2. **Distribution (topology-dependent, API-agnostic):** + - **Hub/Spoke:** A central hub controller watches exported InferencePools and mirrors a same-name/namespace InferencePoolImport into each member cluster, updating `status.controllers[]` to reflect the managing controller, exporting clusters, etc.. + - **Push/Pull:** A cluster-local controller watches exported InferencePools and publishes export records to a central hub. In each member cluster, a controller watches the hub and CRUDs the local InferencePoolImport (same name/namespace) and maintains `status.controllers[]`. +3. **Importing Controller (common):** + - Watches local InferencePoolImport and: + - Programs the IG dataplane based on the supported routing mode. + - If this controller differs from the Exporting Controller, populate `status.controllers[]`. + - Manages `status.controllers[].parentRefs` with the Group, Kind, and Name of the local parent resource, e.g. Gateway, of the InferencePoolImport. +4. **Data Path:** + The data path is dependent on the export mode selected by the implementation. + - Endpoint Mode: Client → local IG → (make scheduling decision) → local/remote EPP → selected model server endpoint → response. + - Parent Mode: Client → local IG → (make scheduling decision) → local EPP/remote parent → remote EPP → selected model server endpoint → response. + +### InferencePoolImport Naming + +The exporting controller will create an InferencePoolImport resource using the exported InferencePool namespace and name. A cluster name entry in +`status.controllers[]` is added for each cluster that exports an InferencePool with the same ns/name. + +**Note:** EPP ns/name sameness is not required. + +### InferencePool Selection + +InferencePool selection is implementation-specific. The following are examples of how an IG may select one exported InferencePool over another: + +- **Metrics-based:** Scrape EPP-exposed metrics (e.g., ready pods) to bias InferencePool choice. +- **Active-Passive:** Basic EPP readiness checks (gRPC health). + +**Note:** When an exported InferencePool is selected by an IG, standard EPP semantics are used to select endpoints of that pool. + +### API Changes + +#### InferencePool Annotation + +The following annotation is being proposed to indicate the desire to export the InferencePool to member clusters of a ClusterSet. + +The `inference.networking.x-k8s.io/export` annotation key indicates a desire to export the InferencePool: + +```yaml +inference.networking.x-k8s.io/export: "" +``` + +Supported Values: + +- `ClusterSet` – export to all members of the current [ClusterSet](https://multicluster.sigs.k8s.io/api-types/cluster-set/). + +**Note:** Additional annotations, e.g. region/domain scoping, filter clusters in the ClusterSet, routing mode configuration, etc. and +potentially adding an InferencePoolExport resource may be considered in the future. + +#### InferencePool Status + +An implementation MUST set a parent status entry with a parentRef of kind `InferencePoolImport` and the ns/name of the exported InferencePool. +This informs the user that the request to export the InferencePool has been recognized by the implementation, along with the implementation's +unique `ControllerName`. `ControllerName` is a domain/path string that indicates the name of the controller that wrote the status entry. + +An `Exported` parent condition type is being added to surface status of the exported InferencePool. An implementation MUST set +this status condition to `True` when the annotated InferencePool has been exported to all member clusters of the ClusterSet and `False` +for all other reasons. When the export annotation is removed from the InferencePool, an implementation MUST remove this condition type. + +#### InferencePoolImport + +A cluster-local, controller-managed resource that represents an imported InferencePool. It primarily communicates a relationship between an exported +InferencePool and the exporting cluster name. It is not user-authored; status carries the effective import. Inference Platform Owners can reference +the InferencePoolImport, even if the local cluster does not have an InferencePool. In the context of Gateway API, it means that an HTTPRoute can be +configured to reference an InferencePoolImport to route matching requests to remote InferencePool endpoints. This API will be used almost exclusively +for tracking endpoints, but unlike MCS, we actually have two distinct sets of endpoints to track: + +1. Endpoint Picker Extensions (EPPs) +2. InferencePool parents, e.g. Gateways + +Key ideas: + +- Map exported InferencePool to exporting controller and cluster. +- Name/namespace sameness with the exported InferencePool (avoids extra indirection). +- Conditions: Surface a controller-level status condition to indicate that the InferencePoolImport is ready to be used. +- Conditions: Surface parent-level status conditions to indicate that the InferencePoolImport is referenced by a parent, e.g. Gateway. + +See the full Go type below for additional details. + +## Controller Responsibilities + +**Export Controller:** + +- Discover exported InferencePools. +- For each ClusterSet member cluster, CRUD InferencePoolImport (mirrored namespace/name). +- Populate the exported InferencePool with status to indicate a unique name of the managing controller and an `Exported` condition that + indicates the status of the exported pool. +- Populate an InferencePoolImport `status.controllers[]` entry with the managing controller name, cluster, and status conditions associated + with the exported InferencePool. + +**Import Controller:** + +- Watch InferencePoolImports. +- Program the IG data plane to route matching requests based on the supported routing mode. +- Manage InferencePoolImport `status.controllers[].parentRefs` with a cluster-local parentRef, e.g. Gateway, when the InferencePoolImport + is referenced by a managed HTTProute. + +## Examples + +### Exporting Cluster (Cluster A) Manifests + +In this example, Cluster A exports the InferencePool to all clusters in the ClusterSet. This will +cause the exporting controller to create an InferencePoolImport resource in all clusters. + +```yaml +apiVersion: inference.networking.k8s.io/v1 +kind: InferencePool +metadata: + name: llm-pool + namespace: example + annotations: + inference.networking.x-k8s.io/export: "ClusterSet" # Export the pool to all clusters in the ClusterSet +spec: + endpointPickerRef: + name: epp + portNumber: 9002 + selector: + matchLabels: + app: my-model + targetPorts: + - number: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: epp + namespace: example +spec: + selector: + app: epp + ports: + - name: ext-proc + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: LoadBalancer # EPP exposed via LoadBalancer +``` + +### Importing Cluster (Cluster B) Manifests + +In this example, the Inference Platform Owner has configured an HTTPRoute to route to endpoints of the Cluster A InferencePool +by referencing the InferencePoolImport as a `backendRef`. The parent IG(s) of the HTTPRoute are responsible for routing to the +endpoints selected by the exported InferencePool's EPP. + +The InferencePoolImport is controller-managed; shown here only to illustrate the expected status shape. + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferencePoolImport +metadata: + name: llm-pool # mirrors exporting InferencePool name + namespace: example # mirrors exporting InferencePool namespace +status: + controllers: + - controllerName: example.com/mcip-controller + type: MultiCluster + exportingClusters: + - name: cluster-a + - controllerName: example.com/ig-controller + type: GatewayClass + parents: + - parentRef: + group: gateway.networking.k8s.io + kind: Gateway + name: inf-gw # Cluster-local parent, e.g. gateway + conditions: + - type: Accepted + status: "True" +--- +# Route in the importing cluster that targets the imported pool +apiVersion: gateway.networking.k8s.io/v1beta1 +kind: HTTPRoute +metadata: + name: llm-route + namespace: example +spec: + parentRefs: + - name: inf-gw + hostnames: + - my.model.com + rules: + - matches: + - path: + type: PathPrefix + value: /completions + backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePoolImport + name: llm-pool +``` + +An implementation MUST conform to Gateway API specifications, including when the HTTPRoute contains InferencePool and InferencePoolImport `backendRefs`, +e.g. `weight`-based load balancing. In the following example, traffic MUST be split equally between Cluster A and B InferencePool endpoints when +using the following `backendRefs`: + +```yaml + backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: llm-pool + weight: 50 + - group: inference.networking.x-k8s.io + kind: InferencePoolImport + name: llm-pool + weight: 50 +``` + +**Note:** The above example does not export the local "llm-pool" InferencePool. If this InferencePool was exported, it would be included in +the example InferencePoolImport and the implementation would be responsible for balancing the traffic between the two pools. + +### Go Types + +The following Go types define the InferencePoolImport API being introduced by this proposal. Note that a separate Pull Request will be used +to finalize and merge all Go types into the repository. + +```go +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" +) + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=ipimp +// +kubebuilder:subresource:status +// +// InferencePoolImport represents an imported InferencePool from another cluster. +// This resource is controller-managed; users typically do not author it directly. +type InferencePoolImport struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines the desired state of the InferencePoolImport. + Spec InferencePoolImportSpec `json:"spec,omitempty"` + + // Status defines the current state of the InferencePoolImport. + Status InferencePoolImportStatus `json:"status,omitempty"` +} + +// Unused but defined for potential future use. +type InferencePoolImportSpec struct{} + +type InferencePoolImportStatus struct { + // Controllers is a list of controllers that are responsible for managing this InferencePoolImport. + // + // +kubebuilder:validation:Required + Controllers []ImportController `json:"controllers"` +} + +// ImportController defines a controller that is responsible for managing this InferencePoolImport. +type ImportController struct { + // Name is a domain/path string that indicates the name of the controller that manages this + // InferencePoolImport. This corresponds to the GatewayClass controllerName field when the + // controller will manage parents of type "Gateway". Otherwise, the name is implementation-specific. + // + // Example: "example.net/import-controller". + // + // The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes + // names (https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). + // + // A controller MUST populate this field when writing status and ensure that entries to status + // populated with their controller name are removed when they are no longer necessary. + // + // +required + Name ControllerName `json:"name"` + + // ExportingClusters is a list of clusters that exported the InferencePool associated with this + // InferencePoolImport. Required when the controller is responsible for CRUD'ing the InferencePoolImport + // from the exported InferencePool(s). + // + // +optional + ExportingClusters []ExportingCluster `json:"exportingClusters"` + + // Parents is a list of parent resources, typically Gateways, that are associated with the + // InferencePoolImport, and the status of the InferencePoolImport with respect to each parent. + // + // Required when the controller manages the InferencePoolImport as an HTTPRoute backendRef. The controller + // must add an entry for each parent it manages and remove the parent entry when the controller no longer + // considers the InferencePoolImport to be associated with that parent. + // + // +optional + // +listType=atomic + Parents []v1.ParentStatus `json:"parents,omitempty"` + + // Conditions track the state of the InferencePoolImport. + // + // Known condition types are: + // + // * "Accepted" + // + // +optional + // +listType=map + // +listMapKey=type + // +kubebuilder:validation:MaxItems=8 + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// ControllerName is the name of a controller that manages a resource. It must be a domain prefixed path. +// +// Valid values include: +// +// * "example.com/bar" +// +// Invalid values include: +// +// * "example.com" - must include path +// * "foo.example.com" - must include path +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$` +type ControllerName string + +// ExportingCluster defines a cluster that exported the InferencePool associated to this InferencePoolImport. +type ExportingCluster struct { + // Name of the exporting cluster (must be unique within the list). + // + // +kubebuilder:validation:Required + Name string `json:"name"` +} + +// +kubebuilder:object:root=true +type InferencePoolImportList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferencePoolImport `json:"items"` +} +``` + +### Failure Mode + +EPP failure modes continue to work as-is and are independent of MCIP. + +#### EPP Selection + +Since an IG decides which EPP to use for endpoint selection when multiple InferencePool/InferencePoolImport `backendRefs` exist, +an implementation MAY use EPP metrics and/or health data to make a load-balancing decision. + +## Alternatives + +### Option 1: Reuse MCS API for EPP + +Reuse MCS to export EPP Services. This approach provides simple infra, but may be confusing to users (you “export EPPs” not pools) and +requires a separate MCS parent export for parent-based inter-cluster routing. + +**Pros**: + +- Reuses existing MCS infrastructure. +- Relatively simple to implement. + +**Cons**: + +- Referencing InferencePools in other clusters requires you to create an InferencePool locally. +- In this model, you don’t actually choose to export an InferencePool, you export the EPP or InferencePool parent(s) service, that could lead to confusion. +- InferencePool is meant to be a replacement for a Service so it may seem counterintuitive for a user to create a Service to achieve multi-cluster inference. + +## Option 2: New MCS API + +One of the key pain points we’re seeing here is that the current iteration of the MCS API requires a tight coupling between name/namespace and kind, with Service being the only kind of backend supported right now. This goes against the broader SIG-Network direction of introducing more focused kinds of backends (like InferencePool). To address this, we could create a resource that has an `exportRef` that allows for exporting different types of resources. + +While we were at it, we could combine the separate `export` and `import` resources that exist today, with `export` acting as the (optional) spec of this new resource, and `import` acting as `status` of the resource. Instead of `import` resources being automatically created, users would create them wherever they wanted to reference or export something to a MultiClusterService. + +Here’s a very rough example: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: MultiClusterService +metadata: + name: epp + namespace: example +spec: + exportRef: + group: v1 + kind: Service + name: epp + scope: ClusterSet +status: + conditions: + - type: Accepted + status: "True" + message: "MultiClusterService has been accepted" + lastTransitionTime: "2025-03-30T01:33:51Z" + targetCount: 1 + ports: + - protocol: TCP + appProtocol: HTTP + port: 8080 +``` + +### Open Questions + +#### EPP Discovery + +- Should EPP Deployment/Pod discovery be standardized (labels/port names) for health/metrics auto-discovery? + +#### Security + +- Provide a standard way to bootstrap mTLS between importing IG and exported EPP/parents, e.g. use BackendTLSPolicy? + +#### Ownership and Lifecycle + +- Garbage collection when export is withdrawn (delete import?) and how to drain traffic safely. See [this comment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/1374#discussion_r2357523781) for additional context. + +### Prior Art + +- [GEP-1748: Gateway API Interaction with Multi-Cluster Services](https://gateway-api.sigs.k8s.io/geps/gep-1748/) +- [Envoy Gateway with Multi-Cluster Services](https://gateway.envoyproxy.io/latest/tasks/traffic/multicluster-service/) +- [Multi-Cluster Service API](https://multicluster.sigs.k8s.io/concepts/multicluster-services-api/) + +### References + +- [Initial Multi-Cluster Inference Design Doc](https://docs.google.com/document/d/1QGvG9ToaJ72vlCBdJe--hmrmLtgOV_ptJi9D58QMD2w/edit?tab=t.0#heading=h.q6xiq2fzcaia) + +### Notes for reviewers + +- The InferencePoolImport CRD is intentionally status-only to keep the UX simple and controller-driven. +- The InferencePool namespace sameness simplifies identity and lets HTTPRoute authors reference imports without new indirection. diff --git a/docs/scheduler-flowchart.png b/docs/scheduler-flowchart.png deleted file mode 100644 index 4459ef1ca..000000000 Binary files a/docs/scheduler-flowchart.png and /dev/null differ diff --git a/go.mod b/go.mod index 28dbb0837..81a3dc818 100644 --- a/go.mod +++ b/go.mod @@ -5,34 +5,39 @@ go 1.24.0 require ( github.com/cespare/xxhash/v2 v2.3.0 github.com/elastic/crd-ref-docs v0.2.0 - github.com/envoyproxy/go-control-plane/envoy v1.32.4 + github.com/envoyproxy/go-control-plane/envoy v1.35.0 github.com/go-logr/logr v1.4.3 github.com/go-logr/zapr v1.3.0 github.com/google/go-cmp v0.7.0 github.com/google/uuid v1.6.0 github.com/hashicorp/golang-lru/v2 v2.0.7 - github.com/onsi/ginkgo/v2 v2.25.1 + github.com/onsi/ginkgo/v2 v2.26.0 github.com/onsi/gomega v1.38.2 - github.com/prometheus/client_golang v1.23.0 + github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/prometheus/common v0.65.0 - github.com/prometheus/prometheus v0.305.0 - github.com/stretchr/testify v1.11.0 + github.com/prometheus/common v0.67.1 + github.com/prometheus/prometheus v0.307.1 + github.com/stretchr/testify v1.11.1 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 - golang.org/x/sync v0.16.0 - google.golang.org/grpc v1.75.0 - google.golang.org/protobuf v1.36.8 - k8s.io/api v0.34.0 - k8s.io/apiextensions-apiserver v0.33.4 - k8s.io/apimachinery v0.34.0 - k8s.io/client-go v0.34.0 - k8s.io/code-generator v0.34.0 - k8s.io/component-base v0.34.0 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/controller-tools v0.18.0 - sigs.k8s.io/gateway-api v1.3.0 + golang.org/x/sync v0.17.0 + google.golang.org/grpc v1.76.0 + google.golang.org/protobuf v1.36.10 + k8s.io/api v0.34.1 + k8s.io/apiextensions-apiserver v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/code-generator v0.34.1 + k8s.io/component-base v0.34.1 + k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d + sigs.k8s.io/controller-runtime v0.22.3 + // Update the CONTROLLER_TOOLS_VERSION in Makefile when bumping controller-tools. + sigs.k8s.io/controller-tools v0.19.0 + sigs.k8s.io/gateway-api v1.4.0 sigs.k8s.io/structured-merge-diff/v6 v6.3.0 sigs.k8s.io/yaml v1.6.0 ) @@ -46,38 +51,39 @@ require ( github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/cenkalti/backoff/v5 v5.0.2 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dennwc/varint v1.0.0 // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.8.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.2 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobuffalo/flect v1.0.3 // indirect github.com/goccy/go-yaml v1.18.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect + github.com/google/cel-go v0.26.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect - github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a // indirect + github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect - github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kylelemons/godebug v1.1.0 // indirect - github.com/mailru/easyjson v0.7.7 // indirect + github.com/mailru/easyjson v0.9.0 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/moby/spdystream v0.5.0 // indirect @@ -85,50 +91,45 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.6 // indirect + github.com/spf13/pflag v1.0.7 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect - go.opentelemetry.io/otel v1.37.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0 // indirect - go.opentelemetry.io/otel/metric v1.37.0 // indirect - go.opentelemetry.io/otel/sdk v1.37.0 // indirect - go.opentelemetry.io/otel/trace v1.37.0 // indirect - go.opentelemetry.io/proto/otlp v1.6.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.41.0 // indirect - golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect - golang.org/x/mod v0.27.0 // indirect - golang.org/x/net v0.43.0 // indirect - golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.35.0 // indirect - golang.org/x/term v0.34.0 // indirect - golang.org/x/text v0.28.0 // indirect - golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.36.0 // indirect - golang.org/x/tools/go/expect v0.1.1-deprecated // indirect + golang.org/x/crypto v0.42.0 // indirect + golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect + golang.org/x/mod v0.28.0 // indirect + golang.org/x/net v0.44.0 // indirect + golang.org/x/oauth2 v0.31.0 // indirect + golang.org/x/sys v0.36.0 // indirect + golang.org/x/term v0.35.0 // indirect + golang.org/x/text v0.29.0 // indirect + golang.org/x/time v0.13.0 // indirect + golang.org/x/tools v0.37.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250707201910-8d1bb00bc6a7 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250922171735-9219d122eba9 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.33.4 // indirect - k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f // indirect + k8s.io/apiserver v0.34.1 // indirect + k8s.io/gengo/v2 v2.0.0-20250820003526-c297c0c1eb9d // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect + k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect ) diff --git a/go.sum b/go.sum index fca5d7209..92589c4a3 100644 --- a/go.sum +++ b/go.sum @@ -1,19 +1,19 @@ cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= -cloud.google.com/go/auth v0.16.2 h1:QvBAGFPLrDeoiNjyfVunhQ10HKNYuOwZ5noee0M5df4= -cloud.google.com/go/auth v0.16.2/go.mod h1:sRBas2Y1fB1vZTdurouM0AzuYQBMZinrUYL8EufhtEA= +cloud.google.com/go/auth v0.16.5 h1:mFWNQ2FEVWAliEQWpAdH80omXFokmrnbDhUS9cBywsI= +cloud.google.com/go/auth v0.16.5/go.mod h1:utzRfHMP+Vv0mpOkTRQoWD2q3BatTOoWbA7gCc2dUhQ= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= -cloud.google.com/go/compute/metadata v0.7.0 h1:PBWF+iiAerVNe8UCHxdOt6eHLVc3ydFeOCw78U8ytSU= -cloud.google.com/go/compute/metadata v0.7.0/go.mod h1:j5MvL9PprKL39t166CoB1uVHfQMs4tFQZZcKwksXUjo= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 h1:Gt0j3wceWMwPmiazCa8MzMA0MfhmPIz0Qp0FJ6qcM0U= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0/go.mod h1:Ot/6aikWnKWi4l9QB7qVSwa8iMphQNqkWALMoNT3rzM= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1 h1:B+blDbyVIG3WaikNxPnhPiJ1MThR03b3vKGtER95TP4= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1/go.mod h1:JdM5psgjfBf5fo2uWOZhflPWyDBZ/O/CNAH9CtsuZE4= -github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 h1:FPKJS1T+clwv+OLGt13a8UjqeRuh0O4SJ3lUriThc+4= -github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1/go.mod h1:j2chePtV91HrC22tGoRX3sGY42uF13WzmmV80/OdVAA= -github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 h1:oygO0locgZJe7PpYPXT5A29ZkwJaPqcva7BVeemZOZs= -github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= +cloud.google.com/go/compute/metadata v0.8.4 h1:oXMa1VMQBVCyewMIOm3WQsnVd9FbKBtm8reqWRaXnHQ= +cloud.google.com/go/compute/metadata v0.8.4/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1 h1:5YTBM8QDVIBN3sxBil89WfdAAqDZbyJTgh688DSxX5w= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0 h1:wL5IEG5zb7BVv1Kv0Xm92orq+5hB5Nipn3B5tn4Rqfk= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 h1:XkkQbfMyuH2jTSjQjSoihryI8GINRcs4xp8lNawg0FI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= @@ -28,40 +28,40 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= -github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM= -github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= -github.com/aws/aws-sdk-go-v2/config v1.29.14 h1:f+eEi/2cKCg9pqKBoAIwRGzVb70MRKqWX4dg1BDcSJM= -github.com/aws/aws-sdk-go-v2/config v1.29.14/go.mod h1:wVPHWcIFv3WO89w0rE10gzf17ZYy+UVS1Geq8Iei34g= -github.com/aws/aws-sdk-go-v2/credentials v1.17.67 h1:9KxtdcIA/5xPNQyZRgUSpYOE6j9Bc4+D7nZua0KGYOM= -github.com/aws/aws-sdk-go-v2/credentials v1.17.67/go.mod h1:p3C44m+cfnbv763s52gCqrjaqyPikj9Sg47kUVaNZQQ= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q= +github.com/aws/aws-sdk-go-v2 v1.39.2 h1:EJLg8IdbzgeD7xgvZ+I8M1e0fL0ptn/M47lianzth0I= +github.com/aws/aws-sdk-go-v2 v1.39.2/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= +github.com/aws/aws-sdk-go-v2/config v1.31.12 h1:pYM1Qgy0dKZLHX2cXslNacbcEFMkDMl+Bcj5ROuS6p8= +github.com/aws/aws-sdk-go-v2/config v1.31.12/go.mod h1:/MM0dyD7KSDPR+39p9ZNVKaHDLb9qnfDurvVS2KAhN8= +github.com/aws/aws-sdk-go-v2/credentials v1.18.16 h1:4JHirI4zp958zC026Sm+V4pSDwW4pwLefKrc0bF2lwI= +github.com/aws/aws-sdk-go-v2/credentials v1.18.16/go.mod h1:qQMtGx9OSw7ty1yLclzLxXCRbrkjWAM7JnObZjmCB7I= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.9 h1:Mv4Bc0mWmv6oDuSWTKnk+wgeqPL5DRFu5bQL9BGPQ8Y= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.9/go.mod h1:IKlKfRppK2a1y0gy1yH6zD+yX5uplJ6UuPlgd48dJiQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.9 h1:se2vOWGD3dWQUtfn4wEjRQJb1HK1XsNIt825gskZ970= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.9/go.mod h1:hijCGH2VfbZQxqCDN7bwz/4dzxV+hkyhjawAtdPWKZA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.9 h1:6RBnKZLkJM4hQ+kN6E7yWFveOTg8NLPHAkqrs4ZPlTU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.9/go.mod h1:V9rQKRmK7AWuEsOMnHzKj8WyrIir1yUJbZxDuZLFvXI= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 h1:1Gw+9ajCV1jogloEv1RRnvfRFia2cL6c9cuKV2Ps+G8= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 h1:hXmVKytPfTy5axZ+fYbR5d0cFmC3JvwLm5kM83luako= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 h1:1XuUZ8mYJw9B6lzAkXhqHlJd/XvaX32evhproijJEZY= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.19/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4= -github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ= -github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.9 h1:5r34CgVOD4WZudeEKZ9/iKpiT6cM1JyEROpXjOcdWv8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.9/go.mod h1:dB12CEbNWPbzO2uC6QSWHteqOg4JfBVJOojbAoAUb5I= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.6 h1:A1oRkiSQOWstGh61y4Wc/yQ04sqrQZr1Si/oAXj20/s= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.6/go.mod h1:5PfYspyCU5Vw1wNPsxi15LZovOnULudOQuVxphSflQA= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.1 h1:5fm5RTONng73/QA73LhCNR7UT9RpFH3hR6HWL6bIgVY= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.1/go.mod h1:xBEjWD13h+6nq+z4AkqSfSvqRKFgDIQeaMguAJndOWo= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.6 h1:p3jIvqYwUZgu/XYeI48bJxOhvm47hZb5HUQ0tn6Q9kA= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.6/go.mod h1:WtKK+ppze5yKPkZ0XwqIVWD4beCwv056ZbPQNoeHqM8= +github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= +github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 h1:6df1vn4bBlDDo4tARvBm7l6KA9iVMnE3NWizDeWSrps= github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3/go.mod h1:CIWtjkly68+yqLPbvwwR/fjNJA/idrtULjZWh2v1ys0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/cenkalti/backoff/v5 v5.0.2 h1:rIfFVxEf1QsI7E1ZHfp/B4DF/6QBAUhmgkxc0H7Zss8= -github.com/cenkalti/backoff/v5 v5.0.2/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 h1:aQ3y1lwWyqYPiWZThqv1aFbZMiM9vblcSArJRf2Irls= @@ -75,10 +75,10 @@ github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE= github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA= github.com/elastic/crd-ref-docs v0.2.0 h1:U17MyGX71j4qfKTvYxbR4qZGoA1hc2thy7kseGYmP+o= github.com/elastic/crd-ref-docs v0.2.0/go.mod h1:0bklkJhTG7nC6AVsdDi0wt5bGoqvzdZSzMMQkilZ6XM= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= -github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/envoyproxy/go-control-plane/envoy v1.35.0 h1:ixjkELDE+ru6idPxcHLj8LBVc2bFP7iBytj353BoHUo= +github.com/envoyproxy/go-control-plane/envoy v1.35.0/go.mod h1:09qwbGVuSWWAyN5t/b3iyVfz5+z8QWGrzkoqm/8SbEs= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= @@ -87,10 +87,16 @@ github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjT github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= -github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.14 h1:3fAqdB6BCPKHDMHAKRwtPUwYexKtGrNuw8HX/T/4neo= +github.com/gkampitakis/go-snaps v0.5.14/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -98,12 +104,12 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA= +github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= +github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gobuffalo/flect v1.0.3 h1:xeWBM2nui+qnVvNM4S3foBhCAL2XgPU+a7FdpelbTq4= @@ -112,16 +118,16 @@ github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= -github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= +github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI= +github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -129,22 +135,22 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a h1://KbezygeMJZCSHH+HgUZiTeSoiuFspbMg1ge+eFj18= -github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a/go.mod h1:5hDyRhoBCxViHszMt12TnOpEI4VVi+U8Gm9iphldiMA= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= -github.com/googleapis/gax-go/v2 v2.14.2 h1:eBLnkZ9635krYIPD+ag1USrOAI0Nr0QYF3+/3GqO0k0= -github.com/googleapis/gax-go/v2 v2.14.2/go.mod h1:ON64QhlJkhVtSqp4v1uaK92VyZ2gmvDQsweuyLV+8+w= +github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= +github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= -github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= -github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 h1:cLN4IBkmkYZNnk7EAJ0BHIethd+J6LqxFNw5mSiI2bM= +github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= @@ -155,6 +161,8 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -169,8 +177,12 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= +github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= @@ -196,8 +208,8 @@ github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s= github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.25.1 h1:Fwp6crTREKM+oA6Cz4MsO8RhKQzs2/gOIVOUscMAfZY= -github.com/onsi/ginkgo/v2 v2.25.1/go.mod h1:ppTWQ1dh9KM/F1XgpeRqelR+zHVwV81DGRSDnFxK7Sk= +github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= +github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= @@ -211,25 +223,28 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/prometheus/prometheus v0.305.0 h1:UO/LsM32/E9yBDtvQj8tN+WwhbyWKR10lO35vmFLx0U= -github.com/prometheus/prometheus v0.305.0/go.mod h1:JG+jKIDUJ9Bn97anZiCjwCxRyAx+lpcEQ0QnZlUlbwY= -github.com/prometheus/sigv4 v0.2.0 h1:qDFKnHYFswJxdzGeRP63c4HlH3Vbn1Yf/Ao2zabtVXk= -github.com/prometheus/sigv4 v0.2.0/go.mod h1:D04rqmAaPPEUkjRQxGqjoxdyJuyCh6E0M18fZr0zBiE= +github.com/prometheus/common v0.67.1 h1:OTSON1P4DNxzTg4hmKCc37o4ZAZDv0cfXLkOt0oEowI= +github.com/prometheus/common v0.67.1/go.mod h1:RpmT9v35q2Y+lsieQsdOh5sXZ6ajUGC8NjZAmr8vb0Q= +github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos= +github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/prometheus/prometheus v0.307.1 h1:Hh3kRMFn+xpQGLe/bR6qpUfW4GXQO0spuYeY7f2JZs4= +github.com/prometheus/prometheus v0.307.1/go.mod h1:/7YQG/jOLg7ktxGritmdkZvezE1fa6aWDj0MGDIZvcY= +github.com/prometheus/sigv4 v0.2.1 h1:hl8D3+QEzU9rRmbKIRwMKRwaFGyLkbPdH5ZerglRHY0= +github.com/prometheus/sigv4 v0.2.1/go.mod h1:ySk6TahIlsR2sxADuHy4IBFhwEjRGGsfbbLGhFYFj6Q= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= +github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -241,32 +256,42 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.11.0 h1:ib4sjIrwZKxE5u/Japgo/7SJV3PvgjGiRNAvTVGqQl8= -github.com/stretchr/testify v1.11.0/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 h1:dNzwXjZKpMpE2JhmO+9HsPl42NIXFIFSUSSs0fiqra0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0/go.mod h1:90PoxvaEB5n6AOdZvi+yWJQoE95U8Dhhw2bSyRqnTD0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0 h1:JgtbA0xkWHnTmYk7YusopJFX6uleBmAuZ8n05NEh8nQ= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.36.0/go.mod h1:179AK5aar5R3eS9FucPy6rggvU0g52cvKId8pv4+v0c= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= -go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= -go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= -go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= -go.opentelemetry.io/proto/otlp v1.6.0 h1:jQjP+AQyTf+Fe7OKj/MfkDrmK4MNVtw2NpXsf9fefDI= -go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAjzlRhDwmVpZc= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0 h1:kJxSDN4SgWWTjG/hPp3O7LCGLcHXFlvS2/FFOrwL+SE= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0/go.mod h1:mgIOzS7iZeKJdeB8/NYHrJ48fdGc71Llo5bJ1J4DWUE= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= @@ -277,53 +302,53 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= -golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= -golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 h1:yqrTHse8TCMW1M1ZCP+VAR/l0kKxwaAIqN/il7x4voA= -golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= +golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI= +golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8= +golang.org/x/exp v0.0.0-20250808145144-a408d31f581a h1:Y+7uR/b1Mw2iSXZ3G//1haIiSElDQZ8KWh0h+sZPG90= +golang.org/x/exp v0.0.0-20250808145144-a408d31f581a/go.mod h1:rT6SFzZ7oxADUDx58pcaKFTcZ+inxAa9fTrYx/uVYwg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= -golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= +golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U= +golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= -golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= +golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= +golang.org/x/oauth2 v0.31.0 h1:8Fq0yVZLh4j4YA47vHKFTa9Ew5XIrCP8LC6UeNZnLxo= +golang.org/x/oauth2 v0.31.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= -golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ= +golang.org/x/term v0.35.0/go.mod h1:TPGtkTLesOwf2DE8CgVYiZinHAOuy5AYUYT1lENIZnA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= +golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= -golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= +golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= +golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM= golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= @@ -336,21 +361,21 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= -google.golang.org/api v0.238.0 h1:+EldkglWIg/pWjkq97sd+XxH7PxakNYoe/rkSTbnvOs= -google.golang.org/api v0.238.0/go.mod h1:cOVEm2TpdAGHL2z+UwyS+kmlGr3bVWQQ6sYEqkKje50= -google.golang.org/genproto/googleapis/api v0.0.0-20250707201910-8d1bb00bc6a7 h1:FiusG7LWj+4byqhbvmB+Q93B/mOxJLN2DTozDuZm4EU= -google.golang.org/genproto/googleapis/api v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:kXqgZtrWaf6qS3jZOCnCH7WYfrvFjkC51bM8fz3RsCA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1:pFyd6EwwL2TqFf8emdthzeX+gZE1ElRq3iM8pui4KBY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= -google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= -google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= -google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +google.golang.org/api v0.250.0 h1:qvkwrf/raASj82UegU2RSDGWi/89WkLckn4LuO4lVXM= +google.golang.org/api v0.250.0/go.mod h1:Y9Uup8bDLJJtMzJyQnu+rLRJLA0wn+wTtc6vTlOvfXo= +google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 h1:8XJ4pajGwOlasW+L13MnEGA8W4115jJySQtVfS2/IBU= +google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4/go.mod h1:NnuHhy+bxcg30o7FnVAZbXsPHUDQ9qKWAQKCD7VxFtk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250922171735-9219d122eba9 h1:V1jCN2HBa8sySkR5vLcCSqJSTMv093Rw9EJefhQGP7M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250922171735-9219d122eba9/go.mod h1:HSkG/KdJWusxU1F6CNrwNDjBMgisKxGnc5dAZfT0mjQ= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= @@ -360,38 +385,38 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.34.0 h1:L+JtP2wDbEYPUeNGbeSa/5GwFtIA662EmT2YSLOkAVE= -k8s.io/api v0.34.0/go.mod h1:YzgkIzOOlhl9uwWCZNqpw6RJy9L2FK4dlJeayUoydug= -k8s.io/apiextensions-apiserver v0.33.4 h1:rtq5SeXiDbXmSwxsF0MLe2Mtv3SwprA6wp+5qh/CrOU= -k8s.io/apiextensions-apiserver v0.33.4/go.mod h1:mWXcZQkQV1GQyxeIjYApuqsn/081hhXPZwZ2URuJeSs= -k8s.io/apimachinery v0.34.0 h1:eR1WO5fo0HyoQZt1wdISpFDffnWOvFLOOeJ7MgIv4z0= -k8s.io/apimachinery v0.34.0/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= -k8s.io/apiserver v0.33.4 h1:6N0TEVA6kASUS3owYDIFJjUH6lgN8ogQmzZvaFFj1/Y= -k8s.io/apiserver v0.33.4/go.mod h1:8ODgXMnOoSPLMUg1aAzMFx+7wTJM+URil+INjbTZCok= -k8s.io/client-go v0.34.0 h1:YoWv5r7bsBfb0Hs2jh8SOvFbKzzxyNo0nSb0zC19KZo= -k8s.io/client-go v0.34.0/go.mod h1:ozgMnEKXkRjeMvBZdV1AijMHLTh3pbACPvK7zFR+QQY= -k8s.io/code-generator v0.34.0 h1:Ze2i1QsvUprIlX3oHiGv09BFQRLCz+StA8qKwwFzees= -k8s.io/code-generator v0.34.0/go.mod h1:Py2+4w2HXItL8CGhks8uI/wS3Y93wPKO/9mBQUYNua0= -k8s.io/component-base v0.34.0 h1:bS8Ua3zlJzapklsB1dZgjEJuJEeHjj8yTu1gxE2zQX8= -k8s.io/component-base v0.34.0/go.mod h1:RSCqUdvIjjrEm81epPcjQ/DS+49fADvGSCkIP3IC6vg= -k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f h1:SLb+kxmzfA87x4E4brQzB33VBbT2+x7Zq9ROIHmGn9Q= -k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= +k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apiserver v0.34.1 h1:U3JBGdgANK3dfFcyknWde1G6X1F4bg7PXuvlqt8lITA= +k8s.io/apiserver v0.34.1/go.mod h1:eOOc9nrVqlBI1AFCvVzsob0OxtPZUCPiUJL45JOTBG0= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= +k8s.io/code-generator v0.34.1 h1:WpphT26E+j7tEgIUfFr5WfbJrktCGzB3JoJH9149xYc= +k8s.io/code-generator v0.34.1/go.mod h1:DeWjekbDnJWRwpw3s0Jat87c+e0TgkxoR4ar608yqvg= +k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= +k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= +k8s.io/gengo/v2 v2.0.0-20250820003526-c297c0c1eb9d h1:qUrYOinhdAUL0xxhA4gPqogPBaS9nIq2l2kTb6pmeB0= +k8s.io/gengo/v2 v2.0.0-20250820003526-c297c0c1eb9d/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= +k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= +k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0= +k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/controller-tools v0.18.0 h1:rGxGZCZTV2wJreeRgqVoWab/mfcumTMmSwKzoM9xrsE= -sigs.k8s.io/controller-tools v0.18.0/go.mod h1:gLKoiGBriyNh+x1rWtUQnakUYEujErjXs9pf+x/8n1U= -sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= -sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/controller-runtime v0.22.3 h1:I7mfqz/a/WdmDCEnXmSPm8/b/yRTy6JsKKENTijTq8Y= +sigs.k8s.io/controller-runtime v0.22.3/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/controller-tools v0.19.0 h1:OU7jrPPiZusryu6YK0jYSjPqg8Vhf8cAzluP9XGI5uk= +sigs.k8s.io/controller-tools v0.19.0/go.mod h1:y5HY/iNDFkmFla2CfQoVb2AQXMsBk4ad84iR1PLANB0= +sigs.k8s.io/gateway-api v1.4.0 h1:ZwlNM6zOHq0h3WUX2gfByPs2yAEsy/EenYJB78jpQfQ= +sigs.k8s.io/gateway-api v1.4.0/go.mod h1:AR5RSqciWP98OPckEjOjh2XJhAe2Na4LHyXD2FUY7Qk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index 22c705184..d9581ef3e 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -18,13 +18,14 @@ set -euo pipefail # ----------------------------------------------------------------------------- # Environment variables (defaults) # ----------------------------------------------------------------------------- -# MAJOR and MINOR are required (defaults provided here if not already set) +# MAJOR, MINOR, and PATCH are required (defaults provided here if not already set) MAJOR="${MAJOR:-0}" MINOR="${MINOR:-1}" +PATCH="${PATCH:-0}" # If RC is defined (non-empty) then include the rc suffix; otherwise omit it. if [[ -z "${RC-}" ]]; then - RELEASE_TAG="v${MAJOR}.${MINOR}.0" + RELEASE_TAG="v${MAJOR}.${MINOR}.${PATCH}" else RELEASE_TAG="v${MAJOR}.${MINOR}.0-rc.${RC}" fi @@ -74,25 +75,21 @@ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-in # ----------------------------------------------------------------------------- # Update image references # ----------------------------------------------------------------------------- -EPP="config/manifests/inferencepool-resources.yaml" #TODO: Put all helm values files into an array to loop over EPP_HELM="config/charts/inferencepool/values.yaml" BBR_HELM="config/charts/body-based-routing/values.yaml" CONFORMANCE_MANIFESTS="conformance/resources/base.yaml" -echo "Updating ${EPP}, ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..." +echo "Updating ${EPP_HELM}, ${BBR_HELM}, and ${CONFORMANCE_MANIFESTS} ..." # Update the container tag. -sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP" sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP_HELM" sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$BBR_HELM" sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$CONFORMANCE_MANIFESTS" # Update the container image pull policy. -sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$EPP" sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$CONFORMANCE_MANIFESTS" # Update the container registry. -sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP_HELM" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$BBR_HELM" sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$CONFORMANCE_MANIFESTS" @@ -136,11 +133,15 @@ sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inferen # Update the container registry for lora-syncer in vLLM CPU and GPU deployment manifests. sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$VLLM_GPU_DEPLOY" "$VLLM_CPU_DEPLOY" +# Update IGW_CHART_VERSION in quickstart guide to match the current release tag +GUIDES_INDEX="site-src/guides/index.md" +sed -i.bak -E "s/export IGW_CHART_VERSION=v[0-9]+\.[0-9]+\.[0-9]+(-rc\.[0-9]+)?/export IGW_CHART_VERSION=${RELEASE_TAG}/g" "$GUIDES_INDEX" + # ----------------------------------------------------------------------------- # Stage the changes # ----------------------------------------------------------------------------- -echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..." -git add $VERSION_FILE $UPDATED_CRD $README $EPP $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY +echo "Staging $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY $GUIDES_INDEX files..." +git add $VERSION_FILE $UPDATED_CRD $README $EPP_HELM $BBR_HELM $CONFORMANCE_MANIFESTS $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY $GUIDES_INDEX # ----------------------------------------------------------------------------- # Cleanup backup files and finish diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh new file mode 100755 index 000000000..0388b6e24 --- /dev/null +++ b/hack/verify-helm.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. + +declare -A test_cases_inference_pool + +# InferencePool Helm Chart test cases +test_cases_inference_pool["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_inference_pool["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" + +# Run the install command in case this script runs from a different bash +# source (such as in the verify-all script) +make helm-install + +# Running tests cases +echo "Running helm template command for inferencePool chart..." +# Loop through the keys of the associative array +for key in "${!test_cases_inference_pool[@]}"; do + echo "Running test: $key" + ${SCRIPT_ROOT}/bin/helm template ${SCRIPT_ROOT}/config/charts/inferencepool ${test_cases_inference_pool[$key]} --output-dir="${SCRIPT_ROOT}/bin" + if [ $? -ne 0 ]; then + echo "Helm template command failed for test: $key" + exit 1 + fi +done + + + diff --git a/hack/verify-manifests.sh b/hack/verify-manifests.sh index 70d819bc8..d74c27701 100755 --- a/hack/verify-manifests.sh +++ b/hack/verify-manifests.sh @@ -38,12 +38,15 @@ main() { cp ${SCRIPT_ROOT}/config/crd/bases/* "${TEMP_DIR}/" # Download external CRDs for validation + fetch_crds "https://raw.githubusercontent.com/kubernetes-sigs/gateway-api/refs/tags/${GATEWAY_API_VERSION}/config/crd/standard/gateway.networking.k8s.io_gatewayclasses.yaml" fetch_crds "https://raw.githubusercontent.com/kubernetes-sigs/gateway-api/refs/tags/${GATEWAY_API_VERSION}/config/crd/standard/gateway.networking.k8s.io_gateways.yaml" fetch_crds "https://raw.githubusercontent.com/kubernetes-sigs/gateway-api/refs/tags/${GATEWAY_API_VERSION}/config/crd/standard/gateway.networking.k8s.io_httproutes.yaml" fetch_crds "https://raw.githubusercontent.com/GoogleCloudPlatform/gke-gateway-api/refs/tags/${GKE_GATEWAY_API_VERSION}/config/crd/networking.gke.io_gcpbackendpolicies.yaml" fetch_crds "https://raw.githubusercontent.com/GoogleCloudPlatform/gke-gateway-api/refs/tags/${GKE_GATEWAY_API_VERSION}/config/crd/networking.gke.io_healthcheckpolicies.yaml" fetch_crds "https://raw.githubusercontent.com/istio/istio/refs/tags/${ISTIO_VERSION}/manifests/charts/base/files/crd-all.gen.yaml" + # Run the install command in case this script runs from a different bash + # source (such as in the verify-all script) make kubectl-validate ${SCRIPT_ROOT}/bin/kubectl-validate "${TEMP_DIR}" diff --git a/internal/runnable/grpc.go b/internal/runnable/grpc.go index 82b7b85e2..5db5f9ed5 100644 --- a/internal/runnable/grpc.go +++ b/internal/runnable/grpc.go @@ -37,8 +37,7 @@ func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable { // Start listening. lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) if err != nil { - log.Error(err, "gRPC server failed to listen") - return err + return fmt.Errorf("gRPC server failed to listen - %w", err) } log.Info("gRPC server listening", "port", port) @@ -59,8 +58,7 @@ func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable { // Keep serving until terminated. if err := srv.Serve(lis); err != nil && err != grpc.ErrServerStopped { - log.Error(err, "gRPC server failed") - return err + return fmt.Errorf("gRPC server failed - %w", err) } log.Info("gRPC server terminated") return nil diff --git a/latencypredictor-v1/build-deploy.sh b/latencypredictor-v1/build-deploy.sh index 94a3f98f7..1531dbb1a 100755 --- a/latencypredictor-v1/build-deploy.sh +++ b/latencypredictor-v1/build-deploy.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Build and deploy script for training, prediction, and test servers +# Build and deploy script for both servers set -e @@ -7,9 +7,8 @@ set -e PROJECT_ID="kaushikmitra-gke-dev" REGION="asia-southeast1-c" REPOSITORY="kaushikmitra-docker-repo" -TRAINING_IMAGE="latencypredictor-v3-training-server" -PREDICTION_IMAGE="latencypredictor-v3-prediction-server" -TEST_IMAGE="latencypredictor-v3-test" +TRAINING_IMAGE="latencypredictor-v1-training-server" +PREDICTION_IMAGE="latencypredictor-v1-prediction-server" TAG="latest" # Colors for output @@ -42,18 +41,7 @@ check_files() { fi done - # Check for test-specific files - local test_files=("Dockerfile-test") - for file in "${test_files[@]}"; do - if [[ ! -f "$file" ]]; then - echo_warning "Test file $file not found - test image will not be built" - TEST_BUILD_ENABLED=false - return - fi - done - - TEST_BUILD_ENABLED=true - echo_status "All required files found (including test files)." + echo_status "All required files found." } # Build Docker images @@ -62,7 +50,7 @@ build_images() { # Build training server image echo_status "Building training server image..." - docker build -f Dockerfile-training -t ${TRAINING_IMAGE}:${TAG} . + docker build -f Dockerfile-training -t ${TRAINING_IMAGE}:${TAG} . # Tag for training server docker tag ${TRAINING_IMAGE}:${TAG} \ @@ -76,19 +64,7 @@ build_images() { docker tag ${PREDICTION_IMAGE}:${TAG} \ us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${PREDICTION_IMAGE}:${TAG} - # Build test image if enabled - if [[ "$TEST_BUILD_ENABLED" == "true" ]]; then - echo_status "Building test image..." - docker build -f Dockerfile-test -t ${TEST_IMAGE}:${TAG} . - - # Tag for test image - docker tag ${TEST_IMAGE}:${TAG} \ - us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG} - - echo_status "All images (including test) built successfully." - else - echo_status "Images built successfully (test image skipped)." - fi + echo_status "Images built successfully." } # Push images to Artifact Registry @@ -106,14 +82,7 @@ push_images() { echo_status "Pushing prediction server image..." docker push us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${PREDICTION_IMAGE}:${TAG} - # Push test image if enabled - if [[ "$TEST_BUILD_ENABLED" == "true" ]]; then - echo_status "Pushing test image..." - docker push us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG} - echo_status "All images (including test) pushed successfully." - else - echo_status "Images pushed successfully (test image skipped)." - fi + echo_status "Images pushed successfully." } # Deploy to GKE @@ -133,112 +102,6 @@ deploy_to_gke() { echo_status "Deployment completed successfully." } -# Deploy test job -deploy_test() { - echo_status "Deploying test job..." - - if [[ "$TEST_BUILD_ENABLED" != "true" ]]; then - echo_warning "Test image not available. Skipping test deployment." - return - fi - - # Check if test manifest exists - if [[ ! -f "test-job.yaml" ]]; then - echo_warning "test-job.yaml not found. Creating a basic test job..." - create_test_manifest - fi - - # Delete existing test job if it exists - kubectl delete job latency-predictor-test --ignore-not-found=true - - # Apply test job - kubectl apply -f test-job.yaml - - echo_status "Test job deployed. Monitor with: kubectl logs -f job/latency-predictor-test" -} - -# Create a basic test manifest -create_test_manifest() { - cat > test-job.yaml << EOF -apiVersion: batch/v1 -kind: Job -metadata: - name: latency-predictor-test - namespace: default - labels: - app: latency-predictor-test - component: test -spec: - template: - metadata: - labels: - app: latency-predictor-test - component: test - spec: - nodeSelector: - cloud.google.com/gke-nodepool: "pool-2" - restartPolicy: Never - containers: - - name: test-runner - image: us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG} - imagePullPolicy: Always - command: ["pytest"] - args: ["-v", "-s", "test_dual_server_client.py"] - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - env: - - name: TRAINING_SERVER_URL - value: "http://training-service:8000" - - name: PREDICTION_SERVER_URL - value: "http://prediction-service:80" - - name: TEST_TIMEOUT - value: "300" - volumeMounts: - - name: test-results - mountPath: /test-results - volumes: - - name: test-results - emptyDir: {} - backoffLimit: 3 -EOF - echo_status "Created basic test-job.yaml manifest." -} - -# Run tests -run_tests() { - echo_status "Running tests..." - - if [[ "$TEST_BUILD_ENABLED" != "true" ]]; then - echo_warning "Test image not available. Running basic connectivity tests instead..." - test_deployment - return - fi - - # Deploy and run test job - deploy_test - - # Wait for job completion and show logs - echo_status "Waiting for test job to complete..." - kubectl wait --for=condition=complete job/latency-predictor-test --timeout=600s || { - echo_error "Test job did not complete successfully" - kubectl describe job latency-predictor-test - kubectl logs job/latency-predictor-test - return 1 - } - - echo_status "Test job completed. Showing logs:" - kubectl logs job/latency-predictor-test - - # Clean up test job - echo_status "Cleaning up test job..." - kubectl delete job latency-predictor-test -} - # Get service information get_service_info() { echo_status "Getting service information..." @@ -268,7 +131,7 @@ get_service_info() { kubectl get services } -# Test the deployment (basic connectivity tests) +# Test the deployment test_deployment() { echo_status "Testing deployment..." @@ -302,18 +165,6 @@ test_deployment() { fi } -# List built images -list_images() { - echo_status "Listing built images..." - - echo_status "Local images:" - docker images | grep -E "${TRAINING_IMAGE}|${PREDICTION_IMAGE}|${TEST_IMAGE}" || echo "No local images found" - - echo_status "Remote images in Artifact Registry:" - gcloud artifacts docker images list us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY} \ - --include-tags --filter="package~(${TRAINING_IMAGE}|${PREDICTION_IMAGE}|${TEST_IMAGE})" || echo "No remote images found" -} - # Cleanup function cleanup() { echo_status "Cleaning up..." @@ -333,27 +184,15 @@ main() { build_images ;; "push") - check_files push_images ;; "deploy") deploy_to_gke ;; - "test-deploy") - check_files - deploy_test - ;; - "test") - check_files - run_tests - ;; "info") get_service_info ;; - "images") - list_images - ;; - "basic-test") + "test") test_deployment ;; "all") @@ -365,30 +204,17 @@ main() { test_deployment cleanup ;; - "full") - check_files - build_images - push_images - deploy_to_gke - get_service_info - run_tests - cleanup - ;; *) - echo "Usage: $0 {check|build|push|deploy|test-deploy|test|info|images|basic-test|all|full}" + echo "Usage: $0 {check|build|push|deploy|info|test|all}" echo "" echo "Commands:" - echo " check - Check if required files exist" - echo " build - Build Docker images (including test if Dockerfile-test exists)" - echo " push - Push images to Artifact Registry" - echo " deploy - Deploy to GKE" - echo " test-deploy- Deploy test job only" - echo " test - Run comprehensive tests using test image" - echo " info - Get service information" - echo " images - List built images (local and remote)" - echo " basic-test - Run basic connectivity tests" - echo " all - Run complete build and deployment process (no tests)" - echo " full - Run complete process including comprehensive tests" + echo " check - Check if required files exist" + echo " build - Build Docker images" + echo " push - Push images to Artifact Registry" + echo " deploy - Deploy to GKE" + echo " info - Get service information" + echo " test - Test the deployment" + echo " all - Run complete build and deployment process" exit 1 ;; esac diff --git a/latencypredictor-v1/test_latency_predictor_client.py b/latencypredictor-v1/test_latency_predictor_client.py new file mode 100644 index 000000000..402f14fb7 --- /dev/null +++ b/latencypredictor-v1/test_latency_predictor_client.py @@ -0,0 +1,1244 @@ +import os +import time +import asyncio +import aiohttp +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from collections import defaultdict +import random + +import pytest +import requests + +import joblib +import numpy as np +import tempfile +import xgboost + +# Base URL of your running FastAPI server +BASE_URL = os.getenv("TRAINING_SERVER_URL", "http://34.143.221.122:80") + +# Helper to wait until the server is ready +def wait_for_ready(timeout: float = 30.0, interval: float = 1.0): + start = time.time() + while True: + try: + r = requests.get(f"{BASE_URL}/readyz", timeout=2.0) + if r.status_code == 200: + return + except requests.RequestException: + pass + if time.time() - start > timeout: + pytest.skip("Server did not become ready in time") + time.sleep(interval) + +@pytest.fixture(scope="module", autouse=True) +def ensure_server_ready(): + """Wait for the /readyz endpoint before running tests.""" + wait_for_ready() + + +def test_healthz(): + r = requests.get(f"{BASE_URL}/healthz") + assert r.status_code == 200 + assert r.json().get("status") == "ok" + + +def test_readyz(): + r = requests.get(f"{BASE_URL}/readyz") + assert r.status_code == 200 + assert r.json().get("status") == "ready" + + +def test_model_info(): + """Test the simplified /model/download/info endpoint.""" + r = requests.get(f"{BASE_URL}/model/download/info") + assert r.status_code == 200 + + data = r.json() + assert "model_type" in data + assert "model_status" in data + assert "available_endpoints" in data + assert data["model_type"] in ["bayesian_ridge", "xgboost"] + assert isinstance(data["model_status"], dict) + + print(f"Server using model type: {data['model_type']}") + + if data["model_type"] == "bayesian_ridge": + assert "coefficients_info" in data + assert data["available_endpoints"]["coefficients"] == "/metrics" + else: # XGBoost + assert "trees" in data["available_endpoints"] + + +def test_root_endpoint_enhanced(): + """Test the enhanced root endpoint that now includes model info.""" + r = requests.get(f"{BASE_URL}/") + assert r.status_code == 200 + + data = r.json() + assert "message" in data + assert "model_type" in data + assert data["model_type"] in ["bayesian_ridge", "xgboost"] + + +def test_add_training_data_bulk(): + """ + Send 120 training samples in one bulk request so the server can retrain: + Updated equations with prefix cache score: + actual_ttft_ms = 2*input_token_length + 3*num_request_waiting + + 4*num_request_running + 50*kv_cache_percentage + + 30*prefix_cache_score + 95 + actual_tpot_ms = 100*kv_cache_percentage + 0.5*input_token_length + 1*num_tokens_generated + + 5*num_request_running + 9 + """ + entries = [] + common = { + "kv_cache_percentage": 0.5, + "num_request_running": 1, + } + + for i in range(1, 121): + waiting = i % 10 + 1 + tokens = waiting + inp_len = 10 * i + kv = common["kv_cache_percentage"] + running = common["num_request_running"] + prefix_cache = random.uniform(0.1, 0.9) # Added prefix cache score + + entries.append({ + "kv_cache_percentage": kv, + "input_token_length": inp_len, + "num_request_waiting": waiting, + "num_request_running": running, + # Updated TTFT formula to include prefix_cache_score + "actual_ttft_ms": (inp_len*2.0 + waiting*3.0 + running*4.0 + kv*50.0 + prefix_cache*30.0) + 95, + # TPOT formula remains unchanged + "actual_tpot_ms": (kv*100.0 + inp_len*0.5 + tokens*1.0 + running*5.0) + 9, + "num_tokens_generated": tokens, + "prefix_cache_score": prefix_cache, # Added prefix cache score + "timestamp": time.time() # FastAPI will coerce to datetime + }) + + payload = {"entries": entries} + r = requests.post(f"{BASE_URL}/add_training_data_bulk", json=payload) + assert r.status_code == 202, f"Expected 202, got {r.status_code}" + assert r.json().get("message") == "Accepted 120 training samples." + + +def test_model_learns_equation(): + """ + After sending bulk data, poll /predict until the model's predictions + match our linear equations within tolerance, or fail after 60s. + Updated to include prefix_cache_score in the test equation. + """ + # First check what model type we're using + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_type = model_info_r.json().get("model_type", "unknown") + + features = { + "kv_cache_percentage": 0.5, + "input_token_length": 200, + "num_request_waiting": 4, + "num_request_running": 1, + "num_tokens_generated": 4, + "prefix_cache_score": 0.7, # Added prefix cache score + } + + # Updated expected TTFT to include prefix cache score + expected_ttft = ( + features["input_token_length"] * 2.0 + + features["num_request_waiting"] * 3.0 + + features["num_request_running"] * 4.0 + + features["kv_cache_percentage"] * 50.0 + + features["prefix_cache_score"] * 30.0 # New term + + 95 + ) + # TPOT formula remains unchanged + expected_tpot = ( + features["kv_cache_percentage"] * 100.0 + + features["input_token_length"] * 0.5 + + features["num_tokens_generated"] * 1.0 + + features["num_request_running"] * 5.0 + 9 + ) + + # Adjust tolerance based on model type + # XGBoost might need more tolerance for tree-based predictions + tolerance = 0.15 if model_type == "xgboost" else 0.1 + + deadline = time.time() + 60.0 + last_ttft, last_tpot = None, None + + while time.time() < deadline: + r = requests.post(f"{BASE_URL}/predict", json=features) + if r.status_code != 200: + time.sleep(1) + continue + + body = r.json() + last_ttft = body["ttft_ms"] + last_tpot = body["tpot_ms"] + + # Verify the response includes model_type + assert "model_type" in body, "Response should include model_type" + assert body["model_type"] == model_type + + ttft_ok = abs(last_ttft - expected_ttft) <= tolerance * expected_ttft + tpot_ok = abs(last_tpot - expected_tpot) <= tolerance * expected_tpot + if ttft_ok and tpot_ok: + print(f"Model converged with {model_type} in {60.0 - (deadline - time.time()):.1f}s") + print(f" Expected TTFT: {expected_ttft:.1f}, Got: {last_ttft:.1f}") + print(f" Expected TPOT: {expected_tpot:.1f}, Got: {last_tpot:.1f}") + break + + time.sleep(1) + + assert last_ttft is not None, "Never got a successful prediction." + assert abs(last_ttft - expected_ttft) <= tolerance * expected_ttft, ( + f"TTFT={last_ttft:.1f} not within ±{tolerance*100}% of {expected_ttft:.1f} (model: {model_type})" + ) + assert abs(last_tpot - expected_tpot) <= tolerance * expected_tpot, ( + f"TPOT={last_tpot:.1f} not within ±{tolerance*100}% of {expected_tpot:.1f} (model: {model_type})" + ) + + +def test_prediction_missing_prefix_cache_score(): + """Test that predictions fail when prefix_cache_score is missing.""" + features = { + "kv_cache_percentage": 0.5, + "input_token_length": 200, + "num_request_waiting": 4, + "num_request_running": 1, + "num_tokens_generated": 4, + # Missing prefix_cache_score + } + + r = requests.post(f"{BASE_URL}/predict", json=features) + assert r.status_code == 422 # Should fail validation + + print("✓ Prediction correctly failed when prefix_cache_score was missing") + + +def test_prefix_cache_score_impact_on_ttft(): + """ + Test that prefix_cache_score has the expected impact on TTFT predictions. + Since our test equation has +30*prefix_cache_score, higher scores should increase TTFT. + """ + print("Testing prefix cache score impact on TTFT predictions...") + + base_features = { + "kv_cache_percentage": 0.5, + "input_token_length": 300, + "num_request_waiting": 4, + "num_request_running": 2, + "num_tokens_generated": 15, + } + + prefix_cache_scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] + predictions = [] + + for prefix_score in prefix_cache_scores: + test_features = {**base_features, "prefix_cache_score": prefix_score} + + pred_r = requests.post(f"{BASE_URL}/predict", json=test_features, timeout=10) + assert pred_r.status_code == 200 + + pred_data = pred_r.json() + predictions.append({ + "prefix_cache_score": prefix_score, + "ttft_ms": pred_data["ttft_ms"], + "tpot_ms": pred_data["tpot_ms"] + }) + + print(f" Prefix cache {prefix_score:.1f}: TTFT={pred_data['ttft_ms']:.1f}ms, TPOT={pred_data['tpot_ms']:.1f}ms") + + # Check that TTFT increases as prefix cache score increases + # (since our test equation has +30*prefix_cache_score) + ttft_values = [p["ttft_ms"] for p in predictions] + + # Calculate correlation between prefix cache score and TTFT + first_half_avg = sum(ttft_values[:3]) / 3 # Low prefix cache scores + second_half_avg = sum(ttft_values[3:]) / 3 # High prefix cache scores + + print(f"Low prefix cache avg TTFT: {first_half_avg:.1f}ms") + print(f"High prefix cache avg TTFT: {second_half_avg:.1f}ms") + + # Since our training equation has +30*prefix_cache_score, higher prefix cache should increase TTFT + ttft_difference = second_half_avg - first_half_avg + print(f"TTFT difference (high - low prefix cache): {ttft_difference:.1f}ms") + + # Should be positive difference (higher prefix cache = higher TTFT in our test equation) + assert ttft_difference > 10, f"Expected TTFT to increase with prefix cache score, got difference: {ttft_difference:.1f}ms" + + # TPOT should not be significantly affected by prefix cache score + tpot_values = [p["tpot_ms"] for p in predictions] + tpot_first_half = sum(tpot_values[:3]) / 3 + tpot_second_half = sum(tpot_values[3:]) / 3 + tpot_difference = abs(tpot_second_half - tpot_first_half) + + print(f"TPOT difference (should be small): {tpot_difference:.1f}ms") + assert tpot_difference < 5, f"TPOT should not be significantly affected by prefix cache, got difference: {tpot_difference:.1f}ms" + + print("✓ Prefix cache score impact test passed") + + +def test_prediction_response_format(): + """Test that prediction responses include all expected fields including new model_type.""" + features = generate_random_prediction_payload() + + r = requests.post(f"{BASE_URL}/predict", json=features) + assert r.status_code == 200 + + data = r.json() + required_fields = [ + "ttft_ms", "tpot_ms", "ttft_uncertainty", "tpot_uncertainty", + "ttft_prediction_bounds", "tpot_prediction_bounds", + "predicted_at", "model_type" + ] + + for field in required_fields: + assert field in data, f"Missing required field: {field}" + + # Verify model_type is valid + assert data["model_type"] in ["bayesian_ridge", "xgboost"] + + # Verify numeric fields are reasonable + assert data["ttft_ms"] >= 0 + assert data["tpot_ms"] >= 0 + assert data["ttft_uncertainty"] >= 0 + assert data["tpot_uncertainty"] >= 0 + + # Verify bounds are tuples + assert len(data["ttft_prediction_bounds"]) == 2 + assert len(data["tpot_prediction_bounds"]) == 2 + + +def test_metrics_endpoint_enhanced(): + """Test that metrics endpoint includes model-specific information with proper coefficients.""" + r = requests.get(f"{BASE_URL}/metrics") + assert r.status_code == 200 + + content = r.text + + # Should contain model type metric + assert "model_type{" in content + + # Should contain either coefficients (Bayesian Ridge) or importance (XGBoost) + has_coef = "ttft_coef{" in content or "tpot_coef{" in content + has_importance = "ttft_importance{" in content or "tpot_importance{" in content + + assert has_coef or has_importance, "Should have either coefficients or feature importance metrics" + + # Should have standard metrics + assert "ttft_r2_score{" in content + assert "tpot_r2_score{" in content + assert "training_samples_count" in content + + # Check for prefix_cache_score in TTFT metrics + if has_coef: + assert 'feature="prefix_cache_score"' in content, "Should have prefix_cache_score coefficient for TTFT model" + if has_importance: + assert 'feature="prefix_cache_score"' in content, "Should have prefix_cache_score importance for TTFT model" + + # Parse and validate coefficient values for Bayesian Ridge + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_type = model_info_r.json().get("model_type") + + if model_type == "bayesian_ridge": + # Check that coefficients are present and reasonable + lines = content.split('\n') + ttft_intercept = None + ttft_coefs = {} + tpot_intercept = None + tpot_coefs = {} + + for line in lines: + if line.startswith('ttft_intercept{'): + ttft_intercept = float(line.split('}')[1].strip()) + elif line.startswith('ttft_coef{'): + feature = line.split('feature="')[1].split('"')[0] + value = float(line.split('}')[1].strip()) + ttft_coefs[feature] = value + elif line.startswith('tpot_intercept{'): + tpot_intercept = float(line.split('}')[1].strip()) + elif line.startswith('tpot_coef{'): + feature = line.split('feature="')[1].split('"')[0] + value = float(line.split('}')[1].strip()) + tpot_coefs[feature] = value + + # Validate coefficients are present + assert ttft_intercept is not None, "TTFT intercept should be present" + assert tpot_intercept is not None, "TPOT intercept should be present" + + # Updated expected features to include prefix_cache_score for TTFT + expected_ttft_features = ["kv_cache_percentage", "input_token_length", "num_request_waiting", "num_request_running", "prefix_cache_score"] + expected_tpot_features = ["kv_cache_percentage", "input_token_length", "num_request_waiting", "num_request_running", "num_tokens_generated"] + + for feature in expected_ttft_features: + assert feature in ttft_coefs, f"TTFT coefficient for {feature} should be present" + + for feature in expected_tpot_features: + assert feature in tpot_coefs, f"TPOT coefficient for {feature} should be present" + + print(f"✓ Bayesian Ridge coefficients validated:") + print(f" TTFT intercept: {ttft_intercept:.4f}") + print(f" TTFT coefficients: {ttft_coefs}") + print(f" TPOT intercept: {tpot_intercept:.4f}") + print(f" TPOT coefficients: {tpot_coefs}") + + # Validate prefix_cache_score coefficient is reasonable + if "prefix_cache_score" in ttft_coefs: + prefix_coef = ttft_coefs["prefix_cache_score"] + print(f" Prefix cache coefficient: {prefix_coef:.4f}") + # Should be positive and reasonably close to our training value of 30 + assert 10 < prefix_coef < 50, f"Prefix cache coefficient should be reasonable: {prefix_coef}" + + print("✓ Training server metrics endpoint working correctly with prefix cache support") + + +def test_xgboost_tree_endpoints(): + """Test XGBoost tree endpoints if XGBoost is being used.""" + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_type = model_info_r.json().get("model_type") + + if model_type != "xgboost": + print("Skipping XGBoost tree tests - not using XGBoost model") + return + + print("Testing XGBoost tree endpoints...") + + # Test TTFT trees + ttft_response = requests.get(f"{BASE_URL}/model/ttft/xgb/json") + assert ttft_response.status_code == 200, "TTFT XGBoost trees should be available" + ttft_trees = ttft_response.json() + assert isinstance(ttft_trees, list), "TTFT trees should be a list" + assert len(ttft_trees) > 0, "Should have TTFT trees" + assert isinstance(ttft_trees[0], dict), "Each tree should be a dict" + + # Test TPOT trees + tpot_response = requests.get(f"{BASE_URL}/model/tpot/xgb/json") + assert tpot_response.status_code == 200, "TPOT XGBoost trees should be available" + tpot_trees = tpot_response.json() + assert isinstance(tpot_trees, list), "TPOT trees should be a list" + assert len(tpot_trees) > 0, "Should have TPOT trees" + assert isinstance(tpot_trees[0], dict), "Each tree should be a dict" + + print(f"✓ XGBoost trees available: {len(ttft_trees)} TTFT trees, {len(tpot_trees)} TPOT trees") + + +def test_bayesian_ridge_coefficients(): + """Test that Bayesian Ridge coefficients are properly descaled and stored.""" + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_type = model_info_r.json().get("model_type") + + if model_type != "bayesian_ridge": + print("Skipping Bayesian Ridge coefficient tests - not using Bayesian Ridge model") + return + + print("Testing Bayesian Ridge coefficient storage and retrieval...") + + # Get coefficients from metrics + r = requests.get(f"{BASE_URL}/metrics") + assert r.status_code == 200 + content = r.text + + # Parse coefficients from metrics + lines = content.split('\n') + ttft_coefs = {} + tpot_coefs = {} + + for line in lines: + if line.startswith('ttft_coef{'): + feature = line.split('feature="')[1].split('"')[0] + value = float(line.split('}')[1].strip()) + ttft_coefs[feature] = value + elif line.startswith('tpot_coef{'): + feature = line.split('feature="')[1].split('"')[0] + value = float(line.split('}')[1].strip()) + tpot_coefs[feature] = value + + # Test a prediction to see if coefficients make sense + test_features = { + "kv_cache_percentage": 0.5, + "input_token_length": 100, + "num_request_waiting": 2, + "num_request_running": 1, + "num_tokens_generated": 5, + "prefix_cache_score": 0.8, # Added prefix cache score + } + + # Make prediction via API + pred_response = requests.post(f"{BASE_URL}/predict", json=test_features) + assert pred_response.status_code == 200 + api_prediction = pred_response.json() + + print(f"✓ Coefficients extracted from metrics:") + print(f" TTFT coefficients: {ttft_coefs}") + print(f" TPOT coefficients: {tpot_coefs}") + print(f" API TTFT prediction: {api_prediction['ttft_ms']:.2f}") + print(f" API TPOT prediction: {api_prediction['tpot_ms']:.2f}") + + # Verify prefix_cache_score coefficient exists for TTFT + assert "prefix_cache_score" in ttft_coefs, "prefix_cache_score should be in TTFT coefficients" + assert "prefix_cache_score" not in tpot_coefs, "prefix_cache_score should NOT be in TPOT coefficients" + + +def test_model_endpoints_by_type(): + """Test the appropriate endpoints based on model type.""" + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_info = model_info_r.json() + model_type = model_info["model_type"] + + print(f"Testing endpoints for model type: {model_type}") + + if model_type == "bayesian_ridge": + # For Bayesian Ridge, we should have coefficients in metrics + test_bayesian_ridge_coefficients() + + # XGBoost endpoints should return 404 + ttft_xgb_response = requests.get(f"{BASE_URL}/model/ttft/xgb/json") + assert ttft_xgb_response.status_code == 404, "XGBoost endpoints should not be available for Bayesian Ridge" + + print("✓ Bayesian Ridge: coefficients available in metrics, XGBoost endpoints properly blocked") + + else: # XGBoost + # For XGBoost, we should have tree endpoints + test_xgboost_tree_endpoints() + + print("✓ XGBoost: tree endpoints available") + + +def generate_random_prediction_payload(): + """Generate a random prediction payload for stress testing including prefix_cache_score.""" + return { + "kv_cache_percentage": random.uniform(0.1, 0.9), + "input_token_length": random.randint(10, 1000), + "num_request_waiting": random.randint(1, 20), + "num_request_running": random.randint(1, 10), + "num_tokens_generated": random.randint(1, 20), + "prefix_cache_score": random.uniform(0.0, 1.0), # Added prefix cache score + } + + +def generate_random_training_payload(): + """Generate a random training data payload for stress testing with updated TTFT formula.""" + input_tokens = random.randint(10, 1000) + waiting_requests = random.randint(1, 20) + running_requests = random.randint(1, 10) + kv = random.uniform(0.01, 0.99) + tokens_generated = random.randint(1, 20) + prefix_cache = random.uniform(0.0, 1.0) # Added prefix cache score + + return { + "kv_cache_percentage": kv, + "input_token_length": input_tokens, + "num_request_waiting": waiting_requests, + "num_request_running": running_requests, + # Updated linear TTFT with noise - now includes prefix_cache_score + "actual_ttft_ms": ( + input_tokens * 2.0 + + waiting_requests * 3.0 + + running_requests * 4.0 + + kv * 50.0 + + prefix_cache * 30.0 # New term for prefix cache + + 95 + random.uniform(-10, 10) + ), + # TPOT formula remains unchanged + "actual_tpot_ms": ( + kv * 100.0 + + input_tokens * 0.5 + + tokens_generated * 1.0 + + running_requests * 5.0 + + 9 + random.uniform(-5, 5) + ), + "num_tokens_generated": tokens_generated, + "prefix_cache_score": prefix_cache, # Added prefix cache score + } + + +def generate_bulk_training_payload(size=1000): + """Generate a bulk training payload with specified number of entries.""" + entries = [] + for _ in range(size): + entries.append(generate_random_training_payload()) + return {"entries": entries} + + +async def async_post_request(session, url, payload, request_id): + """Make an async POST request and return result with metadata.""" + start_time = time.time() + try: + async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=5)) as response: + end_time = time.time() + response_data = await response.json() + return { + 'request_id': request_id, + 'status_code': response.status, + 'response_time': end_time - start_time, + 'success': response.status in [200, 202], + 'response_data': response_data, + 'request_type': 'predict' if '/predict' in url else 'training', + 'model_type': response_data.get('model_type') if response.status == 200 else None + } + except Exception as e: + end_time = time.time() + return { + 'request_id': request_id, + 'status_code': 0, + 'response_time': end_time - start_time, + 'success': False, + 'error': str(e), + 'request_type': 'predict' if '/predict' in url else 'training', + 'model_type': None + } + +async def run_stress_test_async(duration_seconds=10, target_qps=300): + interval = 1.0/target_qps + start = time.time() + connector = aiohttp.TCPConnector(limit=10000, limit_per_host=10000, ttl_dns_cache=300, use_dns_cache=True) + async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=2)) as sess: + tasks = [] + req_id = 0 + next_time = start + while time.time() - start < duration_seconds: + now = time.time() + while next_time <= now: + req_id += 1 + if random.random()<0.5: + url = f"{BASE_URL}/predict" + payload = generate_random_prediction_payload() + else: + url = f"{BASE_URL}/add_training_data_bulk" + payload = {"entries":[ generate_random_training_payload() ]} + tasks.append(asyncio.create_task(async_post_request(sess, url, payload, req_id))) + next_time += interval + await asyncio.sleep(0.0001) + + results = await asyncio.gather(*tasks, return_exceptions=True) + + valid_results = [r for r in results if isinstance(r, dict)] + + # Calculate actual QPS achieved + if valid_results: + actual_duration = duration_seconds + actual_qps = len(valid_results) / actual_duration + print(f"Target QPS: {target_qps}, Actual QPS: {actual_qps:.0f}") + + return valid_results + + +def fetch_and_parse_xgb_json(path_suffix): + """ + Download the XGBoost JSON dump for `path_suffix` (ttft or tpot), + parse into a Python list of dicts, and return it. + """ + url = f"{BASE_URL}/model/{path_suffix}/xgb/json" + r = requests.get(url, timeout=10) + assert r.status_code == 200, f"Failed to fetch JSON for {path_suffix}" + trees = r.json() + assert isinstance(trees, list), "Expected a JSON array of trees" + assert len(trees) > 0, "Tree list should not be empty" + assert isinstance(trees[0], dict), "Each tree must be a JSON object" + return trees + + +async def async_fetch_and_parse_xgb_json(session, suffix, request_id): + """ + Async GET /model//xgb/json and return timing + status. + """ + url = f"{BASE_URL}/model/{suffix}/xgb/json" + start = time.time() + try: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as resp: + data = await resp.json() + elapsed = time.time() - start + return { + 'request_id': request_id, + 'request_type': f'download_{suffix}', + 'status_code': resp.status, + 'response_time': elapsed, + 'success': resp.status == 200, + 'tree_count': len(data) if isinstance(data, list) else None + } + except Exception as e: + elapsed = time.time() - start + return { + 'request_id': request_id, + 'request_type': f'download_{suffix}', + 'status_code': 0, + 'response_time': elapsed, + 'success': False, + 'error': str(e) + } + + +async def run_simplified_stress_test(duration_seconds=10, target_qps=2): + """ + Simplified stress test: bulk training vs predictions and tree downloads (XGBoost only). + """ + info_r = requests.get(f"{BASE_URL}/model/download/info", timeout=5.0) + model_type = info_r.json().get("model_type", "bayesian_ridge") + + interval = 1.0 / target_qps + start = time.time() + connector = aiohttp.TCPConnector(limit=1000, limit_per_host=1000) + async with aiohttp.ClientSession(connector=connector) as sess: + tasks = [] + req_id = 0 + next_time = start + + while time.time() - start < duration_seconds: + now = time.time() + while next_time <= now: + req_id += 1 + + if random.random() < 0.5: + # Either predictions or tree downloads (XGBoost only) + if random.random() < 0.7: # 70% predictions + url = f"{BASE_URL}/predict" + payload = generate_random_prediction_payload() + task = asyncio.create_task( + async_post_request_with_timeout( + sess, url, payload, req_id, + aiohttp.ClientTimeout(total=5), "predict" + ) + ) + else: # 30% tree downloads (only for XGBoost) + if model_type == "xgboost": + suffix = random.choice(["ttft", "tpot"]) + task = asyncio.create_task( + async_fetch_and_parse_xgb_json(sess, suffix, req_id) + ) + else: + # For Bayesian Ridge, just do another prediction + url = f"{BASE_URL}/predict" + payload = generate_random_prediction_payload() + task = asyncio.create_task( + async_post_request_with_timeout( + sess, url, payload, req_id, + aiohttp.ClientTimeout(total=5), "predict" + ) + ) + else: + # bulk training + url = f"{BASE_URL}/add_training_data_bulk" + payload = generate_bulk_training_payload(1000) + task = asyncio.create_task( + async_post_request_with_timeout( + sess, url, payload, req_id, + aiohttp.ClientTimeout(total=30), "bulk_training" + ) + ) + + tasks.append(task) + next_time += interval + + await asyncio.sleep(0.001) + + print(f"Waiting for {len(tasks)} requests to complete…") + results = await asyncio.gather(*tasks, return_exceptions=True) + valid = [r for r in results if isinstance(r, dict)] + + if valid: + actual_qps = len(valid) / duration_seconds + print(f"Target QPS: {target_qps}, Actual QPS: {actual_qps:.2f}") + + return valid + + +async def async_post_request_with_timeout(session, url, payload, request_id, timeout, request_type): + """Make an async POST request with custom timeout and return result with metadata.""" + start_time = time.time() + try: + async with session.post(url, json=payload, timeout=timeout) as response: + end_time = time.time() + response_data = await response.json() + + # Count training entries for bulk requests + training_entries = len(payload.get("entries", [])) if request_type == "bulk_training" else 1 + + return { + 'request_id': request_id, + 'status_code': response.status, + 'response_time': end_time - start_time, + 'success': response.status in [200, 202], + 'response_data': response_data, + 'request_type': request_type, + 'training_entries': training_entries if request_type == "bulk_training" else 0, + 'model_type': response_data.get('model_type') if response.status == 200 and request_type == 'predict' else None + } + except Exception as e: + end_time = time.time() + training_entries = len(payload.get("entries", [])) if request_type == "bulk_training" else 1 + return { + 'request_id': request_id, + 'status_code': 0, + 'response_time': end_time - start_time, + 'success': False, + 'error': str(e), + 'request_type': request_type, + 'training_entries': training_entries if request_type == "bulk_training" else 0, + 'model_type': None + } + + +def analyze_stress_test_results(results): + """Analyze and print stress test results with model type information.""" + if not results: + print("No results to analyze") + return + + total_requests = len(results) + successful_requests = sum(1 for r in results if r.get('success', False)) + failed_requests = total_requests - successful_requests + + response_times = [r['response_time'] for r in results if r.get('response_time')] + avg_response_time = sum(response_times) / len(response_times) if response_times else 0 + + status_codes = defaultdict(int) + for r in results: + status_codes[r.get('status_code', 0)] += 1 + + request_types = defaultdict(int) + for r in results: + request_types[r.get('request_type', 'unknown')] += 1 + + # Analyze model types in prediction responses + model_types = defaultdict(int) + for r in results: + if r.get('model_type'): + model_types[r['model_type']] += 1 + + test_duration = max(response_times) if response_times else 0 + actual_qps = total_requests / test_duration if test_duration > 0 else 0 + + print(f"\n{'='*50}") + print("STRESS TEST RESULTS") + print(f"{'='*50}") + print(f"Total Requests: {total_requests}") + print(f"Successful: {successful_requests} ({successful_requests/total_requests*100:.1f}%)") + print(f"Failed: {failed_requests} ({failed_requests/total_requests*100:.1f}%)") + print(f"Average Response Time: {avg_response_time*1000:.2f}ms") + print(f"Actual QPS: {actual_qps:.0f}") + print(f"\nRequest Types:") + for req_type, count in request_types.items(): + print(f" {req_type}: {count}") + print(f"\nStatus Code Distribution:") + for status, count in status_codes.items(): + print(f" {status}: {count}") + + if model_types: + print(f"\nModel Types in Predictions:") + for model_type, count in model_types.items(): + print(f" {model_type}: {count}") + + if response_times: + sorted_times = sorted(response_times) + p50 = sorted_times[int(len(sorted_times) * 0.5)] * 1000 + p95 = sorted_times[int(len(sorted_times) * 0.95)] * 1000 + p99 = sorted_times[int(len(sorted_times) * 0.99)] * 1000 + print(f"\nResponse Time Percentiles:") + print(f" P50: {p50:.2f}ms") + print(f" P95: {p95:.2f}ms") + print(f" P99: {p99:.2f}ms") + + +def analyze_bulk_training_results(results): + """Analyze and print bulk training stress test results with additional metrics.""" + if not results: + print("No results to analyze") + return + + total_requests = len(results) + successful_requests = sum(1 for r in results if r.get('success', False)) + failed_requests = total_requests - successful_requests + + # Separate analysis by request type + prediction_results = [r for r in results if r.get('request_type') == 'predict'] + bulk_training_results = [r for r in results if r.get('request_type') == 'bulk_training'] + download_results = [r for r in results if r.get('request_type', '').startswith('download_')] + + # Calculate total training entries processed + total_training_entries = sum(r.get('training_entries', 0) for r in bulk_training_results) + + # Analyze model types in prediction responses + model_types = defaultdict(int) + for r in prediction_results: + if r.get('model_type'): + model_types[r['model_type']] += 1 + + response_times = [r['response_time'] for r in results if r.get('response_time')] + avg_response_time = sum(response_times) / len(response_times) if response_times else 0 + + status_codes = defaultdict(int) + for r in results: + status_codes[r.get('status_code', 0)] += 1 + + request_types = defaultdict(int) + for r in results: + request_types[r.get('request_type', 'unknown')] += 1 + + print(f"\n{'='*60}") + print("BULK TRAINING STRESS TEST RESULTS") + print(f"{'='*60}") + print(f"Total Requests: {total_requests}") + print(f"Successful: {successful_requests} ({successful_requests/total_requests*100:.1f}%)") + print(f"Failed: {failed_requests} ({failed_requests/total_requests*100:.1f}%)") + print(f"Average Response Time: {avg_response_time*1000:.2f}ms") + + print(f"\nRequest Type Breakdown:") + print(f" Prediction requests: {len(prediction_results)}") + print(f" Bulk training requests: {len(bulk_training_results)}") + print(f" Model download requests: {len(download_results)}") + print(f" Total training entries processed: {total_training_entries}") + + if model_types: + print(f"\nModel Types in Predictions:") + for model_type, count in model_types.items(): + print(f" {model_type}: {count}") + + print(f"\nStatus Code Distribution:") + for status, count in status_codes.items(): + print(f" {status}: {count}") + + # Response time analysis by request type + if prediction_results: + pred_times = [r['response_time'] for r in prediction_results if r.get('response_time')] + if pred_times: + avg_pred_time = sum(pred_times) / len(pred_times) + print(f"\nPrediction Request Response Times:") + print(f" Average: {avg_pred_time*1000:.2f}ms") + print(f" Min: {min(pred_times)*1000:.2f}ms") + print(f" Max: {max(pred_times)*1000:.2f}ms") + + if bulk_training_results: + bulk_times = [r['response_time'] for r in bulk_training_results if r.get('response_time')] + if bulk_times: + avg_bulk_time = sum(bulk_times) / len(bulk_times) + print(f"\nBulk Training Request Response Times:") + print(f" Average: {avg_bulk_time*1000:.2f}ms") + print(f" Min: {min(bulk_times)*1000:.2f}ms") + print(f" Max: {max(bulk_times)*1000:.2f}ms") + + if download_results: + download_times = [r['response_time'] for r in download_results if r.get('response_time')] + if download_times: + avg_download_time = sum(download_times) / len(download_times) + print(f"\nModel Download Request Response Times:") + print(f" Average: {avg_download_time*1000:.2f}ms") + print(f" Min: {min(download_times)*1000:.2f}ms") + print(f" Max: {max(download_times)*1000:.2f}ms") + + if response_times: + sorted_times = sorted(response_times) + p50 = sorted_times[int(len(sorted_times) * 0.5)] * 1000 + p95 = sorted_times[int(len(sorted_times) * 0.95)] * 1000 + p99 = sorted_times[int(len(sorted_times) * 0.99)] * 1000 + print(f"\nOverall Response Time Percentiles:") + print(f" P50: {p50:.2f}ms") + print(f" P95: {p95:.2f}ms") + print(f" P99: {p99:.2f}ms") + + +def test_stress_test_high_qps(): + """ + Stress test with 300 QPS for 10 seconds. + Sends predictions and training data in parallel. + """ + results = asyncio.run(run_stress_test_async(duration_seconds=10, target_qps=300)) + + analyze_stress_test_results(results) + + assert len(results) > 0, "No requests were made" + + successful_requests = sum(1 for r in results if r.get('success', False)) + success_rate = successful_requests / len(results) + + assert success_rate > 0.8, f"Success rate too low: {success_rate*100:.1f}%" + + print(f"Stress test completed successfully with {success_rate*100:.1f}% success rate") + + +def test_stress_test_mixed_load(): + """ + Alternative stress test with mixed load patterns. + Tests server stability under varying load conditions. + """ + print("Running mixed load stress test...") + + print("Phase 1: Ramping up load...") + results_phase1 = asyncio.run(run_stress_test_async(duration_seconds=5, target_qps=100)) + + print("Phase 2: High sustained load...") + results_phase2 = asyncio.run(run_stress_test_async(duration_seconds=10, target_qps=300)) + + print("Phase 3: Cooling down...") + results_phase3 = asyncio.run(run_stress_test_async(duration_seconds=5, target_qps=50)) + + all_results = results_phase1 + results_phase2 + results_phase3 + + print("\nCOMBINED RESULTS FOR ALL PHASES:") + analyze_stress_test_results(all_results) + + assert len(all_results) > 0, "No requests were made" + + successful_requests = sum(1 for r in all_results if r.get('success', False)) + success_rate = successful_requests / len(all_results) + + assert success_rate > 0.75, f"Overall success rate too low: {success_rate*100:.1f}%" + + print(f"Mixed load stress test completed with {success_rate*100:.1f}% success rate") + + +def test_simplified_stress_test(): + """Simplified stress test focusing on predictions, training, and tree downloads with prefix cache.""" + print("Running simplified stress test with prefix cache score support...") + print("Configuration: 2 QPS, 50% bulk training, 35% predictions, 15% tree downloads (XGBoost only)") + + results = asyncio.run(run_simplified_stress_test(duration_seconds=60, target_qps=2)) + + analyze_bulk_training_results(results) + + assert len(results) > 0, "No requests were made" + + successful_requests = sum(1 for r in results if r.get('success', False)) + success_rate = successful_requests / len(results) + + # Count request types + prediction_count = sum(1 for r in results if r.get('request_type') == 'predict') + bulk_training_count = sum(1 for r in results if r.get('request_type') == 'bulk_training') + download_count = sum(1 for r in results if r.get('request_type', '').startswith('download_')) + + assert success_rate > 0.8, f"Success rate too low: {success_rate*100:.1f}%" + assert prediction_count > 0, "No prediction requests were made" + assert bulk_training_count > 0, "No bulk training requests were made" + + print(f"✓ Simplified stress test with prefix cache completed:") + print(f" Success rate: {success_rate*100:.1f}%") + print(f" Prediction requests: {prediction_count}") + print(f" Tree download requests: {download_count}") + print(f" Bulk training requests: {bulk_training_count}") + + +def test_model_type_consistency(): + """ + Test that the model type is consistent across all API endpoints. + """ + print("Testing model type consistency across endpoints...") + + # Get model type from different endpoints + root_response = requests.get(f"{BASE_URL}/") + model_info_response = requests.get(f"{BASE_URL}/model/download/info") + + # Make a prediction to get model type from prediction response + prediction_request = generate_random_prediction_payload() + prediction_response = requests.post(f"{BASE_URL}/predict", json=prediction_request) + + # Extract model types + root_model_type = root_response.json().get("model_type") + model_info_model_type = model_info_response.json().get("model_type") + prediction_model_type = prediction_response.json().get("model_type") + + # Check consistency + assert root_model_type == model_info_model_type == prediction_model_type, ( + f"Model type inconsistency: root={root_model_type}, " + f"model_info={model_info_model_type}, prediction={prediction_model_type}" + ) + + print(f"Model type consistent across all endpoints: {root_model_type}") + + +def test_xgboost_vs_bayesian_ridge_performance(): + """ + Performance comparison test (if both models are available). + This test will check model performance differences. + """ + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_info = model_info_r.json() + + print(f"Current model: {model_info['model_type']}") + + # Generate test predictions with prefix cache scores + test_cases = [generate_random_prediction_payload() for _ in range(10)] + + predictions = [] + response_times = [] + + for test_case in test_cases: + start_time = time.time() + response = requests.post(f"{BASE_URL}/predict", json=test_case) + end_time = time.time() + + assert response.status_code == 200 + predictions.append(response.json()) + response_times.append((end_time - start_time) * 1000) # Convert to ms + + avg_response_time = sum(response_times) / len(response_times) + avg_prefix_cache = sum(tc['prefix_cache_score'] for tc in test_cases) / len(test_cases) + + print(f"Model: {predictions[0]['model_type']}") + print(f"Average response time: {avg_response_time:.2f}ms") + print(f"Average prefix cache score: {avg_prefix_cache:.2f}") + print(f"Average TTFT prediction: {sum(p['ttft_ms'] for p in predictions)/len(predictions):.2f}ms") + print(f"Average TPOT prediction: {sum(p['tpot_ms'] for p in predictions)/len(predictions):.2f}ms") + print(f"Average TTFT uncertainty: {sum(p['ttft_uncertainty'] for p in predictions)/len(predictions):.2f}") + print(f"Average TPOT uncertainty: {sum(p['tpot_uncertainty'] for p in predictions)/len(predictions):.2f}") + + # Basic sanity checks + assert avg_response_time < 1000, f"Response time too slow: {avg_response_time:.2f}ms" + assert all(p['ttft_ms'] > 0 for p in predictions), "All TTFT predictions should be positive" + assert all(p['tpot_ms'] > 0 for p in predictions), "All TPOT predictions should be positive" + + +def test_uncertainty_estimation_quality(): + """ + Test the quality of uncertainty estimation for both model types. + """ + model_info_r = requests.get(f"{BASE_URL}/model/download/info") + model_type = model_info_r.json().get("model_type") + + # Generate multiple predictions for the same input + test_payload = { + "kv_cache_percentage": 0.5, + "input_token_length": 100, + "num_request_waiting": 2, + "num_request_running": 1, + "num_tokens_generated": 5, + "prefix_cache_score": 0.8, # Added prefix cache score + } + + predictions = [] + for _ in range(5): # Make multiple identical requests + response = requests.post(f"{BASE_URL}/predict", json=test_payload) + assert response.status_code == 200 + predictions.append(response.json()) + + # Check that predictions are consistent (should be identical for same input) + ttft_values = [p['ttft_ms'] for p in predictions] + tpot_values = [p['tpot_ms'] for p in predictions] + + ttft_std = sum((x - ttft_values[0])**2 for x in ttft_values)**0.5 / len(ttft_values) + tpot_std = sum((x - tpot_values[0])**2 for x in tpot_values)**0.5 / len(tpot_values) + + # For deterministic models, predictions should be identical + if model_type == "bayesian_ridge": + assert ttft_std < 0.01, f"TTFT predictions should be consistent, got std: {ttft_std}" + assert tpot_std < 0.01, f"TPOT predictions should be consistent, got std: {tpot_std}" + + # Check uncertainty values are reasonable + pred = predictions[0] + ttft_uncertainty_ratio = pred['ttft_uncertainty'] / pred['ttft_ms'] + tpot_uncertainty_ratio = pred['tpot_uncertainty'] / pred['tpot_ms'] + + print(f"Model: {model_type}") + print(f"Prefix cache score: {test_payload['prefix_cache_score']}") + print(f"TTFT: {pred['ttft_ms']:.2f} ± {pred['ttft_uncertainty']:.2f} ({ttft_uncertainty_ratio*100:.1f}%)") + print(f"TPOT: {pred['tpot_ms']:.2f} ± {pred['tpot_uncertainty']:.2f} ({tpot_uncertainty_ratio*100:.1f}%)") + + # Uncertainty should be reasonable (not too high or too low) + assert 0.01 < ttft_uncertainty_ratio < 0.5, f"TTFT uncertainty ratio should be reasonable: {ttft_uncertainty_ratio}" + assert 0.01 < tpot_uncertainty_ratio < 0.5, f"TPOT uncertainty ratio should be reasonable: {tpot_uncertainty_ratio}" + + # Check prediction bounds contain the prediction + ttft_bounds = pred['ttft_prediction_bounds'] + tpot_bounds = pred['tpot_prediction_bounds'] + + assert ttft_bounds[0] <= pred['ttft_ms'] <= ttft_bounds[1], "TTFT should be within prediction bounds" + assert tpot_bounds[0] <= pred['tpot_ms'] <= tpot_bounds[1], "TPOT should be within prediction bounds" + + +def test_edge_cases(): + """ + Test edge cases and boundary conditions with prefix cache score. + """ + # Test minimum values + min_payload = { + "kv_cache_percentage": 0.0, + "input_token_length": 1, + "num_request_waiting": 0, + "num_request_running": 0, + "num_tokens_generated": 1, + "prefix_cache_score": 0.0, # Added prefix cache score + } + + response = requests.post(f"{BASE_URL}/predict", json=min_payload) + assert response.status_code == 200 + data = response.json() + assert data['ttft_ms'] > 0 + assert data['tpot_ms'] > 0 + + # Test maximum reasonable values + max_payload = { + "kv_cache_percentage": 1.0, + "input_token_length": 10000, + "num_request_waiting": 100, + "num_request_running": 50, + "num_tokens_generated": 1000, + "prefix_cache_score": 1.0, # Added prefix cache score + } + + response = requests.post(f"{BASE_URL}/predict", json=max_payload) + assert response.status_code == 200 + data = response.json() + assert data['ttft_ms'] > 0 + assert data['tpot_ms'] > 0 + + # Test invalid values (should fail validation) + invalid_payloads = [ + {"kv_cache_percentage": -0.1, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 1.1, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 0.5, "input_token_length": -1, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 0.5, "input_token_length": 100, "num_request_waiting": -1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 0.5, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": -1, "num_tokens_generated": 10, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 0.5, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": -1, "prefix_cache_score": 0.5}, + {"kv_cache_percentage": 0.5, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": -0.1}, # Invalid prefix cache + {"kv_cache_percentage": 0.5, "input_token_length": 100, "num_request_waiting": 1, "num_request_running": 1, "num_tokens_generated": 10, "prefix_cache_score": 1.1}, # Invalid prefix cache + ] + + for invalid_payload in invalid_payloads: + response = requests.post(f"{BASE_URL}/predict", json=invalid_payload) + assert response.status_code == 422, f"Should reject invalid payload: {invalid_payload}" + + +def test_concurrent_training_and_prediction(): + """ + Test that training and prediction can happen concurrently without issues. + """ + print("Testing concurrent training and prediction with prefix cache...") + + def make_predictions(): + results = [] + for _ in range(20): + payload = generate_random_prediction_payload() + try: + response = requests.post(f"{BASE_URL}/predict", json=payload, timeout=5) + results.append(response.status_code == 200) + except: + results.append(False) + time.sleep(0.1) + return results + + def send_training_data(): + results = [] + for _ in range(5): + payload = generate_bulk_training_payload(100) # Smaller batches for faster processing + try: + response = requests.post(f"{BASE_URL}/add_training_data_bulk", json=payload, timeout=10) + results.append(response.status_code == 202) + except: + results.append(False) + time.sleep(0.5) + return results + + # Run both functions concurrently + with ThreadPoolExecutor(max_workers=2) as executor: + prediction_future = executor.submit(make_predictions) + training_future = executor.submit(send_training_data) + + prediction_results = prediction_future.result() + training_results = training_future.result() + + prediction_success_rate = sum(prediction_results) / len(prediction_results) + training_success_rate = sum(training_results) / len(training_results) + + print(f"Prediction success rate: {prediction_success_rate*100:.1f}%") \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 78f1cb81c..5b565e71d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,6 +13,7 @@ theme: favicon: images/favicon-64.png features: - content.code.annotate + - content.code.copy - search.highlight - navigation.tabs - navigation.top @@ -56,23 +57,25 @@ nav: Design Principles: concepts/design-principles.md Conformance: concepts/conformance.md Roles and Personas: concepts/roles-and-personas.md + Priority and Capacity: concepts/priority-and-capacity.md - Implementations: - Gateways: implementations/gateways.md - Model Servers: implementations/model-servers.md - FAQ: faq.md - Guides: - User Guides: - - Getting started: guides/index.md + - Getting started (Released): guides/index.md + - Getting started (Latest/Main): guides/getting-started-latest.md - Use Cases: - Serve Multiple GenAI models: guides/serve-multiple-genai-models.md - - Serve Multiple LoRA adapters: guides/serve-multiple-lora-adapters.md - Rollout: - Adapter Rollout: guides/adapter-rollout.md - InferencePool Rollout: guides/inferencepool-rollout.md - Metrics and Observability: guides/metrics-and-observability.md - Configuration Guide: - - Configuring the plugins via configuration files or text: guides/epp-configuration/config-text.md + - Configuring the plugins via configuration YAML file: guides/epp-configuration/config-text.md - Prefix Cache Aware Plugin: guides/epp-configuration/prefix-aware.md + - Migration Guide: guides/ga-migration.md - Troubleshooting Guide: guides/troubleshooting.md - Implementer Guides: - Getting started: guides/implementers.md @@ -81,11 +84,16 @@ nav: - Benchmark: performance/benchmark/index.md - Regression Testing: performance/regression-testing/index.md - Reference: - - API Reference: reference/spec.md + - v1 API Reference: reference/spec.md + - v1alpha1 API Reference: + - reference/x-v1a1-spec.md + - v1alpha2 API Reference: + - reference/x-v1a2-spec.md - API Types: - InferencePool: api-types/inferencepool.md - - InferenceModel: api-types/inferencemodel.md + - InferenceObjective: api-types/inferenceobjective.md + - InferencePoolImport: api-types/inferencepoolimport.md - Enhancements: - - Overview: gieps/overview.md + - Overview: enhancements/overview.md - Contributing: - How to Get Involved: contributing/index.md diff --git a/pkg/bbr/README.md b/pkg/bbr/README.md index b5b6f770d..80ab38354 100644 --- a/pkg/bbr/README.md +++ b/pkg/bbr/README.md @@ -8,7 +8,3 @@ body of the HTTP request. However, most implementations do not support routing based on the request body. This extension helps bridge that gap for clients. This extension works by parsing the request body. If it finds a `model` parameter in the request body, it will copy the value of that parameter into a request header. - -This extension is intended to be paired with an `ext_proc` capable Gateway. There is not -a standard way to represent this kind of extension in Gateway API yet, so we recommend -referring to implementation-specific documentation for how to deploy this extension. diff --git a/pkg/bbr/handlers/request_test.go b/pkg/bbr/handlers/request_test.go index e59795353..9e408fdef 100644 --- a/pkg/bbr/handlers/request_test.go +++ b/pkg/bbr/handlers/request_test.go @@ -206,7 +206,7 @@ func TestHandleRequestBody(t *testing.T) { bbr_success_total{} 1 ` - if err := metricsutils.GatherAndCompare(crmetrics.Registry, strings.NewReader(wantMetrics), "inference_model_request_total"); err != nil { + if err := metricsutils.GatherAndCompare(crmetrics.Registry, strings.NewReader(wantMetrics), "inference_objective_request_total"); err != nil { t.Error(err) } } diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go index 499d6af28..6488453be 100644 --- a/pkg/bbr/handlers/server.go +++ b/pkg/bbr/handlers/server.go @@ -61,9 +61,6 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { return nil } if recvErr != nil { - // This error occurs very frequently, though it doesn't seem to have any impact. - // TODO Figure out if we can remove this noise. - loggerVerbose.Error(recvErr, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", recvErr) } diff --git a/pkg/bbr/server/runserver.go b/pkg/bbr/server/runserver.go index ec2880b24..ac6ac414e 100644 --- a/pkg/bbr/server/runserver.go +++ b/pkg/bbr/server/runserver.go @@ -19,6 +19,7 @@ package server import ( "context" "crypto/tls" + "fmt" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" @@ -54,8 +55,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { if r.SecureServing { cert, err := tlsutil.CreateSelfSignedTLSCertificate(logger) if err != nil { - logger.Error(err, "Failed to create self signed certificate") - return err + return fmt.Errorf("failed to create self signed certificate - %w", err) } creds := credentials.NewTLS(&tls.Config{Certificates: []tls.Certificate{cert}}) srv = grpc.NewServer(grpc.Creds(creds)) diff --git a/pkg/common/telemetry.go b/pkg/common/telemetry.go new file mode 100644 index 000000000..3723e0f7d --- /dev/null +++ b/pkg/common/telemetry.go @@ -0,0 +1,140 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "context" + "fmt" + "os" + "strconv" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/stdout/stdouttrace" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.37.0" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/version" +) + +type errorHandler struct { + logger logr.Logger +} + +func (h *errorHandler) Handle(err error) { + h.logger.V(logging.DEFAULT).Error(err, "trace error occurred") +} + +func InitTracing(ctx context.Context, logger logr.Logger) error { + logger = logger.WithName("trace") + loggerWrap := &errorHandler{logger: logger} + + _, ok := os.LookupEnv("OTEL_SERVICE_NAME") + if !ok { + os.Setenv("OTEL_SERVICE_NAME", "gateway-api-inference-extension") + } + + _, ok = os.LookupEnv("OTEL_EXPORTER_OTLP_ENDPOINT") + if !ok { + os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + } + + traceExporter, err := initTraceExporter(ctx, logger) + if err != nil { + loggerWrap.Handle(fmt.Errorf("%s: %v", "init trace exporter failed", err)) + return err + } + + // Go SDK doesn't have an automatic sampler, handle manually + samplerType, ok := os.LookupEnv("OTEL_TRACES_SAMPLER") + if !ok { + samplerType = "parentbased_traceidratio" + } + samplerARG, ok := os.LookupEnv("OTEL_TRACES_SAMPLER_ARG") + if !ok { + samplerARG = "0.1" + } + + sampler := sdktrace.ParentBased(sdktrace.TraceIDRatioBased(0.1)) + if samplerType == "parentbased_traceidratio" { + fraction, err := strconv.ParseFloat(samplerARG, 64) + if err != nil { + fraction = 0.1 + } + + sampler = sdktrace.ParentBased(sdktrace.TraceIDRatioBased(fraction)) + } else { + loggerWrap.Handle(fmt.Errorf("unsupported sampler type: %s, fallback to parentbased_traceidratio with 0.1 Ratio", samplerType)) + } + + opt := []sdktrace.TracerProviderOption{ + sdktrace.WithBatcher(traceExporter), + sdktrace.WithSampler(sampler), + sdktrace.WithResource(resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceVersionKey.String(version.BuildRef), + )), + } + + tracerProvider := sdktrace.NewTracerProvider(opt...) + otel.SetTracerProvider(tracerProvider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) + otel.SetErrorHandler(loggerWrap) + + go func() { + <-ctx.Done() + err := tracerProvider.Shutdown(context.Background()) + if err != nil { + loggerWrap.Handle(fmt.Errorf("%s: %v", "failed to shutdown TraceProvider", err)) + } + + logger.V(logging.DEFAULT).Info("trace provider shutting down") + }() + + return nil +} + +// initTraceExporter create a SpanExporter +// support exporter type +// - console: export spans in console for development use case +// - otlp: export spans through gRPC to an opentelemetry collector +func initTraceExporter(ctx context.Context, logger logr.Logger) (sdktrace.SpanExporter, error) { + var traceExporter sdktrace.SpanExporter + traceExporter, err := stdouttrace.New(stdouttrace.WithPrettyPrint()) + if err != nil { + return nil, fmt.Errorf("failed to create stdouttrace exporter: %w", err) + } + + exporterType, ok := os.LookupEnv("OTEL_TRACES_EXPORTER") + if !ok { + exporterType = "console" + } + + logger.Info("init OTel trace exporter", "type", exporterType) + if exporterType == "otlp" { + traceExporter, err = otlptracegrpc.New(ctx, otlptracegrpc.WithInsecure()) + if err != nil { + return nil, fmt.Errorf("failed to create otlp-grcp exporter: %w", err) + } + } + + return traceExporter, nil +} diff --git a/pkg/epp/README.md b/pkg/epp/README.md index df5c21375..966aed5f2 100644 --- a/pkg/epp/README.md +++ b/pkg/epp/README.md @@ -20,9 +20,4 @@ An EPP instance handles a single `InferencePool` (and so for each `InferencePool - The EPP generates metrics to enhance observability. - It reports InferenceObjective-level metrics, further broken down by target model. - Detailed information regarding metrics can be found on the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/). - - -## Scheduling Algorithm -The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. The following flow chart summarizes the current scheduling algorithm - -Scheduling Algorithm + \ No newline at end of file diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index 83ce9a7fc..613ebf5ec 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -22,7 +22,6 @@ import ( "sync" "time" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" @@ -33,11 +32,8 @@ import ( // FakePodMetrics is an implementation of PodMetrics that doesn't run the async refresh loop. type FakePodMetrics struct { - Pod *backend.Pod - Metrics *MetricsState - runningRequests *datalayer.RequestPriorityQueue - stopped bool - mu sync.RWMutex // Protect the stopped field and operations + Pod *backend.Pod + Metrics *MetricsState } func (fpm *FakePodMetrics) String() string { @@ -52,100 +48,8 @@ func (fpm *FakePodMetrics) GetMetrics() *MetricsState { return fpm.Metrics } -func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { - fpm.Pod = toInternalPod(pod, nil) -} - -func (f *FakePodMetrics) StopRefreshLoop() { - f.mu.Lock() - defer f.mu.Unlock() - f.stopped = true -} - -func (f *FakePodMetrics) GetRunningRequests() *datalayer.RequestPriorityQueue { - f.mu.RLock() - defer f.mu.RUnlock() - if f.stopped { - return nil // Return nil for stopped pod metrics - } - return f.runningRequests -} - -func (f *FakePodMetrics) AddRequest(requestID string, tpot float64) bool { - f.mu.RLock() - defer f.mu.RUnlock() - if f.stopped { - return false // Reject operations after stopped - } - return f.runningRequests.Add(requestID, tpot) -} - -func (f *FakePodMetrics) RemoveRequest(requestID string) bool { - f.mu.RLock() - defer f.mu.RUnlock() - if f.stopped { - return false // Reject operations after stopped - } - _, success := f.runningRequests.Remove(requestID) - return success -} - -func (f *FakePodMetrics) UpdateRequest(requestID string, tpot float64) bool { - f.mu.RLock() - defer f.mu.RUnlock() - if f.stopped { - return false // Reject operations after stopped - } - return f.runningRequests.Update(requestID, tpot) -} - -func (f *FakePodMetrics) GetRequestCount() int { - f.mu.RLock() - defer f.mu.RUnlock() - if f.stopped { - return 0 // Return 0 after stopped - } - return f.runningRequests.GetSize() -} - -func (f *FakePodMetrics) ContainsRequest(requestID string) bool { - pod := f.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - return pod.RunningRequests.Contains(requestID) -} - -func (srv *FakePodMetrics) PeekRequestPriorityQueue() *datalayer.Request { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return nil - } - return pod.RunningRequests.Peek() -} - -func NewFakePodMetrics(k8sPod *corev1.Pod) *FakePodMetrics { - labels := make(map[string]string) - for k, v := range k8sPod.Labels { - labels[k] = v - } - - pod := &backend.Pod{ - NamespacedName: types.NamespacedName{ - Name: k8sPod.Name, - Namespace: k8sPod.Namespace, - }, - Address: k8sPod.Status.PodIP, - Labels: labels, - RunningRequests: datalayer.NewRequestPriorityQueue(), - } - - return &FakePodMetrics{ - Pod: pod, - Metrics: &MetricsState{UpdateTime: time.Now()}, - runningRequests: datalayer.NewRequestPriorityQueue(), - stopped: false, - } +func (fpm *FakePodMetrics) UpdatePod(pod *datalayer.PodInfo) { + fpm.Pod = pod } func (*FakePodMetrics) Put(string, datalayer.Cloneable) {} @@ -164,7 +68,7 @@ type FakePodMetricsClient struct { Res map[types.NamespacedName]*MetricsState } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, _ int32) (*MetricsState, error) { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState) (*MetricsState, error) { f.errMu.RLock() err, ok := f.Err[pod.NamespacedName] f.errMu.RUnlock() diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 8927b1b12..6f6be6820 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -25,6 +25,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" "go.uber.org/multierr" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" @@ -35,11 +36,12 @@ const ( LoraInfoRunningAdaptersMetricName = "running_lora_adapters" LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraInfoMaxAdaptersMetricName = "max_lora" + + CacheConfigBlockSizeInfoMetricName = "block_size" ) type PodMetricsClientImpl struct { MetricMapping *MetricMapping - ModelServerMetricsPort int32 ModelServerMetricsPath string ModelServerMetricsScheme string @@ -47,8 +49,8 @@ type PodMetricsClientImpl struct { } // FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one. -func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, port int32) (*MetricsState, error) { - url := p.getMetricEndpoint(pod, port) +func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState) (*MetricsState, error) { + url := p.getMetricEndpoint(pod) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %v", err) @@ -65,7 +67,7 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Po return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } - parser := expfmt.TextParser{} + parser := expfmt.NewTextParser(model.LegacyValidation) metricFamilies, err := parser.TextToMetricFamilies(resp.Body) if err != nil { return nil, err @@ -73,11 +75,8 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Po return p.promToPodMetrics(metricFamilies, existing) } -func (p *PodMetricsClientImpl) getMetricEndpoint(pod *backend.Pod, targetPortNumber int32) string { - if p.ModelServerMetricsPort == 0 { - p.ModelServerMetricsPort = targetPortNumber - } - return fmt.Sprintf("%s://%s:%d%s", p.ModelServerMetricsScheme, pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath) +func (p *PodMetricsClientImpl) getMetricEndpoint(pod *backend.Pod) string { + return p.ModelServerMetricsScheme + "://" + pod.GetMetricsHost() + p.ModelServerMetricsPath } // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. @@ -152,6 +151,24 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } } + if p.MetricMapping.CacheConfigInfo != nil { + cacheMetrics, err := p.getMetric(metricFamilies, *p.MetricMapping.CacheConfigInfo) + if err != nil { + errs = multierr.Append(errs, err) + } else { + for _, v := range cacheMetrics.GetLabel() { + if v.GetName() == CacheConfigBlockSizeInfoMetricName { + updated.CacheBlockSize, err = strconv.Atoi(v.GetValue()) + if err != nil { + errs = multierr.Append(errs, err) + } else { + break + } + } + } + } + } + return updated, errs } diff --git a/pkg/epp/backend/metrics/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go index 782f7427e..b3c26db2c 100644 --- a/pkg/epp/backend/metrics/metrics_spec.go +++ b/pkg/epp/backend/metrics/metrics_spec.go @@ -33,6 +33,7 @@ type MetricMapping struct { TotalRunningRequests *MetricSpec KVCacheUtilization *MetricSpec LoraRequestInfo *MetricSpec + CacheConfigInfo *MetricSpec } // stringToMetricSpec converts a string to a MetricSpec. @@ -94,7 +95,7 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) { } // NewMetricMapping creates a MetricMapping from string values. -func NewMetricMapping(queuedStr, runningStr, kvUsageStr, loraReqInfoStr string) (*MetricMapping, error) { +func NewMetricMapping(queuedStr, runningStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) { queuedSpec, err := stringToMetricSpec(queuedStr) if err != nil { return nil, fmt.Errorf("error parsing WaitingRequests: %w", err) @@ -111,11 +112,18 @@ func NewMetricMapping(queuedStr, runningStr, kvUsageStr, loraReqInfoStr string) if err != nil { return nil, fmt.Errorf("error parsing loraReqInfoStr: %w", err) } + + cacheInfoSpec, err := stringToMetricSpec(cacheInfoMetric) + if err != nil { + return nil, fmt.Errorf("error parsing cacheInfoMetric: %w", err) + } + mapping := &MetricMapping{ TotalQueuedRequests: queuedSpec, TotalRunningRequests: runningSpec, KVCacheUtilization: kvUsageSpec, LoraRequestInfo: loraReqInfoSpec, + CacheConfigInfo: cacheInfoSpec, } return mapping, nil diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index 2dd8ca5dd..502ad6f09 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -489,7 +489,9 @@ func TestPromToPodMetrics(t *testing.T) { func TestFetchMetrics(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) pod := &backend.Pod{ - Address: "127.0.0.1", + Address: "127.0.0.1", + Port: "9999", + MetricsHost: "127.0.0.1:9999", NamespacedName: types.NamespacedName{ Namespace: "test", Name: "pod", @@ -499,12 +501,11 @@ func TestFetchMetrics(t *testing.T) { // No MetricMapping needed for this basic test p := &PodMetricsClientImpl{ ModelServerMetricsScheme: "http", - ModelServerMetricsPort: 9999, ModelServerMetricsPath: "/metrics", Client: http.DefaultClient, } - _, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use + _, err := p.FetchMetrics(ctx, pod, existing) // Use a port that's unlikely to be in use if err == nil { t.Errorf("FetchMetrics() expected error, got nil") } diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index 9ee142610..a1114aecf 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -24,8 +24,6 @@ import ( "time" "github.com/go-logr/logr" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" @@ -51,7 +49,7 @@ type podMetrics struct { } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, port int32) (*MetricsState, error) + FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState) (*MetricsState, error) } func (pm *podMetrics) String() string { @@ -66,98 +64,8 @@ func (pm *podMetrics) GetMetrics() *MetricsState { return pm.metrics.Load() } -// New methods for priority queue integration -func (pm *podMetrics) GetRunningRequests() *datalayer.RequestPriorityQueue { - pod := pm.GetPod() - if pod == nil { - return nil - } - return pod.RunningRequests -} - -func (pm *podMetrics) AddRequest(requestID string, tpot float64) bool { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - success := pod.RunningRequests.Add(requestID, tpot) - // No need to update metrics since we removed ActualRunningRequests - return success -} - -func (pm *podMetrics) RemoveRequest(requestID string) bool { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - _, success := pod.RunningRequests.Remove(requestID) - // No need to update metrics since we removed ActualRunningRequests - return success -} - -func (pm *podMetrics) UpdateRequest(requestID string, tpot float64) bool { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - return pod.RunningRequests.Update(requestID, tpot) -} - -func (pm *podMetrics) GetRequestCount() int { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return 0 - } - return pod.RunningRequests.GetSize() -} - -func (pm *podMetrics) ContainsRequest(requestID string) bool { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - return pod.RunningRequests.Contains(requestID) -} - -func (pm *podMetrics) PeekRequestPriorityQueue() *datalayer.Request { - pod := pm.GetPod() - if pod == nil || pod.RunningRequests == nil { - return nil - } - return pod.RunningRequests.Peek() -} - -func (pm *podMetrics) UpdatePod(k8sPod *corev1.Pod) { - currentPod := pm.GetPod() - updatedPod := toInternalPod(k8sPod, currentPod.GetRunningRequests()) - - // Preserve the existing running requests queue if it exists - if currentPod != nil && currentPod.GetRunningRequests() != nil { - updatedPod.RunningRequests = currentPod.GetRunningRequests() - } - - pm.pod.Store(updatedPod) -} -func toInternalPod(pod *corev1.Pod, existingQueue *datalayer.RequestPriorityQueue) *backend.Pod { - labels := make(map[string]string, len(pod.GetLabels())) - for key, value := range pod.GetLabels() { - labels[key] = value - } - - queue := existingQueue - if queue == nil { - queue = datalayer.NewRequestPriorityQueue() - } - - return &backend.Pod{ - NamespacedName: types.NamespacedName{ - Name: pod.Name, - Namespace: pod.Namespace, - }, - Address: pod.Status.PodIP, - Labels: labels, - RunningRequests: queue, - } +func (pm *podMetrics) UpdatePod(pod *datalayer.PodInfo) { + pm.pod.Store(pod) } // start starts a goroutine exactly once to periodically update metrics. The goroutine will be @@ -185,17 +93,9 @@ func (pm *podMetrics) startRefreshLoop(ctx context.Context) { } func (pm *podMetrics) refreshMetrics() error { - pool, err := pm.ds.PoolGet() - if err != nil { - // No inference pool or not initialize. - return err - } ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) defer cancel() - if len(pool.Spec.TargetPorts) != 1 { - return fmt.Errorf("expected 1 target port, got %d", len(pool.Spec.TargetPorts)) - } - updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), int32(pool.Spec.TargetPorts[0].Number)) + updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics()) if err != nil { pm.logger.V(logutil.TRACE).Info("Failed to refreshed metrics:", "err", err) } diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go index 49a1b3d2d..b0297cd1e 100644 --- a/pkg/epp/backend/metrics/pod_metrics_test.go +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -17,31 +17,25 @@ package metrics import ( "context" - "fmt" - "sync" "testing" "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" ) var ( - pod1 = &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", + pod1Info = &datalayer.PodInfo{ + NamespacedName: types.NamespacedName{ + Name: "pod1-rank-0", Namespace: "default", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - PodIP: "192.168.1.1", }, + PodName: "pod1", } initial = &MetricsState{ WaitingQueueSize: 0, @@ -71,12 +65,11 @@ func TestMetricsRefresh(t *testing.T) { pmf := NewPodMetricsFactory(pmc, time.Millisecond) // The refresher is initialized with empty metrics. - pm := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) + pm := pmf.NewEndpoint(ctx, pod1Info, &fakeDataStore{}) - namespacedName := types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} // Use SetRes to simulate an update of metrics from the pod. // Verify that the metrics are updated. - pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: initial}) + pmc.SetRes(map[types.NamespacedName]*MetricsState{pod1Info.NamespacedName: initial}) condition := func(collect *assert.CollectT) { assert.True(collect, cmp.Equal(pm.GetMetrics(), initial, cmpopts.IgnoreFields(MetricsState{}, "UpdateTime"))) } @@ -86,182 +79,11 @@ func TestMetricsRefresh(t *testing.T) { // new update. pmf.ReleaseEndpoint(pm) time.Sleep(pmf.refreshMetricsInterval * 2 /* small buffer for robustness */) - pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: updated}) + pmc.SetRes(map[types.NamespacedName]*MetricsState{pod1Info.NamespacedName: updated}) // Still expect the same condition (no metrics update). assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) } -// Test priority queue functionality -func TestPodMetricsRequestManagement(t *testing.T) { - ctx := context.Background() - pmc := &FakePodMetricsClient{} - pmf := NewPodMetricsFactory(pmc, time.Minute) // Long interval to avoid interference - - pme := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) - pm := pme.(*podMetrics) // Type assertion to access podMetrics methods - - defer pmf.ReleaseEndpoint(pm) - - // Test adding requests - assert.True(t, pm.AddRequest("req1", 1.5)) - assert.True(t, pm.AddRequest("req2", 2.0)) - assert.False(t, pm.AddRequest("req1", 1.0)) // Duplicate should fail - - // Test request count - assert.Equal(t, 2, pm.GetRequestCount()) - - // Test contains request - assert.True(t, pm.ContainsRequest("req1")) - assert.False(t, pm.ContainsRequest("req3")) - - // Test update request - assert.True(t, pm.UpdateRequest("req1", 0.5)) - assert.False(t, pm.UpdateRequest("req3", 1.0)) // Non-existent - - // Test remove request - assert.True(t, pm.RemoveRequest("req1")) - assert.False(t, pm.RemoveRequest("req1")) // Already removed - assert.Equal(t, 1, pm.GetRequestCount()) - - // Test getting running requests queue - queue := pm.GetRunningRequests() - assert.NotNil(t, queue) - assert.Equal(t, 1, queue.GetSize()) -} - -// Test pod updates preserve request queue -func TestPodUpdatePreservesQueue(t *testing.T) { - ctx := context.Background() - pmc := &FakePodMetricsClient{} - pmf := NewPodMetricsFactory(pmc, time.Minute) - - pme := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) - pm := pme.(*podMetrics) // Type assertion to access podMetrics methods - - defer pmf.ReleaseEndpoint(pm) - - // Add some requests - assert.True(t, pm.AddRequest("req1", 1.5)) - assert.True(t, pm.AddRequest("req2", 2.0)) - assert.Equal(t, 2, pm.GetRequestCount()) - - // Update pod with new IP - updatedPod := pod1.DeepCopy() - updatedPod.Status.PodIP = "192.168.1.2" - updatedPod.Labels["new"] = "label" - - pm.UpdatePod(updatedPod) - - // Queue should be preserved - assert.Equal(t, 2, pm.GetRequestCount()) - assert.True(t, pm.ContainsRequest("req1")) - assert.True(t, pm.ContainsRequest("req2")) - - // Pod properties should be updated - pod := pm.GetPod() - assert.Equal(t, "192.168.1.2", pod.Address) - assert.Equal(t, "label", pod.Labels["new"]) -} - -// Test error handling in metrics refresh -func TestMetricsRefreshWithErrors(t *testing.T) { - ctx := context.Background() - pmc := &FakePodMetricsClient{} - pmf := NewPodMetricsFactory(pmc, time.Millisecond) - - pme := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) - pm := pme.(*podMetrics) // Type assertion to access podMetrics methods - - defer pmf.ReleaseEndpoint(pm) - - namespacedName := types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} - - // Set an error for this pod - pmc.SetErr(map[types.NamespacedName]error{ - namespacedName: fmt.Errorf("connection failed"), - }) - - // Metrics should still be accessible (error is logged but not fatal) - // The pod metrics should continue to work - assert.NotNil(t, pm.GetMetrics()) - assert.NotNil(t, pm.GetPod()) - - // Request operations should still work - assert.True(t, pm.AddRequest("req1", 1.5)) - assert.Equal(t, 1, pm.GetRequestCount()) -} - -// Test string representation -func TestPodMetricsString(t *testing.T) { - ctx := context.Background() - pmc := &FakePodMetricsClient{} - pmf := NewPodMetricsFactory(pmc, time.Minute) - - pme := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) - pm := pme.(*podMetrics) // Type assertion to access podMetrics methods - - defer pmf.ReleaseEndpoint(pm) - - // Add some requests - pm.AddRequest("req1", 1.5) - pm.AddRequest("req2", 2.0) - - str := pm.String() - assert.Contains(t, str, "pod1") - assert.Contains(t, str, "default") - assert.Contains(t, str, "[req1(1.50), req2(2.00)]") - assert.Contains(t, str, "192.168.1.1") -} - -// Test concurrent access to request operations -func TestConcurrentRequestOperations(t *testing.T) { - ctx := context.Background() - pmc := &FakePodMetricsClient{} - pmf := NewPodMetricsFactory(pmc, time.Minute) - - pme := pmf.NewEndpoint(ctx, pod1, &fakeDataStore{}) - pm := pme.(*podMetrics) // Type assertion to access podMetrics methods - - defer pmf.ReleaseEndpoint(pm) - - const numGoroutines = 10 - const requestsPerGoroutine = 100 - - var wg sync.WaitGroup - - // Launch goroutines that add requests - for i := 0; i < numGoroutines; i++ { - wg.Add(1) - go func(id int) { - defer wg.Done() - for j := 0; j < requestsPerGoroutine; j++ { - requestID := fmt.Sprintf("req-%d-%d", id, j) - pm.AddRequest(requestID, float64(j)) - } - }(i) - } - - // Launch goroutines that check and remove requests - for i := 0; i < numGoroutines/2; i++ { - wg.Add(1) - go func(id int) { - defer wg.Done() - for j := 0; j < requestsPerGoroutine/2; j++ { - requestID := fmt.Sprintf("req-%d-%d", id, j) - if pm.ContainsRequest(requestID) { - pm.RemoveRequest(requestID) - } - } - }(i) - } - - wg.Wait() - - // Should not crash and should have some requests remaining - count := pm.GetRequestCount() - assert.True(t, count >= 0) // Basic sanity check -} - type fakeDataStore struct{} func (f *fakeDataStore) PoolGet() (*v1.InferencePool, error) { diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index cbb4dc7df..99f15a20f 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -22,7 +22,6 @@ import ( "sync" "time" - corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" @@ -53,8 +52,7 @@ type PodMetricsFactory struct { refreshMetricsInterval time.Duration } -func (f *PodMetricsFactory) NewEndpoint(parentCtx context.Context, in *corev1.Pod, ds datalayer.PoolInfo) PodMetrics { - pod := toInternalPod(in, nil) // Pass nil for new pod - will create new queue +func (f *PodMetricsFactory) NewEndpoint(parentCtx context.Context, pod *datalayer.PodInfo, ds datalayer.PoolInfo) PodMetrics { pm := &podMetrics{ pmc: f.pmc, ds: ds, diff --git a/pkg/epp/config/loader/configloader.go b/pkg/epp/config/loader/configloader.go index 8e80b037d..865eae28b 100644 --- a/pkg/epp/config/loader/configloader.go +++ b/pkg/epp/config/loader/configloader.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" ) var scheme = runtime.NewScheme() @@ -113,6 +114,10 @@ func loadSchedulerConfig(configProfiles []configapi.SchedulingProfile, handle pl return nil, errors.New("no profile handler was specified") } + if profileHandler.TypedName().Type == profile.SingleProfileHandlerType && len(profiles) > 1 { + return nil, errors.New("single profile handler is intended to be used with a single profile, but multiple profiles were specified") + } + return scheduling.NewSchedulerConfig(profileHandler, profiles), nil } diff --git a/pkg/epp/config/loader/configloader_test.go b/pkg/epp/config/loader/configloader_test.go index 5bf5a6608..c00563ad3 100644 --- a/pkg/epp/config/loader/configloader_test.go +++ b/pkg/epp/config/loader/configloader_test.go @@ -73,7 +73,7 @@ func TestLoadRawConfiguration(t *testing.T) { }, { Type: test2Type, - Parameters: json.RawMessage("{\"hashBlockSize\":32}"), + Parameters: json.RawMessage("{\"blockSize\":32}"), }, { Name: "testPicker", @@ -175,7 +175,7 @@ func TestLoadRawConfigurationWithDefaults(t *testing.T) { { Name: test2Type, Type: test2Type, - Parameters: json.RawMessage("{\"hashBlockSize\":32}"), + Parameters: json.RawMessage("{\"blockSize\":32}"), }, { Name: "testPicker", @@ -420,6 +420,11 @@ func TestLoadConfig(t *testing.T) { configText: errorNoProfileHandlersText, wantErr: true, }, + { + name: "errorMultiProfilesUseSingleProfileHandler", + configText: errorMultiProfilesUseSingleProfileHandlerText, + wantErr: true, + }, } registerNeededPlgugins() @@ -464,7 +469,7 @@ plugins: type: test-profile-handler - type: test-two parameters: - hashBlockSize: 32 + blockSize: 32 - name: testPicker type: test-picker schedulingProfiles: @@ -772,7 +777,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: 32 + blockSize: 32 - name: maxScorePicker type: max-score-picker - name: profileHandler @@ -797,7 +802,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: 32 + blockSize: 32 schedulingProfiles: - name: default plugins: @@ -831,7 +836,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: asdf + blockSize: asdf schedulingProfiles: - name: default plugins: @@ -895,3 +900,23 @@ schedulingProfiles: plugins: - pluginRef: maxScore ` + +// multiple profiles using SingleProfileHandler +// +//nolint:dupword +const errorMultiProfilesUseSingleProfileHandlerText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: single-profile-handler +- name: maxScore + type: max-score-picker +schedulingProfiles: +- name: default + plugins: + - pluginRef: maxScore +- name: prof2 + plugins: + - pluginRef: maxScore +` diff --git a/pkg/epp/controller/inferenceobjective_reconciler.go b/pkg/epp/controller/inferenceobjective_reconciler.go index 53bce8646..c8ac5a6c3 100644 --- a/pkg/epp/controller/inferenceobjective_reconciler.go +++ b/pkg/epp/controller/inferenceobjective_reconciler.go @@ -18,6 +18,7 @@ package controller import ( "context" + "fmt" "k8s.io/apimachinery/pkg/api/errors" ctrl "sigs.k8s.io/controller-runtime" @@ -48,8 +49,7 @@ func (c *InferenceObjectiveReconciler) Reconcile(ctx context.Context, req ctrl.R notFound := false if err := c.Get(ctx, req.NamespacedName, infObjective); err != nil { if !errors.IsNotFound(err) { - logger.Error(err, "Unable to get InferenceObjective") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("unable to get InferenceObjective - %w", err) } notFound = true } diff --git a/pkg/epp/controller/inferenceobjective_reconciler_test.go b/pkg/epp/controller/inferenceobjective_reconciler_test.go index de43d6e63..4ceff5d07 100644 --- a/pkg/epp/controller/inferenceobjective_reconciler_test.go +++ b/pkg/epp/controller/inferenceobjective_reconciler_test.go @@ -160,7 +160,7 @@ func TestInferenceObjectiveReconciler(t *testing.T) { WithObjects(initObjs...). Build() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - ds := datastore.NewDatastore(t.Context(), pmf) + ds := datastore.NewDatastore(t.Context(), pmf, 0) for _, m := range test.objectivessInStore { ds.ObjectiveSet(m) } diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index d8b7668e2..3b52de0ae 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -56,9 +56,7 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques obj = &v1alpha2.InferencePool{} default: // Handle unsupported groups gracefully. - err := fmt.Errorf("unsupported API group: %s", c.PoolGKNN.Group) - logger.Error(err, "Cannot reconcile InferencePool") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("cannot reconcile InferencePool - unsupported API group: %s", c.PoolGKNN.Group) } // 2. Perform a single, generic fetch for the object. @@ -68,8 +66,7 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques c.Datastore.Clear() return ctrl.Result{}, nil } - logger.Error(err, "Unable to get InferencePool") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("unable to get InferencePool - %w", err) } // 3. Perform common checks using the client.Object interface. @@ -90,16 +87,14 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques var err error err = pool.ConvertTo(v1infPool) if err != nil { - logger.Error(err, "Failed to convert XInferencePool to InferencePool") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("failed to convert XInferencePool to InferencePool - %w", err) } default: return ctrl.Result{}, fmt.Errorf("unsupported API group: %s", c.PoolGKNN.Group) } if err := c.Datastore.PoolSet(ctx, c.Reader, v1infPool); err != nil { - logger.Error(err, "Failed to update datastore") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("failed to update datastore - %w", err) } return ctrl.Result{}, nil diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 7f6938533..a2bce1256 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -24,6 +24,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" @@ -113,14 +114,14 @@ func TestInferencePoolReconciler(t *testing.T) { ctx := context.Background() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - datastore := datastore.NewDatastore(ctx, pmf) - inferencePoolReconciler := &InferencePoolReconciler{Reader: fakeClient, Datastore: datastore, PoolGKNN: gknn} + ds := datastore.NewDatastore(ctx, pmf, 0) + inferencePoolReconciler := &InferencePoolReconciler{Reader: fakeClient, Datastore: ds, PoolGKNN: gknn} // Step 1: Inception, only ready pods matching pool1 are added to the store. if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffStore(datastore, diffStoreParams{wantPool: pool1, wantPods: []string{"pod1", "pod2"}}); diff != "" { + if diff := diffStore(ds, diffStoreParams{wantPool: pool1, wantPods: []string{"pod1-rank-0", "pod2-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -138,7 +139,7 @@ func TestInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + if diff := diffStore(ds, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -153,7 +154,7 @@ func TestInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + if diff := diffStore(ds, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -167,7 +168,7 @@ func TestInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffStore(datastore, diffStoreParams{wantPods: []string{}}); diff != "" { + if diff := diffStore(ds, diffStoreParams{wantPods: []string{}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } } @@ -180,7 +181,9 @@ type diffStoreParams struct { func diffStore(datastore datastore.Datastore, params diffStoreParams) string { gotPool, _ := datastore.PoolGet() - if diff := cmp.Diff(params.wantPool, gotPool); diff != "" { + // controller-runtime fake client may not populate TypeMeta (APIVersion/Kind). + // Ignore it when comparing pools. + if diff := cmp.Diff(params.wantPool, gotPool, cmpopts.IgnoreTypes(metav1.TypeMeta{})); diff != "" { return "pool:" + diff } @@ -258,14 +261,14 @@ func TestXInferencePoolReconciler(t *testing.T) { ctx := context.Background() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - datastore := datastore.NewDatastore(ctx, pmf) - inferencePoolReconciler := &InferencePoolReconciler{Reader: fakeClient, Datastore: datastore, PoolGKNN: gknn} + ds := datastore.NewDatastore(ctx, pmf, 0) + inferencePoolReconciler := &InferencePoolReconciler{Reader: fakeClient, Datastore: ds, PoolGKNN: gknn} // Step 1: Inception, only ready pods matching pool1 are added to the store. if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := xDiffStore(t, datastore, xDiffStoreParams{wantPool: pool1, wantPods: []string{"pod1", "pod2"}}); diff != "" { + if diff := xDiffStore(t, ds, xDiffStoreParams{wantPool: pool1, wantPods: []string{"pod1-rank-0", "pod2-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -281,7 +284,7 @@ func TestXInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := xDiffStore(t, datastore, xDiffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + if diff := xDiffStore(t, ds, xDiffStoreParams{wantPool: newPool1, wantPods: []string{"pod5-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -296,7 +299,7 @@ func TestXInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := xDiffStore(t, datastore, xDiffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + if diff := xDiffStore(t, ds, xDiffStoreParams{wantPool: newPool1, wantPods: []string{"pod5-rank-0"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -310,7 +313,7 @@ func TestXInferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := xDiffStore(t, datastore, xDiffStoreParams{wantPods: []string{}}); diff != "" { + if diff := xDiffStore(t, ds, xDiffStoreParams{wantPods: []string{}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } } @@ -333,7 +336,10 @@ func xDiffStore(t *testing.T, datastore datastore.Datastore, params xDiffStorePa if err != nil { t.Fatalf("failed to convert InferencePool to XInferencePool: %v", err) } - if diff := cmp.Diff(params.wantPool, gotXPool); diff != "" { + + // controller-runtime fake client may not populate TypeMeta (APIVersion/Kind). + // Ignore it when comparing pools. + if diff := cmp.Diff(params.wantPool, gotXPool, cmpopts.IgnoreTypes(metav1.TypeMeta{})); diff != "" { return "pool:" + diff } diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 3cd7c2574..b3a78ef92 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -18,11 +18,11 @@ package controller import ( "context" + "fmt" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" @@ -52,11 +52,10 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R pod := &corev1.Pod{} if err := c.Get(ctx, req.NamespacedName, pod); err != nil { if apierrors.IsNotFound(err) { - c.Datastore.PodDelete(req.NamespacedName) + c.Datastore.PodDelete(req.Name) return ctrl.Result{}, nil } - logger.V(logutil.DEFAULT).Error(err, "Unable to get pod") - return ctrl.Result{}, err + return ctrl.Result{}, fmt.Errorf("unable to get pod - %w", err) } c.updateDatastore(logger, pod) @@ -90,10 +89,9 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { } func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { - namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} if !podutil.IsPodReady(pod) || !c.Datastore.PoolLabelsMatch(pod.Labels) { logger.V(logutil.DEBUG).Info("Pod removed or not added") - c.Datastore.PodDelete(namespacedName) + c.Datastore.PodDelete(pod.Name) } else { if c.Datastore.PodUpdateOrAddIfNotExist(pod) { logger.V(logutil.DEFAULT).Info("Pod added") diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index 5ceb3efdb..28f817310 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -196,7 +196,7 @@ func TestPodReconciler(t *testing.T) { Build() // Configure the initial state of the datastore. - store := datastore.NewDatastore(t.Context(), pmf) + store := datastore.NewDatastore(t.Context(), pmf, 0) _ = store.PoolSet(t.Context(), fakeClient, test.pool) for _, pod := range test.existingPods { store.PodUpdateOrAddIfNotExist(pod) @@ -213,7 +213,7 @@ func TestPodReconciler(t *testing.T) { var gotPods []*corev1.Pod for _, pm := range store.PodList(backendmetrics.AllPodsPredicate) { - pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().PodName, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().GetIPAddress()}} gotPods = append(gotPods, pod) } if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { diff --git a/pkg/epp/datalayer/collector_test.go b/pkg/epp/datalayer/collector_test.go index 2d47de30a..0e3b9151b 100644 --- a/pkg/epp/datalayer/collector_test.go +++ b/pkg/epp/datalayer/collector_test.go @@ -24,8 +24,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer/mocks" ) @@ -45,14 +44,12 @@ func (d *DummySource) Collect(ctx context.Context, ep Endpoint) error { func defaultEndpoint() Endpoint { ms := NewEndpoint() - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ + pod := &PodInfo{ + NamespacedName: types.NamespacedName{ Name: "pod-name", Namespace: "default", }, - Status: corev1.PodStatus{ - PodIP: "1.2.3.4", - }, + Address: "1.2.3.4:5678", } ms.UpdatePod(pod) return ms diff --git a/pkg/epp/datalayer/endpoint.go b/pkg/epp/datalayer/endpoint.go index 7898a7a41..74c11905e 100644 --- a/pkg/epp/datalayer/endpoint.go +++ b/pkg/epp/datalayer/endpoint.go @@ -19,14 +19,12 @@ package datalayer import ( "fmt" "sync/atomic" - - corev1 "k8s.io/api/core/v1" ) // EndpointPodState allows management of the Pod related attributes. type EndpointPodState interface { GetPod() *PodInfo - UpdatePod(*corev1.Pod) + UpdatePod(*PodInfo) } // EndpointMetricsState allows management of the Metrics related attributes. @@ -35,23 +33,11 @@ type EndpointMetricsState interface { UpdateMetrics(*Metrics) } -// EndpointRunningRequestsState allows management of the Pod related attributes. -type EndpointRunningRequestsState interface { - GetRunningRequests() *RequestPriorityQueue - AddRequest(requestID string, tpot float64) bool - RemoveRequest(requestID string) bool - UpdateRequest(requestID string, tpot float64) bool - GetRequestCount() int - ContainsRequest(requestID string) bool - PeekRequestPriorityQueue() *Request -} - // Endpoint represents an inference serving endpoint and its related attributes. type Endpoint interface { fmt.Stringer EndpointPodState EndpointMetricsState - EndpointRunningRequestsState AttributeMap } @@ -79,16 +65,8 @@ func (srv *ModelServer) GetPod() *PodInfo { return srv.pod.Load() } -func (srv *ModelServer) UpdatePod(k8sPod *corev1.Pod) { - currentPod := srv.GetPod() - updatedPod := ToPodInfo(k8sPod) - - // Preserve the existing running requests queue if it exists - if currentPod != nil && currentPod.GetRunningRequests() != nil { - updatedPod.RunningRequests = currentPod.GetRunningRequests() - } - - srv.pod.Store(updatedPod) +func (srv *ModelServer) UpdatePod(pod *PodInfo) { + srv.pod.Store(pod) } func (srv *ModelServer) GetMetrics() *Metrics { @@ -99,67 +77,6 @@ func (srv *ModelServer) UpdateMetrics(metrics *Metrics) { srv.metrics.Store(metrics) } -// New methods for priority queue integration -func (srv *ModelServer) GetRunningRequests() *RequestPriorityQueue { - pod := srv.GetPod() - if pod == nil { - return nil - } - return pod.RunningRequests -} - -func (srv *ModelServer) AddRequest(requestID string, tpot float64) bool { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - success := pod.RunningRequests.Add(requestID, tpot) - // No need to update metrics since we removed ActualRunningRequests - return success -} - -func (srv *ModelServer) RemoveRequest(requestID string) bool { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - _, success := pod.RunningRequests.Remove(requestID) - // No need to update metrics since we removed ActualRunningRequests - return success -} - -func (srv *ModelServer) UpdateRequest(requestID string, tpot float64) bool { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - return pod.RunningRequests.Update(requestID, tpot) -} - -func (srv *ModelServer) GetRequestCount() int { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return 0 - } - return pod.RunningRequests.GetSize() -} - -func (srv *ModelServer) ContainsRequest(requestID string) bool { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return false - } - return pod.RunningRequests.Contains(requestID) -} - -func (srv *ModelServer) PeekRequestPriorityQueue() *Request { - pod := srv.GetPod() - if pod == nil || pod.RunningRequests == nil { - return nil - } - return pod.RunningRequests.Peek() -} - func (srv *ModelServer) Put(key string, value Cloneable) { srv.attributes.Put(key, value) } diff --git a/pkg/epp/datalayer/factory.go b/pkg/epp/datalayer/factory.go index eca7697e5..989527c6c 100644 --- a/pkg/epp/datalayer/factory.go +++ b/pkg/epp/datalayer/factory.go @@ -21,7 +21,6 @@ import ( "sync" "time" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" @@ -45,7 +44,7 @@ type PoolInfo interface { // providing methods to allocate and retire endpoints. This can potentially be used for // pooled memory or other management chores in the implementation. type EndpointFactory interface { - NewEndpoint(parent context.Context, inpod *corev1.Pod, poolinfo PoolInfo) Endpoint + NewEndpoint(parent context.Context, inpod *PodInfo, poolinfo PoolInfo) Endpoint ReleaseEndpoint(ep Endpoint) } @@ -70,8 +69,8 @@ func NewEndpointFactory(sources []DataSource, refreshMetricsInterval time.Durati // NewEndpoint implements EndpointFactory.NewEndpoint. // Creates a new endpoint and starts its associated collector with its own ticker. // Guards against multiple concurrent calls for the same endpoint. -func (lc *EndpointLifecycle) NewEndpoint(parent context.Context, inpod *corev1.Pod, _ PoolInfo) Endpoint { - key := types.NamespacedName{Namespace: inpod.Namespace, Name: inpod.Name} +func (lc *EndpointLifecycle) NewEndpoint(parent context.Context, inpod *PodInfo, _ PoolInfo) Endpoint { + key := types.NamespacedName{Namespace: inpod.GetNamespacedName().Namespace, Name: inpod.GetNamespacedName().Name} logger := log.FromContext(parent).WithValues("pod", key) if _, ok := lc.collectors.Load(key); ok { diff --git a/pkg/epp/datalayer/metrics.go b/pkg/epp/datalayer/metrics.go index 5869165c9..2febcb4d0 100644 --- a/pkg/epp/datalayer/metrics.go +++ b/pkg/epp/datalayer/metrics.go @@ -32,6 +32,7 @@ type Metrics struct { WaitingQueueSize int KVCacheUsagePercent float64 KvCacheMaxTokenCapacity int + CacheBlockSize int // UpdateTime records the last time when the metrics were updated. UpdateTime time.Time @@ -75,6 +76,7 @@ func (m *Metrics) Clone() *Metrics { WaitingQueueSize: m.WaitingQueueSize, KVCacheUsagePercent: m.KVCacheUsagePercent, KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity, + CacheBlockSize: m.CacheBlockSize, UpdateTime: m.UpdateTime, } } diff --git a/pkg/epp/datalayer/metrics/client.go b/pkg/epp/datalayer/metrics/client.go index 962a2a584..c59850ac5 100644 --- a/pkg/epp/datalayer/metrics/client.go +++ b/pkg/epp/datalayer/metrics/client.go @@ -24,6 +24,7 @@ import ( "time" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" ) @@ -83,7 +84,7 @@ func (cl *client) Get(ctx context.Context, target *url.URL, ep datalayer.Address return nil, fmt.Errorf("unexpected status code from %s: %v", ep.GetNamespacedName(), resp.StatusCode) } - parser := expfmt.TextParser{} + parser := expfmt.NewTextParser(model.LegacyValidation) metricFamilies, err := parser.TextToMetricFamilies(resp.Body) if err != nil { return nil, err diff --git a/pkg/epp/datalayer/metrics/datasource.go b/pkg/epp/datalayer/metrics/datasource.go index 7dcdc97ba..1e14d1b1a 100644 --- a/pkg/epp/datalayer/metrics/datasource.go +++ b/pkg/epp/datalayer/metrics/datasource.go @@ -21,11 +21,8 @@ import ( "crypto/tls" "errors" "fmt" - "net" "net/url" - "strconv" "sync" - "sync/atomic" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" ) @@ -37,9 +34,8 @@ const ( // DataSource is a Model Server Protocol (MSP) compliant metrics data source, // returning Prometheus formatted metrics for an endpoint. type DataSource struct { - metricsScheme string // scheme to use in metrics URL - metricsPort atomic.Pointer[string] // target port to use in metrics URL - metricsPath string // path to use in metrics URL + metricsScheme string // scheme to use in metrics URL + metricsPath string // path to use in metrics URL client Client // client (e.g. a wrapped http.Client) used to get metrics extractors sync.Map // key: name, value: extractor @@ -49,7 +45,7 @@ type DataSource struct { // the provided client factory. If ClientFactory is nil, a default factory is used. // The Scheme, port and path are command line options. It should be noted that // a port value of zero is set if the command line is unspecified. -func NewDataSource(metricsScheme string, metricsPort int32, metricsPath string, skipCertVerification bool, cl Client) *DataSource { +func NewDataSource(metricsScheme string, metricsPath string, skipCertVerification bool, cl Client) *DataSource { if metricsScheme == "https" { httpsTransport := baseTransport.Clone() httpsTransport.TLSClientConfig = &tls.Config{ @@ -67,25 +63,9 @@ func NewDataSource(metricsScheme string, metricsPort int32, metricsPath string, metricsPath: metricsPath, client: cl, } - dataSrc.SetPort(metricsPort) return dataSrc } -// SetPort updates the port used for metrics scraping. -// The port value can only be set once (i.e., if set by command line, -// do not overwrite from Pool.Spec). A port value of 0 (i.e., unspecified -// command line value) is ignored. -// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1398 -func (dataSrc *DataSource) SetPort(metricsPort int32) { - if dataSrc.metricsPort.Load() != nil { // do not overwrite - return - } - if metricsPort != 0 { // ignore zero value for port - port := strconv.Itoa(int(metricsPort)) - dataSrc.metricsPort.Store(&port) - } -} - // Name returns the metrics data source name. func (dataSrc *DataSource) Name() string { return DataSourceName @@ -132,7 +112,7 @@ func (dataSrc *DataSource) Collect(ctx context.Context, ep datalayer.Endpoint) e func (dataSrc *DataSource) getMetricsEndpoint(ep datalayer.Addressable) *url.URL { return &url.URL{ Scheme: dataSrc.metricsScheme, - Host: net.JoinHostPort(ep.GetIPAddress(), *dataSrc.metricsPort.Load()), + Host: ep.GetMetricsHost(), Path: dataSrc.metricsPath, } } diff --git a/pkg/epp/datalayer/metrics/extractor.go b/pkg/epp/datalayer/metrics/extractor.go index 08105196d..6c6978c87 100644 --- a/pkg/epp/datalayer/metrics/extractor.go +++ b/pkg/epp/datalayer/metrics/extractor.go @@ -37,6 +37,8 @@ const ( LoraInfoRunningAdaptersMetricName = "running_lora_adapters" LoraInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraInfoMaxAdaptersMetricName = "max_lora" + + CacheConfigBlockSizeInfoMetricName = "block_size" ) // Extractor implements the metrics extraction based on the model @@ -49,8 +51,8 @@ type Extractor struct { // configured with the given metrics' specifications. // These are mandatory metrics per the MSP specification, and are used // as the basis for the built-in scheduling plugins. -func NewExtractor(queueSpec, runningSpec, kvusageSpec, loraSpec string) (*Extractor, error) { - mapping, err := NewMapping(queueSpec, runningSpec, kvusageSpec, loraSpec) +func NewExtractor(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) { + mapping, err := NewMapping(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec) if err != nil { return nil, fmt.Errorf("failed to create extractor metrics Mapping - %w", err) } @@ -120,6 +122,16 @@ func (ext *Extractor) Extract(ctx context.Context, data any, ep datalayer.Endpoi } } + if spec := ext.mapping.CacheInfo; spec != nil { // extract CacheInfo-specific metrics + metric, err := spec.getLatestMetric(families) + if err != nil { + errs = append(errs, err) + } else if metric != nil { + populateCacheInfoMetrics(clone, metric, &errs) + updated = true + } + } + if updated { clone.UpdateTime = time.Now() ep.UpdateMetrics(clone) @@ -154,6 +166,23 @@ func populateLoRAMetrics(clone *datalayer.Metrics, metric *dto.Metric, errs *[]e } } +// populateCacheInfoMetrics updates the metrics with cache info from the metric labels. +func populateCacheInfoMetrics(clone *datalayer.Metrics, metric *dto.Metric, errs *[]error) { + clone.CacheBlockSize = 0 + for _, label := range metric.GetLabel() { + if label.GetName() == CacheConfigBlockSizeInfoMetricName { + if label.GetValue() != "" { + if val, err := strconv.Atoi(label.GetValue()); err == nil { + clone.CacheBlockSize = val + break + } else { + *errs = append(*errs, err) + } + } + } + } +} + // addAdapters splits a comma-separated adapter list and stores keys with default value 0. func addAdapters(m map[string]int, csv string) { for _, name := range strings.Split(csv, ",") { diff --git a/pkg/epp/datalayer/metrics/mapping.go b/pkg/epp/datalayer/metrics/mapping.go index e92f1f102..7b1fed9c1 100644 --- a/pkg/epp/datalayer/metrics/mapping.go +++ b/pkg/epp/datalayer/metrics/mapping.go @@ -27,10 +27,11 @@ type Mapping struct { TotalRunningRequests *Spec KVCacheUtilization *Spec LoraRequestInfo *LoRASpec + CacheInfo *Spec } // NewMapping creates a metrics.Mapping from the input specification strings. -func NewMapping(queue, running, kvusage, lora string) (*Mapping, error) { +func NewMapping(queue, running, kvusage, lora, cacheInfo string) (*Mapping, error) { var errs []error queueSpec, err := parseStringToSpec(queue) @@ -49,6 +50,12 @@ func NewMapping(queue, running, kvusage, lora string) (*Mapping, error) { if err != nil { errs = append(errs, err) } + + cacheInfoSpec, err := parseStringToSpec(cacheInfo) + if err != nil { + errs = append(errs, err) + } + if len(errs) != 0 { return nil, errors.Join(errs...) } @@ -57,5 +64,6 @@ func NewMapping(queue, running, kvusage, lora string) (*Mapping, error) { TotalRunningRequests: runningSpec, KVCacheUtilization: kvusageSpec, LoraRequestInfo: loraSpec, + CacheInfo: cacheInfoSpec, }, nil } diff --git a/pkg/epp/datalayer/podinfo.go b/pkg/epp/datalayer/podinfo.go index 5f2d417c6..7cbd6d886 100644 --- a/pkg/epp/datalayer/podinfo.go +++ b/pkg/epp/datalayer/podinfo.go @@ -19,40 +19,25 @@ package datalayer import ( "fmt" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" ) // Addressable supports getting an IP address and a namespaced name. type Addressable interface { GetIPAddress() string + GetPort() string + GetMetricsHost() string GetNamespacedName() types.NamespacedName - GetRunningRequests() *RequestPriorityQueue } // PodInfo represents the relevant Kubernetes Pod state of an inference server. type PodInfo struct { - NamespacedName types.NamespacedName - Address string - Labels map[string]string - RunningRequests *RequestPriorityQueue -} - -// ToPodInfo converts a Kubernetes API Pod to its internal representation. -func ToPodInfo(pod *corev1.Pod) *PodInfo { - labels := make(map[string]string, len(pod.GetLabels())) - for key, value := range pod.GetLabels() { - labels[key] = value - } - return &PodInfo{ - NamespacedName: types.NamespacedName{ - Name: pod.Name, - Namespace: pod.Namespace, - }, - Address: pod.Status.PodIP, - Labels: labels, - RunningRequests: NewRequestPriorityQueue(), - } + NamespacedName types.NamespacedName + PodName string + Address string + Port string + MetricsHost string + Labels map[string]string } // String returns a string representation of the pod. @@ -73,18 +58,16 @@ func (p *PodInfo) Clone() *PodInfo { for key, value := range p.Labels { clonedLabels[key] = value } - var clonedRequests *RequestPriorityQueue - if p.RunningRequests != nil { - clonedRequests = p.RunningRequests.Clone() - } return &PodInfo{ NamespacedName: types.NamespacedName{ Name: p.NamespacedName.Name, Namespace: p.NamespacedName.Namespace, }, - Address: p.Address, - Labels: clonedLabels, - RunningRequests: clonedRequests, + PodName: p.PodName, + Address: p.Address, + Port: p.Port, + MetricsHost: p.MetricsHost, + Labels: clonedLabels, } } @@ -98,7 +81,12 @@ func (p *PodInfo) GetIPAddress() string { return p.Address } -// GetRunningRequests returns the running request queue for the Pod. -func (p *PodInfo) GetRunningRequests() *RequestPriorityQueue { - return p.RunningRequests +// GetPort returns the Pod's inference port. +func (p *PodInfo) GetPort() string { + return p.Port +} + +// GetMetricsHost returns the pod's metrics host (ip:port) +func (p *PodInfo) GetMetricsHost() string { + return p.MetricsHost } diff --git a/pkg/epp/datalayer/podinfo_test.go b/pkg/epp/datalayer/podinfo_test.go index 91256cae7..baf804a22 100644 --- a/pkg/epp/datalayer/podinfo_test.go +++ b/pkg/epp/datalayer/podinfo_test.go @@ -55,17 +55,6 @@ var ( } ) -func TestToPodInfo(t *testing.T) { - podinfo := ToPodInfo(pod) - if podinfo.RunningRequests == nil { - t.Fatal("Expected RunningRequests to be initialized") - } - podinfo.RunningRequests = nil // Reset to nil for comparison, this is necessary because the podinfo is created with a new map each time - if diff := cmp.Diff(expected, podinfo); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } -} - func TestPodInfoClone(t *testing.T) { clone := expected.Clone() assert.NotSame(t, expected, clone) @@ -78,7 +67,17 @@ func TestPodInfoClone(t *testing.T) { } func TestPodInfoString(t *testing.T) { - podinfo := ToPodInfo(pod) + podinfo := PodInfo{ + NamespacedName: types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + PodName: pod.Name, + Address: pod.Status.PodIP, + Port: "8000", + MetricsHost: "127.0.0.1:8000", + Labels: labels, + } s := podinfo.String() assert.Contains(t, s, name) diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index e2e9bebbc..5dcd0f4a0 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -20,7 +20,9 @@ import ( "context" "errors" "fmt" + "net" "reflect" + "strconv" "sync" corev1 "k8s.io/api/core/v1" @@ -33,7 +35,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" - dlmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" ) @@ -62,31 +63,20 @@ type Datastore interface { // PodList lists pods matching the given predicate. PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool - PodDelete(namespacedName types.NamespacedName) - - // Request management operations - // PodAddRequest adds a request to a specific pod's running requests queue - PodAddRequest(podName types.NamespacedName, requestID string, tpot float64) error - // PodRemoveRequest removes a request from a specific pod's running requests queue - PodRemoveRequest(podName types.NamespacedName, requestID string) error - // PodUpdateRequest updates the TPOT value for a request in a specific pod's queue - PodUpdateRequest(podName types.NamespacedName, requestID string, tpot float64) error - // PodGetRunningRequests returns the priority queue for a specific pod - PodGetRunningRequests(podName types.NamespacedName) (*datalayer.RequestPriorityQueue, error) - // PodGetRequestCount returns the number of running requests for a specific pod - PodGetRequestCount(podName types.NamespacedName) (int, error) + PodDelete(podName string) // Clears the store state, happens when the pool gets deleted. Clear() } -func NewDatastore(parentCtx context.Context, epFactory datalayer.EndpointFactory) Datastore { +func NewDatastore(parentCtx context.Context, epFactory datalayer.EndpointFactory, modelServerMetricsPort int32) Datastore { store := &datastore{ - parentCtx: parentCtx, - poolAndObjectivesMu: sync.RWMutex{}, - objectives: make(map[string]*v1alpha2.InferenceObjective), - pods: &sync.Map{}, - epf: epFactory, + parentCtx: parentCtx, + poolAndObjectivesMu: sync.RWMutex{}, + objectives: make(map[string]*v1alpha2.InferenceObjective), + pods: &sync.Map{}, + modelServerMetricsPort: modelServerMetricsPort, + epf: epFactory, } return store } @@ -101,7 +91,10 @@ type datastore struct { objectives map[string]*v1alpha2.InferenceObjective // key: types.NamespacedName, value: backendmetrics.PodMetrics pods *sync.Map - epf datalayer.EndpointFactory + // modelServerMetricsPort metrics port from EPP command line argument + // used only if there is only one inference engine per pod + modelServerMetricsPort int32 + epf datalayer.EndpointFactory } func (ds *datastore) Clear() { @@ -129,11 +122,6 @@ func (ds *datastore) PoolSet(ctx context.Context, reader client.Reader, pool *v1 oldPool := ds.pool ds.pool = pool - if oldPool == nil || pool.Spec.TargetPorts[0] != oldPool.Spec.TargetPorts[0] { - if source, found := datalayer.GetNamedSource[*dlmetrics.DataSource](dlmetrics.DataSourceName); found { - source.SetPort(int32(pool.Spec.TargetPorts[0].Number)) - } - } if oldPool == nil || !reflect.DeepEqual(pool.Spec.Selector, oldPool.Spec.Selector) { logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", pool.Spec.Selector) // A full resync is required to address two cases: @@ -227,126 +215,65 @@ func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []b } func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { - namespacedName := types.NamespacedName{ - Name: pod.Name, - Namespace: pod.Namespace, - } - var pm backendmetrics.PodMetrics - existing, ok := ds.pods.Load(namespacedName) - if !ok { - pm = ds.epf.NewEndpoint(ds.parentCtx, pod, ds) - ds.pods.Store(namespacedName, pm) - } else { - pm = existing.(backendmetrics.PodMetrics) - } - // Update pod properties if anything changed. - pm.UpdatePod(pod) - return ok -} - -func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { - v, ok := ds.pods.LoadAndDelete(namespacedName) - if ok { - ds.epf.ReleaseEndpoint(v.(backendmetrics.PodMetrics)) - } -} - -// /// Request Management APIs /// - -func (ds *datastore) PodAddRequest(podName types.NamespacedName, requestID string, tpot float64) error { - pm, ok := ds.pods.Load(podName) - if !ok { - return fmt.Errorf("pod %s not found in datastore", podName) - } - - // TODO add to universal request map if needed for global tracking - - podMetrics := pm.(backendmetrics.PodMetrics) - runningRequests := podMetrics.GetRunningRequests() - if runningRequests == nil { - return fmt.Errorf("pod %s does not have running requests queue initialized", podName) - } - - // Request flow in datalayer - // - // Add request - - if !runningRequests.Add(requestID, tpot) { - return fmt.Errorf("request %s already exists in pod %s", requestID, podName) - } - - return nil -} - -func (ds *datastore) PodRemoveRequest(podName types.NamespacedName, requestID string) error { - pm, ok := ds.pods.Load(podName) - if !ok { - return fmt.Errorf("pod %s not found in datastore", podName) - } - - // Request removal from universal request map if needed for global tracking - - podMetrics := pm.(backendmetrics.PodMetrics) - runningRequests := podMetrics.GetRunningRequests() - if runningRequests == nil { - return fmt.Errorf("pod %s does not have running requests queue initialized", podName) - } - - _, removed := runningRequests.Remove(requestID) - if !removed { - return fmt.Errorf("request %s not found in pod %s", requestID, podName) - } - - return nil -} - -func (ds *datastore) PodUpdateRequest(podName types.NamespacedName, requestID string, tpot float64) error { - pm, ok := ds.pods.Load(podName) - if !ok { - return fmt.Errorf("pod %s not found in datastore", podName) + if ds.pool == nil { + return true } - podMetrics := pm.(backendmetrics.PodMetrics) - runningRequests := podMetrics.GetRunningRequests() - if runningRequests == nil { - return fmt.Errorf("pod %s does not have running requests queue initialized", podName) + labels := make(map[string]string, len(pod.GetLabels())) + for key, value := range pod.GetLabels() { + labels[key] = value } - if !runningRequests.Update(requestID, tpot) { - return fmt.Errorf("request %s not found in pod %s", requestID, podName) + modelServerMetricsPort := 0 + if len(ds.pool.Spec.TargetPorts) == 1 { + modelServerMetricsPort = int(ds.modelServerMetricsPort) } - - return nil -} - -func (ds *datastore) PodGetRunningRequests(podName types.NamespacedName) (*datalayer.RequestPriorityQueue, error) { - pm, ok := ds.pods.Load(podName) - if !ok { - return nil, fmt.Errorf("pod %s not found in datastore", podName) + pods := []*datalayer.PodInfo{} + for idx, port := range ds.pool.Spec.TargetPorts { + metricsPort := modelServerMetricsPort + if metricsPort == 0 { + metricsPort = int(port.Number) + } + pods = append(pods, + &datalayer.PodInfo{ + NamespacedName: types.NamespacedName{ + Name: pod.Name + "-rank-" + strconv.Itoa(idx), + Namespace: pod.Namespace, + }, + PodName: pod.Name, + Address: pod.Status.PodIP, + Port: strconv.Itoa(int(port.Number)), + MetricsHost: net.JoinHostPort(pod.Status.PodIP, strconv.Itoa(metricsPort)), + Labels: labels, + }) } - podMetrics := pm.(backendmetrics.PodMetrics) - runningRequests := podMetrics.GetRunningRequests() - if runningRequests == nil { - return nil, fmt.Errorf("pod %s does not have running requests queue initialized", podName) + result := true + for _, podInfo := range pods { + var pm backendmetrics.PodMetrics + existing, ok := ds.pods.Load(podInfo.NamespacedName) + if !ok { + pm = ds.epf.NewEndpoint(ds.parentCtx, podInfo, ds) + ds.pods.Store(podInfo.NamespacedName, pm) + result = false + } else { + pm = existing.(backendmetrics.PodMetrics) + } + // Update pod properties if anything changed. + pm.UpdatePod(podInfo) } - - return runningRequests, nil + return result } -func (ds *datastore) PodGetRequestCount(podName types.NamespacedName) (int, error) { - pm, ok := ds.pods.Load(podName) - if !ok { - return 0, fmt.Errorf("pod %s not found in datastore", podName) - } - - podMetrics := pm.(backendmetrics.PodMetrics) - runningRequests := podMetrics.GetRunningRequests() - if runningRequests == nil { - return 0, fmt.Errorf("pod %s does not have running requests queue initialized", podName) - } - - return runningRequests.GetSize(), nil +func (ds *datastore) PodDelete(podName string) { + ds.pods.Range(func(k, v any) bool { + pm := v.(backendmetrics.PodMetrics) + if pm.GetPod().PodName == podName { + ds.pods.Delete(k) + ds.epf.ReleaseEndpoint(pm) + } + return true + }) } func (ds *datastore) podResyncAll(ctx context.Context, reader client.Reader) error { @@ -376,9 +303,9 @@ func (ds *datastore) podResyncAll(ctx context.Context, reader client.Reader) err // Remove pods that don't belong to the pool or not ready any more. ds.pods.Range(func(k, v any) bool { pm := v.(backendmetrics.PodMetrics) - if exist := activePods[pm.GetPod().NamespacedName.Name]; !exist { + if exist := activePods[pm.GetPod().PodName]; !exist { logger.V(logutil.VERBOSE).Info("Removing pod", "pod", pm.GetPod()) - ds.PodDelete(pm.GetPod().NamespacedName) + ds.PodDelete(pm.GetPod().PodName) } return true }) diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 271c31ee7..ee59071e6 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -19,6 +19,8 @@ package datastore import ( "context" "errors" + "net" + "strconv" "testing" "time" @@ -35,6 +37,7 @@ import ( v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) @@ -83,21 +86,21 @@ func TestPool(t *testing.T) { WithScheme(scheme). Build() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - datastore := NewDatastore(context.Background(), pmf) - _ = datastore.PoolSet(context.Background(), fakeClient, tt.inferencePool) - gotPool, gotErr := datastore.PoolGet() + ds := NewDatastore(context.Background(), pmf, 0) + _ = ds.PoolSet(context.Background(), fakeClient, tt.inferencePool) + gotPool, gotErr := ds.PoolGet() if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { t.Errorf("Unexpected error diff (+got/-want): %s", diff) } if diff := cmp.Diff(tt.wantPool, gotPool); diff != "" { t.Errorf("Unexpected pool diff (+got/-want): %s", diff) } - gotSynced := datastore.PoolHasSynced() + gotSynced := ds.PoolHasSynced() if diff := cmp.Diff(tt.wantSynced, gotSynced); diff != "" { t.Errorf("Unexpected synced diff (+got/-want): %s", diff) } if tt.labels != nil { - gotLabelsMatch := datastore.PoolLabelsMatch(tt.labels) + gotLabelsMatch := ds.PoolLabelsMatch(tt.labels) if diff := cmp.Diff(tt.wantLabelsMatch, gotLabelsMatch); diff != "" { t.Errorf("Unexpected labels match diff (+got/-want): %s", diff) } @@ -190,7 +193,7 @@ func TestObjective(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - ds := NewDatastore(t.Context(), pmf) + ds := NewDatastore(t.Context(), pmf, 0) for _, m := range test.existingModels { ds.ObjectiveSet(m) } @@ -241,13 +244,22 @@ var ( WaitingModels: map[string]int{}, } - pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} - pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace} + pod1NamespacedName = types.NamespacedName{Name: pod1.Name + "-rank-0", Namespace: pod1.Namespace} + pod2NamespacedName = types.NamespacedName{Name: pod2.Name + "-rank-0", Namespace: pod2.Namespace} inferencePool = &v1.InferencePool{ Spec: v1.InferencePoolSpec{ TargetPorts: []v1.Port{{Number: v1.PortNumber(int32(8000))}}, }, } + inferencePoolMultiTarget = &v1.InferencePool{ + Spec: v1.InferencePoolSpec{ + TargetPorts: []v1.Port{{Number: v1.PortNumber(int32(8000))}, {Number: v1.PortNumber(int32(8001))}}, + }, + } + + inferencePoolTargetPort = strconv.Itoa(int(inferencePool.Spec.TargetPorts[0].Number)) + inferencePoolMultiTargetPort0 = strconv.Itoa(int(inferencePoolMultiTarget.Spec.TargetPorts[0].Number)) + inferencePoolMultiTargetPort1 = strconv.Itoa(int(inferencePoolMultiTarget.Spec.TargetPorts[1].Number)) ) func TestMetrics(t *testing.T) { @@ -315,7 +327,7 @@ func TestMetrics(t *testing.T) { WithScheme(scheme). Build() pmf := backendmetrics.NewPodMetricsFactory(test.pmc, time.Millisecond) - ds := NewDatastore(ctx, pmf) + ds := NewDatastore(ctx, pmf, 0) _ = ds.PoolSet(ctx, fakeClient, inferencePool) for _, pod := range test.storePods { ds.PodUpdateOrAddIfNotExist(pod) @@ -340,14 +352,6 @@ func TestMetrics(t *testing.T) { } func TestPods(t *testing.T) { - updatedPod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - }, - Spec: corev1.PodSpec{ - NodeName: "node-1", - }, - } tests := []struct { name string op func(ctx context.Context, ds Datastore) @@ -371,60 +375,226 @@ func TestPods(t *testing.T) { }, }, { - name: "Update existing pod, new field, should update", - existingPods: []*corev1.Pod{pod1}, - wantPods: []*corev1.Pod{updatedPod}, + name: "Delete the pod", + existingPods: []*corev1.Pod{pod1, pod2}, + wantPods: []*corev1.Pod{pod1}, op: func(ctx context.Context, ds Datastore) { - ds.PodUpdateOrAddIfNotExist(updatedPod) + ds.PodDelete(pod2.Name) }, }, { - name: "Update existing pod, no new fields, should not update", + name: "Delete the pod that doesn't exist", existingPods: []*corev1.Pod{pod1}, wantPods: []*corev1.Pod{pod1}, op: func(ctx context.Context, ds Datastore) { - incoming := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: "default", + ds.PodDelete(pod2.Name) + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx := context.Background() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := NewDatastore(t.Context(), pmf, 0) + fakeClient := fake.NewFakeClient() + if err := ds.PoolSet(ctx, fakeClient, inferencePool); err != nil { + t.Error(err) + } + for _, pod := range test.existingPods { + ds.PodUpdateOrAddIfNotExist(pod) + } + + test.op(ctx, ds) + var gotPods []*corev1.Pod + for _, pm := range ds.PodList(backendmetrics.AllPodsPredicate) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().PodName, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().GetIPAddress()}} + gotPods = append(gotPods, pod) + } + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { + t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) + } + }) + } +} + +func TestPodInfo(t *testing.T) { + tests := []struct { + name string + op func(ctx context.Context, ds Datastore) + pool *v1.InferencePool + existingPods []*corev1.Pod + wantPodInfos []*datalayer.PodInfo + }{ + { + name: "Add new pod, no existing pods, should add", + existingPods: []*corev1.Pod{}, + wantPodInfos: []*datalayer.PodInfo{ + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-0", + Namespace: pod1.Namespace, }, - } - ds.PodUpdateOrAddIfNotExist(incoming) + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolTargetPort, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolTargetPort), + Labels: map[string]string{}, + }, + }, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod1) }, + pool: inferencePool, }, { - name: "Delete the pod", - wantPods: []*corev1.Pod{pod1}, + name: "Add new pod, no existing pods, should add, multiple target ports", + existingPods: []*corev1.Pod{}, + wantPodInfos: []*datalayer.PodInfo{ + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-0", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort0, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort0), + Labels: map[string]string{}, + }, + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-1", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort1, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort1), + Labels: map[string]string{}, + }, + }, op: func(ctx context.Context, ds Datastore) { - ds.PodDelete(pod2NamespacedName) + ds.PodUpdateOrAddIfNotExist(pod1) }, + pool: inferencePoolMultiTarget, }, { - name: "Delete the pod that doesn't exist", + name: "Add new pod, with existing pods, should add, multiple target ports", existingPods: []*corev1.Pod{pod1}, - wantPods: []*corev1.Pod{pod1}, + wantPodInfos: []*datalayer.PodInfo{ + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-0", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort0, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort0), + Labels: map[string]string{}, + }, + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-1", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort1, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort1), + Labels: map[string]string{}, + }, + { + NamespacedName: types.NamespacedName{ + Name: pod2.Name + "-rank-0", + Namespace: pod2.Namespace, + }, + + PodName: pod2.Name, + Address: pod2.Status.PodIP, + Port: inferencePoolMultiTargetPort0, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort0), + Labels: map[string]string{}, + }, + { + NamespacedName: types.NamespacedName{ + Name: pod2.Name + "-rank-1", + Namespace: pod2.Namespace, + }, + + PodName: pod2.Name, + Address: pod2.Status.PodIP, + Port: inferencePoolMultiTargetPort1, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort1), + Labels: map[string]string{}, + }, + }, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod2) + }, + pool: inferencePoolMultiTarget, + }, + { + name: "Delete the pod, multiple target ports", + existingPods: []*corev1.Pod{pod1, pod2}, + wantPodInfos: []*datalayer.PodInfo{ + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-0", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort0, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort0), + Labels: map[string]string{}, + }, + { + NamespacedName: types.NamespacedName{ + Name: pod1.Name + "-rank-1", + Namespace: pod1.Namespace, + }, + + PodName: pod1.Name, + Address: pod1.Status.PodIP, + Port: inferencePoolMultiTargetPort1, + MetricsHost: net.JoinHostPort(pod1.Status.PodIP, inferencePoolMultiTargetPort1), + Labels: map[string]string{}, + }, + }, op: func(ctx context.Context, ds Datastore) { - ds.PodDelete(pod2NamespacedName) + ds.PodDelete(pod2.Name) }, + pool: inferencePoolMultiTarget, }, } + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ctx := context.Background() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - ds := NewDatastore(t.Context(), pmf) + ds := NewDatastore(t.Context(), pmf, 0) + fakeClient := fake.NewFakeClient() + if err := ds.PoolSet(ctx, fakeClient, test.pool); err != nil { + t.Error(err) + } for _, pod := range test.existingPods { ds.PodUpdateOrAddIfNotExist(pod) } test.op(ctx, ds) - var gotPods []*corev1.Pod + var gotPodInfos []*datalayer.PodInfo for _, pm := range ds.PodList(backendmetrics.AllPodsPredicate) { - pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} - gotPods = append(gotPods, pod) + gotPodInfos = append(gotPodInfos, pm.GetPod()) } - if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { - t.Logf("got (%v) != want (%v);", gotPods, test.wantPods) + if diff := cmp.Diff(test.wantPodInfos, gotPodInfos, cmpopts.SortSlices(func(a, b *datalayer.PodInfo) bool { return a.NamespacedName.Name < b.NamespacedName.Name })); diff != "" { + t.Errorf("ConvertTo() mismatch (-want +got):\n%s", diff) } }) } diff --git a/pkg/epp/flowcontrol/config.go b/pkg/epp/flowcontrol/config.go new file mode 100644 index 000000000..edc23abad --- /dev/null +++ b/pkg/epp/flowcontrol/config.go @@ -0,0 +1,50 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package flowcontrol + +import ( + "fmt" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/registry" +) + +// Config is the top-level configuration for the entire flow control module. +// It embeds the configurations for the controller and the registry, providing a single point of entry for validation +// and initialization. +type Config struct { + Controller controller.Config + Registry registry.Config +} + +// ValidateAndApplyDefaults checks the configuration for validity and populates any empty fields with system defaults. +// It delegates validation to the underlying controller and registry configurations. +// It returns a new, validated `Config` object and does not mutate the receiver. +func (c *Config) ValidateAndApplyDefaults() (*Config, error) { + validatedControllerCfg, err := c.Controller.ValidateAndApplyDefaults() + if err != nil { + return nil, fmt.Errorf("controller config validation failed: %w", err) + } + validatedRegistryCfg, err := c.Registry.ValidateAndApplyDefaults() + if err != nil { + return nil, fmt.Errorf("registry config validation failed: %w", err) + } + return &Config{ + Controller: *validatedControllerCfg, + Registry: *validatedRegistryCfg, + }, nil +} diff --git a/pkg/epp/flowcontrol/config_test.go b/pkg/epp/flowcontrol/config_test.go new file mode 100644 index 000000000..713abee77 --- /dev/null +++ b/pkg/epp/flowcontrol/config_test.go @@ -0,0 +1,91 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package flowcontrol + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/registry" +) + +func TestConfig_ValidateAndApplyDefaults(t *testing.T) { + t.Parallel() + + // A minimal valid registry config, which is required for the success case. + validRegistryConfig := registry.Config{ + PriorityBands: []registry.PriorityBandConfig{ + {Priority: 1, PriorityName: "TestBand"}, + }, + } + + testCases := []struct { + name string + input Config + expectErr bool + expectedErrIs error + }{ + { + name: "ShouldSucceed_WhenSubConfigsAreValid", + input: Config{ + Controller: controller.Config{}, + Registry: validRegistryConfig, + }, + expectErr: false, + }, + { + name: "ShouldFail_WhenControllerConfigIsInvalid", + input: Config{ + Controller: controller.Config{ + DefaultRequestTTL: -1 * time.Second, + }, + Registry: validRegistryConfig, + }, + expectErr: true, + }, + { + name: "ShouldFail_WhenRegistryConfigIsInvalid", + input: Config{ + Controller: controller.Config{}, + Registry: registry.Config{ + PriorityBands: []registry.PriorityBandConfig{}, + }, + }, + expectErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + originalInput := tc.input + validatedCfg, err := tc.input.ValidateAndApplyDefaults() + + if tc.expectErr { + require.Error(t, err, "expected an error but got nil") + } else { + require.NoError(t, err, "expected no error but got: %v", err) + require.NotNil(t, validatedCfg, "validatedCfg should not be nil on success") + } + + assert.Equal(t, originalInput, tc.input, "input config should not be mutated") + }) + } +} diff --git a/pkg/epp/flowcontrol/contracts/mocks/mocks.go b/pkg/epp/flowcontrol/contracts/mocks/mocks.go index c5c8d2e3b..10f093b11 100644 --- a/pkg/epp/flowcontrol/contracts/mocks/mocks.go +++ b/pkg/epp/flowcontrol/contracts/mocks/mocks.go @@ -34,6 +34,7 @@ import ( "fmt" "sync" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" @@ -48,9 +49,9 @@ type MockRegistryShard struct { IsActiveFunc func() bool ManagedQueueFunc func(key types.FlowKey) (contracts.ManagedQueue, error) IntraFlowDispatchPolicyFunc func(key types.FlowKey) (framework.IntraFlowDispatchPolicy, error) - InterFlowDispatchPolicyFunc func(priority uint) (framework.InterFlowDispatchPolicy, error) - PriorityBandAccessorFunc func(priority uint) (framework.PriorityBandAccessor, error) - AllOrderedPriorityLevelsFunc func() []uint + InterFlowDispatchPolicyFunc func(priority int) (framework.InterFlowDispatchPolicy, error) + PriorityBandAccessorFunc func(priority int) (framework.PriorityBandAccessor, error) + AllOrderedPriorityLevelsFunc func() []int StatsFunc func() contracts.ShardStats } @@ -82,21 +83,21 @@ func (m *MockRegistryShard) IntraFlowDispatchPolicy(key types.FlowKey) (framewor return nil, nil } -func (m *MockRegistryShard) InterFlowDispatchPolicy(priority uint) (framework.InterFlowDispatchPolicy, error) { +func (m *MockRegistryShard) InterFlowDispatchPolicy(priority int) (framework.InterFlowDispatchPolicy, error) { if m.InterFlowDispatchPolicyFunc != nil { return m.InterFlowDispatchPolicyFunc(priority) } return nil, nil } -func (m *MockRegistryShard) PriorityBandAccessor(priority uint) (framework.PriorityBandAccessor, error) { +func (m *MockRegistryShard) PriorityBandAccessor(priority int) (framework.PriorityBandAccessor, error) { if m.PriorityBandAccessorFunc != nil { return m.PriorityBandAccessorFunc(priority) } return nil, nil } -func (m *MockRegistryShard) AllOrderedPriorityLevels() []uint { +func (m *MockRegistryShard) AllOrderedPriorityLevels() []int { if m.AllOrderedPriorityLevelsFunc != nil { return m.AllOrderedPriorityLevelsFunc() } @@ -112,12 +113,12 @@ func (m *MockRegistryShard) Stats() contracts.ShardStats { // MockSaturationDetector is a simple "stub-style" mock for testing. type MockSaturationDetector struct { - IsSaturatedFunc func(ctx context.Context) bool + IsSaturatedFunc func(ctx context.Context, candidatePods []metrics.PodMetrics) bool } -func (m *MockSaturationDetector) IsSaturated(ctx context.Context) bool { +func (m *MockSaturationDetector) IsSaturated(ctx context.Context, candidatePods []metrics.PodMetrics) bool { if m.IsSaturatedFunc != nil { - return m.IsSaturatedFunc(ctx) + return m.IsSaturatedFunc(ctx, candidatePods) } return false } diff --git a/pkg/epp/flowcontrol/contracts/registry.go b/pkg/epp/flowcontrol/contracts/registry.go index de1b89ae6..fe0b790b9 100644 --- a/pkg/epp/flowcontrol/contracts/registry.go +++ b/pkg/epp/flowcontrol/contracts/registry.go @@ -22,8 +22,8 @@ import ( ) // FlowRegistry is the complete interface for the global flow control plane. -// It composes the client-facing data path interface and the administrative interface. A concrete implementation of this -// interface is the single source of truth for all flow control state. +// It composes all role-based interfaces. A concrete implementation of this interface is the single source of truth for +// all flow control state. // // # Conformance: Implementations MUST be goroutine-safe. // @@ -48,22 +48,21 @@ import ( // 2. Capacity Partitioning: Global and per-band capacity limits must be uniformly partitioned across all Active // shards. type FlowRegistry interface { - FlowRegistryClient - FlowRegistryAdmin + FlowRegistryObserver + FlowRegistryDataPlane } -// FlowRegistryAdmin defines the administrative interface for the global control plane. -type FlowRegistryAdmin interface { - // Stats returns globally aggregated statistics for the entire `FlowRegistry`. +// FlowRegistryObserver defines the read-only, observation interface for the registry. +type FlowRegistryObserver interface { + // Stats returns a near-consistent snapshot globally aggregated statistics for the entire `FlowRegistry`. Stats() AggregateStats - // ShardStats returns a slice of statistics, one for each internal shard. + // ShardStats returns a near-consistent slice of statistics snapshots, one for each `RegistryShard`. ShardStats() []ShardStats } -// FlowRegistryClient defines the primary, client-facing interface for the registry. -// This is the interface that the `controller.FlowController`'s data path depends upon. -type FlowRegistryClient interface { +// FlowRegistryDataPlane defines the high-throughput, request-path interface for the registry. +type FlowRegistryDataPlane interface { // WithConnection manages a scoped, leased session for a given flow. // It is the primary and sole entry point for interacting with the data path. // @@ -90,9 +89,8 @@ type FlowRegistryClient interface { // Its purpose is to ensure that any interaction with the flow's state (e.g., accessing its shards and queues) occurs // safely while the flow is guaranteed to be protected from garbage collection. type ActiveFlowConnection interface { - // Shards returns a stable snapshot of accessors for all internal state shards (both Active and Draining). - // Consumers MUST check `RegistryShard.IsActive()` before routing new work to a shard from this slice. - Shards() []RegistryShard + // ActiveShards returns a stable snapshot of accessors for all Active internal state shards. + ActiveShards() []RegistryShard } // RegistryShard defines the interface for a single slice (shard) of the `FlowRegistry`'s state. @@ -124,22 +122,22 @@ type RegistryShard interface { // InterFlowDispatchPolicy retrieves a priority band's configured `framework.InterFlowDispatchPolicy` for this shard. // The registry guarantees that a non-nil default policy is returned if none is configured for the band. // Returns an error wrapping `ErrPriorityBandNotFound` if the priority level is not configured. - InterFlowDispatchPolicy(priority uint) (framework.InterFlowDispatchPolicy, error) + InterFlowDispatchPolicy(priority int) (framework.InterFlowDispatchPolicy, error) // PriorityBandAccessor retrieves a read-only accessor for a given priority level, providing a view of the band's // state as seen by this specific shard. This is the primary entry point for inter-flow dispatch policies that need to // inspect and compare multiple flow queues within the same priority band. // Returns an error wrapping `ErrPriorityBandNotFound` if the priority level is not configured. - PriorityBandAccessor(priority uint) (framework.PriorityBandAccessor, error) + PriorityBandAccessor(priority int) (framework.PriorityBandAccessor, error) - // AllOrderedPriorityLevels returns all configured priority levels that this shard is aware of, sorted in ascending - // numerical order. This order corresponds to highest priority (lowest numeric value) to lowest priority (highest + // AllOrderedPriorityLevels returns all configured priority levels that this shard is aware of, sorted in descending + // numerical order. This order corresponds to highest priority (highest numeric value) to lowest priority (lowest // numeric value). // The returned slice provides a definitive, ordered list of priority levels for iteration, for example, by a // `controller.FlowController` worker's dispatch loop. - AllOrderedPriorityLevels() []uint + AllOrderedPriorityLevels() []int - // Stats returns a snapshot of the statistics for this specific shard. + // Stats returns a near consistent snapshot of the shard's state. Stats() ShardStats } @@ -162,6 +160,7 @@ type ManagedQueue interface { } // AggregateStats holds globally aggregated statistics for the entire `FlowRegistry`. +// It is a read-only data object representing a near-consistent snapshot of the registry's state. type AggregateStats struct { // TotalCapacityBytes is the globally configured maximum total byte size limit across all priority bands and shards. TotalCapacityBytes uint64 @@ -170,11 +169,18 @@ type AggregateStats struct { // TotalLen is the total number of items currently queued across the entire system. TotalLen uint64 // PerPriorityBandStats maps each configured priority level to its globally aggregated statistics. - PerPriorityBandStats map[uint]PriorityBandStats + PerPriorityBandStats map[int]PriorityBandStats } -// ShardStats holds statistics for a single internal shard within the `FlowRegistry`. +// ShardStats holds statistics and identifying information for a `RegistryShard` within the `FlowRegistry`. +// It is a read-only data object representing a near-consistent snapshot of the shard's state. type ShardStats struct { + // ID is the unique, stable identifier for this shard. + ID string + // IsActive indicates if the shard was accepting new work at the time this stats snapshot was generated. + // A value of `false` means the shard is in the process of being gracefully drained. + // Due to the concurrent nature of the system, this state could change immediately after the snapshot is taken. + IsActive bool // TotalCapacityBytes is the optional, maximum total byte size limit aggregated across all priority bands within this // shard. Its value represents the globally configured limit for the `FlowRegistry` partitioned for this shard. // The `controller.FlowController` enforces this limit in addition to any per-band capacity limits. @@ -188,13 +194,14 @@ type ShardStats struct { // The capacity values within represent this shard's partition of the global band capacity. // The key is the numerical priority level. // All configured priority levels are guaranteed to be represented. - PerPriorityBandStats map[uint]PriorityBandStats + PerPriorityBandStats map[int]PriorityBandStats } // PriorityBandStats holds aggregated statistics for a single priority band. +// It is a read-only data object representing a near-consistent snapshot of the priority band's state. type PriorityBandStats struct { // Priority is the numerical priority level this struct describes. - Priority uint + Priority int // PriorityName is a human-readable name for the priority band (e.g., "Critical", "Sheddable"). // The registry configuration requires this field, so it is guaranteed to be non-empty. PriorityName string diff --git a/pkg/epp/flowcontrol/contracts/saturationdetector.go b/pkg/epp/flowcontrol/contracts/saturationdetector.go index 91d2406c5..15037d50a 100644 --- a/pkg/epp/flowcontrol/contracts/saturationdetector.go +++ b/pkg/epp/flowcontrol/contracts/saturationdetector.go @@ -16,7 +16,11 @@ limitations under the License. package contracts -import "context" +import ( + "context" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" +) // SaturationDetector defines the contract for a component that provides real-time load signals to the // `controller.FlowController`. @@ -32,8 +36,8 @@ import "context" // // Implementations MUST be goroutine-safe. type SaturationDetector interface { - // IsSaturated returns true if the system's backend resources are considered saturated. + // IsSaturated returns true if the system's backend resources are considered saturated for a set of candidate pods. // `controller.FlowController`'s dispatch workers call this method to decide whether to pause or throttle dispatch // operations to prevent overwhelming the backends. - IsSaturated(ctx context.Context) bool + IsSaturated(ctx context.Context, candidatePods []metrics.PodMetrics) bool } diff --git a/pkg/epp/flowcontrol/controller/config.go b/pkg/epp/flowcontrol/controller/config.go new file mode 100644 index 000000000..e542c4d6b --- /dev/null +++ b/pkg/epp/flowcontrol/controller/config.go @@ -0,0 +1,102 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "fmt" + "time" +) + +const ( + // defaultExpiryCleanupInterval is the default frequency for scanning for expired items. + defaultExpiryCleanupInterval = 1 * time.Second + // defaultProcessorReconciliationInterval is the default frequency for the supervisor loop. + defaultProcessorReconciliationInterval = 5 * time.Second + // defaultEnqueueChannelBufferSize is the default size of a worker's incoming request buffer. + defaultEnqueueChannelBufferSize = 100 +) + +// Config holds the configuration for the `FlowController`. +type Config struct { + // DefaultRequestTTL is the default Time-To-Live applied to requests that do not + // specify their own TTL hint. + // Optional: If zero, no TTL is applied by default and we rely solely on request context cancellation. + DefaultRequestTTL time.Duration + + // ExpiryCleanupInterval is the interval at which each shard processor scans its queues for expired items. + // Optional: Defaults to `defaultExpiryCleanupInterval` (1 second). + ExpiryCleanupInterval time.Duration + + // ProcessorReconciliationInterval is the frequency at which the `FlowController`'s supervisor loop garbage collects + // stale workers. + // Optional: Defaults to `defaultProcessorReconciliationInterval` (5 seconds). + ProcessorReconciliationInterval time.Duration + + // EnqueueChannelBufferSize is the size of the buffered channel that accepts incoming requests for each shard + // processor. This buffer acts as a shock absorber, decoupling the high-frequency distributor from the processor's + // serial execution loop and allowing the system to handle short bursts of traffic without blocking. + // Optional: Defaults to `defaultEnqueueChannelBufferSize` (100). + EnqueueChannelBufferSize int +} + +// ValidateAndApplyDefaults checks the global configuration for validity and then creates a new `Config` object, +// populating any empty fields with system defaults. +// It does not mutate the receiver. +func (c *Config) ValidateAndApplyDefaults() (*Config, error) { + cfg := c.deepCopy() + + // --- Validation --- + if cfg.DefaultRequestTTL < 0 { + return nil, fmt.Errorf("DefaultRequestTTL cannot be negative, but got %v", cfg.DefaultRequestTTL) + } + if cfg.ExpiryCleanupInterval < 0 { + return nil, fmt.Errorf("ExpiryCleanupInterval cannot be negative, but got %v", cfg.ExpiryCleanupInterval) + } + if cfg.ProcessorReconciliationInterval < 0 { + return nil, fmt.Errorf("ProcessorReconciliationInterval cannot be negative, but got %v", + cfg.ProcessorReconciliationInterval) + } + if cfg.EnqueueChannelBufferSize < 0 { + return nil, fmt.Errorf("EnqueueChannelBufferSize cannot be negative, but got %d", cfg.EnqueueChannelBufferSize) + } + + // --- Defaulting --- + if cfg.ExpiryCleanupInterval == 0 { + cfg.ExpiryCleanupInterval = defaultExpiryCleanupInterval + } + if cfg.ProcessorReconciliationInterval == 0 { + cfg.ProcessorReconciliationInterval = defaultProcessorReconciliationInterval + } + if cfg.EnqueueChannelBufferSize == 0 { + cfg.EnqueueChannelBufferSize = defaultEnqueueChannelBufferSize + } + return cfg, nil +} + +// deepCopy creates a deep copy of the `Config` object. +func (c *Config) deepCopy() *Config { + if c == nil { + return nil + } + newCfg := &Config{ + DefaultRequestTTL: c.DefaultRequestTTL, + ExpiryCleanupInterval: c.ExpiryCleanupInterval, + ProcessorReconciliationInterval: c.ProcessorReconciliationInterval, + EnqueueChannelBufferSize: c.EnqueueChannelBufferSize, + } + return newCfg +} diff --git a/pkg/epp/flowcontrol/controller/config_test.go b/pkg/epp/flowcontrol/controller/config_test.go new file mode 100644 index 000000000..710df9fa7 --- /dev/null +++ b/pkg/epp/flowcontrol/controller/config_test.go @@ -0,0 +1,135 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConfig_ValidateAndApplyDefaults(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + input Config + expectErr bool + expectedCfg Config + shouldDefault bool + }{ + { + name: "ValidConfig_NoChanges", + input: Config{ + DefaultRequestTTL: 10 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 10 * time.Second, + EnqueueChannelBufferSize: 200, + }, + expectErr: false, + expectedCfg: Config{ + DefaultRequestTTL: 10 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 10 * time.Second, + EnqueueChannelBufferSize: 200, + }, + }, + { + name: "EmptyConfig_ShouldApplyDefaults", + input: Config{}, + expectErr: false, + expectedCfg: Config{ + DefaultRequestTTL: 0, + ExpiryCleanupInterval: defaultExpiryCleanupInterval, + ProcessorReconciliationInterval: defaultProcessorReconciliationInterval, + EnqueueChannelBufferSize: defaultEnqueueChannelBufferSize, + }, + shouldDefault: true, + }, + { + name: "NegativeDefaultRequestTTL_Invalid", + input: Config{DefaultRequestTTL: -1}, + expectErr: true, + }, + { + name: "NegativeExpiryCleanupInterval_Invalid", + input: Config{ExpiryCleanupInterval: -1}, + expectErr: true, + }, + { + name: "NegativeProcessorReconciliationInterval_Invalid", + input: Config{ProcessorReconciliationInterval: -1}, + expectErr: true, + }, + { + name: "NegativeEnqueueChannelBufferSize_Invalid", + input: Config{EnqueueChannelBufferSize: -1}, + expectErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + originalInput := tc.input.deepCopy() + validatedCfg, err := tc.input.ValidateAndApplyDefaults() + + if tc.expectErr { + require.Error(t, err, "expected an error but got nil") + assert.Nil(t, validatedCfg, "validatedCfg should be nil on error") + } else { + require.NoError(t, err, "expected no error but got: %v", err) + require.NotNil(t, validatedCfg, "validatedCfg should not be nil on success") + assert.Equal(t, tc.expectedCfg, *validatedCfg, "validatedCfg should match expected config") + + // Ensure the original config is not mutated. + assert.Equal(t, *originalInput, tc.input, "input config should not be mutated") + } + }) + } +} + +func TestConfig_DeepCopy(t *testing.T) { + t.Parallel() + + t.Run("ShouldReturnNil_ForNilReceiver", func(t *testing.T) { + t.Parallel() + var nilConfig *Config + assert.Nil(t, nilConfig.deepCopy(), "Deep copy of a nil config should be nil") + }) + + t.Run("ShouldCreateIdenticalButSeparateObject", func(t *testing.T) { + t.Parallel() + original := &Config{ + DefaultRequestTTL: 1 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 3 * time.Second, + EnqueueChannelBufferSize: 4, + } + clone := original.deepCopy() + + require.NotSame(t, original, clone, "Clone should be a new object in memory") + assert.Equal(t, *original, *clone, "Cloned object should have identical values") + + // Modify the clone and ensure the original is unchanged. + clone.DefaultRequestTTL = 99 * time.Second + assert.NotEqual(t, original.DefaultRequestTTL, clone.DefaultRequestTTL, + "Original should not be mutated after clone is changed") + }) +} diff --git a/pkg/epp/flowcontrol/controller/controller.go b/pkg/epp/flowcontrol/controller/controller.go new file mode 100644 index 000000000..a11ef26a5 --- /dev/null +++ b/pkg/epp/flowcontrol/controller/controller.go @@ -0,0 +1,516 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller contains the implementation of the FlowController engine. +// +// The FlowController is the central processing engine of the Flow Control layer. It is a sharded, high-throughput +// component responsible for managing the lifecycle of all incoming requests. It achieves this by acting as a stateless +// supervisor that orchestrates a pool of stateful workers (ShardProcessors), distributing incoming requests among them. +package controller + +import ( + "cmp" + "context" + "errors" + "fmt" + "slices" + "strconv" + "sync" + "time" + + "github.com/go-logr/logr" + k8srand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/clock" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller/internal" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// registryClient defines the minimal interface that the FlowController needs to interact with the FlowRegistry. +type registryClient interface { + contracts.FlowRegistryObserver + contracts.FlowRegistryDataPlane +} + +// shardProcessor is the minimal internal interface that the FlowController requires from its workers. +type shardProcessor interface { + Run(ctx context.Context) + Submit(item *internal.FlowItem) error + SubmitOrBlock(ctx context.Context, item *internal.FlowItem) error +} + +// shardProcessorFactory defines the signature for creating a shardProcessor. +type shardProcessorFactory func( + ctx context.Context, + shard contracts.RegistryShard, + saturationDetector contracts.SaturationDetector, + clock clock.WithTicker, + cleanupSweepInterval time.Duration, + enqueueChannelBufferSize int, + logger logr.Logger, +) shardProcessor + +var _ shardProcessor = &internal.ShardProcessor{} + +// managedWorker holds the state for a single supervised worker. +type managedWorker struct { + processor shardProcessor + // cancel function for the worker-specific context. Used during shutdown and GC. + cancel context.CancelFunc +} + +// FlowController is the central, high-throughput engine of the Flow Control layer. +// It is designed as a stateless distributor that orchestrates a pool of stateful workers (ShardProcessor), following a +// supervisor-worker pattern. +// +// The controller's run loop executes periodically, acting as a garbage collector that keeps the pool of running +// workers synchronized with the dynamic shard topology of the FlowRegistry. +// +// Request Lifecycle Management: +// +// 1. Asynchronous Finalization (Controller-Owned): The Controller actively monitors the request Context +// (TTL/Cancellation) in EnqueueAndWait. If the Context expires, the Controller immediately Finalizes the item and +// unblocks the caller. +// 2. Synchronous Finalization (Processor-Owned): The Processor handles Dispatch, Capacity Rejection, and Shutdown. +// 3. Cleanup (Processor-Owned): The Processor periodically sweeps externally finalized items to reclaim capacity. +type FlowController struct { + // --- Immutable dependencies (set at construction) --- + + config Config + registry registryClient + saturationDetector contracts.SaturationDetector + clock clock.WithTicker + logger logr.Logger + shardProcessorFactory shardProcessorFactory + + // --- Lifecycle state --- + + // parentCtx is the root context for the controller's lifecycle, established when NewFlowController is called. + // It is the parent for all long-lived worker goroutines. + parentCtx context.Context + + // --- Concurrent state --- + + // workers is a highly concurrent map storing the managedWorker for each shard. + // It is the controller's source of truth for the worker pool. + workers sync.Map // key: shard ID (string); value: *managedWorker + + // wg waits for all worker goroutines to terminate during shutdown. + wg sync.WaitGroup +} + +// flowControllerOption is a function that applies a configuration change. +// test-only +type flowControllerOption func(*FlowController) + +// NewFlowController creates and starts a new FlowController instance. +// The provided context governs the lifecycle of the controller and all its workers. +func NewFlowController( + ctx context.Context, + config Config, + registry contracts.FlowRegistry, + sd contracts.SaturationDetector, + logger logr.Logger, + opts ...flowControllerOption, +) (*FlowController, error) { + fc := &FlowController{ + config: config, + registry: registry, + saturationDetector: sd, + clock: clock.RealClock{}, + logger: logger.WithName("flow-controller"), + parentCtx: ctx, + } + + fc.shardProcessorFactory = func( + ctx context.Context, + shard contracts.RegistryShard, + saturationDetector contracts.SaturationDetector, + clock clock.WithTicker, + cleanupSweepInterval time.Duration, + enqueueChannelBufferSize int, + logger logr.Logger, + ) shardProcessor { + return internal.NewShardProcessor( + ctx, + shard, + saturationDetector, + clock, + cleanupSweepInterval, + enqueueChannelBufferSize, + logger) + } + + for _, opt := range opts { + opt(fc) + } + + go fc.run(ctx) + return fc, nil +} + +// run starts the FlowController's main reconciliation loop (supervisor loop). +// This loop is responsible for garbage collecting workers whose shards no longer exist in the registry. +// This method blocks until the provided context is cancelled and all worker goroutines have fully terminated. +func (fc *FlowController) run(ctx context.Context) { + fc.logger.Info("Starting FlowController reconciliation loop.") + defer fc.logger.Info("FlowController reconciliation loop stopped.") + + ticker := fc.clock.NewTicker(fc.config.ProcessorReconciliationInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + fc.shutdown() + return + case <-ticker.C(): + fc.reconcileProcessors() + } + } +} + +// EnqueueAndWait is the primary, synchronous entry point to the Flow Control system. It submits a request and blocks +// until the request reaches a terminal outcome (dispatched, rejected, or evicted). +// +// # Design Rationale: The Synchronous Model +// +// This blocking model is deliberately chosen for its simplicity and robustness, especially in the context of Envoy +// External Processing (ext_proc), which operates on a stream-based protocol. +// +// - ext_proc Alignment: A single goroutine typically manages the stream for a given HTTP request. +// EnqueueAndWait fits this perfectly: the request-handling goroutine calls it, blocks, and upon return, has a +// definitive outcome to act upon. +// - Simplified State Management: The state of a "waiting" request is implicitly managed by the blocked goroutine's +// stack and its Context. The system only needs to signal this specific goroutine to unblock it. +// - Direct Backpressure: If queues are full, EnqueueAndWait returns an error immediately, providing direct +// backpressure to the caller. +func (fc *FlowController) EnqueueAndWait( + ctx context.Context, + req types.FlowControlRequest, +) (types.QueueOutcome, error) { + flowKey := req.FlowKey() + fairnessID := flowKey.ID + priority := strconv.Itoa(flowKey.Priority) + metrics.IncFlowControlQueueSize(fairnessID, priority) + defer metrics.DecFlowControlQueueSize(fairnessID, priority) + + // 1. Create the derived context that governs this request's lifecycle (Parent Cancellation + TTL). + reqCtx, cancel, enqueueTime := fc.createRequestContext(ctx, req) + defer cancel() + + // 2. Enter the distribution loop to find a home for the request. + // This loop is responsible for retrying on ErrShardDraining. + for { + select { // Non-blocking check on controller lifecycle. + case <-fc.parentCtx.Done(): + return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning) + default: + } + + // Attempt to distribute the request once. + item, err := fc.tryDistribution(reqCtx, req, enqueueTime) + if err != nil { + // Distribution failed terminally (e.g., no shards, context cancelled during blocking submit). + // The item has already been finalized by tryDistribution. + finalState := item.FinalState() + return finalState.Outcome, finalState.Err + } + + // Distribution was successful; ownership of the item has been transferred to a processor. + // Now, we block here in awaitFinalization until the request is finalized by either the processor (e.g., dispatched, + // rejected) or the controller itself (e.g., caller's context cancelled/TTL expired). + outcome, err := fc.awaitFinalization(reqCtx, item) + if errors.Is(err, contracts.ErrShardDraining) { + // This is a benign race condition where the chosen shard started draining after acceptance. + fc.logger.V(logutil.DEBUG).Info("Selected shard is Draining, retrying request distribution", + "flowKey", req.FlowKey(), "requestID", req.ID()) + // Introduce a small, randomized delay (1-10ms) to prevent tight spinning loops and thundering herds during retry + // scenarios (e.g., shard draining) + // TODO: Replace this with a more sophisticated backoff strategy when our data parallelism story matures. + // For now, this is more than sufficient. + jitterMs := k8srand.Intn(10) + 1 + fc.clock.Sleep(time.Duration(jitterMs) * time.Millisecond) + continue + } + + // The outcome is terminal (Dispatched, Evicted, or a non-retriable rejection). + return outcome, err + } +} + +var errNoShards = errors.New("no viable active shards available") + +// tryDistribution handles a single attempt to select a shard and submit a request. +// If this function returns an error, it guarantees that the provided `item` has been finalized. +func (fc *FlowController) tryDistribution( + reqCtx context.Context, + req types.FlowControlRequest, + enqueueTime time.Time, +) (*internal.FlowItem, error) { + // Calculate effective TTL for item initialization (reqCtx is the enforcement mechanism). + effectiveTTL := fc.config.DefaultRequestTTL + if deadline, ok := reqCtx.Deadline(); ok { + if ttl := deadline.Sub(enqueueTime); ttl > 0 { + effectiveTTL = ttl + } + } + + // We must create a fresh FlowItem on each attempt as finalization is per-lifecycle. + item := internal.NewItem(req, effectiveTTL, enqueueTime) + + candidates, err := fc.selectDistributionCandidates(item.OriginalRequest().FlowKey()) + if err != nil { + outcome := types.QueueOutcomeRejectedOther + if errors.Is(err, errNoShards) { + outcome = types.QueueOutcomeRejectedCapacity + } + finalErr := fmt.Errorf("%w: request not accepted: %w", types.ErrRejected, err) + item.FinalizeWithOutcome(outcome, finalErr) + return item, finalErr + } + + outcome, err := fc.distributeRequest(reqCtx, item, candidates) + if err == nil { + // Success: Ownership of the item has been transferred to the processor. + return item, nil + } + + // For any distribution error, the controller retains ownership and must finalize the item. + var finalErr error + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + // We propagate the original context error here, EnqueueAndWait will rely on item.FinalState().Err. + finalErr = err + item.Finalize(context.Cause(reqCtx)) + } else { // e.g., + finalErr = fmt.Errorf("%w: request not accepted: %w", types.ErrRejected, err) + item.FinalizeWithOutcome(outcome, finalErr) + } + return item, finalErr +} + +// awaitFinalization blocks until an item is finalized, either by the processor (synchronously) or by the controller +// itself due to context expiry (asynchronously). +func (fc *FlowController) awaitFinalization( + reqCtx context.Context, + item *internal.FlowItem, +) (types.QueueOutcome, error) { + select { + case <-reqCtx.Done(): + // Asynchronous Finalization (Controller-initiated): + // The request Context expired (Cancellation/TTL) while the item was being processed. + cause := context.Cause(reqCtx) + item.Finalize(cause) + + // The processor will eventually discard this "zombie" item during its cleanup sweep. + finalState := item.FinalState() + return finalState.Outcome, finalState.Err + + case finalState := <-item.Done(): + // Synchronous Finalization (Processor-initiated): + // The processor finalized the item (Dispatch, Reject, Shutdown). + return finalState.Outcome, finalState.Err + } +} + +// createRequestContext derives the context that governs a request's lifecycle, enforcing the TTL deadline. +func (fc *FlowController) createRequestContext( + ctx context.Context, + req types.FlowControlRequest, +) (context.Context, context.CancelFunc, time.Time) { + enqueueTime := fc.clock.Now() + effectiveTTL := req.InitialEffectiveTTL() + if effectiveTTL <= 0 { + effectiveTTL = fc.config.DefaultRequestTTL + } + + if effectiveTTL > 0 { + reqCtx, cancel := context.WithDeadlineCause(ctx, enqueueTime.Add(effectiveTTL), types.ErrTTLExpired) + return reqCtx, cancel, enqueueTime + } + reqCtx, cancel := context.WithCancel(ctx) + return reqCtx, cancel, enqueueTime +} + +// candidate holds the information needed to evaluate a shard as a potential target for a request. +type candidate struct { + processor shardProcessor + shardID string + byteSize uint64 +} + +// selectDistributionCandidates identifies all Active shards for the item's flow and ranks them by the current byte size +// of that flow's queue, from least to most loaded. +func (fc *FlowController) selectDistributionCandidates(key types.FlowKey) ([]candidate, error) { + var candidates []candidate + + // Acquire a connection to the registry for the flow key. This ensures a consistent view of the ActiveShards for the + // duration of the shard selection process, preventing races with concurrent shard topology changes. + err := fc.registry.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { + shards := conn.ActiveShards() + candidates = make([]candidate, 0, len(shards)) + for _, shard := range shards { + worker := fc.getOrStartWorker(shard) + mq, err := shard.ManagedQueue(key) + if err != nil { + fc.logger.Error(err, + "Invariant violation. Failed to get ManagedQueue for a leased flow on an Active shard. Skipping shard.", + "flowKey", key, "shardID", shard.ID()) + continue + } + candidates = append(candidates, candidate{worker.processor, shard.ID(), mq.ByteSize()}) + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("failed to acquire lease for flow %s: %w", key, err) + } + + if len(candidates) == 0 { + return nil, fmt.Errorf("%w for flow %s", errNoShards, key) + } + + slices.SortFunc(candidates, func(a, b candidate) int { + return cmp.Compare(a.byteSize, b.byteSize) + }) + + return candidates, nil +} + +// distributeRequest implements a flow-aware, two-phase "Join-Shortest-Queue-by-Bytes" (JSQ-Bytes) distribution strategy +// with graceful backpressure. It attempts to submit an item to the best-ranked candidate from the provided list. +// +// The algorithm operates as follows: +// 1. Phase 1 (Non-blocking Fast Failover): It iterates through the ranked candidates and attempts a non-blocking +// submission. The first successful submission wins. +// 2. Phase 2 (Blocking Fallback): If all non-blocking attempts fail, it performs a single blocking submission to the +// least-loaded candidate, providing backpressure. +// +// The provided context (ctx) is used for the blocking submission phase (SubmitOrBlock). +// +// Ownership Contract: +// - Returns nil: Success. Ownership transferred to Processor. +// - Returns error: Failure (Context expiry, shutdown,, etc.). +// Ownership retained by Controller. The Controller MUST finalize the item. +func (fc *FlowController) distributeRequest( + ctx context.Context, + item *internal.FlowItem, + candidates []candidate, +) (types.QueueOutcome, error) { + reqID := item.OriginalRequest().ID() + for _, c := range candidates { + if err := c.processor.Submit(item); err == nil { + return types.QueueOutcomeNotYetFinalized, nil + } + fc.logger.V(logutil.TRACE).Info("Processor busy during fast failover, trying next candidate", + "shardID", c.shardID, "requestID", reqID) + } + + // All processors are busy. Attempt a single blocking submission to the least-loaded candidate. + bestCandidate := candidates[0] + fc.logger.V(logutil.TRACE).Info("All processors busy, attempting blocking submit to best candidate", + "shardID", bestCandidate.shardID, "requestID", reqID) + err := bestCandidate.processor.SubmitOrBlock(ctx, item) + if err != nil { + return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: request not accepted: %w", types.ErrRejected, err) + } + return types.QueueOutcomeNotYetFinalized, nil // Success, ownership transferred. +} + +// getOrStartWorker implements the lazy-loading and startup of shard processors. +// It ensures that exactly one worker goroutine is started for each shard, using atomic operations +// (sync.Map.LoadOrStore). The worker's processor goroutine is only started after it has successfully been registered, +// preventing race conditions where multiple goroutines create and start the same worker. +func (fc *FlowController) getOrStartWorker(shard contracts.RegistryShard) *managedWorker { + if w, ok := fc.workers.Load(shard.ID()); ok { + return w.(*managedWorker) + } + + // Construct a new worker, but do not start its goroutine yet. + processorCtx, cancel := context.WithCancel(fc.parentCtx) + processor := fc.shardProcessorFactory( + processorCtx, + shard, + fc.saturationDetector, + fc.clock, + fc.config.ExpiryCleanupInterval, + fc.config.EnqueueChannelBufferSize, + fc.logger.WithValues("shardID", shard.ID()), + ) + newWorker := &managedWorker{ + processor: processor, + cancel: cancel, + } + + // Atomically load or store. This is the critical synchronization step. + actual, loaded := fc.workers.LoadOrStore(shard.ID(), newWorker) + if loaded { + // Another goroutine beat us to it. The `newWorker` we created was not stored. + // We must cancel the context we created to prevent a leak. + cancel() + return actual.(*managedWorker) + } + + // We won the race. The newWorker was stored. Now, start the processor's long-running goroutine. + fc.logger.V(logutil.DEFAULT).Info("Starting new ShardProcessor worker.", "shardID", shard.ID()) + fc.wg.Add(1) + go func() { + defer fc.wg.Done() + processor.Run(processorCtx) + }() + + return newWorker +} + +// reconcileProcessors is the supervisor's core garbage collection loop. +// It identifies and stops workers whose corresponding shards have been removed from the registry. +func (fc *FlowController) reconcileProcessors() { + stats := fc.registry.ShardStats() + shards := make(map[string]struct{}, len(stats)) // map[shardID] -> isActive + for _, s := range stats { + shards[s.ID] = struct{}{} + } + + fc.workers.Range(func(key, value any) bool { + shardID := key.(string) + worker := value.(*managedWorker) + if _, exists := shards[shardID]; !exists { + fc.logger.V(logutil.DEFAULT).Info("Stale worker detected for GC'd shard, initiating shutdown.", + "shardID", shardID) + worker.cancel() // Cancel the worker's context, initiating the Processor's graceful shutdown sequence. + fc.workers.Delete(shardID) // Delete from the map so no new requests are routed to it. + } + return true + }) +} + +// shutdown gracefully terminates all running `shardProcessor` goroutines. +// It signals all workers to stop and waits for them to complete their shutdown procedures. +func (fc *FlowController) shutdown() { + fc.logger.Info("Shutting down FlowController and all shard processors.") + fc.workers.Range(func(key, value any) bool { + shardID := key.(string) + worker := value.(*managedWorker) + fc.logger.V(logutil.VERBOSE).Info("Sending shutdown signal to processor", "shardID", shardID) + worker.cancel() + return true + }) + fc.wg.Wait() + fc.logger.Info("All shard processors have shut down.") +} diff --git a/pkg/epp/flowcontrol/controller/controller_test.go b/pkg/epp/flowcontrol/controller/controller_test.go new file mode 100644 index 000000000..917a5a1e5 --- /dev/null +++ b/pkg/epp/flowcontrol/controller/controller_test.go @@ -0,0 +1,1303 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Note on Time-Based Lifecycle Tests: +// Tests validating the controller's handling of request TTLs (e.g., OnReqCtxTimeout*) rely on real-time timers +// (context.WithDeadline). The injected testclock.FakeClock is used to control the timing of internal loops (like +// reconciliation), but it cannot manipulate the timers used by the standard context package. Therefore, these specific +// tests use time.Sleep or assertions on real-time durations. + +package controller + +import ( + "context" + "errors" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/utils/clock" + testclock "k8s.io/utils/clock/testing" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts/mocks" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller/internal" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" + frameworkmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework/mocks" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + typesmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types/mocks" +) + +// --- Test Harness & Fixtures --- + +// withClock returns a test-only option to inject a clock. +// test-only +func withClock(c clock.WithTicker) flowControllerOption { + return func(fc *FlowController) { + fc.clock = c + } +} + +// withRegistryClient returns a test-only option to inject a mock or fake registry client. +// test-only +func withRegistryClient(client registryClient) flowControllerOption { + return func(fc *FlowController) { + fc.registry = client + } +} + +// withShardProcessorFactory returns a test-only option to inject a processor factory. +// test-only +func withShardProcessorFactory(factory shardProcessorFactory) flowControllerOption { + return func(fc *FlowController) { + fc.shardProcessorFactory = factory + } +} + +// testHarness holds the `FlowController` and its dependencies under test. +type testHarness struct { + fc *FlowController + cfg Config + // clock is the clock interface used by the controller. + clock clock.WithTicker + mockRegistry *mockRegistryClient + mockDetector *mocks.MockSaturationDetector + // mockClock provides access to FakeClock methods (Step, HasWaiters) if and only if the underlying clock is a + // FakeClock. + mockClock *testclock.FakeClock + mockProcessorFactory *mockShardProcessorFactory +} + +// newUnitHarness creates a test environment with a mock processor factory, suitable for focused unit tests of the +// controller's logic. It starts the controller's run loop using the provided context for lifecycle management. +func newUnitHarness(t *testing.T, ctx context.Context, cfg Config, registry *mockRegistryClient) *testHarness { + t.Helper() + mockDetector := &mocks.MockSaturationDetector{} + + // Initialize the FakeClock with the current system time. + // The controller implementation uses the injected clock to calculate the deadline timestamp,vbut uses the standard + // context.WithDeadline (which relies on the system clock) to enforce it. + // If the FakeClock's time is far from the system time, deadlines calculated based on the FakeClockvmight already be + // expired according to the system clock, causing immediate TTL failures. + mockClock := testclock.NewFakeClock(time.Now()) + + mockProcessorFactory := &mockShardProcessorFactory{ + processors: make(map[string]*mockShardProcessor), + } + + // Default the registry if nil, simplifying tests that don't focus on registry interaction. + if registry == nil { + registry = &mockRegistryClient{} + } + + opts := []flowControllerOption{ + withRegistryClient(registry), + withClock(mockClock), + withShardProcessorFactory(mockProcessorFactory.new), + } + fc, err := NewFlowController(ctx, cfg, registry, mockDetector, logr.Discard(), opts...) + require.NoError(t, err, "failed to create FlowController for unit test harness") + + h := &testHarness{ + fc: fc, + cfg: cfg, + clock: mockClock, + mockRegistry: registry, + mockDetector: mockDetector, + mockClock: mockClock, + mockProcessorFactory: mockProcessorFactory, + } + return h +} + +// newIntegrationHarness creates a test environment that uses real `ShardProcessor`s, suitable for integration tests +// validating the controller-processor interaction. +func newIntegrationHarness(t *testing.T, ctx context.Context, cfg Config, registry *mockRegistryClient) *testHarness { + t.Helper() + mockDetector := &mocks.MockSaturationDetector{} + // Align FakeClock with system time. See explanation in newUnitHarness. + + mockClock := testclock.NewFakeClock(time.Now()) + if registry == nil { + registry = &mockRegistryClient{} + } + + opts := []flowControllerOption{ + withRegistryClient(registry), + withClock(mockClock), + } + fc, err := NewFlowController(ctx, cfg, registry, mockDetector, logr.Discard(), opts...) + require.NoError(t, err, "failed to create FlowController for integration test harness") + + h := &testHarness{ + fc: fc, + cfg: cfg, + clock: mockClock, + mockRegistry: registry, + mockDetector: mockDetector, + mockClock: mockClock, + } + return h +} + +// mockActiveFlowConnection is a local mock for the `contracts.ActiveFlowConnection` interface. +type mockActiveFlowConnection struct { + contracts.ActiveFlowConnection + ActiveShardsV []contracts.RegistryShard +} + +func (m *mockActiveFlowConnection) ActiveShards() []contracts.RegistryShard { + return m.ActiveShardsV +} + +// mockRegistryClient is a mock for the private `registryClient` interface. +type mockRegistryClient struct { + contracts.FlowRegistryObserver + contracts.FlowRegistryDataPlane + WithConnectionFunc func(key types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error + ShardStatsFunc func() []contracts.ShardStats +} + +func (m *mockRegistryClient) WithConnection( + key types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, +) error { + if m.WithConnectionFunc != nil { + return m.WithConnectionFunc(key, fn) + } + return fn(&mockActiveFlowConnection{}) +} + +func (m *mockRegistryClient) ShardStats() []contracts.ShardStats { + if m.ShardStatsFunc != nil { + return m.ShardStatsFunc() + } + return nil +} + +// mockShardProcessor is a mock for the internal `shardProcessor` interface. +type mockShardProcessor struct { + SubmitFunc func(item *internal.FlowItem) error + SubmitOrBlockFunc func(ctx context.Context, item *internal.FlowItem) error + // runCtx captures the context provided to the Run method for lifecycle assertions. + runCtx context.Context + runCtxMu sync.RWMutex + // runStarted is closed when the Run method is called, allowing tests to synchronize with worker startup. + runStarted chan struct{} +} + +func (m *mockShardProcessor) Submit(item *internal.FlowItem) error { + if m.SubmitFunc != nil { + return m.SubmitFunc(item) + } + return nil +} + +func (m *mockShardProcessor) SubmitOrBlock(ctx context.Context, item *internal.FlowItem) error { + if m.SubmitOrBlockFunc != nil { + return m.SubmitOrBlockFunc(ctx, item) + } + return nil +} + +func (m *mockShardProcessor) Run(ctx context.Context) { + m.runCtxMu.Lock() + m.runCtx = ctx + m.runCtxMu.Unlock() + if m.runStarted != nil { + close(m.runStarted) + } + // Block until the context is cancelled, simulating a running worker. + <-ctx.Done() +} + +// Context returns the context captured during the Run method call. +func (m *mockShardProcessor) Context() context.Context { + m.runCtxMu.RLock() + defer m.runCtxMu.RUnlock() + return m.runCtx +} + +// mockShardProcessorFactory allows tests to inject specific `mockShardProcessor` instances. +type mockShardProcessorFactory struct { + mu sync.Mutex + processors map[string]*mockShardProcessor +} + +// new is the factory function conforming to the `shardProcessorFactory` signature. +func (f *mockShardProcessorFactory) new( + _ context.Context, // The factory does not use the lifecycle context; it's passed to the processor's Run method later. + shard contracts.RegistryShard, + _ contracts.SaturationDetector, + _ clock.WithTicker, + _ time.Duration, + _ int, + _ logr.Logger, +) shardProcessor { + f.mu.Lock() + defer f.mu.Unlock() + if proc, ok := f.processors[shard.ID()]; ok { + return proc + } + // Return a default mock processor if one is not explicitly registered by the test. + return &mockShardProcessor{} +} + +// stubManagedQueue is a simple stub for the `contracts.ManagedQueue` interface. +type stubManagedQueue struct { + contracts.ManagedQueue + byteSizeV uint64 +} + +func (s *stubManagedQueue) ByteSize() uint64 { return s.byteSizeV } + +func (s *stubManagedQueue) FlowQueueAccessor() framework.FlowQueueAccessor { + return &frameworkmocks.MockFlowQueueAccessor{ByteSizeV: s.byteSizeV} +} + +// mockShardBuilder is a fixture to declaratively build mock `contracts.RegistryShard` for tests. +type mockShardBuilder struct { + id string + byteSize uint64 +} + +func newMockShard(id string) *mockShardBuilder { + return &mockShardBuilder{id: id} +} + +func (b *mockShardBuilder) withByteSize(size uint64) *mockShardBuilder { + b.byteSize = size + return b +} + +func (b *mockShardBuilder) build() contracts.RegistryShard { + return &mocks.MockRegistryShard{ + IDFunc: func() string { return b.id }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return &stubManagedQueue{byteSizeV: b.byteSize}, nil + }, + } +} + +var defaultFlowKey = types.FlowKey{ID: "test-flow", Priority: 100} + +func newTestRequest(key types.FlowKey) *typesmocks.MockFlowControlRequest { + return &typesmocks.MockFlowControlRequest{ + FlowKeyV: key, + ByteSizeV: 100, + IDV: "req-" + key.ID, + } +} + +// --- Test Cases --- + +// TestFlowController_EnqueueAndWait covers the primary API entry point, focusing on validation, distribution logic, +// retries, and the request lifecycle (including post-distribution cancellation/timeout). +func TestFlowController_EnqueueAndWait(t *testing.T) { + t.Parallel() + + t.Run("Rejections", func(t *testing.T) { + t.Parallel() + + t.Run("OnReqCtxExpiredBeforeDistribution", func(t *testing.T) { + t.Parallel() + // Test that if the request context provided to EnqueueAndWait is already expired, it returns immediately. + h := newUnitHarness(t, t.Context(), Config{DefaultRequestTTL: 1 * time.Minute}, nil) + + // Configure registry to return a shard. + shardA := newMockShard("shard-A").build() + h.mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(_ contracts.ActiveFlowConnection) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA}}) + } + // Configure processor to block until context expiry. + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + SubmitOrBlockFunc: func(ctx context.Context, _ *internal.FlowItem) error { + <-ctx.Done() // Wait for the context to be done. + return context.Cause(ctx) // Return the cause. + }, + } + + req := newTestRequest(defaultFlowKey) + // Use a context with a deadline in the past. + reqCtx, cancel := context.WithDeadlineCause( + context.Background(), + h.clock.Now().Add(-1*time.Second), + types.ErrTTLExpired) + defer cancel() + + outcome, err := h.fc.EnqueueAndWait(reqCtx, req) + require.Error(t, err, "EnqueueAndWait must fail if request context deadline is exceeded") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, types.ErrTTLExpired, "error should wrap types.ErrTTLExpired from the context cause") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "outcome should be QueueOutcomeRejectedOther") + }) + t.Run("OnControllerShutdown", func(t *testing.T) { + t.Parallel() + // Create a context specifically for the controller's lifecycle. + ctx, cancel := context.WithCancel(t.Context()) + h := newUnitHarness(t, ctx, Config{}, nil) + cancel() // Immediately stop the controller. + + // Wait for the controller's run loop and all workers (none in this case) to exit. + // We need to wait because the shutdown process is asynchronous. + h.fc.wg.Wait() + + req := newTestRequest(defaultFlowKey) + // The request context is valid, but the controller itself is stopped. + outcome, err := h.fc.EnqueueAndWait(context.Background(), req) + require.Error(t, err, "EnqueueAndWait must reject requests if controller is not running") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "error should wrap ErrFlowControllerNotRunning") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, + "outcome should be QueueOutcomeRejectedOther on shutdown") + }) + + t.Run("OnNoShardsAvailable", func(t *testing.T) { + t.Parallel() + // The default mockRegistryClient returns an empty list of ActiveShards. + h := newUnitHarness(t, t.Context(), Config{}, nil) + + req := newTestRequest(defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(context.Background(), req) + require.Error(t, err, "EnqueueAndWait must reject requests if no shards are available") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.Equal(t, types.QueueOutcomeRejectedCapacity, outcome, + "outcome should be QueueOutcomeRejectedCapacity when no shards exist for the flow") + }) + + t.Run("OnRegistryConnectionError", func(t *testing.T) { + t.Parallel() + mockRegistry := &mockRegistryClient{} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) + + expectedErr := errors.New("simulated connection failure") + // Configure the registry to fail when attempting to retrieve ActiveFlowConnection. + mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + _ func(conn contracts.ActiveFlowConnection) error, + ) error { + return expectedErr + } + + req := newTestRequest(defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(context.Background(), req) + require.Error(t, err, "EnqueueAndWait must reject requests if registry connection fails") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, expectedErr, "error should wrap the underlying connection error") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, + "outcome should be QueueOutcomeRejectedOther for transient registry errors") + }) + + t.Run("OnManagedQueueError", func(t *testing.T) { + t.Parallel() + mockRegistry := &mockRegistryClient{} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) + + // Create a faulty shard that successfully leases the flow but fails to return the + // ManagedQueue. This shard should be considered as unavailable. + faultyShard := &mocks.MockRegistryShard{ + IDFunc: func() string { return "faulty-shard" }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return nil, errors.New("invariant violation: queue retrieval failed") + }, + } + mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{faultyShard}}) + } + + req := newTestRequest(defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(context.Background(), req) + require.Error(t, err, "EnqueueAndWait must reject requests if no shards are available") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.Equal(t, types.QueueOutcomeRejectedCapacity, outcome, + "outcome should be QueueOutcomeRejectedCapacity when no shards exist for the flow") + }) + }) + + // Distribution tests validate the JSQ-Bytes algorithm, the two-phase submission strategy, and error handling during + // the handoff, including time-based failures during blocking fallback. + t.Run("Distribution", func(t *testing.T) { + t.Parallel() + + // Define a long default TTL to prevent unexpected timeouts unless a test case explicitly sets a shorter one. + const defaultTestTTL = 5 * time.Second + + testCases := []struct { + name string + shards []contracts.RegistryShard + setupProcessors func(t *testing.T, h *testHarness) + // requestTTL overrides the default TTL for time-sensitive tests. + requestTTL time.Duration + expectedOutcome types.QueueOutcome + expectErr bool + expectErrIs error + }{ + { + name: "SubmitSucceeds_NonBlocking_WithSingleActiveShard", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + // Simulate asynchronous processing and successful dispatch. + go item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + name: "DistributesToLeastLoadedShard_WithMultipleActiveShards", + shards: []contracts.RegistryShard{ + newMockShard("shard-A").withByteSize(1000).build(), // More loaded + newMockShard("shard-B").withByteSize(100).build(), // Least loaded + }, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { + t.Error("Submit was called on the more loaded shard (shard-A); JSQ-Bytes algorithm failed") + return internal.ErrProcessorBusy + }, + } + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + item.SetHandle(&typesmocks.MockQueueItemHandle{}) + go item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + name: "SubmitSucceeds_AfterBlocking_WithAllProcessorsBusy", + shards: []contracts.RegistryShard{ + newMockShard("shard-A").withByteSize(1000).build(), + newMockShard("shard-B").withByteSize(100).build(), + }, + setupProcessors: func(t *testing.T, h *testHarness) { + // Both processors reject the initial non-blocking Submit. + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + } + // Shard-B is the least loaded, so it should receive the blocking fallback (SubmitOrBlock). + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + SubmitOrBlockFunc: func(_ context.Context, item *internal.FlowItem) error { + // The blocking call succeeds. + go item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + // Validates the scenario where the request's TTL expires while the controller is blocked waiting for capacity. + // NOTE: This relies on real time passing, as context.WithDeadline timers cannot be controlled by FakeClock. + name: "Rejects_AfterBlocking_WhenTTL_Expires", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + requestTTL: 50 * time.Millisecond, // Short TTL to keep the test fast. + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + // Reject the non-blocking attempt. + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + // Block the fallback attempt until the context (carrying the TTL deadline) expires. + SubmitOrBlockFunc: func(ctx context.Context, _ *internal.FlowItem) error { + <-ctx.Done() + return ctx.Err() + }, + } + }, + // No runActions needed; we rely on the real-time timer to expire. + // When the blocking call fails due to context expiry, the outcome is RejectedOther. + expectedOutcome: types.QueueOutcomeRejectedOther, + expectErr: true, + // The error must reflect the specific cause of the context cancellation (ErrTTLExpired). + expectErrIs: types.ErrTTLExpired, + }, + { + name: "Rejects_OnProcessorShutdownDuringSubmit", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + // Simulate the processor shutting down during the non-blocking handoff. + SubmitFunc: func(_ *internal.FlowItem) error { return types.ErrFlowControllerNotRunning }, + SubmitOrBlockFunc: func(_ context.Context, _ *internal.FlowItem) error { + return types.ErrFlowControllerNotRunning + }, + } + }, + expectedOutcome: types.QueueOutcomeRejectedOther, + expectErr: true, + expectErrIs: types.ErrFlowControllerNotRunning, + }, + { + name: "Rejects_OnProcessorShutdownDuringSubmitOrBlock", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + // Simulate the processor shutting down during the blocking handoff. + SubmitOrBlockFunc: func(_ context.Context, _ *internal.FlowItem) error { + return types.ErrFlowControllerNotRunning + }, + } + }, + expectedOutcome: types.QueueOutcomeRejectedOther, + expectErr: true, + expectErrIs: types.ErrFlowControllerNotRunning, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + // Arrange + mockRegistry := &mockRegistryClient{} + + // Configure the harness with the appropriate TTL. + harnessConfig := Config{DefaultRequestTTL: defaultTestTTL} + if tc.requestTTL > 0 { + harnessConfig.DefaultRequestTTL = tc.requestTTL + } + h := newUnitHarness(t, t.Context(), harnessConfig, mockRegistry) + + // Configure the registry to return the specified shards. + mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: tc.shards}) + } + tc.setupProcessors(t, h) + + // Act + var outcome types.QueueOutcome + var err error + + startTime := time.Now() // Capture real start time for duration checks. + // Use a background context for the parent; the request lifecycle is governed by the config/derived context. + outcome, err = h.fc.EnqueueAndWait(context.Background(), newTestRequest(defaultFlowKey)) + + // Assert + if tc.expectErr { + require.Error(t, err, "expected an error during EnqueueAndWait but got nil") + assert.ErrorIs(t, err, tc.expectErrIs, "error should wrap the expected underlying cause") + // All failures during the distribution phase (capacity, timeout, shutdown) should result in a rejection. + assert.ErrorIs(t, err, types.ErrRejected, "rejection errors must wrap types.ErrRejected") + + // Specific assertion for real-time TTL tests. + if errors.Is(tc.expectErrIs, types.ErrTTLExpired) { + duration := time.Since(startTime) + // Ensure the test didn't return instantly. Use a tolerance for CI environments. + // This validates that the real-time wait actually occurred. + assert.GreaterOrEqual(t, duration, tc.requestTTL-30*time.Millisecond, + "EnqueueAndWait returned faster than the TTL allows, indicating the timer did not function correctly") + } + + } else { + require.NoError(t, err, "expected no error during EnqueueAndWait but got: %v", err) + } + assert.Equal(t, tc.expectedOutcome, outcome, "outcome did not match expected value") + }) + } + }) + + t.Run("Retry", func(t *testing.T) { + t.Parallel() + + // This test specifically validates the behavior when the request context is cancelled externally while the + // controller is blocked in the SubmitOrBlock phase. + t.Run("Rejects_OnRequestContextCancelledWhileBlocking", func(t *testing.T) { + t.Parallel() + mockRegistry := &mockRegistryClient{ + WithConnectionFunc: func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection, + ) error) error { + return fn(&mockActiveFlowConnection{ + ActiveShardsV: []contracts.RegistryShard{newMockShard("shard-A").build()}, + }) + }, + } + // Use a long TTL to ensure the failure is due to cancellation, not timeout. + h := newUnitHarness(t, t.Context(), Config{DefaultRequestTTL: 10 * time.Second}, mockRegistry) + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + // Reject non-blocking attempt. + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + // Block the fallback attempt until the context is cancelled. + SubmitOrBlockFunc: func(ctx context.Context, _ *internal.FlowItem) error { + <-ctx.Done() + return ctx.Err() + }, + } + + // Create a cancellable context for the request. + reqCtx, cancelReq := context.WithCancel(context.Background()) + // Cancel the request shortly after starting the operation. + // We use real time sleep here as we are testing external cancellation signals interacting with the context. + go func() { time.Sleep(10 * time.Millisecond); cancelReq() }() + + outcome, err := h.fc.EnqueueAndWait(reqCtx, newTestRequest(defaultFlowKey)) + + require.Error(t, err, "EnqueueAndWait must fail when context is cancelled during a blocking submit") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, context.Canceled, "error should wrap the underlying ctx.Err() (context.Canceled)") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, + "outcome should be QueueOutcomeRejectedOther when cancelled during distribution") + }) + + // This test validates the retry mechanism when a processor reports that its shard is draining. + t.Run("RetriesAndSucceeds_OnProcessorReportsShardDraining", func(t *testing.T) { + t.Parallel() + var callCount atomic.Int32 + mockRegistry := &mockRegistryClient{ + WithConnectionFunc: func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + attempt := callCount.Add(1) + shardA := newMockShard("shard-A").withByteSize(100).build() + shardB := newMockShard("shard-B").withByteSize(1000).build() + + if attempt == 1 { + // Attempt 1: Shard A is the least loaded and is selected. + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA, shardB}}) + } + // Attempt 2 (Retry): Assume Shard A is now draining and removed from the active set by the registry. + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardB}}) + }, + } + // Use a long TTL to ensure retries don't time out. + h := newUnitHarness(t, t.Context(), Config{DefaultRequestTTL: 10 * time.Second}, mockRegistry) + + // Configure Shard A's processor to reject the request due to draining. + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + // The processor accepts the item but then asynchronously finalizes it with ErrShardDraining. + item.SetHandle(&typesmocks.MockQueueItemHandle{}) + go item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, contracts.ErrShardDraining) + return nil + }, + } + // Configure Shard B's processor to successfully dispatch the request on the retry. + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + go item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + return nil + }, + } + + // Act + outcome, err := h.fc.EnqueueAndWait(context.Background(), newTestRequest(defaultFlowKey)) + + // Assert + require.NoError(t, err, "EnqueueAndWait must succeed after retrying on a healthy shard") + assert.Equal(t, types.QueueOutcomeDispatched, outcome, "outcome should be QueueOutcomeDispatched") + assert.Equal(t, int32(2), callCount.Load(), "registry must be consulted for Active shards on each retry attempt") + }) + }) + + // Lifecycle covers the post-distribution phase, focusing on how the controller handles context cancellation and TTL + // expiry while the request is buffered or queued by the processor (Asynchronous Finalization). + t.Run("Lifecycle", func(t *testing.T) { + t.Parallel() + + // Validates that the controller correctly initiates asynchronous finalization when the request context is cancelled + // after ownership has been transferred to the processor. + t.Run("OnReqCtxCancelledAfterDistribution", func(t *testing.T) { + t.Parallel() + // Use a long TTL to ensure the failure is due to cancellation. + h := newUnitHarness(t, t.Context(), Config{DefaultRequestTTL: 10 * time.Second}, nil) + + shardA := newMockShard("shard-A").build() + h.mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(_ contracts.ActiveFlowConnection) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA}}) + } + + // Channel for synchronization. + itemSubmitted := make(chan *internal.FlowItem, 1) + + // Configure the processor to accept the item but never finalize it, simulating a queued request. + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + item.SetHandle(&typesmocks.MockQueueItemHandle{}) + itemSubmitted <- item + return nil + }, + } + + reqCtx, cancelReq := context.WithCancel(context.Background()) + req := newTestRequest(defaultFlowKey) + + var outcome types.QueueOutcome + var err error + done := make(chan struct{}) + go func() { + outcome, err = h.fc.EnqueueAndWait(reqCtx, req) + close(done) + }() + + // 1. Wait for the item to be successfully distributed. + var item *internal.FlowItem + select { + case item = <-itemSubmitted: + // Success. Ownership has transferred. EnqueueAndWait is now in the select loop. + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for item to be submitted to the processor") + } + + // 2. Cancel the request context. + cancelReq() + + // 3. Wait for EnqueueAndWait to return. + select { + case <-done: + // Success. The controller detected the cancellation and unblocked the caller. + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for EnqueueAndWait to return after cancellation") + } + + // 4. Assertions for EnqueueAndWait's return values. + require.Error(t, err, "EnqueueAndWait should return an error when the request is cancelled post-distribution") + // The outcome should be Evicted (as the handle was set). + assert.ErrorIs(t, err, types.ErrEvicted, "error should wrap ErrEvicted") + // The underlying cause must be propagated. + assert.ErrorIs(t, err, types.ErrContextCancelled, "error should wrap ErrContextCancelled") + assert.Equal(t, types.QueueOutcomeEvictedContextCancelled, outcome, "outcome should be EvictedContextCancelled") + + // 5. Assert that the FlowItem itself was indeed finalized by the controller. + finalState := item.FinalState() + require.NotNil(t, finalState, "Item should have been finalized asynchronously by the controller") + assert.Equal(t, types.QueueOutcomeEvictedContextCancelled, finalState.Outcome, + "Item's internal outcome must match the returned outcome") + }) + + // Validates the asynchronous finalization path due to TTL expiry. + // Note: This relies on real time passing, as context.WithDeadline timers cannot be controlled by FakeClock. + t.Run("OnReqCtxTimeoutAfterDistribution", func(t *testing.T) { + t.Parallel() + // Configure a short TTL to keep the test reasonably fast. + const requestTTL = 50 * time.Millisecond + h := newUnitHarness(t, t.Context(), Config{DefaultRequestTTL: requestTTL}, nil) + + shardA := newMockShard("shard-A").build() + h.mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(_ contracts.ActiveFlowConnection) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA}}) + } + + itemSubmitted := make(chan *internal.FlowItem, 1) + + // Configure the processor to accept the item but never finalize it. + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + item.SetHandle(&typesmocks.MockQueueItemHandle{}) + itemSubmitted <- item + return nil + }, + } + + req := newTestRequest(defaultFlowKey) + // Use a context for the call itself that won't time out independently. + enqueueCtx, enqueueCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer enqueueCancel() + + var outcome types.QueueOutcome + var err error + done := make(chan struct{}) + + startTime := time.Now() // Capture start time to validate duration. + go func() { + outcome, err = h.fc.EnqueueAndWait(enqueueCtx, req) + close(done) + }() + + // 1. Wait for the item to be submitted. + var item *internal.FlowItem + select { + case item = <-itemSubmitted: + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for item to be submitted to the processor") + } + + // 2.Wait for the TTL to expire (Real time). We do NOT call Step(). + // Wait for EnqueueAndWait to return due to the TTL expiry. + select { + case <-done: + // Success. Now validate that enough time actually passed. + duration := time.Since(startTime) + assert.GreaterOrEqual(t, duration, requestTTL-30*time.Millisecond, // tolerance for CI environments + "EnqueueAndWait returned faster than the TTL allows, indicating the timer did not function correctly") + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for EnqueueAndWait to return after TTL expiry") + } + + // 4. Assertions for EnqueueAndWait's return values. + require.Error(t, err, "EnqueueAndWait should return an error when TTL expires post-distribution") + assert.ErrorIs(t, err, types.ErrEvicted, "error should wrap ErrEvicted") + assert.ErrorIs(t, err, types.ErrTTLExpired, "error should wrap the underlying cause (types.ErrTTLExpired)") + assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "outcome should be EvictedTTL") + + // 5. Assert FlowItem final state. + finalState := item.FinalState() + require.NotNil(t, finalState, "Item should have been finalized asynchronously by the controller") + assert.Equal(t, types.QueueOutcomeEvictedTTL, finalState.Outcome, + "Item's internal outcome must match the returned outcome") + }) + }) +} + +// TestFlowController_WorkerManagement covers the lifecycle of the shard processors (workers), including startup, +// reconciliation (garbage collection), and shutdown. +func TestFlowController_WorkerManagement(t *testing.T) { + t.Parallel() + + // Reconciliation validates that the controller correctly identifies and shuts down workers whose shards no longer + // exist in the registry. + t.Run("Reconciliation", func(t *testing.T) { + t.Parallel() + + // Setup: A registry that initially knows about "shard-A" and "stale-shard", but later only reports "shard-A". + mockRegistry := &mockRegistryClient{ + ShardStatsFunc: func() []contracts.ShardStats { + // The current state of the world according to the registry. + return []contracts.ShardStats{{ID: "shard-A"}} + }} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) + + // Pre-populate the controller with initial workers, simulating a previous state. + initialShards := []string{"shard-A", "stale-shard"} + for _, shardID := range initialShards { + currentShardID := shardID + // Initialize the processor mocks with the channel needed to synchronize startup. + h.mockProcessorFactory.processors[currentShardID] = &mockShardProcessor{runStarted: make(chan struct{})} + shard := &mocks.MockRegistryShard{IDFunc: func() string { return currentShardID }} + // Start the worker using the internal mechanism. + h.fc.getOrStartWorker(shard) + } + require.Len(t, h.mockProcessorFactory.processors, 2, "pre-condition: initial workers not set up correctly") + + // Wait for all worker goroutines to have started and captured their contexts. + for id, p := range h.mockProcessorFactory.processors { + proc := p + select { + case <-proc.runStarted: + // Worker is running. + case <-time.After(2 * time.Second): + t.Fatalf("timed out waiting for worker %s to start", id) + } + } + + // Act: Manually trigger the reconciliation logic. + h.fc.reconcileProcessors() + + t.Run("StaleWorkerIsCancelled", func(t *testing.T) { + staleProc := h.mockProcessorFactory.processors["stale-shard"] + require.NotNil(t, staleProc.Context(), "precondition: stale processor context should have been captured") + // The context of the removed worker must be cancelled to signal shutdown. + select { + case <-staleProc.Context().Done(): + // Success: Context was cancelled. + case <-time.After(100 * time.Millisecond): + t.Error("context of the stale worker was not cancelled during reconciliation") + } + }) + + t.Run("ActiveWorkerIsNotCancelled", func(t *testing.T) { + activeProc := h.mockProcessorFactory.processors["shard-A"] + require.NotNil(t, activeProc.Context(), "precondition: active processor context should have been captured") + // The context of an active worker must remain open. + select { + case <-activeProc.Context().Done(): + t.Error("context of the active worker was incorrectly cancelled during reconciliation") + default: + // Success: Context is still active. + } + }) + + t.Run("WorkerMapIsUpdated", func(t *testing.T) { + // The stale worker must be removed from the controller's concurrent map. + _, ok := h.fc.workers.Load("stale-shard") + assert.False(t, ok, "stale worker must be deleted from the controller's map") + _, ok = h.fc.workers.Load("shard-A") + assert.True(t, ok, "active worker must remain in the controller's map") + }) + }) + + // Validates that the reconciliation loop runs periodically based on the configured interval. + t.Run("Reconciliation_IsTriggeredByTicker", func(t *testing.T) { + t.Parallel() + const reconciliationInterval = 10 * time.Second + mockRegistry := &mockRegistryClient{} + + // Count the number of times the reconciliation logic (which calls ShardStats) runs. + var reconcileCount atomic.Int32 + mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + reconcileCount.Add(1) + return nil + } + + h := newUnitHarness(t, t.Context(), Config{ProcessorReconciliationInterval: reconciliationInterval}, mockRegistry) + // Ensure we are using the FakeClock specifically for this test, as we need Step/HasWaiters. + require.NotNil(t, h.mockClock, "This test requires the harness to be using FakeClock") + + // Wait for the reconciliation loop to start and create the ticker. + // This prevents a race where the clock is stepped before the ticker is registered with the FakeClock. + require.Eventually(t, h.mockClock.HasWaiters, time.Second, 10*time.Millisecond, + "reconciliation ticker was not created") + + // Advance the clock to trigger the first reconciliation. + h.mockClock.Step(reconciliationInterval) + + assert.Eventually(t, func() bool { + return reconcileCount.Load() == 1 + }, time.Second, 10*time.Millisecond, "reconciliation was not triggered by the first ticker event") + + // Advance the clock again to ensure it continues to fire. + h.mockClock.Step(reconciliationInterval) + assert.Eventually(t, func() bool { + return reconcileCount.Load() == 2 + }, time.Second, 10*time.Millisecond, "reconciliation did not fire on the second ticker event") + }) + + // Validates the atomicity of worker creation and ensures resource cleanup for the loser of the race. + t.Run("WorkerCreationRace", func(t *testing.T) { + t.Parallel() + + // This test orchestrates a deterministic race condition. + factoryEntered := make(chan *mockShardProcessor, 2) + continueFactory := make(chan struct{}) + // Map to store the construction context for each processor instance, allowing us to verify cleanup. + constructionContexts := sync.Map{} + + h := newUnitHarness(t, t.Context(), Config{}, nil) + + // Inject a custom factory to control the timing of worker creation. + h.fc.shardProcessorFactory = func( + ctx context.Context, // The context created by getOrStartWorker for the potential new processor. + shard contracts.RegistryShard, + _ contracts.SaturationDetector, + _ clock.WithTicker, + _ time.Duration, + _ int, + _ logr.Logger, + ) shardProcessor { + // This function is called by getOrStartWorker before the LoadOrStore check. + proc := &mockShardProcessor{runStarted: make(chan struct{})} + constructionContexts.Store(proc, ctx) // Capture the construction context. + + // Signal entry and then block, allowing another goroutine to enter. + factoryEntered <- proc + <-continueFactory + return proc + } + + shard := newMockShard("race-shard").build() + var wg sync.WaitGroup + wg.Add(2) + + // Start two goroutines that will race to create the same worker. + go func() { + defer wg.Done() + h.fc.getOrStartWorker(shard) + }() + go func() { + defer wg.Done() + h.fc.getOrStartWorker(shard) + }() + + // 1. Wait for both goroutines to enter the factory and create their respective processor instances. + proc1 := <-factoryEntered + proc2 := <-factoryEntered + + // 2. Unblock both goroutines, allowing them to race to workers.LoadOrStore. + close(continueFactory) + wg.Wait() + + // 3. Identify the winner and the loser. + actual, ok := h.fc.workers.Load("race-shard") + require.True(t, ok, "a worker must have been successfully stored in the map") + + storedWorker := actual.(*managedWorker) + winnerProc := storedWorker.processor.(*mockShardProcessor) + + var loserProc *mockShardProcessor + if winnerProc == proc1 { + loserProc = proc2 + } else { + loserProc = proc1 + } + + // 4. Validate the state of the winning processor. + // Wait for the Run method to be called on the winner (only the winner should start). + select { + case <-winnerProc.runStarted: + // Success. + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for the winning worker's Run method to be called") + } + + // The winning processor's context must remain active. + require.NotNil(t, winnerProc.Context(), "winner's context should not be nil (Run was called)") + select { + case <-winnerProc.Context().Done(): + t.Error("context of the winning worker should not be cancelled") + default: + // Success + } + + // 5. Validate the state of the losing processor and resource cleanup. + // The losing processor's Run method must NOT be called. + select { + case <-loserProc.runStarted: + t.Error("Run was incorrectly called on the losing worker") + default: + // Success + } + + // Verify the context created for the loser during construction was cancelled by getOrStartWorker. + loserCtxRaw, ok := constructionContexts.Load(loserProc) + require.True(t, ok, "loser processor construction context should have been captured") + loserCtx := loserCtxRaw.(context.Context) + + select { + case <-loserCtx.Done(): + // Success: Context was cancelled, preventing resource leaks. + case <-time.After(100 * time.Millisecond): + t.Error("context of the losing worker was not cancelled, this will leak resources") + } + }) +} + +// Helper function to create a realistic mock registry environment for integration/concurrency tests. +func setupRegistryForConcurrency(t *testing.T, numShards int, flowKey types.FlowKey) *mockRegistryClient { + t.Helper() + mockRegistry := &mockRegistryClient{} + shards := make([]contracts.RegistryShard, numShards) + + // Configure the shards and their dependencies required by the real ShardProcessor implementation. + for i := range numShards { + // Capture loop variables for closures. + shardID := fmt.Sprintf("shard-%d", i) + // Use high-fidelity mock queues (MockManagedQueue) that implement the necessary interfaces and synchronization. + currentQueue := &mocks.MockManagedQueue{FlowKeyV: flowKey} + + shards[i] = &mocks.MockRegistryShard{ + IDFunc: func() string { return shardID }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return currentQueue, nil + }, + // Configuration required for ShardProcessor initialization and dispatch logic. + AllOrderedPriorityLevelsFunc: func() []int { return []int{flowKey.Priority} }, + PriorityBandAccessorFunc: func(priority int) (framework.PriorityBandAccessor, error) { + if priority == flowKey.Priority { + return &frameworkmocks.MockPriorityBandAccessor{ + PriorityV: priority, + IterateQueuesFunc: func(f func(framework.FlowQueueAccessor) bool) { + f(currentQueue.FlowQueueAccessor()) + }, + }, nil + } + return nil, fmt.Errorf("unexpected priority %d", priority) + }, + // Configure dispatch policies (FIFO). + IntraFlowDispatchPolicyFunc: func(_ types.FlowKey) (framework.IntraFlowDispatchPolicy, error) { + return &frameworkmocks.MockIntraFlowDispatchPolicy{ + SelectItemFunc: func(qa framework.FlowQueueAccessor) (types.QueueItemAccessor, error) { + return qa.PeekHead() + }, + }, nil + }, + InterFlowDispatchPolicyFunc: func(_ int) (framework.InterFlowDispatchPolicy, error) { + return &frameworkmocks.MockInterFlowDispatchPolicy{ + SelectQueueFunc: func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { + return currentQueue.FlowQueueAccessor(), nil + }, + }, nil + }, + // Configure stats reporting based on the live state of the mock queues. + StatsFunc: func() contracts.ShardStats { + return contracts.ShardStats{ + ID: shardID, + TotalLen: uint64(currentQueue.Len()), + TotalByteSize: currentQueue.ByteSize(), + PerPriorityBandStats: map[int]contracts.PriorityBandStats{ + flowKey.Priority: { + Len: uint64(currentQueue.Len()), + ByteSize: currentQueue.ByteSize(), + CapacityBytes: 1e9, // Effectively unlimited capacity to ensure dispatch success. + }, + }, + } + }, + } + } + + // Configure the registry connection. + mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: shards}) + } + mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + stats := make([]contracts.ShardStats, len(shards)) + for i, shard := range shards { + stats[i] = shard.Stats() + } + return stats + } + return mockRegistry +} + +// TestFlowController_Concurrency_Distribution performs an integration test under high contention, using real +// ShardProcessors. +// It validates the thread-safety of the distribution logic and the overall system throughput. +func TestFlowController_Concurrency_Distribution(t *testing.T) { + const ( + numShards = 4 + numGoroutines = 50 + numRequests = 200 + ) + + // Arrange + mockRegistry := setupRegistryForConcurrency(t, numShards, defaultFlowKey) + + // Initialize the integration harness with real ShardProcessors. + h := newIntegrationHarness(t, t.Context(), Config{ + // Use a generous buffer to focus the test on distribution logic rather than backpressure. + EnqueueChannelBufferSize: numRequests, + DefaultRequestTTL: 5 * time.Second, + ExpiryCleanupInterval: 100 * time.Millisecond, + }, mockRegistry) + + // Act: Hammer the controller concurrently. + var wg sync.WaitGroup + wg.Add(numGoroutines) + outcomes := make(chan types.QueueOutcome, numRequests) + + for i := range numGoroutines { + goroutineID := i + go func() { + defer wg.Done() + for j := range numRequests / numGoroutines { + req := newTestRequest(defaultFlowKey) + req.IDV = fmt.Sprintf("req-distrib-%d-%d", goroutineID, j) + + // Use a reasonable timeout for the individual request context. + reqCtx, cancel := context.WithTimeout(t.Context(), 5*time.Second) + defer cancel() + + ctx := logr.NewContext(reqCtx, logr.Discard()) + outcome, err := h.fc.EnqueueAndWait(ctx, req) + if err != nil { + // Use t.Errorf for concurrent tests to report failures without halting execution. + t.Errorf("EnqueueAndWait failed unexpectedly under load: %v", err) + } + outcomes <- outcome + } + }() + } + + // Wait for all requests to complete. + wg.Wait() + close(outcomes) + + // Assert: All requests should be successfully dispatched. + successCount := 0 + for outcome := range outcomes { + if outcome == types.QueueOutcomeDispatched { + successCount++ + } + } + require.Equal(t, numRequests, successCount, + "all concurrent requests must be dispatched successfully without errors or data races") +} + +// TestFlowController_Concurrency_Backpressure specifically targets the blocking submission path (SubmitOrBlock) by +// configuring the processors with zero buffer capacity. +func TestFlowController_Concurrency_Backpressure(t *testing.T) { + if testing.Short() { + t.Skip("Skipping concurrency integration test in short mode.") + } + t.Parallel() + + const ( + numShards = 2 + numGoroutines = 20 + // Fewer requests than the distribution test, as the blocking path is inherently slower. + numRequests = 40 + ) + + // Arrange: Set up the registry environment. + mockRegistry := setupRegistryForConcurrency(t, numShards, defaultFlowKey) + + // Use the integration harness with a configuration designed to induce backpressure. + h := newIntegrationHarness(t, t.Context(), Config{ + // Zero buffer forces immediate use of SubmitOrBlock if the processor loop is busy. + EnqueueChannelBufferSize: 0, + // Generous TTL to ensure timeouts are not the cause of failure. + DefaultRequestTTL: 10 * time.Second, + ExpiryCleanupInterval: 100 * time.Millisecond, + }, mockRegistry) + + // Act: Concurrently submit requests. + var wg sync.WaitGroup + wg.Add(numGoroutines) + outcomes := make(chan types.QueueOutcome, numRequests) + + for i := range numGoroutines { + goroutineID := i + go func() { + defer wg.Done() + for j := range numRequests / numGoroutines { + req := newTestRequest(defaultFlowKey) + req.IDV = fmt.Sprintf("req-backpressure-%d-%d", goroutineID, j) + + // Use a reasonable timeout for the individual request context to ensure the test finishes promptly if a + // deadlock occurs. + reqCtx, cancel := context.WithTimeout(t.Context(), 5*time.Second) + defer cancel() + + outcome, err := h.fc.EnqueueAndWait(logr.NewContext(reqCtx, logr.Discard()), req) + if err != nil { + t.Errorf("EnqueueAndWait failed unexpectedly under backpressure for request %s: %v", req.ID(), err) + } + outcomes <- outcome + } + }() + } + wg.Wait() + close(outcomes) + + // Assert: Verify successful dispatch despite high contention and zero buffer. + successCount := 0 + for outcome := range outcomes { + if outcome == types.QueueOutcomeDispatched { + successCount++ + } + } + require.Equal(t, numRequests, successCount, + "all concurrent requests should be dispatched successfully even under high contention and zero buffer capacity") +} diff --git a/pkg/epp/flowcontrol/controller/doc.go b/pkg/epp/flowcontrol/controller/doc.go index 8c96bbc18..0d2ea3687 100644 --- a/pkg/epp/flowcontrol/controller/doc.go +++ b/pkg/epp/flowcontrol/controller/doc.go @@ -14,109 +14,48 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package controller contains the implementation of the `FlowController` engine. +// Package controller contains the implementation of the FlowController engine. // // # Overview // -// The `FlowController` is the central processing engine of the flow control system. It is a sharded, high-throughput -// component responsible for managing the lifecycle of all incoming requests—from initial submission via the synchronous -// `EnqueueAndWait` method to a terminal outcome (dispatch, rejection, or eviction). It achieves this by orchestrating -// its dependencies—the `contracts.FlowRegistry`, the pluggable `Policy` framework, and the -// `contracts.SaturationDetector`—to make continuous, state-aware decisions. +// The FlowController is the central processing engine of the Flow Control layer. It acts as a stateless supervisor that +// orchestrates a pool of stateful workers (internal.ShardProcessor), managing the lifecycle of all incoming requests +// from initial submission to a terminal outcome (dispatch, rejection, or eviction). // -// # Architecture: The Processor-Shard Relationship +// # Architecture: Supervisor-Worker Pattern // -// The `FlowController` engine is designed around a clear separation of state and execution. This "control plane vs. -// data plane" separation is key to enabling dynamic, concurrent-safe configuration updates. +// This package implements a supervisor-worker pattern to achieve high throughput and dynamic scalability. // -// - The `contracts.FlowRegistry` is the **control plane**. It is the single source of truth for all configuration. -// When an administrative action occurs (e.g., `RegisterOrUpdateFlow`), the `contracts.FlowRegistry` is responsible -// for safely applying that change to each of its managed `contracts.RegistryShard` instances. +// - The FlowController (Supervisor): The public-facing API of the system. Its primary responsibilities are to execute +// a distribution algorithm to select the optimal worker for a new request and to manage the lifecycle of the worker +// pool, ensuring it stays synchronized with the underlying shard topology defined by the contracts.FlowRegistry. +// - The internal.ShardProcessor (Worker): A stateful, single-goroutine actor responsible for the entire lifecycle of +// requests on a single shard. The supervisor manages a pool of these workers, one for each contracts.RegistryShard. // -// - The `contracts.RegistryShard` is the **concurrent-safe state port**. It defines the contract for a state store -// that holds the `contracts.ManagedQueue` and framework `Policy` instances for a single shard. +// # Concurrency Model // -// - The `internal.ShardProcessor` is the **data plane worker**. It is given a single `contracts.RegistryShard` to -// operate on. Its main `dispatchCycle` continuously acquires a read lock on the shard to get a consistent view of -// the active queues and policies, and then executes its dispatch logic. +// The FlowController is designed to be highly concurrent and thread-safe. It acts primarily as a stateless distributor. // -// This separation is what enables dynamic updates. The `internal.ShardProcessor` is stateless; it simply executes -// against the state presented by its `contracts.RegistryShard` on each cycle. This allows the control plane -// (`contracts.FlowRegistry`) to safely change that state in the background. +// - EnqueueAndWait: Can be called concurrently by many goroutines. +// - Worker Management: Uses a sync.Map (workers) for concurrent access and lazy initialization of workers. +// - Supervision: A single background goroutine (run) manages the worker pool lifecycle (garbage collection). // -// # Architectural Deep Dive: The `EnqueueAndWait` Model +// It achieves high throughput by minimizing shared state and relying on the internal ShardProcessors to handle state +// mutations serially (using an actor model). // -// A fundamental design choice is the synchronous, blocking `EnqueueAndWait` method. In the context of the Gateway API -// Inference Extension's Endpoint Picker (EPP), which operates as an Envoy External Processing (`ext_proc`) server, this -// model is deliberately chosen for its simplicity and robustness. +// # Request Lifecycle and Ownership // -// - Alignment with `ext_proc`: The `ext_proc` protocol is stream-based. A single goroutine within the EPP manages the -// stream for a given HTTP request. `EnqueueAndWait` fits this perfectly: the request-handling goroutine calls it, -// blocks, and upon return, has the definitive outcome. It can then immediately act on that outcome, maintaining -// clear request-goroutine affinity. +// A request (represented internally as a FlowItem) has a lifecycle managed cooperatively by the Controller and a +// Processor. Defining ownership is critical for ensuring an item is finalized exactly once. // -// - Simplified State Management: The state of a "waiting" request is implicitly managed by the blocked goroutine's -// stack and its `context.Context`. The `FlowController` only needs to signal this specific goroutine to unblock it. +// 1. Submission (Controller): The Controller attempts to hand off the item to a Processor. +// 2. Handoff: +// - Success: Ownership transfers to the Processor, which is now responsible for Finalization. +// - Failure: Ownership remains with the Controller, which must Finalize the item. +// 3. Processing (Processor): The Processor enqueues, manages, and eventually dispatches or rejects the item. +// 4. Finalization: The terminal outcome is set. This can happen: +// - Synchronously: The Processor determines the outcome (e.g., Dispatch, Capacity Rejection). +// - Asynchronously: The Controller observes the request's Context expiry (TTL/Cancellation) and calls Finalize. // -// - Direct Backpressure: If queues are full, `EnqueueAndWait` returns `types.ErrQueueAtCapacity`. This provides -// immediate, direct backpressure to the earliest point of contact. -// -// # Architectural Deep Dive: The Sharded Model & JSQ-Bytes -// -// The `FlowController` is built on a sharded architecture to enable parallel processing and prevent a central dispatch -// loop from becoming a bottleneck. The `FlowController` consists of a top-level manager and a pool of independent -// `internal.ShardProcessor` workers. The `contracts.FlowRegistry` guarantees that every logical flow is represented by -// a distinct queue instance on every active shard. -// -// This architecture trades deterministic global state for high throughput and scalability. The key challenge, and the -// system's most critical assumption, revolves around ensuring this distributed model can still achieve global fairness -// objectives. -// -// ## The Critical Assumption: Homogeneity Within Flows -// -// The effectiveness of the sharded model hinges on a critical assumption: while the system as a whole manages a -// heterogeneous set of flows, the traffic *within a single logical flow* is assumed to be roughly homogeneous in its -// characteristics. A logical flow is intended to represent a single workload or tenant; therefore, the most -// unpredictable variables (effecting decode behavior) are expected to be statistically similar *within* that flow. -// -// ## The Hedge: Join the Shortest Queue by Bytes (JSQ-Bytes) -// -// To make this assumption as robust as possible, the `FlowController` uses a "Join the Shortest Queue by Bytes -// (JSQ-Bytes)" algorithm. `ByteSize` is an excellent proxy for the resources the `FlowController` explicitly manages -// (host memory pressure and queuing capacity) and is also a reasonable proxy for prefill compute time. -// -// Crucially, the goal of the distributor is not to perfectly predict backend compute time, but to intelligently balance -// the load at the controller level. JSQ-Bytes achieves this by: -// -// 1. Reflecting True Load: It distributes work based on each shard's current queue size in bytes—a direct measure of -// its memory and capacity congestion. -// -// 2. Adapting to Congestion: The byte-size of a queue is a real-time signal of a shard's overall congestion. If a -// shard is slow (e.g., due to long-decoding downstream requests), its queues will remain full, and JSQ-Bytes will -// adaptively steer new work away. -// -// 3. Hedging Against Assumption Violations: This adaptive, self-correcting nature makes it a powerful hedge. It -// doesn't just distribute; it actively *load balances* based on the most relevant feedback available. -// -// # Architectural Deep Dive: Pre-Policy Gating -// -// Before policies are invoked, the `internal.ShardProcessor` applies an `internal.BandFilter`. This function determines -// which flows within a priority band are eligible for a given operation (e.g., dispatch). This pattern is a deliberate -// architectural choice to decouple the logic of *viability* from the logic of *selection*. -// -// - An `internal.BandFilter` (e.g., the `internal.NewSaturationFilter`) determines if a flow is viable based on -// external signals like backend load. -// - The `framework.InterFlowDispatchPolicy` then selects from among the viable candidates based on its own fairness -// logic. -// -// This abstraction provides two major benefits: -// -// 1. Low Contributor Burden: It makes the mental model for policy contributors significantly simpler. An author of a -// new fairness policy does not need to be concerned with the complexities of saturation detection or other gating -// concerns. They are given a simple, pre-filtered view of the world and can focus solely on their selection logic. -// -// 2. Correctness by Construction: The `internal.subsetPriorityBandAccessor` wrapper guarantees that a policy operates -// on a consistent, filtered view, regardless of which accessor method it calls (`FlowIDs`, `Queue`, etc.). This -// prevents an entire class of subtle bugs where a policy might otherwise see a stale or unfiltered view of the -// system state. +// The FlowItem uses atomic operations to safely coordinate the Finalization state across goroutines. package controller diff --git a/pkg/epp/flowcontrol/controller/internal/doc.go b/pkg/epp/flowcontrol/controller/internal/doc.go index 3f39b5791..0599d5387 100644 --- a/pkg/epp/flowcontrol/controller/internal/doc.go +++ b/pkg/epp/flowcontrol/controller/internal/doc.go @@ -14,34 +14,23 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package internal provides the core worker implementation for the `controller.FlowController`. +// Package internal provides the core worker implementation for the controller.FlowController. // -// The components in this package are the private, internal building blocks of the `controller` package. This separation -// enforces a clean public API at the `controller` level and allows the internal mechanics of the engine to evolve -// independently. +// The components in this package are the private, internal building blocks of the controller. This separation enforces +// a clean public API at the `controller` level and allows the internal mechanics of the engine to evolve independently. // -// # Design Philosophy: A Single-Writer Actor Model +// # Design Philosophy: The Single-Writer Actor Model // -// The concurrency model for this package is deliberately built around a single-writer, channel-based actor pattern, as -// implemented in the `ShardProcessor`. While a simple lock-based approach might seem easier, it is insufficient for the -// system's requirements. The "enqueue" operation is a complex, stateful transaction that requires a **hierarchical -// capacity check** against both the overall shard and a request's specific priority band. +// The concurrency model for this package is built around a single-writer, channel-based actor pattern, as implemented +// in the ShardProcessor. All state-mutating operations for a given shard (primarily enqueuing new requests) are +// funneled through a single Run goroutine. // -// A coarse, shard-wide lock would be required to make this transaction atomic, creating a major performance bottleneck -// and causing head-of-line blocking at the top-level `controller.FlowController`. The single-writer model, where all -// state mutations are funneled through a single goroutine, makes this transaction atomic *without locks*. +// This design makes complex, multi-step transactions (like a hierarchical capacity check against both a shard's total +// limit and a priority band's limit) inherently atomic without locks. This avoids the performance bottleneck of a +// coarse, shard-wide lock and allows the top-level Controller to remain decoupled and highly concurrent. // -// This design provides two critical benefits: -// 1. **Decoupling:** The `controller.FlowController` is decoupled via a non-blocking channel send, allowing for much -// higher throughput. -// 2. **Backpressure:** The state of the channel buffer serves as a high-fidelity, real-time backpressure signal, -// enabling more intelligent load balancing. +// # Key Components // -// # Future-Proofing for Complex Transactions -// -// This model's true power is that it provides a robust foundation for future features like **displacement** (a -// high-priority item evicting lower-priority ones). This is an "all-or-nothing" atomic transaction that is -// exceptionally difficult to implement correctly in a lock-free or coarse-grained locking model without significant -// performance penalties. The single-writer model contains the performance cost of such a potentially long transaction -// to the single `ShardProcessor`, preventing it from blocking the entire `controller.FlowController`. +// - ShardProcessor: The implementation of the worker actor. Manages the lifecycle of requests for a single shard. +// - FlowItem: The internal representation of a request, managing its state and synchronization across goroutines. package internal diff --git a/pkg/epp/flowcontrol/controller/internal/filter.go b/pkg/epp/flowcontrol/controller/internal/filter.go deleted file mode 100644 index 0a0669224..000000000 --- a/pkg/epp/flowcontrol/controller/internal/filter.go +++ /dev/null @@ -1,144 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package internal - -import ( - "context" - - "github.com/go-logr/logr" - - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -// BandFilter is a function that acts as a pre-policy gate. It takes a complete view of a priority band and returns a -// potentially filtered `framework.PriorityBandAccessor` containing only the flows that are viable candidates for a -// subsequent policy decision. It can also return a boolean signal to pause the entire operation for the band. -// -// This abstraction decouples the logic of determining *viability* (e.g., is a flow subject to backpressure?) from the -// logic of *selection* (e.g., which of the viable flows is the fairest to pick next?). This separation simplifies the -// mental model for policy authors, who can focus solely on selection logic without needing to account for external -// gating signals. -// -// Because filters are applied before the band is passed to a policy, they are inherently composable. Multiple filters -// can be chained to apply different viability criteria. For example, a future filter could be developed to temporarily -// exclude a "misbehaving" flow that is causing repeated errors, quarantining it from policy consideration. -// -// A nil returned `PriorityBandAccessor` indicates that no filtering was necessary and the original accessor should be -// used. This provides a zero-allocation fast path for the common case where no flows are being filtered. -type BandFilter func( - ctx context.Context, - band framework.PriorityBandAccessor, - logger logr.Logger, -) (filteredBand framework.PriorityBandAccessor, shouldPause bool) - -// NewSaturationFilter creates a `BandFilter` that uses the provided `contracts.SaturationDetector` to determine which -// flows are dispatchable. This is the standard filter used in the production `FlowController` for the dispatch -// operation. -func NewSaturationFilter(sd contracts.SaturationDetector) BandFilter { - return func( - ctx context.Context, - band framework.PriorityBandAccessor, - logger logr.Logger, - ) (framework.PriorityBandAccessor, bool) { - // Phase 1: Implement the current global saturation check. - if sd.IsSaturated(ctx) { - logger.V(logutil.VERBOSE).Info("System saturated, pausing dispatch for this shard.") - return nil, true // Pause dispatching for all bands. - } - - // Phase 2 (Future): This is where per-flow saturation logic would go. - // It would iterate `band`, call `IsSaturated(ctx, flowID)`, and build a filtered map of allowed flows, - // then return `newSubsetPriorityBandAccessor(band, allowedFlows)`. - // For now, no per-flow filtering is done. Return nil to signal the fast path. - return nil, false // Do not pause, and do not filter any flows. - } -} - -// subsetPriorityBandAccessor provides a view of a priority band that is restricted to a specific subset of flows. -// It implements the `framework.PriorityBandAccessor` interface, ensuring that any policy operating on it will only -// see the allowed flows, regardless of which accessor method is used. This provides correctness by construction. -// -// For performance, it pre-computes a slice of the allowed flows at creation time, making subsequent calls to -// `FlowKeys()` an O(1) operation with zero allocations. -type subsetPriorityBandAccessor struct { - originalAccessor framework.PriorityBandAccessor - allowedFlows map[types.FlowKey]struct{} - allowedFlowsSlice []types.FlowKey -} - -var _ framework.PriorityBandAccessor = &subsetPriorityBandAccessor{} - -// newSubsetPriorityBandAccessor creates a new filtered view of a priority band. -func newSubsetPriorityBandAccessor(original framework.PriorityBandAccessor, allowed []types.FlowKey) *subsetPriorityBandAccessor { - // Pre-compute the map for efficient lookups in `Queue()` and `IterateQueues()`. - allowedMap := make(map[types.FlowKey]struct{}, len(allowed)) - for _, k := range allowed { - allowedMap[k] = struct{}{} - } - - return &subsetPriorityBandAccessor{ - originalAccessor: original, - allowedFlows: allowedMap, - allowedFlowsSlice: allowed, - } -} - -// Priority returns the numerical priority level of this band. -func (s *subsetPriorityBandAccessor) Priority() uint { - return s.originalAccessor.Priority() -} - -// PriorityName returns the human-readable name of this priority band. -func (s *subsetPriorityBandAccessor) PriorityName() string { - return s.originalAccessor.PriorityName() -} - -// FlowKeys returns a slice of the composite `types.FlowKey`s for every flow instance currently active within this -// priority band that are in the allowed subset. -// This is an O(1) operation because the slice is pre-computed at creation. -func (s *subsetPriorityBandAccessor) FlowKeys() []types.FlowKey { - return s.allowedFlowsSlice -} - -// Queue returns a `framework.FlowQueueAccessor` for the specified `ID` within this priority band, but only if it is -// in the allowed subset. This is an O(1) map lookup. If the flow is not in the allowed subset, it returns nil. -func (s *subsetPriorityBandAccessor) Queue(id string) framework.FlowQueueAccessor { - key := types.FlowKey{ID: id, Priority: s.Priority()} - if _, ok := s.allowedFlows[key]; !ok { - return nil - } - return s.originalAccessor.Queue(id) -} - -// IterateQueues executes the given `callback` for each `framework.FlowQueueAccessor` in the allowed subset of this -// priority band. The iteration stops if the callback returns false. -// This implementation delegates to the original accessor's iterator and applies the filter, which is more robust and -// efficient than iterating over a pre-computed slice of IDs. -func (s *subsetPriorityBandAccessor) IterateQueues(callback func(queue framework.FlowQueueAccessor) bool) { - s.originalAccessor.IterateQueues(func(queue framework.FlowQueueAccessor) bool { - if _, ok := s.allowedFlows[queue.FlowKey()]; ok { - // This queue is in the allowed set, so execute the callback. - if !callback(queue) { - return false // The callback requested to stop, so we stop the outer iteration too. - } - } - return true // Continue iterating over the original set. - }) -} diff --git a/pkg/epp/flowcontrol/controller/internal/filter_test.go b/pkg/epp/flowcontrol/controller/internal/filter_test.go deleted file mode 100644 index ceff9e83f..000000000 --- a/pkg/epp/flowcontrol/controller/internal/filter_test.go +++ /dev/null @@ -1,174 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package internal - -import ( - "context" - "sort" - "testing" - - "github.com/go-logr/logr" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts/mocks" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" - frameworkmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework/mocks" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" -) - -func TestNewSaturationFilter(t *testing.T) { - t.Parallel() - - testCases := []struct { - name string - isSaturated bool - expectShouldPause bool - expectFilteredBandNil bool - }{ - { - name: "should not pause or filter when system is not saturated", - isSaturated: false, - expectShouldPause: false, - expectFilteredBandNil: true, // nil band signals the fast path - }, - { - name: "should pause when system is saturated", - isSaturated: true, - expectShouldPause: true, - expectFilteredBandNil: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - // --- ARRANGE --- - mockSD := &mocks.MockSaturationDetector{IsSaturatedFunc: func(ctx context.Context) bool { return tc.isSaturated }} - filter := NewSaturationFilter(mockSD) - require.NotNil(t, filter, "NewSaturationFilter should not return nil") - - mockBand := &frameworkmocks.MockPriorityBandAccessor{} - - // --- ACT --- - filteredBand, shouldPause := filter(context.Background(), mockBand, logr.Discard()) - - // --- ASSERT --- - assert.Equal(t, tc.expectShouldPause, shouldPause, "The filter's pause signal should match the expected value") - - if tc.expectFilteredBandNil { - assert.Nil(t, filteredBand, "Expected filtered band to be nil") - } else { - assert.NotNil(t, filteredBand, "Expected a non-nil filtered band") - } - }) - } -} - -func TestSubsetPriorityBandAccessor(t *testing.T) { - t.Parallel() - - // --- ARRANGE --- - // Setup a mock original accessor that knows about three flows. - flowAKey := types.FlowKey{ID: "flow-a", Priority: 10} - flowBKey := types.FlowKey{ID: "flow-b", Priority: 10} - flowCKey := types.FlowKey{ID: "flow-c", Priority: 10} - - mockQueueA := &frameworkmocks.MockFlowQueueAccessor{FlowKeyV: flowAKey} - mockQueueB := &frameworkmocks.MockFlowQueueAccessor{FlowKeyV: flowBKey} - mockQueueC := &frameworkmocks.MockFlowQueueAccessor{FlowKeyV: flowCKey} - - originalAccessor := &frameworkmocks.MockPriorityBandAccessor{ - PriorityV: 10, - PriorityNameV: "High", - FlowKeysFunc: func() []types.FlowKey { - return []types.FlowKey{flowAKey, flowBKey, flowCKey} - }, - QueueFunc: func(id string) framework.FlowQueueAccessor { - switch id { - case "flow-a": - return mockQueueA - case "flow-b": - return mockQueueB - case "flow-c": - return mockQueueC - } - return nil - }, - IterateQueuesFunc: func(callback func(queue framework.FlowQueueAccessor) bool) { - if !callback(mockQueueA) { - return - } - if !callback(mockQueueB) { - return - } - callback(mockQueueC) - }, - } - - // Create a subset view that only allows two of the flows. - allowedFlows := []types.FlowKey{flowAKey, flowCKey} - subsetAccessor := newSubsetPriorityBandAccessor(originalAccessor, allowedFlows) - require.NotNil(t, subsetAccessor, "newSubsetPriorityBandAccessor should not return nil") - - t.Run("should pass through priority and name", func(t *testing.T) { - t.Parallel() - assert.Equal(t, uint(10), subsetAccessor.Priority(), "Priority() should pass through from the original accessor") - assert.Equal(t, "High", subsetAccessor.PriorityName(), - "PriorityName() should pass through from the original accessor") - }) - - t.Run("should only return allowed flow keys", func(t *testing.T) { - t.Parallel() - flowKeys := subsetAccessor.FlowKeys() - // Sort for consistent comparison, as the pre-computed slice order is not guaranteed. - sort.Slice(flowKeys, func(i, j int) bool { - return flowKeys[i].ID < flowKeys[j].ID - }) - assert.Equal(t, []types.FlowKey{flowAKey, flowCKey}, flowKeys, "FlowKeys() should only return the allowed subset") - }) - - t.Run("should only return queues for allowed flows", func(t *testing.T) { - t.Parallel() - assert.Same(t, mockQueueA, subsetAccessor.Queue("flow-a"), "Should return queue for allowed flow 'a'") - assert.Nil(t, subsetAccessor.Queue("flow-b"), "Should not return queue for disallowed flow 'b'") - assert.Same(t, mockQueueC, subsetAccessor.Queue("flow-c"), "Should return queue for allowed flow 'c'") - }) - - t.Run("should only iterate over allowed queues", func(t *testing.T) { - t.Parallel() - var iterated []string - subsetAccessor.IterateQueues(func(queue framework.FlowQueueAccessor) bool { - iterated = append(iterated, queue.FlowKey().ID) - return true - }) - // Sort for consistent comparison, as iteration order is not guaranteed. - sort.Strings(iterated) - assert.Equal(t, []string{"flow-a", "flow-c"}, iterated, "IterateQueues() should only visit allowed flows") - }) - - t.Run("should stop iteration if callback returns false", func(t *testing.T) { - t.Parallel() - var iterated []string - subsetAccessor.IterateQueues(func(queue framework.FlowQueueAccessor) bool { - iterated = append(iterated, queue.FlowKey().ID) - return false // Exit after the first item. - }) - assert.Len(t, iterated, 1, "Iteration should have stopped after one item") - }) -} diff --git a/pkg/epp/flowcontrol/controller/internal/item.go b/pkg/epp/flowcontrol/controller/internal/item.go index 86aeb8a0c..f0d5d3286 100644 --- a/pkg/epp/flowcontrol/controller/internal/item.go +++ b/pkg/epp/flowcontrol/controller/internal/item.go @@ -17,141 +17,177 @@ limitations under the License. package internal import ( + "context" + "errors" + "fmt" + "strconv" "sync" "sync/atomic" "time" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" ) -// flowItem is the internal representation of a request managed by the `FlowController`. It implements the -// `types.QueueItemAccessor` interface, which is the primary view of the item used by queue and policy implementations. -// It wraps the original `types.FlowControlRequest` and adds metadata for queuing, lifecycle management, and policy -// interaction. +// FinalState encapsulates the terminal outcome of a FlowItem's lifecycle. +type FinalState struct { + Outcome types.QueueOutcome + Err error +} + +// FlowItem is the internal representation of a request managed by the Flow Controller. +// +// # Lifecycle Management +// +// Finalization (determining outcome) can be initiated by the Controller (e.g., Context expiry) or the Processor (e.g., +// Dispatch/Reject). It sets the outcome and signals the waiting goroutine. // -// # Concurrency +// # Synchronization // -// The `finalize` method is the primary point of concurrency concern. It is designed to be atomic and idempotent through -// the use of `sync.Once`. This guarantees that an item's final outcome can be set exactly once, even if multiple -// goroutines (e.g., the main dispatch loop and the expiry cleanup loop) race to finalize it. All other fields are set -// at creation time and are not modified thereafter, making them safe for concurrent access. -type flowItem struct { - // enqueueTime is the timestamp when the item was logically accepted by the `FlowController`. - enqueueTime time.Time - // effectiveTTL is the actual time-to-live assigned to this item. - effectiveTTL time.Duration - // originalRequest is the underlying request object. +// Atomic operations synchronize state across the Controller and Processor goroutines: +// - finalState (atomic.Pointer): Safely publishes the outcome. +// - handle (atomic.Pointer): Safely publishes the queue admission status. +type FlowItem struct { + // --- Immutable fields during a single lifecycle --- + + enqueueTime time.Time + effectiveTTL time.Duration originalRequest types.FlowControlRequest - // handle is the unique identifier for this item within a specific queue instance. - handle types.QueueItemHandle - - // done is closed exactly once when the item is finalized (dispatched or evicted/rejected). - done chan struct{} - // err stores the final error state if the item was not successfully dispatched. - // It is written to exactly once, protected by `onceFinalize`. - err atomic.Value // Stores error - // outcome stores the final `types.QueueOutcome` of the item's lifecycle. - // It is written to exactly once, protected by `onceFinalize`. - outcome atomic.Value // Stores `types.QueueOutcome` - // onceFinalize ensures the `finalize()` logic is idempotent. + + // --- Synchronized State --- + + // handle stores the types.QueueItemHandle atomically. + // Written by the Processor (SetHandle) when admitted. + // Read by inferOutcome (called by Finalize) to infer the outcome (Rejected vs. Evicted). + // Distinguishing between pre-admission (Rejection) and post-admission (Eviction) during asynchronous finalization + // relies on whether this handle is nil or non-nil. + handle atomic.Pointer[types.QueueItemHandle] + + // finalState holds the result of the finalization. Stored atomically once. + // Use FinalState() for safe access. + finalState atomic.Pointer[FinalState] + + // --- Finalization Signaling --- + + // done is the channel used to signal the completion of the item's lifecycle. + // Buffered to size 1 to prevent Finalize from blocking. + done chan *FinalState + + // onceFinalize ensures the finalization logic runs exactly once per lifecycle. onceFinalize sync.Once } -// ensure flowItem implements the interface. -var _ types.QueueItemAccessor = &flowItem{} +var _ types.QueueItemAccessor = &FlowItem{} -// NewItem creates a new `flowItem`, which is the internal representation of a request inside the `FlowController`. -// This constructor is exported so that the parent `controller` package can create items to be passed into the -// `internal` package's processors. It initializes the item with a "NotYetFinalized" outcome and an open `done` channel. -func NewItem(req types.FlowControlRequest, effectiveTTL time.Duration, enqueueTime time.Time) *flowItem { - fi := &flowItem{ +// NewItem allocates and initializes a new FlowItem for a request lifecycle. +func NewItem(req types.FlowControlRequest, effectiveTTL time.Duration, enqueueTime time.Time) *FlowItem { + return &FlowItem{ enqueueTime: enqueueTime, effectiveTTL: effectiveTTL, originalRequest: req, - done: make(chan struct{}), + done: make(chan *FinalState, 1), } - // Initialize the outcome to its zero state. - fi.outcome.Store(types.QueueOutcomeNotYetFinalized) - return fi } -// EnqueueTime returns the time the item was logically accepted by the `FlowController` for queuing. This is used as the -// basis for TTL calculations. -func (fi *flowItem) EnqueueTime() time.Time { return fi.enqueueTime } +// EnqueueTime returns the time the item was logically accepted by the FlowController. +func (fi *FlowItem) EnqueueTime() time.Time { return fi.enqueueTime } -// EffectiveTTL returns the actual time-to-live assigned to this item by the `FlowController`. -func (fi *flowItem) EffectiveTTL() time.Duration { return fi.effectiveTTL } +// EffectiveTTL returns the actual time-to-live assigned to this item. +func (fi *FlowItem) EffectiveTTL() time.Duration { return fi.effectiveTTL } -// OriginalRequest returns the original, underlying `types.FlowControlRequest` object. -func (fi *flowItem) OriginalRequest() types.FlowControlRequest { return fi.originalRequest } +// OriginalRequest returns the original types.FlowControlRequest object. +func (fi *FlowItem) OriginalRequest() types.FlowControlRequest { return fi.originalRequest } -// Handle returns the `types.QueueItemHandle` that uniquely identifies this item within a specific queue instance. It -// returns nil if the item has not yet been added to a queue. -func (fi *flowItem) Handle() types.QueueItemHandle { return fi.handle } +// Done returns a read-only channel that will receive the FinalState pointer exactly once. +func (fi *FlowItem) Done() <-chan *FinalState { return fi.done } -// SetHandle associates a `types.QueueItemHandle` with this item. This method is called by a `framework.SafeQueue` -// implementation immediately after the item is added to the queue. -func (fi *flowItem) SetHandle(handle types.QueueItemHandle) { fi.handle = handle } +// FinalState returns the FinalState if the item has been finalized, or nil otherwise. +// Safe for concurrent access. +func (fi *FlowItem) FinalState() *FinalState { return fi.finalState.Load() } -// Done returns a channel that is closed when the item has been finalized (e.g., dispatched or evicted). -// This is the primary mechanism for consumers to wait for an item's outcome. It is designed to be used in a `select` -// statement, allowing the caller to simultaneously wait for other events, such as context cancellation. -// -// # Example Usage -// -// select { -// case <-item.Done(): -// outcome, err := item.FinalState() -// // ... handle outcome -// case <-ctx.Done(): -// // ... handle cancellation -// } -func (fi *flowItem) Done() <-chan struct{} { - return fi.done +// Handle returns the types.QueueItemHandle for this item within a queue. +// Returns nil if the item is not in a queue. Safe for concurrent access. +func (fi *FlowItem) Handle() types.QueueItemHandle { + ptr := fi.handle.Load() + if ptr == nil { + return nil + } + return *ptr } -// FinalState returns the terminal outcome and error for the item. -// -// CRITICAL: This method must only be called after the channel returned by `Done()` has been closed. Calling it before -// the item is finalized may result in a race condition where the final state has not yet been written. -func (fi *flowItem) FinalState() (types.QueueOutcome, error) { - outcomeVal := fi.outcome.Load() - errVal := fi.err.Load() - - var finalOutcome types.QueueOutcome - if oc, ok := outcomeVal.(types.QueueOutcome); ok { - finalOutcome = oc - } else { - // This case should not happen if finalize is always called correctly, but we default to a safe value. - finalOutcome = types.QueueOutcomeNotYetFinalized - } +// SetHandle associates a types.QueueItemHandle with this item. Called by the queue implementation (via Processor). +// Safe for concurrent access. +func (fi *FlowItem) SetHandle(handle types.QueueItemHandle) { fi.handle.Store(&handle) } - var finalErr error - if e, ok := errVal.(error); ok { - finalErr = e - } - return finalOutcome, finalErr +// Finalize determines the item's terminal state based on the provided cause (e.g., Context error) and the item's +// current admission status (queued or not). +// +// This method is intended for asynchronous finalization initiated by the Controller (e.g., TTL expiry). +// It is idempotent. +func (fi *FlowItem) Finalize(cause error) { + fi.onceFinalize.Do(func() { + // Atomically load the handle to determine if the item was admitted to a queue. + // This synchronization is critical for correctly inferring the outcome across goroutines. + isQueued := fi.Handle() != nil + outcome, finalErr := inferOutcome(cause, isQueued) + fi.finalizeInternal(outcome, finalErr) + }) } -// finalize sets the item's terminal state (`outcome`, `error`) and closes its `done` channel idempotently using -// `sync.Once`. This is the single, internal point where an item's lifecycle within the `FlowController` concludes. -func (fi *flowItem) finalize(outcome types.QueueOutcome, err error) { +// FinalizeWithOutcome sets the item's terminal state explicitly. +// +// This method is intended for synchronous finalization by the Processor (Dispatch, Reject) or the Controller +// (Distribution failure). +// It is idempotent. +func (fi *FlowItem) FinalizeWithOutcome(outcome types.QueueOutcome, err error) { fi.onceFinalize.Do(func() { - if err != nil { - fi.err.Store(err) - } - fi.outcome.Store(outcome) - close(fi.done) + fi.finalizeInternal(outcome, err) }) } -// isFinalized checks if the item has been finalized without blocking. It is used internally by the `ShardProcessor` as -// a defensive check to avoid operating on items that have already been completed. -func (fi *flowItem) isFinalized() bool { - select { - case <-fi.done: - return true +// finalizeInternal is the core finalization logic. It must be called within the sync.Once.Do block. +// It captures the state, stores it atomically, and signals the Done channel. +func (fi *FlowItem) finalizeInternal(outcome types.QueueOutcome, err error) { + finalState := &FinalState{ + Outcome: outcome, + Err: err, + } + + // Atomically store the pointer. This is the critical memory barrier that publishes the state safely. + fi.finalState.Store(finalState) + + duration := time.Since(fi.enqueueTime) + flowKey := fi.originalRequest.FlowKey() + metrics.RecordFlowControlRequestQueueDuration(flowKey.ID, strconv.Itoa(flowKey.Priority), outcome.String(), duration) + + fi.done <- finalState + close(fi.done) +} + +// inferOutcome determines the correct QueueOutcome and Error based on the cause of finalization and whether the item +// was already admitted to a queue. +func inferOutcome(cause error, isQueued bool) (types.QueueOutcome, error) { + var specificErr error + var outcomeIfEvicted types.QueueOutcome + switch { + case errors.Is(cause, types.ErrTTLExpired) || errors.Is(cause, context.DeadlineExceeded): + specificErr = types.ErrTTLExpired + outcomeIfEvicted = types.QueueOutcomeEvictedTTL + case errors.Is(cause, context.Canceled): + specificErr = fmt.Errorf("%w: %w", types.ErrContextCancelled, cause) + outcomeIfEvicted = types.QueueOutcomeEvictedContextCancelled default: - return false + // Handle other potential causes (e.g., custom context errors). + specificErr = cause + outcomeIfEvicted = types.QueueOutcomeEvictedOther } + + if isQueued { + // The item was in the queue when it expired/cancelled. + return outcomeIfEvicted, fmt.Errorf("%w: %w", types.ErrEvicted, specificErr) + } + + // The item was not yet in the queue (e.g., buffered in enqueueChan). + // We treat this as a rejection, as it never formally consumed queue capacity. + return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, specificErr) } diff --git a/pkg/epp/flowcontrol/controller/internal/item_test.go b/pkg/epp/flowcontrol/controller/internal/item_test.go index d50aaed41..9b7b627c2 100644 --- a/pkg/epp/flowcontrol/controller/internal/item_test.go +++ b/pkg/epp/flowcontrol/controller/internal/item_test.go @@ -18,6 +18,7 @@ package internal import ( "context" + "errors" "testing" "time" @@ -28,25 +29,208 @@ import ( typesmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types/mocks" ) -func TestItem(t *testing.T) { +func TestFlowItem_New(t *testing.T) { t.Parallel() + req := typesmocks.NewMockFlowControlRequest(100, "req-1", types.FlowKey{}) - t.Run("should correctly set and get handle", func(t *testing.T) { - t.Parallel() - item := &flowItem{} - handle := &typesmocks.MockQueueItemHandle{} - item.SetHandle(handle) - assert.Same(t, handle, item.Handle(), "Handle() should retrieve the same handle instance set by SetHandle()") - }) - - t.Run("should have a non-finalized state upon creation", func(t *testing.T) { - t.Parallel() - key := types.FlowKey{ID: "flow-a", Priority: 10} - req := typesmocks.NewMockFlowControlRequest(100, "req-1", key, context.Background()) - item := NewItem(req, time.Minute, time.Now()) - require.NotNil(t, item, "NewItem should not return nil") - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeNotYetFinalized, outcome, "A new item's outcome should be NotYetFinalized") - assert.NoError(t, err, "A new item should have a nil error") - }) + enqueueTime := time.Now() + item := NewItem(req, time.Minute, enqueueTime) + + require.NotNil(t, item, "NewItem should not return a nil item") + assert.Equal(t, enqueueTime, item.EnqueueTime(), "EnqueueTime should be populated") + assert.Equal(t, time.Minute, item.EffectiveTTL(), "EffectiveTTL should be populated") + assert.Same(t, req, item.OriginalRequest(), "OriginalRequest should be populated") + assert.Nil(t, item.FinalState(), "a new item must not have a final state") + select { + case <-item.Done(): + t.Fatal("Done() channel for a new item must block, but it was closed") + default: + // This is the expected path, as the channel would have blocked. + } +} + +func TestFlowItem_Handle(t *testing.T) { + t.Parallel() + item := &FlowItem{} + handle := &typesmocks.MockQueueItemHandle{} + item.SetHandle(handle) + assert.Same(t, handle, item.Handle(), "Handle() must retrieve the identical handle instance set by SetHandle()") +} + +func TestFlowItem_Finalize_Idempotency(t *testing.T) { + t.Parallel() + now := time.Now() + req := typesmocks.NewMockFlowControlRequest(100, "req-1", types.FlowKey{}) + + testCases := []struct { + name string + firstCall func(item *FlowItem) + secondCall func(item *FlowItem) + expectedOutcome types.QueueOutcome + expectedErrIs error + }{ + { + name: "Finalize then Finalize", + firstCall: func(item *FlowItem) { + item.Finalize(types.ErrTTLExpired) + }, + secondCall: func(item *FlowItem) { + item.Finalize(context.Canceled) + }, + expectedOutcome: types.QueueOutcomeRejectedOther, + expectedErrIs: types.ErrTTLExpired, + }, + { + name: "Finalize then FinalizeWithOutcome", + firstCall: func(item *FlowItem) { + item.Finalize(types.ErrTTLExpired) + }, + secondCall: func(item *FlowItem) { + item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + }, + expectedOutcome: types.QueueOutcomeRejectedOther, + expectedErrIs: types.ErrTTLExpired, + }, + { + name: "FinalizeWithOutcome then FinalizeWithOutcome", + firstCall: func(item *FlowItem) { + item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + }, + secondCall: func(item *FlowItem) { + item.FinalizeWithOutcome(types.QueueOutcomeRejectedCapacity, errors.New("rejected")) + }, + expectedOutcome: types.QueueOutcomeDispatched, + expectedErrIs: nil, + }, + { + name: "FinalizeWithOutcome then Finalize", + firstCall: func(item *FlowItem) { + item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) + }, + secondCall: func(item *FlowItem) { + item.Finalize(types.ErrTTLExpired) + }, + expectedOutcome: types.QueueOutcomeDispatched, + expectedErrIs: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + item := NewItem(req, time.Minute, now) + + // First call + tc.firstCall(item) + + // Second call + tc.secondCall(item) + + // Check FinalState() + finalState := item.FinalState() + require.NotNil(t, finalState, "FinalState should not be nil") + assert.Equal(t, tc.expectedOutcome, finalState.Outcome, "Outcome should match the first call") + if tc.expectedErrIs != nil { + assert.ErrorIs(t, finalState.Err, tc.expectedErrIs, "Error should match the first call") + } else { + assert.NoError(t, finalState.Err, "Error should be nil") + } + + // Check Done channel + select { + case state, ok := <-item.Done(): + require.True(t, ok, "Done channel should be readable") + assert.Equal(t, tc.expectedOutcome, state.Outcome, "Done channel outcome should match the first call") + if tc.expectedErrIs != nil { + assert.ErrorIs(t, state.Err, tc.expectedErrIs, "Done channel error should match the first call") + } else { + assert.NoError(t, state.Err, "Done channel error should be nil") + } + case <-time.After(50 * time.Millisecond): + t.Fatal("Done channel should have received the state") + } + }) + } +} + +func TestFlowItem_Finalize_InferOutcome(t *testing.T) { + t.Parallel() + now := time.Now() + + testCases := []struct { + name string + cause error + isQueued bool + expectOutcome types.QueueOutcome + expectErrIs error + }{ + { + name: "queued TTL expired", + cause: types.ErrTTLExpired, + isQueued: true, + expectOutcome: types.QueueOutcomeEvictedTTL, + expectErrIs: types.ErrTTLExpired, + }, + { + name: "queued context cancelled", + cause: context.Canceled, + isQueued: true, + expectOutcome: types.QueueOutcomeEvictedContextCancelled, + expectErrIs: types.ErrContextCancelled, + }, + { + name: "queued other error", + cause: errors.New("other cause"), + isQueued: true, + expectOutcome: types.QueueOutcomeEvictedOther, + expectErrIs: types.ErrEvicted, + }, + { + name: "not queued TTL expired", + cause: types.ErrTTLExpired, + isQueued: false, + expectOutcome: types.QueueOutcomeRejectedOther, + expectErrIs: types.ErrTTLExpired, + }, + { + name: "not queued context cancelled", + cause: context.Canceled, + isQueued: false, + expectOutcome: types.QueueOutcomeRejectedOther, + expectErrIs: types.ErrContextCancelled, + }, + { + name: "nil cause queued", + cause: nil, + isQueued: true, + expectOutcome: types.QueueOutcomeEvictedOther, + expectErrIs: types.ErrEvicted, + }, + { + name: "nil cause not queued", + cause: nil, + isQueued: false, + expectOutcome: types.QueueOutcomeRejectedOther, + expectErrIs: types.ErrRejected, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + req := typesmocks.NewMockFlowControlRequest(100, "req-1", types.FlowKey{}) + item := NewItem(req, time.Minute, now) + if tc.isQueued { + item.SetHandle(&typesmocks.MockQueueItemHandle{}) + } + + item.Finalize(tc.cause) + + finalState := item.FinalState() + require.NotNil(t, finalState, "FinalState should not be nil") + assert.Equal(t, tc.expectOutcome, finalState.Outcome, "Unexpected outcome") + require.Error(t, finalState.Err, "An error should be set") + assert.ErrorIs(t, finalState.Err, tc.expectErrIs, "Unexpected error type") + }) + } } diff --git a/pkg/epp/flowcontrol/controller/internal/processor.go b/pkg/epp/flowcontrol/controller/internal/processor.go index 7f9c8ee3a..2370fd646 100644 --- a/pkg/epp/flowcontrol/controller/internal/processor.go +++ b/pkg/epp/flowcontrol/controller/internal/processor.go @@ -26,7 +26,7 @@ import ( "time" "github.com/go-logr/logr" - "sigs.k8s.io/controller-runtime/pkg/log" + "k8s.io/utils/clock" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" @@ -34,124 +34,144 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -const ( - // enqueueChannelBufferSize sets the size of the buffered channel that accepts incoming requests for the shard - // processor. This buffer acts as a "shock absorber," decoupling the upstream distributor from the processor's main - // loop and allowing the system to handle short, intense bursts of traffic without blocking the distributor. - enqueueChannelBufferSize = 100 +// maxCleanupWorkers caps the number of concurrent workers for background cleanup tasks. This prevents a single shard +// from overwhelming the Go scheduler with too many goroutines. +const maxCleanupWorkers = 4 - // maxCleanupWorkers caps the number of concurrent workers for background cleanup tasks. This prevents a single shard - // from overwhelming the Go scheduler with too many goroutines. - maxCleanupWorkers = 4 -) - -var ( - // errInterFlow is a sentinel error for failures during the inter-flow dispatch phase (e.g., a - // `framework.InterFlowDispatchPolicy` fails to select a queue). - // - // Strategy: When this error is encountered, the dispatch cycle aborts processing for the current priority band and - // immediately moves to the next, promoting work conservation. A failure in one band should not halt progress in - // others. - errInterFlow = errors.New("inter-flow policy failure") - - // errIntraFlow is a sentinel error for failures *after* a specific flow's queue has been selected (e.g., a - // `framework.IntraFlowDispatchPolicy` fails or a queue `Remove` fails). - // - // Strategy: When this error is encountered, the dispatch cycle aborts processing for the entire priority band for the - // current cycle. This acts as a critical circuit breaker. A stateless inter-flow policy could otherwise repeatedly - // select the same problematic queue in a tight loop of failures. Halting the band for one cycle prevents this. - errIntraFlow = errors.New("intra-flow operation failure") -) +// ErrProcessorBusy is a sentinel error returned by the processor's Submit method indicating that the processor's. +// internal buffer is momentarily full and cannot accept new work. +var ErrProcessorBusy = errors.New("shard processor is busy") -// clock defines an interface for getting the current time, allowing for dependency injection in tests. -type clock interface { - Now() time.Time -} - -// ShardProcessor is the core worker of the `controller.FlowController`. It is paired one-to-one with a -// `contracts.RegistryShard` instance and is responsible for all request lifecycle operations on that shard, including -// enqueueing, dispatching, and expiry cleanup. It acts as the "data plane" worker that executes against the -// concurrent-safe state provided by its shard. +// ShardProcessor is the core worker of the FlowController. +// +// It is paired one-to-one with a RegistryShard instance and is responsible for all request lifecycle operations on that +// shard, from the point an item is successfully submitted to it. // -// For a full rationale on the single-writer concurrency model, see the package-level documentation in `doc.go`. +// # Request Lifecycle Management & Ownership // -// # Concurrency Guarantees and Race Conditions +// The ShardProcessor takes ownership of a FlowItem only after it has been successfully sent to its internal enqueueChan +// via Submit or SubmitOrBlock (i.e., when these methods return nil). +// Once the Processor takes ownership, it is solely responsible for ensuring that item.Finalize() or +// item.FinalizeWithOutcome() is called exactly once for that item, under all circumstances (dispatch, rejection, sweep, +// or shutdown). // -// This model provides two key guarantees: +// If Submit or SubmitOrBlock return an error, ownership remains with the caller (the Controller), which must then +// handle the finalization. // -// 1. **Safe Enqueueing**: The `Run` method's goroutine has exclusive ownership of all operations that *add* items to -// queues. This makes the "check-then-act" sequence in `enqueue` (calling `hasCapacity` then `managedQ.Add`) -// inherently atomic from a writer's perspective, preventing capacity breaches. While the background -// `runExpiryCleanup` goroutine can concurrently *remove* items, this is a benign race; a concurrent removal only -// creates more available capacity, ensuring the `hasCapacity` check remains valid. +// # Concurrency Model // -// 2. **Idempotent Finalization**: The primary internal race is between the main `dispatchCycle` and the background -// `runExpiryCleanup` goroutine, which might try to finalize the same `flowItem` simultaneously. This race is -// resolved by the `flowItem.finalize` method, which uses `sync.Once` to guarantee that only one of these goroutines -// can set the item's final state. +// To ensure correctness and high performance, the processor uses a single-goroutine, actor-based model. The main run +// loop is the sole writer for all state-mutating operations. This makes complex transactions (like capacity checks) +// inherently atomic without coarse-grained locks. type ShardProcessor struct { - shard contracts.RegistryShard - dispatchFilter BandFilter - clock clock - expiryCleanupInterval time.Duration - logger logr.Logger - - // enqueueChan is the entry point for new requests to be processed by this shard's `Run` loop. - enqueueChan chan *flowItem - // wg is used to wait for background tasks like expiry cleanup to complete on shutdown. + shard contracts.RegistryShard + saturationDetector contracts.SaturationDetector + clock clock.WithTicker + cleanupSweepInterval time.Duration + logger logr.Logger + + // lifecycleCtx controls the processor's lifetime. Monitored by Submit* methods for safe shutdown. + lifecycleCtx context.Context + + // enqueueChan is the entry point for new requests. + enqueueChan chan *FlowItem + + // wg is used to wait for background tasks (cleanup sweep) to complete on shutdown. wg sync.WaitGroup isShuttingDown atomic.Bool shutdownOnce sync.Once } -// NewShardProcessor creates a new `ShardProcessor` instance. +// NewShardProcessor creates a new ShardProcessor instance. func NewShardProcessor( + ctx context.Context, shard contracts.RegistryShard, - dispatchFilter BandFilter, - clock clock, - expiryCleanupInterval time.Duration, + saturationDetector contracts.SaturationDetector, + clock clock.WithTicker, + cleanupSweepInterval time.Duration, + enqueueChannelBufferSize int, logger logr.Logger, ) *ShardProcessor { return &ShardProcessor{ - shard: shard, - dispatchFilter: dispatchFilter, - clock: clock, - expiryCleanupInterval: expiryCleanupInterval, - logger: logger, - // A buffered channel decouples the processor from the distributor, allowing for a fast, asynchronous handoff of new - // requests. - enqueueChan: make(chan *flowItem, enqueueChannelBufferSize), + shard: shard, + saturationDetector: saturationDetector, + clock: clock, + cleanupSweepInterval: cleanupSweepInterval, + logger: logger, + lifecycleCtx: ctx, + enqueueChan: make(chan *FlowItem, enqueueChannelBufferSize), } } -// Run is the main operational loop for the shard processor. It must be run as a goroutine. +// Submit attempts a non-blocking handoff of an item to the processor's internal enqueue channel. +// +// Ownership Contract: +// - Returns nil: The item was successfully handed off. +// The ShardProcessor takes responsibility for calling Finalize on the item. +// - Returns error: The item was not handed off. +// Ownership of the FlowItem remains with the caller, who is responsible for calling Finalize. // -// # Loop Strategy: Interleaving Enqueue and Dispatch +// Possible errors: +// - ErrProcessorBusy: The processor's input channel is full. +// - types.ErrFlowControllerNotRunning: The processor is shutting down. +func (sp *ShardProcessor) Submit(item *FlowItem) error { + if sp.isShuttingDown.Load() { + return types.ErrFlowControllerNotRunning + } + select { // The default case makes this select non-blocking. + case sp.enqueueChan <- item: + return nil // Ownership transferred. + case <-sp.lifecycleCtx.Done(): + return types.ErrFlowControllerNotRunning + default: + return ErrProcessorBusy + } +} + +// SubmitOrBlock performs a blocking handoff of an item to the processor's internal enqueue channel. +// It waits until the item is handed off, the caller's context is cancelled, or the processor shuts down. // -// The loop uses a `select` statement to interleave two primary tasks: -// 1. Accepting new requests from the `enqueueChan`. -// 2. Attempting to dispatch existing requests from queues via `dispatchCycle`. +// Ownership Contract: +// - Returns nil: The item was successfully handed off. +// The ShardProcessor takes responsibility for calling Finalize on the item. +// - Returns error: The item was not handed off. +// Ownership of the FlowItem remains with the caller, who is responsible for calling Finalize. // -// This strategy is crucial for balancing responsiveness and throughput. When a new item arrives, it is immediately -// enqueued, and a dispatch cycle is triggered. This gives high-priority new arrivals a chance to be dispatched quickly. -// When no new items are arriving, the loop's `default` case continuously calls `dispatchCycle` to drain the existing -// backlog, ensuring work continues. +// Possible errors: +// - ctx.Err(): The provided context was cancelled or its deadline exceeded. +// - types.ErrFlowControllerNotRunning: The processor is shutting down. +func (sp *ShardProcessor) SubmitOrBlock(ctx context.Context, item *FlowItem) error { + if sp.isShuttingDown.Load() { + return types.ErrFlowControllerNotRunning + } + + select { // The absence of a default case makes this call blocking. + case sp.enqueueChan <- item: + return nil // Ownership transferred. + case <-ctx.Done(): + return ctx.Err() + case <-sp.lifecycleCtx.Done(): + return types.ErrFlowControllerNotRunning + } +} + +// Run is the main operational loop for the shard processor. It must be run as a goroutine. +// It uses a `select` statement to interleave accepting new requests with dispatching existing ones, balancing +// responsiveness with throughput. func (sp *ShardProcessor) Run(ctx context.Context) { sp.logger.V(logutil.DEFAULT).Info("Shard processor run loop starting.") defer sp.logger.V(logutil.DEFAULT).Info("Shard processor run loop stopped.") sp.wg.Add(1) - go sp.runExpiryCleanup(ctx) + go sp.runCleanupSweep(ctx) // This is the main worker loop. It continuously processes incoming requests and dispatches queued requests until the // context is cancelled. The `select` statement has three cases: // // 1. Context Cancellation: The highest priority is shutting down. If the context's `Done` channel is closed, the // loop will drain all queues and exit. This is the primary exit condition. - // // 2. New Item Arrival: If an item is available on `enqueueChan`, it will be processed. This ensures that the // processor is responsive to new work. - // // 3. Default (Dispatch): If neither of the above cases is ready, the `default` case executes, ensuring the loop is // non-blocking. It continuously attempts to dispatch items from the existing backlog, preventing starvation and // ensuring queues are drained. @@ -175,8 +195,9 @@ func (sp *ShardProcessor) Run(ctx context.Context) { sp.enqueue(item) sp.dispatchCycle(ctx) default: + // If no new items are arriving, continuously try to dispatch from the backlog. if !sp.dispatchCycle(ctx) { - // If no work was done, yield to other goroutines to prevent a tight, busy-loop when idle, but allow for + // If no work was done, yield to the scheduler to prevent a tight, busy-loop when idle, while still allowing for // immediate rescheduling. runtime.Gosched() } @@ -184,91 +205,66 @@ func (sp *ShardProcessor) Run(ctx context.Context) { } } -// Enqueue sends a new flow item to the processor's internal channel for asynchronous processing by its main `Run` loop. -// If the processor is shutting down, it immediately finalizes the item with a shutdown error. -func (sp *ShardProcessor) Enqueue(item *flowItem) { - if sp.isShuttingDown.Load() { - item.finalize(types.QueueOutcomeRejectedOther, - fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerShutdown)) - return - } - sp.enqueueChan <- item -} - -// enqueue is the internal implementation for adding a new item to a managed queue. It is always run from the single -// main `Run` goroutine, making its "check-then-act" logic for capacity safe. -func (sp *ShardProcessor) enqueue(item *flowItem) { +// enqueue processes an item received from the enqueueChan. +// It handles capacity checks, checks for external finalization, and either admits the item to a queue or rejects it. +func (sp *ShardProcessor) enqueue(item *FlowItem) { req := item.OriginalRequest() key := req.FlowKey() - logger := log.FromContext(req.Context()).WithName("enqueue").WithValues( - "flowKey", key, - "flowID", key.ID, - "priority", key.Priority, - "reqID", req.ID(), - "reqByteSize", req.ByteSize(), - ) + // --- Optimistic External Finalization Check --- + // Check if the item was finalized by the Controller (due to TTL/cancellation) while it was buffered in enqueueChan. + // This is an optimistic check to avoid unnecessary processing on items already considered dead. + // The ultimate guarantee of cleanup for any races is the runCleanupSweep mechanism. + if finalState := item.FinalState(); finalState != nil { + sp.logger.V(logutil.TRACE).Info("Item finalized externally before processing, discarding.", + "outcome", finalState.Outcome, "err", finalState.Err, "flowKey", key, "reqID", req.ID()) + return + } + // --- Configuration Validation --- managedQ, err := sp.shard.ManagedQueue(key) if err != nil { finalErr := fmt.Errorf("configuration error: failed to get queue for flow key %s: %w", key, err) - logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + sp.logger.Error(finalErr, "Rejecting item.", "flowKey", key, "reqID", req.ID()) + item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } band, err := sp.shard.PriorityBandAccessor(key.Priority) if err != nil { finalErr := fmt.Errorf("configuration error: failed to get priority band for priority %d: %w", key.Priority, err) - logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + sp.logger.Error(finalErr, "Rejecting item.", "flowKey", key, "reqID", req.ID()) + item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } - logger = logger.WithValues("priorityName", band.PriorityName()) + // --- Capacity Check --- + // This check is safe because it is performed by the single-writer Run goroutine. if !sp.hasCapacity(key.Priority, req.ByteSize()) { - // This is an expected outcome, not a system error. Log at the default level with rich context. - stats := sp.shard.Stats() - bandStats := stats.PerPriorityBandStats[key.Priority] - logger.V(logutil.DEFAULT).Info("Rejecting request, queue at capacity", - "outcome", types.QueueOutcomeRejectedCapacity, - "shardTotalBytes", stats.TotalByteSize, - "shardCapacityBytes", stats.TotalCapacityBytes, - "bandTotalBytes", bandStats.ByteSize, - "bandCapacityBytes", bandStats.CapacityBytes, - ) - item.finalize(types.QueueOutcomeRejectedCapacity, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrQueueAtCapacity)) + sp.logger.V(logutil.DEBUG).Info("Rejecting request, queue at capacity", + "flowKey", key, "reqID", req.ID(), "priorityName", band.PriorityName(), "reqByteSize", req.ByteSize()) + item.FinalizeWithOutcome(types.QueueOutcomeRejectedCapacity, fmt.Errorf("%w: %w", + types.ErrRejected, types.ErrQueueAtCapacity)) return } - // This is an optimistic check to prevent a needless add/remove cycle for an item that was finalized (e.g., context - // cancelled) during the handoff to this processor. A race condition still exists where an item can be finalized - // after this check but before the `Add` call completes. - // - // This is considered acceptable because: - // 1. The race window is extremely small. - // 2. The background `runExpiryCleanup` goroutine acts as the ultimate guarantor of correctness, as it will - // eventually find and evict any finalized item that slips through this check and is added to a queue. - if item.isFinalized() { - outcome, err := item.FinalState() - logger.V(logutil.VERBOSE).Info("Item finalized before adding to queue, ignoring.", "outcome", outcome, "err", err) - return - } - - // This is the point of commitment. After this call, the item is officially in the queue and is the responsibility of - // the dispatch or cleanup loops to finalize. + // --- Commitment Point --- + // The item is admitted. The ManagedQueue.Add implementation is responsible for calling item.SetHandle() atomically. if err := managedQ.Add(item); err != nil { finalErr := fmt.Errorf("failed to add item to queue for flow key %s: %w", key, err) - logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + sp.logger.Error(finalErr, "Rejecting item post-admission.", + "flowKey", key, "reqID", req.ID(), "priorityName", band.PriorityName()) + item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } - logger.V(logutil.TRACE).Info("Item enqueued.") + sp.logger.V(logutil.TRACE).Info("Item enqueued.", + "flowKey", key, "reqID", req.ID(), "priorityName", band.PriorityName()) } -// hasCapacity checks if the shard and the specific priority band have enough capacity to accommodate an item of a given -// size. -func (sp *ShardProcessor) hasCapacity(priority uint, itemByteSize uint64) bool { +// hasCapacity checks if the shard and the specific priority band have enough capacity. +// This check reflects actual resource utilization, including "zombie" items (finalized but unswept), to prevent +// physical resource overcommitment. +func (sp *ShardProcessor) hasCapacity(priority int, itemByteSize uint64) bool { if itemByteSize == 0 { return true } @@ -278,311 +274,201 @@ func (sp *ShardProcessor) hasCapacity(priority uint, itemByteSize uint64) bool { } bandStats, ok := stats.PerPriorityBandStats[priority] if !ok { - // This should not happen if the registry is consistent, but we fail closed just in case. - return false + return false // Fail closed if configuration is inconsistent. } return bandStats.ByteSize+itemByteSize <= bandStats.CapacityBytes } -// dispatchCycle attempts to dispatch a single item by iterating through all priority bands from highest to lowest. +// dispatchCycle attempts to dispatch a single item by iterating through priority bands from highest to lowest. // It applies the configured policies for each band to select an item and then attempts to dispatch it. // It returns true if an item was successfully dispatched, and false otherwise. +// It enforces Head-of-Line (HoL) blocking if the selected item is saturated. // -// # Error Handling Philosophy -// -// The engine employs a robust, two-tiered error handling strategy to isolate failures and maximize system availability. -// This is managed via the `errInterFlow` and `errIntraFlow` sentinel errors. -// -// - Inter-Flow Failures: If a failure occurs while selecting a flow (e.g., the `InterFlowDispatchPolicy` fails), the -// processor aborts the *current priority band* and immediately moves to the next one. This promotes work -// conservation, ensuring a single misconfigured band does not halt progress for the entire system. +// # Work Conservation and Head-of-Line (HoL) Blocking // -// - Intra-Flow Failures: If a failure occurs *after* a flow has been selected (e.g., the `IntraFlowDispatchPolicy` -// fails), the processor aborts the *entire priority band* for the current cycle. This is a critical circuit -// breaker. An inter-flow policy that is not stateful with respect to past failures could otherwise repeatedly -// select the same problematic queue, causing a tight loop of failures. Halting the band for one cycle prevents -// this. +// The cycle attempts to be work-conserving by skipping bands where selection fails. +// However, if a selected item is saturated (cannot be scheduled), the cycle stops immediately. This enforces HoL +// blocking to respect the policy's decision and prevent priority inversion, where dispatching lower-priority work might +// exacerbate the saturation affecting the high-priority item. func (sp *ShardProcessor) dispatchCycle(ctx context.Context) bool { - baseLogger := sp.logger.WithName("dispatchCycle") - - // FUTURE EXTENSION POINT: The iteration over priority bands is currently a simple, strict-priority loop. - // This could be abstracted into a third policy tier (e.g., an `InterBandDispatchPolicy`) if more complex scheduling - // between bands, such as Weighted Fair Queuing (WFQ), is ever required. For now, strict priority is sufficient. for _, priority := range sp.shard.AllOrderedPriorityLevels() { originalBand, err := sp.shard.PriorityBandAccessor(priority) if err != nil { - baseLogger.Error(err, "Failed to get PriorityBandAccessor, skipping band", "priority", priority) + sp.logger.Error(err, "Failed to get PriorityBandAccessor, skipping band", "priority", priority) continue } - logger := baseLogger.WithValues("priority", priority, "priorityName", originalBand.PriorityName()) - // Apply the configured filter to get a view of only the dispatchable flows. - dispatchableBand, shouldPause := sp.dispatchFilter(ctx, originalBand, logger) - if shouldPause { - return false // A global gate told us to stop the entire cycle. - } - if dispatchableBand == nil { - // A nil return from the filter indicates the fast path: no filtering was needed. - dispatchableBand = originalBand - } - - // Pass the (potentially filtered) band to the policies. - item, err := sp.selectItem(dispatchableBand, logger) + item, err := sp.selectItem(originalBand) if err != nil { - // The error handling strategy depends on the type of failure (inter- vs. intra-flow). - if errors.Is(err, errIntraFlow) { - logger.Error(err, "Intra-flow policy failure, skipping priority band for this cycle") - } else { - logger.Error(err, "Inter-flow policy or configuration failure, skipping priority band for this cycle") - } - continue + sp.logger.Error(err, "Failed to select item, skipping priority band for this cycle", + "priority", priority, "priorityName", originalBand.PriorityName()) + continue // Continue to the next band to maximize work conservation. } if item == nil { - // This is the common case where a priority band has no items to dispatch. - logger.V(logutil.TRACE).Info("No item selected by dispatch policies, skipping band") continue } - logger = logger.WithValues( - "flowKey", item.OriginalRequest().FlowKey(), - "flowID", item.OriginalRequest().FlowKey().ID, - "flowPriority", item.OriginalRequest().FlowKey().Priority, - "reqID", item.OriginalRequest().ID(), - "reqByteSize", item.OriginalRequest().ByteSize()) - - if err := sp.dispatchItem(item, logger); err != nil { - // All errors from dispatchItem are considered intra-flow and unrecoverable for this band in this cycle. - logger.Error(err, "Failed to dispatch item, skipping priority band for this cycle") - continue + + // --- Viability Check (Saturation/HoL Blocking) --- + req := item.OriginalRequest() + candidatePods := req.CandidatePodsForScheduling() + if sp.saturationDetector.IsSaturated(ctx, candidatePods) { + sp.logger.V(logutil.DEBUG).Info("Policy's chosen item is saturated; enforcing HoL blocking.", + "flowKey", req.FlowKey(), "reqID", req.ID(), "priorityName", originalBand.PriorityName()) + // Stop the dispatch cycle entirely to respect strict policy decision and prevent priority inversion where + // lower-priority work might exacerbate the saturation affecting high-priority work. + return false + } + + // --- Dispatch --- + if err := sp.dispatchItem(item); err != nil { + sp.logger.Error(err, "Failed to dispatch item, skipping priority band for this cycle", + "flowKey", req.FlowKey(), "reqID", req.ID(), "priorityName", originalBand.PriorityName()) + continue // Continue to the next band to maximize work conservation. } - // A successful dispatch occurred, so we return true to signal that work was done. return true } - // No items were dispatched in this cycle across all priority bands. return false } -// selectItem applies the configured inter- and intra-flow dispatch policies to select a single item from a priority -// band. -func (sp *ShardProcessor) selectItem( - band framework.PriorityBandAccessor, - logger logr.Logger, -) (types.QueueItemAccessor, error) { +// selectItem applies the configured inter- and intra-flow dispatch policies to select a single item. +func (sp *ShardProcessor) selectItem(band framework.PriorityBandAccessor) (types.QueueItemAccessor, error) { interP, err := sp.shard.InterFlowDispatchPolicy(band.Priority()) if err != nil { - return nil, fmt.Errorf("%w: could not get InterFlowDispatchPolicy: %w", errInterFlow, err) + return nil, fmt.Errorf("could not get InterFlowDispatchPolicy: %w", err) } queue, err := interP.SelectQueue(band) if err != nil { - return nil, fmt.Errorf("%w: InterFlowDispatchPolicy %q failed to select queue: %w", - errInterFlow, interP.Name(), err) + return nil, fmt.Errorf("InterFlowDispatchPolicy %q failed to select queue: %w", interP.Name(), err) } if queue == nil { - logger.V(logutil.TRACE).Info("No queue selected by InterFlowDispatchPolicy") return nil, nil } key := queue.FlowKey() - logger = logger.WithValues( - "selectedFlowKey", key, - "selectedFlowID", key.ID, - "selectedFlowPriority", key.Priority) intraP, err := sp.shard.IntraFlowDispatchPolicy(key) if err != nil { - // This is an intra-flow failure because we have already successfully selected a queue. - return nil, fmt.Errorf("%w: could not get IntraFlowDispatchPolicy for flow %q: %w", errIntraFlow, key, err) + return nil, fmt.Errorf("could not get IntraFlowDispatchPolicy for flow %s: %w", key, err) } item, err := intraP.SelectItem(queue) if err != nil { - return nil, fmt.Errorf("%w: IntraFlowDispatchPolicy %q failed to select item for flow %q: %w", - errIntraFlow, intraP.Name(), key, err) - } - if item == nil { - logger.V(logutil.TRACE).Info("No item selected by IntraFlowDispatchPolicy") - return nil, nil + return nil, fmt.Errorf("IntraFlowDispatchPolicy %q failed to select item for flow %s: %w", intraP.Name(), key, err) } return item, nil } -// dispatchItem handles the final steps of dispatching an item after it has been selected by policies. This includes -// removing it from its queue, checking for last-minute expiry, and finalizing its outcome. -func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger logr.Logger) error { - logger = logger.WithName("dispatchItem") - +// dispatchItem handles the final steps of dispatching an item: removing it from the queue and finalizing its outcome. +func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor) error { req := itemAcc.OriginalRequest() - // We must look up the queue by its specific priority, as a flow might have draining queues at other levels. - managedQ, err := sp.shard.ManagedQueue(req.FlowKey()) + key := req.FlowKey() + managedQ, err := sp.shard.ManagedQueue(key) if err != nil { - return fmt.Errorf("%w: failed to get ManagedQueue for flow %q: %w", errIntraFlow, req.FlowKey(), err) + return fmt.Errorf("failed to get ManagedQueue for flow %s: %w", key, err) } - // The core mutation: remove the item from the queue. removedItemAcc, err := managedQ.Remove(itemAcc.Handle()) if err != nil { - // This can happen benignly if the item was already removed by the expiry cleanup loop between the time it was - // selected by the policy and the time this function is called. - logger.V(logutil.VERBOSE).Info("Item already removed from queue, likely by expiry cleanup", "err", err) - return fmt.Errorf("%w: failed to remove item %q from queue for flow %q: %w", - errIntraFlow, req.ID(), req.FlowKey(), err) + // This happens benignly if the item was already removed by the cleanup sweep loop. + // We log it at a low level for visibility but return nil so the dispatch cycle proceeds. + sp.logger.V(logutil.DEBUG).Info("Failed to remove item during dispatch (likely already finalized and swept).", + "flowKey", key, "reqID", req.ID(), "error", err) + return nil } - removedItem, ok := removedItemAcc.(*flowItem) - if !ok { - // This indicates a severe logic error where a queue returns an item of an unexpected type. This violates a - // core system invariant: all items managed by the processor must be of type *flowItem. This is an unrecoverable - // state for this shard. - unexpectedItemErr := fmt.Errorf("%w: internal error: item %q of type %T is not a *flowItem", - errIntraFlow, removedItemAcc.OriginalRequest().ID(), removedItemAcc) - panic(unexpectedItemErr) - } - - // Final check for expiry/cancellation right before dispatch. - isExpired, outcome, expiryErr := checkItemExpiry(removedItem, sp.clock.Now()) - if isExpired { - // Ensure we always have a non-nil error to wrap for consistent logging and error handling. - finalErr := expiryErr - if finalErr == nil { - finalErr = errors.New("item finalized before dispatch") - } - logger.V(logutil.VERBOSE).Info("Item expired at time of dispatch, evicting", "outcome", outcome, - "err", finalErr) - removedItem.finalize(outcome, fmt.Errorf("%w: %w", types.ErrEvicted, finalErr)) - // Return an error to signal that the dispatch did not succeed. - return fmt.Errorf("%w: item %q expired before dispatch: %w", errIntraFlow, req.ID(), finalErr) - } - - // Finalize the item as dispatched. - removedItem.finalize(types.QueueOutcomeDispatched, nil) - logger.V(logutil.TRACE).Info("Item dispatched.") + removedItem := removedItemAcc.(*FlowItem) + sp.logger.V(logutil.TRACE).Info("Item dispatched.", "flowKey", req.FlowKey(), "reqID", req.ID()) + removedItem.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) return nil } -// checkItemExpiry checks if an item has been cancelled (via its context) or has exceeded its TTL. It returns true if -// the item is expired, along with the corresponding outcome and error. -// -// This function provides "defense in depth" against race conditions. It is the authoritative check that is called from -// multiple locations (the dispatch loop and the cleanup loop) to determine if an item should be evicted. Its first -// action is to check if the item has *already* been finalized by a competing goroutine, ensuring that the final outcome -// is decided exactly once. -func checkItemExpiry( - itemAcc types.QueueItemAccessor, - now time.Time, -) (isExpired bool, outcome types.QueueOutcome, err error) { - item, ok := itemAcc.(*flowItem) - if !ok { - // This indicates a severe logic error where a queue returns an item of an unexpected type. This violates a - // core system invariant: all items managed by the processor must be of type *flowItem. This is an unrecoverable - // state for this shard. - unexpectedItemErr := fmt.Errorf("internal error: item %q of type %T is not a *flowItem", - itemAcc.OriginalRequest().ID(), itemAcc) - panic(unexpectedItemErr) - } - - // This check is a critical defense against race conditions. If another goroutine (e.g., the cleanup loop) has - // already finalized this item, we must respect that outcome. - if item.isFinalized() { - outcome, err := item.FinalState() - return true, outcome, err - } - - // Check if the request's context has been cancelled. - if ctxErr := item.OriginalRequest().Context().Err(); ctxErr != nil { - return true, types.QueueOutcomeEvictedContextCancelled, fmt.Errorf("%w: %w", types.ErrContextCancelled, ctxErr) - } - - // Check if the item has outlived its TTL. - if item.EffectiveTTL() > 0 && now.Sub(item.EnqueueTime()) > item.EffectiveTTL() { - return true, types.QueueOutcomeEvictedTTL, types.ErrTTLExpired - } - - return false, types.QueueOutcomeNotYetFinalized, nil -} - -// runExpiryCleanup starts a background goroutine that periodically scans all queues on the shard for expired items. -func (sp *ShardProcessor) runExpiryCleanup(ctx context.Context) { +// runCleanupSweep starts a background goroutine that periodically scans all queues for externally finalized items +// ("zombie" items) and removes them in batches. +func (sp *ShardProcessor) runCleanupSweep(ctx context.Context) { defer sp.wg.Done() - logger := sp.logger.WithName("runExpiryCleanup") - logger.V(logutil.DEFAULT).Info("Shard expiry cleanup goroutine starting.") - defer logger.V(logutil.DEFAULT).Info("Shard expiry cleanup goroutine stopped.") + logger := sp.logger.WithName("runCleanupSweep") + logger.V(logutil.DEFAULT).Info("Shard cleanup sweep goroutine starting.") + defer logger.V(logutil.DEFAULT).Info("Shard cleanup sweep goroutine stopped.") - ticker := time.NewTicker(sp.expiryCleanupInterval) + ticker := sp.clock.NewTicker(sp.cleanupSweepInterval) defer ticker.Stop() for { select { case <-ctx.Done(): return - case now := <-ticker.C: - sp.cleanupExpired(now) + case <-ticker.C(): + sp.sweepFinalizedItems() } } } -// cleanupExpired performs a single scan of all queues on the shard, removing and finalizing any items that have -// expired due to TTL or context cancellation. -func (sp *ShardProcessor) cleanupExpired(now time.Time) { - processFn := func(managedQ contracts.ManagedQueue, queueLogger logr.Logger) { - // This predicate identifies items to be removed by the Cleanup call. - predicate := func(item types.QueueItemAccessor) bool { - isExpired, _, _ := checkItemExpiry(item, now) - return isExpired +// sweepFinalizedItems performs a single scan of all queues, removing finalized items in batch and releasing their +// memory. +func (sp *ShardProcessor) sweepFinalizedItems() { + processFn := func(managedQ contracts.ManagedQueue, logger logr.Logger) { + key := managedQ.FlowQueueAccessor().FlowKey() + predicate := func(itemAcc types.QueueItemAccessor) bool { + return itemAcc.(*FlowItem).FinalState() != nil } - removedItems, err := managedQ.Cleanup(predicate) if err != nil { - queueLogger.Error(err, "Error during ManagedQueue Cleanup") + logger.Error(err, "Error during ManagedQueue Cleanup", "flowKey", key) } - - // Finalize all the items that were removed. - sp.finalizeExpiredItems(removedItems, now, queueLogger) + logger.V(logutil.DEBUG).Info("Swept finalized items and released capacity.", + "flowKey", key, "count", len(removedItems)) } - sp.processAllQueuesConcurrently("cleanupExpired", processFn) + sp.processAllQueuesConcurrently("sweepFinalizedItems", processFn) } -// shutdown handles the graceful termination of the processor. It uses sync.Once to guarantee that the shutdown logic is -// executed exactly once, regardless of whether it's triggered by context cancellation or the closing of the enqueue -// channel. +// shutdown handles the graceful termination of the processor, ensuring all pending items (in channel and queues) are +// Finalized. func (sp *ShardProcessor) shutdown() { sp.shutdownOnce.Do(func() { - // Set the atomic bool so that any new calls to Enqueue will fail fast. sp.isShuttingDown.Store(true) sp.logger.V(logutil.DEFAULT).Info("Shard processor shutting down.") - // Drain the channel BEFORE closing it. This prevents a panic from any goroutine that is currently blocked trying to - // send to the channel. We read until it's empty. - DrainLoop: + DrainLoop: // Drain the enqueueChan to finalize buffered items. for { select { case item := <-sp.enqueueChan: - if item == nil { // This is a safeguard against logic errors in the distributor. + if item == nil { continue } - item.finalize(types.QueueOutcomeRejectedOther, - fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerShutdown)) + // Finalize buffered items. + item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, + fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning)) default: - // The channel is empty, we can now safely close it. break DrainLoop } } - close(sp.enqueueChan) - - // Evict all remaining items from the queues. + // We do not close enqueueChan because external goroutines (Controller) send on it. + // The channel will be garbage collected when the processor terminates. sp.evictAll() }) } -// evictAll drains all queues on the shard and finalizes every item with a shutdown error. This is called when the -// processor is shutting down to ensure no requests are left in a pending state. +// evictAll drains all queues on the shard, finalizes every item, and releases their memory. func (sp *ShardProcessor) evictAll() { - processFn := func(managedQ contracts.ManagedQueue, queueLogger logr.Logger) { + processFn := func(managedQ contracts.ManagedQueue, logger logr.Logger) { + key := managedQ.FlowQueueAccessor().FlowKey() removedItems, err := managedQ.Drain() if err != nil { - queueLogger.Error(err, "Error during ManagedQueue Drain") + logger.Error(err, "Error during ManagedQueue Drain", "flowKey", key) } - // Finalize all the items that were removed. - getOutcome := func(_ types.QueueItemAccessor) (types.QueueOutcome, error) { - return types.QueueOutcomeEvictedOther, fmt.Errorf("%w: %w", types.ErrEvicted, types.ErrFlowControllerShutdown) + outcome := types.QueueOutcomeEvictedOther + errShutdown := fmt.Errorf("%w: %w", types.ErrEvicted, types.ErrFlowControllerNotRunning) + for _, i := range removedItems { + item, ok := i.(*FlowItem) + if !ok { + logger.Error(fmt.Errorf("internal error: unexpected type %T", i), + "Panic condition detected during shutdown", "flowKey", key) + continue + } + + // Finalization is idempotent; safe to call even if already finalized externally. + item.FinalizeWithOutcome(outcome, errShutdown) + logger.V(logutil.TRACE).Info("Item evicted during shutdown.", + "flowKey", key, "reqID", item.OriginalRequest().ID()) } - sp.finalizeItems(removedItems, queueLogger, getOutcome) } sp.processAllQueuesConcurrently("evictAll", processFn) } @@ -650,38 +536,3 @@ func (sp *ShardProcessor) processAllQueuesConcurrently( close(tasks) // Close the channel to signal workers to exit. wg.Wait() // Wait for all workers to finish. } - -// finalizeItems is a helper to iterate over a slice of items, safely cast them, and finalize them with an outcome -// determined by the `getOutcome` function. -func (sp *ShardProcessor) finalizeItems( - items []types.QueueItemAccessor, - logger logr.Logger, - getOutcome func(item types.QueueItemAccessor) (types.QueueOutcome, error), -) { - for _, i := range items { - item, ok := i.(*flowItem) - if !ok { - unexpectedItemErr := fmt.Errorf("internal error: item %q of type %T is not a *flowItem", - i.OriginalRequest().ID(), i) - logger.Error(unexpectedItemErr, "Panic condition detected during finalization", "item", i) - continue - } - - outcome, err := getOutcome(i) - item.finalize(outcome, err) - logger.V(logutil.TRACE).Info("Item finalized", "reqID", item.OriginalRequest().ID(), - "outcome", outcome, "err", err) - } -} - -// finalizeExpiredItems is a specialized version of finalizeItems for items that are known to be expired. It determines -// the precise reason for expiry and finalizes the item accordingly. -func (sp *ShardProcessor) finalizeExpiredItems(items []types.QueueItemAccessor, now time.Time, logger logr.Logger) { - getOutcome := func(item types.QueueItemAccessor) (types.QueueOutcome, error) { - // We don't need the `isExpired` boolean here because we know it's true, but this function conveniently returns the - // precise outcome and error. - _, outcome, expiryErr := checkItemExpiry(item, now) - return outcome, fmt.Errorf("%w: %w", types.ErrEvicted, expiryErr) - } - sp.finalizeItems(items, logger, getOutcome) -} diff --git a/pkg/epp/flowcontrol/controller/internal/processor_test.go b/pkg/epp/flowcontrol/controller/internal/processor_test.go index 67657a9a4..73fc5b13d 100644 --- a/pkg/epp/flowcontrol/controller/internal/processor_test.go +++ b/pkg/epp/flowcontrol/controller/internal/processor_test.go @@ -14,29 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -// -// A Note on the Testing Strategy for `ShardProcessor` -// -// The `ShardProcessor` is a complex concurrent orchestrator. Testing it with concrete implementations would lead to -// flaky, non-deterministic tests. Therefore, we use a high-fidelity `testHarness` with stateful mocks to enable -// reliable and deterministic testing. This is a deliberate and necessary choice for several key reasons: -// -// 1. Deterministic Race Simulation: The harness allows us to pause mock execution at critical moments, making it -// possible to deterministically simulate and verify the processor's behavior during race conditions (e.g., the -// dispatch vs. expiry race). This is impossible with concrete implementations without resorting to unreliable -// sleeps. -// -// 2. Failure Mode Simulation: We can trigger specific, on-demand errors from dependencies to verify the processor's -// resilience and complex error-handling logic (e.g., the `errIntraFlow` circuit breaker). -// -// 3. Interaction and Isolation Testing: Mocks allow us to isolate the `ShardProcessor` from its dependencies. This -// ensures that tests are verifying the processor's orchestration logic (i.e., that it calls its dependencies -// correctly) and are not affected by confounding bugs in those dependencies. -// -// In summary, this is a prerequisite for reliably testing a concurrent engine, not just a simple data -// structure. -// - package internal import ( @@ -44,7 +21,7 @@ import ( "errors" "fmt" "os" - "slices" + "sort" "sync" "sync/atomic" "testing" @@ -53,9 +30,11 @@ import ( "github.com/go-logr/logr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + testclock "k8s.io/utils/clock/testing" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts/mocks" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" @@ -79,28 +58,6 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } -// mockClock allows for controlling time in tests. -type mockClock struct { - mu sync.Mutex - currentTime time.Time -} - -func newMockClock() *mockClock { - return &mockClock{currentTime: time.Now()} -} - -func (c *mockClock) Now() time.Time { - c.mu.Lock() - defer c.mu.Unlock() - return c.currentTime -} - -func (c *mockClock) Advance(d time.Duration) { - c.mu.Lock() - defer c.mu.Unlock() - c.currentTime = c.currentTime.Add(d) -} - // testHarness provides a unified, mock-based testing environment for the ShardProcessor. It centralizes all mock state // and provides helper methods for setting up tests and managing the processor's lifecycle. type testHarness struct { @@ -114,15 +71,16 @@ type testHarness struct { startSignal chan struct{} // Core components under test - processor *ShardProcessor - mockClock *mockClock - logger logr.Logger + processor *ShardProcessor + clock *testclock.FakeClock + logger logr.Logger + saturationDetector *mocks.MockSaturationDetector // --- Centralized Mock State --- // The harness's mutex protects the single source of truth for all mock state. mu sync.Mutex queues map[types.FlowKey]*mocks.MockManagedQueue - priorityFlows map[uint][]types.FlowKey // Key: `priority` + priorityFlows map[int][]types.FlowKey // Key: `priority` // Customizable policy logic for tests to override. interFlowPolicySelectQueue func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) @@ -133,14 +91,16 @@ type testHarness struct { func newTestHarness(t *testing.T, expiryCleanupInterval time.Duration) *testHarness { t.Helper() h := &testHarness{ - t: t, - MockRegistryShard: &mocks.MockRegistryShard{}, - mockClock: newMockClock(), - logger: logr.Discard(), - startSignal: make(chan struct{}), - queues: make(map[types.FlowKey]*mocks.MockManagedQueue), - priorityFlows: make(map[uint][]types.FlowKey), + t: t, + MockRegistryShard: &mocks.MockRegistryShard{}, + clock: testclock.NewFakeClock(time.Now()), + logger: logr.Discard(), + saturationDetector: &mocks.MockSaturationDetector{}, + startSignal: make(chan struct{}), + queues: make(map[types.FlowKey]*mocks.MockManagedQueue), + priorityFlows: make(map[int][]types.FlowKey), } + h.ctx, h.cancel = context.WithCancel(context.Background()) // Wire up the harness to provide the mock implementations for the shard's dependencies. h.ManagedQueueFunc = h.managedQueue @@ -153,22 +113,24 @@ func newTestHarness(t *testing.T, expiryCleanupInterval time.Duration) *testHarn h.StatsFunc = func() contracts.ShardStats { return contracts.ShardStats{ TotalCapacityBytes: 1e9, - PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + PerPriorityBandStats: map[int]contracts.PriorityBandStats{ testFlow.Priority: {CapacityBytes: 1e9}, }, } } - // Use a default pass-through filter. - filter := func( - ctx context.Context, - band framework.PriorityBandAccessor, - logger logr.Logger, - ) (framework.PriorityBandAccessor, bool) { - return nil, false - } - h.processor = NewShardProcessor(h, filter, h.mockClock, expiryCleanupInterval, h.logger) + h.processor = NewShardProcessor( + h.ctx, + h, + h.saturationDetector, + h.clock, + expiryCleanupInterval, + 100, + h.logger) require.NotNil(t, h.processor, "NewShardProcessor should not return nil") + + t.Cleanup(func() { h.Stop() }) + return h } @@ -202,23 +164,22 @@ func (h *testHarness) Stop() { } // waitForFinalization blocks until an item is finalized or a timeout is reached. -func (h *testHarness) waitForFinalization(item *flowItem) (types.QueueOutcome, error) { +func (h *testHarness) waitForFinalization(item *FlowItem) (types.QueueOutcome, error) { h.t.Helper() select { - case <-item.Done(): - return item.FinalState() + case finalState := <-item.Done(): + return finalState.Outcome, finalState.Err case <-time.After(testWaitTimeout): h.t.Fatalf("Timed out waiting for item %q to be finalized", item.OriginalRequest().ID()) return types.QueueOutcomeNotYetFinalized, nil } } -// newTestItem creates a new flowItem for testing purposes. -func (h *testHarness) newTestItem(id string, key types.FlowKey, ttl time.Duration) *flowItem { +// newTestItem creates a new FlowItem for testing purposes. +func (h *testHarness) newTestItem(id string, key types.FlowKey, ttl time.Duration) *FlowItem { h.t.Helper() - ctx := log.IntoContext(context.Background(), h.logger) - req := typesmocks.NewMockFlowControlRequest(100, id, key, ctx) - return NewItem(req, ttl, h.mockClock.Now()) + req := typesmocks.NewMockFlowControlRequest(100, id, key) + return NewItem(req, ttl, h.clock.Now()) } // addQueue centrally registers a new mock queue for a given flow, ensuring all harness components are aware of it. @@ -226,13 +187,9 @@ func (h *testHarness) addQueue(key types.FlowKey) *mocks.MockManagedQueue { h.t.Helper() h.mu.Lock() defer h.mu.Unlock() - mockQueue := &mocks.MockManagedQueue{FlowKeyV: key} h.queues[key] = mockQueue - - // Add the key to the correct priority band, creating the band if needed. h.priorityFlows[key.Priority] = append(h.priorityFlows[key.Priority], key) - return mockQueue } @@ -249,20 +206,23 @@ func (h *testHarness) managedQueue(key types.FlowKey) (contracts.ManagedQueue, e } // allOrderedPriorityLevels provides the mock implementation for the `RegistryShard` interface. -func (h *testHarness) allOrderedPriorityLevels() []uint { +func (h *testHarness) allOrderedPriorityLevels() []int { h.mu.Lock() defer h.mu.Unlock() - prios := make([]uint, 0, len(h.priorityFlows)) + prios := make([]int, 0, len(h.priorityFlows)) for p := range h.priorityFlows { prios = append(prios, p) } - slices.Sort(prios) + sort.Slice(prios, func(i, j int) bool { + return prios[i] > prios[j] + }) + return prios } // priorityBandAccessor provides the mock implementation for the `RegistryShard` interface. It acts as a factory for a // fully-configured, stateless mock that is safe for concurrent use. -func (h *testHarness) priorityBandAccessor(p uint) (framework.PriorityBandAccessor, error) { +func (h *testHarness) priorityBandAccessor(p int) (framework.PriorityBandAccessor, error) { band := &frameworkmocks.MockPriorityBandAccessor{PriorityV: p} // Safely get a snapshot of the flow IDs under a lock. @@ -288,7 +248,7 @@ func (h *testHarness) priorityBandAccessor(p uint) (framework.PriorityBandAccess } // interFlowDispatchPolicy provides the mock implementation for the `contracts.RegistryShard` interface. -func (h *testHarness) interFlowDispatchPolicy(p uint) (framework.InterFlowDispatchPolicy, error) { +func (h *testHarness) interFlowDispatchPolicy(p int) (framework.InterFlowDispatchPolicy, error) { policy := &frameworkmocks.MockInterFlowDispatchPolicy{} // If the test provided a custom implementation, use it. if h.interFlowPolicySelectQueue != nil { @@ -331,9 +291,9 @@ func (h *testHarness) intraFlowDispatchPolicy(types.FlowKey) (framework.IntraFlo func TestShardProcessor(t *testing.T) { t.Parallel() - // Lifecycle tests use the processor's main `Run` loop to verify the complete end-to-end lifecycle of a request, from + // Integration tests use the processor's main `Run` loop to verify the complete end-to-end lifecycle of a request, from // `Enqueue` to its final outcome. - t.Run("Lifecycle", func(t *testing.T) { + t.Run("Integration", func(t *testing.T) { t.Parallel() t.Run("should dispatch item successfully", func(t *testing.T) { @@ -341,12 +301,11 @@ func TestShardProcessor(t *testing.T) { // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) item := h.newTestItem("req-dispatch-success", testFlow, testTTL) - h.addQueue(types.FlowKey{ID: testFlow.ID, Priority: testFlow.Priority}) + h.addQueue(testFlow) // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -362,15 +321,14 @@ func TestShardProcessor(t *testing.T) { item := h.newTestItem("req-capacity-reject", testFlow, testTTL) h.addQueue(testFlow) h.StatsFunc = func() contracts.ShardStats { - return contracts.ShardStats{PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + return contracts.ShardStats{PerPriorityBandStats: map[int]contracts.PriorityBandStats{ testFlow.Priority: {CapacityBytes: 50}, // 50 is less than item size of 100 }} } // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -393,7 +351,7 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -413,94 +371,12 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() h.Go() - // Stop the processor, then immediately try to enqueue. - h.Stop() - h.processor.Enqueue(item) - - // --- ASSERT --- - outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "The outcome should be RejectedOther") - require.Error(t, err, "An eviction on shutdown should produce an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "The error should be of type ErrFlowControllerShutdown") - }) - - t.Run("should evict item on TTL expiry via background cleanup", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, testCleanupTick) - item := h.newTestItem("req-expired-evict", testFlow, testShortTTL) - h.addQueue(testFlow) - - // --- ACT --- - h.Start() - defer h.Stop() - h.processor.Enqueue(item) - h.Go() - - // Let time pass for the item to expire and for the background cleanup to run. - h.mockClock.Advance(testShortTTL * 2) - time.Sleep(testCleanupTick * 3) // Allow the cleanup goroutine time to run. + h.Stop() // Stop the processor, then immediately try to enqueue. + require.ErrorIs(t, h.processor.Submit(item), types.ErrFlowControllerNotRunning, + "Submit should return ErrFlowControllerNotRunning on shutdown") // --- ASSERT --- - outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The final outcome should be EvictedTTL") - require.Error(t, err, "A TTL eviction should produce an error") - assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") - }) - - t.Run("should evict item at moment of dispatch if TTL has expired", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, 1*time.Hour) // Disable background cleanup to isolate dispatch logic. - item := h.newTestItem("req-expired-dispatch-evict", testFlow, testShortTTL) - mockQueue := h.addQueue(testFlow) - require.NoError(t, mockQueue.Add(item), "Adding item to mock queue should not fail") - - // Have the policy select the item, but then advance time so it's expired by the time dispatchItem actually runs. - h.interFlowPolicySelectQueue = func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { - h.mockClock.Advance(testShortTTL * 2) - return mockQueue.FlowQueueAccessor(), nil - } - - // --- ACT --- - h.Start() - defer h.Stop() - h.Go() - - // The run loop will pick up the item and attempt dispatch, which will fail internally. - // We need a small sleep to allow the non-blocking run loop to process. - time.Sleep(50 * time.Millisecond) - - // --- ASSERT --- - outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The final outcome should be EvictedTTL") - require.Error(t, err, "An eviction at dispatch time should produce an error") - assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") - }) - - t.Run("should evict item on context cancellation", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, testCleanupTick) - ctx, cancel := context.WithCancel(context.Background()) - req := typesmocks.NewMockFlowControlRequest(100, "req-ctx-cancel", testFlow, ctx) - item := NewItem(req, testTTL, h.mockClock.Now()) - h.addQueue(testFlow) - - // --- ACT --- - h.Start() - defer h.Stop() - h.processor.Enqueue(item) - h.Go() - cancel() // Cancel the context *after* the item is enqueued. - time.Sleep(testCleanupTick * 3) // Allow the cleanup goroutine time to run. - - // --- ASSERT --- - outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeEvictedContextCancelled, outcome, - "The outcome should be EvictedContextCancelled") - require.Error(t, err, "A context cancellation eviction should produce an error") - assert.ErrorIs(t, err, types.ErrContextCancelled, "The error should be of type ErrContextCancelled") + assert.Nil(t, item.FinalState(), "Item should not be finalized by the processor") }) t.Run("should evict a queued item on shutdown", func(t *testing.T) { @@ -525,7 +401,8 @@ func TestShardProcessor(t *testing.T) { outcome, err := h.waitForFinalization(item) assert.Equal(t, types.QueueOutcomeEvictedOther, outcome, "The outcome should be EvictedOther") require.Error(t, err, "An eviction on shutdown should produce an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "The error should be of type ErrFlowControllerShutdown") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, + "The error should be of type ErrFlowControllerNotRunning") }) t.Run("should handle concurrent enqueues and dispatch all items", func(t *testing.T) { @@ -534,8 +411,8 @@ func TestShardProcessor(t *testing.T) { h := newTestHarness(t, testCleanupTick) const numConcurrentItems = 20 q := h.addQueue(testFlow) - itemsToTest := make([]*flowItem, 0, numConcurrentItems) - for i := 0; i < numConcurrentItems; i++ { + itemsToTest := make([]*FlowItem, 0, numConcurrentItems) + for i := range numConcurrentItems { item := h.newTestItem(fmt.Sprintf("req-concurrent-%d", i), testFlow, testTTL) itemsToTest = append(itemsToTest, item) } @@ -546,9 +423,9 @@ func TestShardProcessor(t *testing.T) { var wg sync.WaitGroup for _, item := range itemsToTest { wg.Add(1) - go func(fi *flowItem) { + go func(fi *FlowItem) { defer wg.Done() - h.processor.Enqueue(fi) + require.NoError(t, h.processor.Submit(fi), "Submit should not fail") }(item) } h.Go() @@ -576,16 +453,26 @@ func TestShardProcessor(t *testing.T) { // Use channels to pause the dispatch cycle right before it would remove the item. policyCanProceed := make(chan struct{}) itemIsBeingDispatched := make(chan struct{}) + var signalOnce sync.Once + var removedItem types.QueueItemAccessor require.NoError(t, q.Add(item)) // Add the item directly to the queue. // Override the queue's `RemoveFunc` to pause the dispatch goroutine at a critical moment. q.RemoveFunc = func(h types.QueueItemHandle) (types.QueueItemAccessor, error) { - close(itemIsBeingDispatched) // 1. Signal that dispatch is happening. - <-policyCanProceed // 2. Wait for the test to tell us to continue. - // 4. After we unblock, the item will have already been finalized by the cleanup logic, so we simulate the - // real-world outcome of a failed remove. - return nil, fmt.Errorf("item with handle %v not found", h) + var err error + signalOnce.Do(func() { + removedItem = item + close(itemIsBeingDispatched) // 1. Signal that dispatch is happening. + <-policyCanProceed // 2. Wait for the test to tell us to continue. + // 4. After we unblock, the item will have already been finalized by the cleanup logic. + // We simulate the item no longer being found. + err = fmt.Errorf("item with handle %v not found", h) + }) + if removedItem == item { + return item, nil // Return the item on the first call + } + return nil, err // Return error on subsequent calls } // --- ACT --- @@ -594,20 +481,23 @@ func TestShardProcessor(t *testing.T) { h.Go() // Wait for the dispatch cycle to select our item and pause inside our mock `RemoveFunc`. - <-itemIsBeingDispatched + select { + case <-itemIsBeingDispatched: + case <-time.After(testWaitTimeout): + t.Fatal("Timed out waiting for item to be dispatched") + } // 3. The dispatch goroutine is now paused. We can now safely win the "race" by running cleanup logic. - h.mockClock.Advance(testShortTTL * 2) - h.processor.cleanupExpired(h.mockClock.Now()) // This will remove and finalize the item. + h.clock.Step(testShortTTL * 2) + item.Finalize(types.ErrTTLExpired) // This will finalize the item with RejectedOther. - // 5. Un-pause the dispatch goroutine. It will now fail to remove the item and the `dispatchCycle` will - // correctly conclude without finalizing the item a second time. + // 5. Un-pause the dispatch goroutine. close(policyCanProceed) // --- ASSERT --- - // The item's final state should be from the cleanup logic (EvictedTTL), not the dispatch logic. + // The item's final state should be from the Finalize call above. outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The outcome should be EvictedTTL from the cleanup routine") + assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The outcome should be EvictedTTL from the Finalize call") require.Error(t, err, "A TTL eviction should produce an error") assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") }) @@ -647,7 +537,7 @@ func TestShardProcessor(t *testing.T) { h.Start() defer h.Stop() h.Go() - h.processor.Enqueue(nil) + require.NoError(t, h.processor.Submit(nil), "Submit should not fail") // --- ASSERT --- // Allow a moment for the processor to potentially process the nil item. @@ -666,32 +556,32 @@ func TestShardProcessor(t *testing.T) { testCases := []struct { name string setupHarness func(h *testHarness) - item *flowItem - assert func(t *testing.T, h *testHarness, item *flowItem) + item *FlowItem + assert func(t *testing.T, h *testHarness, item *FlowItem) }{ { name: "should reject item on registry queue lookup failure", setupHarness: func(h *testHarness) { h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { return nil, testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.FinalState().Outcome, + "Outcome should be RejectedOther") + require.Error(t, item.FinalState().Err, "An error should be returned") + assert.ErrorIs(t, item.FinalState().Err, testErr, "The underlying error should be preserved") }, }, { name: "should reject item on registry priority band lookup failure", setupHarness: func(h *testHarness) { h.addQueue(testFlow) - h.PriorityBandAccessorFunc = func(uint) (framework.PriorityBandAccessor, error) { return nil, testErr } + h.PriorityBandAccessorFunc = func(int) (framework.PriorityBandAccessor, error) { return nil, testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.FinalState().Outcome, + "Outcome should be RejectedOther") + require.Error(t, item.FinalState().Err, "An error should be returned") + assert.ErrorIs(t, item.FinalState().Err, testErr, "The underlying error should be preserved") }, }, { @@ -700,11 +590,11 @@ func TestShardProcessor(t *testing.T) { mockQueue := h.addQueue(testFlow) mockQueue.AddFunc = func(types.QueueItemAccessor) error { return testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.FinalState().Outcome, + "Outcome should be RejectedOther") + require.Error(t, item.FinalState().Err, "An error should be returned") + assert.ErrorIs(t, item.FinalState().Err, testErr, "The underlying error should be preserved") }, }, { @@ -721,17 +611,16 @@ func TestShardProcessor(t *testing.T) { assert.Equal(t, 0, addCallCount, "Queue.Add should not have been called for a finalized item") }) }, - item: func() *flowItem { + item: func() *FlowItem { // Create a pre-finalized item. item := newTestHarness(t, 0).newTestItem("req-finalized", testFlow, testTTL) - item.finalize(types.QueueOutcomeDispatched, nil) + item.FinalizeWithOutcome(types.QueueOutcomeDispatched, nil) return item }(), - assert: func(t *testing.T, h *testHarness, item *flowItem) { + assert: func(t *testing.T, h *testHarness, item *FlowItem) { // The item was already finalized, so its state should not change. - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "Outcome should remain unchanged") - assert.NoError(t, err, "Error should remain unchanged") + assert.Equal(t, types.QueueOutcomeDispatched, item.FinalState().Outcome, "Outcome should remain unchanged") + assert.NoError(t, item.FinalState().Err, "Error should remain unchanged") }, }, } @@ -776,7 +665,7 @@ func TestShardProcessor(t *testing.T) { itemByteSize: 1, stats: contracts.ShardStats{ TotalCapacityBytes: 200, TotalByteSize: 100, - PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + PerPriorityBandStats: map[int]contracts.PriorityBandStats{ testFlow.Priority: {ByteSize: 50, CapacityBytes: 50}, }, }, @@ -787,7 +676,7 @@ func TestShardProcessor(t *testing.T) { itemByteSize: 1, stats: contracts.ShardStats{ TotalCapacityBytes: 200, TotalByteSize: 100, - PerPriorityBandStats: map[uint]contracts.PriorityBandStats{}, // Missing stats for priority 10 + PerPriorityBandStats: map[int]contracts.PriorityBandStats{}, // Missing stats for priority 10 }, expectHasCap: false, }, @@ -796,7 +685,7 @@ func TestShardProcessor(t *testing.T) { itemByteSize: 10, stats: contracts.ShardStats{ TotalCapacityBytes: 200, TotalByteSize: 100, - PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + PerPriorityBandStats: map[int]contracts.PriorityBandStats{ testFlow.Priority: {ByteSize: 50, CapacityBytes: 100}, }, }, @@ -836,17 +725,19 @@ func TestShardProcessor(t *testing.T) { expectDidDispatch: false, }, { - name: "should stop dispatching when filter signals pause", + name: "should block dispatch on HoL saturation", setupHarness: func(h *testHarness) { - // Add an item that *could* be dispatched to prove the pause is effective. - q := h.addQueue(testFlow) - require.NoError(t, q.Add(h.newTestItem("item", testFlow, testTTL))) - h.processor.dispatchFilter = func( - _ context.Context, - _ framework.PriorityBandAccessor, - _ logr.Logger, - ) (framework.PriorityBandAccessor, bool) { - return nil, true // Signal pause. + // Add a high-priority item that will be selected but is saturated. + qHigh := h.addQueue(testFlow) // priority 10 + require.NoError(t, qHigh.Add(h.newTestItem("item-high", testFlow, testTTL))) + + // Add a low-priority, viable item. + keyLow := types.FlowKey{ID: "flow-low", Priority: 5} + qLow := h.addQueue(keyLow) + require.NoError(t, qLow.Add(h.newTestItem("item-low", keyLow, testTTL))) + + h.saturationDetector.IsSaturatedFunc = func(_ context.Context, _ []metrics.PodMetrics) bool { + return true } }, expectDidDispatch: false, @@ -854,7 +745,7 @@ func TestShardProcessor(t *testing.T) { { name: "should skip band on priority band accessor error", setupHarness: func(h *testHarness) { - h.PriorityBandAccessorFunc = func(uint) (framework.PriorityBandAccessor, error) { + h.PriorityBandAccessorFunc = func(int) (framework.PriorityBandAccessor, error) { return nil, registryErr } }, @@ -955,62 +846,18 @@ func TestShardProcessor(t *testing.T) { } }) - t.Run("should use filtered view of queues when filter is active", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, testCleanupTick) - flowB := types.FlowKey{ID: "flow-b", Priority: testFlow.Priority} - h.addQueue(testFlow) - qB := h.addQueue(flowB) - itemB := h.newTestItem("item-b", flowB, testTTL) - require.NoError(t, qB.Add(itemB)) - - // This filter only allows flow-b. - h.processor.dispatchFilter = func( - _ context.Context, - originalBand framework.PriorityBandAccessor, - _ logr.Logger, - ) (framework.PriorityBandAccessor, bool) { - return newSubsetPriorityBandAccessor(originalBand, []types.FlowKey{flowB}), false - } - - // This policy will be given the filtered view, so it should only see flow-b. - h.interFlowPolicySelectQueue = func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { - var flowIDs []string - band.IterateQueues(func(fqa framework.FlowQueueAccessor) bool { - flowIDs = append(flowIDs, fqa.FlowKey().ID) - return true - }) - // This is the core assertion of the test. - require.ElementsMatch(t, []string{flowB.ID}, flowIDs, "Policy should only see the filtered flow") - - // Select flow-b to prove the chain works. - q, _ := h.managedQueue(flowB) - return q.FlowQueueAccessor(), nil - } - - // --- ACT --- - dispatched := h.processor.dispatchCycle(context.Background()) - - // --- ASSERT --- - assert.True(t, dispatched, "An item should have been dispatched from the filtered flow") - outcome, err := itemB.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "The dispatched item's outcome should be correct") - assert.NoError(t, err, "The dispatched item should not have an error") - }) - t.Run("should guarantee strict priority by starving lower priority items", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) - keyHigh := types.FlowKey{ID: "flow-high", Priority: 10} - keyLow := types.FlowKey{ID: "flow-low", Priority: 20} + keyHigh := types.FlowKey{ID: "flow-high", Priority: 20} + keyLow := types.FlowKey{ID: "flow-low", Priority: 10} qHigh := h.addQueue(keyHigh) qLow := h.addQueue(keyLow) const numItems = 3 - highPrioItems := make([]*flowItem, numItems) - lowPrioItems := make([]*flowItem, numItems) + highPrioItems := make([]*FlowItem, numItems) + lowPrioItems := make([]*FlowItem, numItems) for i := range numItems { // Add high priority items. itemH := h.newTestItem(fmt.Sprintf("req-high-%d", i), keyHigh, testTTL) @@ -1032,9 +879,9 @@ func TestShardProcessor(t *testing.T) { // Verify all high-priority items are gone and low-priority items remain. for _, item := range highPrioItems { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "High-priority item should be dispatched") - assert.NoError(t, err, "Dispatched high-priority item should not have an error") + assert.Equal(t, types.QueueOutcomeDispatched, item.FinalState().Outcome, + "High-priority item should be dispatched") + assert.NoError(t, item.FinalState().Err, "Dispatched high-priority item should not have an error") } assert.Equal(t, numItems, qLow.Len(), "Low-priority queue should still be full") @@ -1066,19 +913,6 @@ func TestShardProcessor(t *testing.T) { }, expectedErr: registryErr, }, - { - name: "on queue remove failure", - setupMocks: func(h *testHarness) { - h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { - return &mocks.MockManagedQueue{ - RemoveFunc: func(types.QueueItemHandle) (types.QueueItemAccessor, error) { - return nil, registryErr - }, - }, nil - } - }, - expectedErr: registryErr, - }, } for _, tc := range testCases { @@ -1087,18 +921,19 @@ func TestShardProcessor(t *testing.T) { h := newTestHarness(t, testCleanupTick) tc.setupMocks(h) item := h.newTestItem("req-dispatch-fail", testFlow, testTTL) - err := h.processor.dispatchItem(item, h.logger) + err := h.processor.dispatchItem(item) require.Error(t, err, "dispatchItem should return an error") assert.ErrorIs(t, err, tc.expectedErr, "The underlying registry error should be preserved") }) } }) - t.Run("should evict item that expires at moment of dispatch", func(t *testing.T) { + t.Run("should not dispatch already finalized item", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) - item := h.newTestItem("req-expired-dispatch", testFlow, testShortTTL) + item := h.newTestItem("req-already-finalized", testFlow, testTTL) + item.FinalizeWithOutcome(types.QueueOutcomeRejectedOther, errors.New("already done")) h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { return &mocks.MockManagedQueue{ @@ -1109,69 +944,61 @@ func TestShardProcessor(t *testing.T) { } // --- ACT --- - h.mockClock.Advance(testShortTTL * 2) // Make the item expire. - err := h.processor.dispatchItem(item, h.logger) + err := h.processor.dispatchItem(item) // --- ASSERT --- - // First, check the error returned by `dispatchItem`. - require.Error(t, err, "dispatchItem should return an error for an expired item") - assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") - - // Second, check the final state of the item itself. - outcome, finalErr := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The item's final outcome should be EvictedTTL") - require.Error(t, finalErr, "The item's final state should contain an error") - assert.ErrorIs(t, finalErr, types.ErrTTLExpired, "The item's final error should be of type ErrTTLExpired") + require.NoError(t, err, "dispatchItem should return no error for an already finalized item") + + // Check the final state of the item itself - it should not have changed. + finalState := item.FinalState() + require.NotNil(t, finalState, "Item must be finalized") + assert.Equal(t, types.QueueOutcomeRejectedOther, finalState.Outcome, + "The item's final outcome should be RejectedOther") + assert.ErrorContains(t, finalState.Err, "already done", + "The error should be the one from the first Finalize call") }) + }) - t.Run("should panic if queue returns item of wrong type", func(t *testing.T) { + t.Run("cleanup and utility methods", func(t *testing.T) { + t.Parallel() + + t.Run("should sweep externally finalized items", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) - badItem := &typesmocks.MockQueueItemAccessor{ - OriginalRequestV: typesmocks.NewMockFlowControlRequest(0, "bad-item", testFlow, context.Background()), - } + item := h.newTestItem("req-external-finalized", testFlow, testTTL) + q := h.addQueue(testFlow) + require.NoError(t, q.Add(item), "Failed to add item to queue") - h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { - return &mocks.MockManagedQueue{ - RemoveFunc: func(types.QueueItemHandle) (types.QueueItemAccessor, error) { - return badItem, nil - }, - }, nil - } + // Externally finalize the item + item.Finalize(context.Canceled) + require.NotNil(t, item.FinalState(), "Item should be finalized") - itemToDispatch := h.newTestItem("req-dispatch-panic", testFlow, testTTL) - expectedPanicMsg := fmt.Sprintf("%s: internal error: item %q of type %T is not a *flowItem", - errIntraFlow, "bad-item", badItem) + // --- ACT --- + h.processor.sweepFinalizedItems() - // --- ACT & ASSERT --- - assert.PanicsWithError(t, expectedPanicMsg, func() { - _ = h.processor.dispatchItem(itemToDispatch, h.logger) - }, "A type mismatch from a queue should cause a panic") + // --- ASSERT --- + assert.Equal(t, 0, q.Len(), "Queue should be empty after sweep") + finalState := item.FinalState() + assert.Equal(t, types.QueueOutcomeEvictedContextCancelled, finalState.Outcome, + "Outcome should be EvictedContextCancelled") + assert.ErrorIs(t, finalState.Err, types.ErrContextCancelled, "Error should be ErrContextCancelled") }) - }) - - t.Run("cleanup and utility methods", func(t *testing.T) { - t.Parallel() - t.Run("should remove and finalize expired items", func(t *testing.T) { + t.Run("should not sweep items not finalized", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) - // Create an item that is already expired relative to the cleanup time. - item := h.newTestItem("req-expired", testFlow, 1*time.Millisecond) + item := h.newTestItem("req-not-finalized", testFlow, testTTL) q := h.addQueue(testFlow) - require.NoError(t, q.Add(item)) - cleanupTime := h.mockClock.Now().Add(10 * time.Millisecond) + require.NoError(t, q.Add(item), "Failed to add item to queue") // --- ACT --- - h.processor.cleanupExpired(cleanupTime) + h.processor.sweepFinalizedItems() // --- ASSERT --- - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "Item outcome should be EvictedTTL") - require.Error(t, err, "Item should have an error") - assert.ErrorIs(t, err, types.ErrTTLExpired, "Item error should be ErrTTLExpired") + assert.Equal(t, 1, q.Len(), "Queue should still contain the item") + assert.Nil(t, item.FinalState(), "Item should not be finalized") }) t.Run("should evict all items on shutdown", func(t *testing.T) { @@ -1186,18 +1013,19 @@ func TestShardProcessor(t *testing.T) { h.processor.evictAll() // --- ASSERT --- - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedOther, outcome, "Item outcome should be EvictedOther") - require.Error(t, err, "Item should have an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "Item error should be ErrFlowControllerShutdown") + assert.Equal(t, types.QueueOutcomeEvictedOther, item.FinalState().Outcome, + "Item outcome should be EvictedOther") + require.Error(t, item.FinalState().Err, "Item should have an error") + assert.ErrorIs(t, item.FinalState().Err, types.ErrFlowControllerNotRunning, + "Item error should be ErrFlowControllerNotRunning") }) t.Run("should handle registry errors gracefully during concurrent processing", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) - h.AllOrderedPriorityLevelsFunc = func() []uint { return []uint{testFlow.Priority} } - h.PriorityBandAccessorFunc = func(p uint) (framework.PriorityBandAccessor, error) { + h.AllOrderedPriorityLevelsFunc = func() []int { return []int{testFlow.Priority} } + h.PriorityBandAccessorFunc = func(p int) (framework.PriorityBandAccessor, error) { return nil, errors.New("registry error") } @@ -1208,25 +1036,6 @@ func TestShardProcessor(t *testing.T) { }, "processAllQueuesConcurrently should not panic on registry errors") }) - t.Run("should handle items of an unexpected type gracefully during finalization", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, testCleanupTick) - item := &typesmocks.MockQueueItemAccessor{ - OriginalRequestV: typesmocks.NewMockFlowControlRequest(0, "bad-item", testFlow, context.Background()), - } - items := []types.QueueItemAccessor{item} - - // --- ACT & ASSERT --- - // The test passes if this call completes without panicking. - assert.NotPanics(t, func() { - getOutcome := func(types.QueueItemAccessor) (types.QueueOutcome, error) { - return types.QueueOutcomeEvictedOther, nil - } - h.processor.finalizeItems(items, h.logger, getOutcome) - }, "finalizeItems should not panic on unexpected item types") - }) - t.Run("should process all queues with a worker pool", func(t *testing.T) { t.Parallel() // --- ARRANGE --- @@ -1257,122 +1066,120 @@ func TestShardProcessor(t *testing.T) { }) }) }) -} -func TestCheckItemExpiry(t *testing.T) { - t.Parallel() + t.Run("Public API", func(t *testing.T) { + t.Parallel() - // --- ARRANGE --- - now := time.Now() - ctxCancelled, cancel := context.WithCancel(context.Background()) - cancel() // Cancel the context immediately. - - testCases := []struct { - name string - item types.QueueItemAccessor - now time.Time - expectExpired bool - expectOutcome types.QueueOutcome - expectErr error - }{ - { - name: "should not be expired if TTL is not reached and context is active", - item: NewItem( - typesmocks.NewMockFlowControlRequest(100, "req-not-expired", testFlow, context.Background()), - testTTL, - now), - now: now.Add(30 * time.Second), - expectExpired: false, - expectOutcome: types.QueueOutcomeNotYetFinalized, - expectErr: nil, - }, - { - name: "should not be expired if TTL is disabled (0)", - item: NewItem( - typesmocks.NewMockFlowControlRequest(100, "req-not-expired-no-ttl", testFlow, context.Background()), - 0, - now), - now: now.Add(30 * time.Second), - expectExpired: false, - expectOutcome: types.QueueOutcomeNotYetFinalized, - expectErr: nil, - }, - { - name: "should be expired if TTL is exceeded", - item: NewItem( - typesmocks.NewMockFlowControlRequest(100, "req-ttl-expired", testFlow, context.Background()), - time.Second, - now), - now: now.Add(2 * time.Second), - expectExpired: true, - expectOutcome: types.QueueOutcomeEvictedTTL, - expectErr: types.ErrTTLExpired, - }, - { - name: "should be expired if context is cancelled", - item: NewItem( - typesmocks.NewMockFlowControlRequest(100, "req-ctx-cancelled", testFlow, ctxCancelled), - testTTL, - now), - now: now, - expectExpired: true, - expectOutcome: types.QueueOutcomeEvictedContextCancelled, - expectErr: types.ErrContextCancelled, - }, - { - name: "should be expired if already finalized", - item: func() types.QueueItemAccessor { - i := NewItem( - typesmocks.NewMockFlowControlRequest(100, "req-finalized", testFlow, context.Background()), - testTTL, - now) - i.finalize(types.QueueOutcomeDispatched, nil) - return i - }(), - now: now, - expectExpired: true, - expectOutcome: types.QueueOutcomeDispatched, - expectErr: nil, - }, - } + t.Run("Submit", func(t *testing.T) { + t.Parallel() + + t.Run("should return ErrProcessorBusy when channel is full", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem, 1) + h.processor.enqueueChan <- h.newTestItem("item-filler", testFlow, testTTL) // Fill the channel to capacity. + + // The next submit should be non-blocking and fail immediately. + err := h.processor.Submit(h.newTestItem("item-to-reject", testFlow, testTTL)) + require.Error(t, err, "Submit must return an error when the channel is full") + assert.ErrorIs(t, err, ErrProcessorBusy, "The returned error must be ErrProcessorBusy") + }) + + t.Run("should return ErrFlowControllerNotRunning if lifecycleCtx is cancelled", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.Start() + h.Go() // Ensure the Run loop has started + h.cancel() // Cancel the lifecycle context + h.Stop() // Wait for the processor to fully stop + + item := h.newTestItem("item-ctx-cancel", testFlow, testTTL) + err := h.processor.Submit(item) + require.ErrorIs(t, err, types.ErrFlowControllerNotRunning, + "Submit must return ErrFlowControllerNotRunning when lifecycleCtx is cancelled") + assert.Nil(t, item.FinalState(), "Item should not be finalized by Submit") + + err = h.processor.SubmitOrBlock(context.Background(), item) + require.ErrorIs(t, err, types.ErrFlowControllerNotRunning, + "SubmitOrBlock must return ErrFlowControllerNotRunning when lifecycleCtx is cancelled") + assert.Nil(t, item.FinalState(), "Item should not be finalized by SubmitOrBlock") + }) + }) - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { + t.Run("SubmitOrBlock", func(t *testing.T) { t.Parallel() - // --- ACT --- - isExpired, outcome, err := checkItemExpiry(tc.item, tc.now) - // --- ASSERT --- - assert.Equal(t, tc.expectExpired, isExpired, "Expired status should match expected value") - assert.Equal(t, tc.expectOutcome, outcome, "Outcome should match expected value") - - if tc.expectErr != nil { - require.Error(t, err, "An error was expected") - // Use ErrorIs for sentinel errors, ErrorContains for general messages. - if errors.Is(tc.expectErr, types.ErrTTLExpired) || errors.Is(tc.expectErr, types.ErrContextCancelled) { - assert.ErrorIs(t, err, tc.expectErr, "The specific error type should be correct") - } else { - assert.ErrorContains(t, err, tc.expectErr.Error(), "The error message should contain the expected text") + t.Run("should block when channel is full and succeed when space becomes available", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem, 1) + h.processor.enqueueChan <- h.newTestItem("item-filler", testFlow, testTTL) // Fill the channel to capacity. + + itemToSubmit := h.newTestItem("item-to-block", testFlow, testTTL) + submitErr := make(chan error, 1) + + // Run `SubmitOrBlock` in a separate goroutine, as it will block. + go func() { + submitErr <- h.processor.SubmitOrBlock(context.Background(), itemToSubmit) + }() + + // Prove that the call is blocking by ensuring it hasn't returned an error yet. + time.Sleep(20 * time.Millisecond) + require.Len(t, submitErr, 0, "SubmitOrBlock should be blocking and not have returned yet") + <-h.processor.enqueueChan // Make space in the channel. This should unblock the goroutine. + + select { + case err := <-submitErr: + require.NoError(t, err, "SubmitOrBlock should succeed and return no error after being unblocked") + case <-time.After(testWaitTimeout): + t.Fatal("SubmitOrBlock did not return after space was made in the channel") } - } else { - assert.NoError(t, err, "No error was expected") - } - }) - } + }) - t.Run("should panic on item of an unexpected type", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - badItem := &typesmocks.MockQueueItemAccessor{ - OriginalRequestV: typesmocks.NewMockFlowControlRequest(0, "item-bad-type", testFlow, context.Background()), - } + t.Run("should unblock and return context error on cancellation", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem) // Use an unbuffered channel to guarantee the first send blocks. + itemToSubmit := h.newTestItem("item-to-cancel", testFlow, testTTL) + submitErr := make(chan error, 1) + ctx, cancel := context.WithCancel(context.Background()) + + // Run `SubmitOrBlock` in a separate goroutine, as it will block. + go func() { + submitErr <- h.processor.SubmitOrBlock(ctx, itemToSubmit) + }() + + // Prove that the call is blocking. + time.Sleep(20 * time.Millisecond) + require.Len(t, submitErr, 0, "SubmitOrBlock should be blocking and not have returned yet") + cancel() // Cancel the context. This should unblock the goroutine. + + select { + case err := <-submitErr: + require.Error(t, err, "SubmitOrBlock should return an error after context cancellation") + assert.ErrorIs(t, err, context.Canceled, "The returned error must be context.Canceled") + case <-time.After(testWaitTimeout): + t.Fatal("SubmitOrBlock did not return after context was cancelled") + } + }) + + t.Run("should reject immediately if shutting down", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + item := h.newTestItem("req-shutdown-reject", testFlow, testTTL) + h.addQueue(testFlow) + + h.Start() + h.Go() + h.Stop() // Stop the processor, then immediately try to enqueue. + err := h.processor.SubmitOrBlock(context.Background(), item) - expectedPanicMsg := fmt.Sprintf("internal error: item %q of type %T is not a *flowItem", - badItem.OriginalRequestV.ID(), badItem) + require.Error(t, err, "SubmitOrBlock should return an error when shutting down") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "The error should be ErrFlowControllerNotRunning") - // --- ACT & ASSERT --- - assert.PanicsWithError(t, expectedPanicMsg, func() { - _, _, _ = checkItemExpiry(badItem, time.Now()) - }, "A type mismatch from a queue should cause a panic") + // Item should not be finalized by the processor + assert.Nil(t, item.FinalState(), "Item should not be finalized by the processor") + }) + }) }) } diff --git a/pkg/epp/flowcontrol/framework/mocks/mocks.go b/pkg/epp/flowcontrol/framework/mocks/mocks.go index b8715b779..ff8441fde 100644 --- a/pkg/epp/flowcontrol/framework/mocks/mocks.go +++ b/pkg/epp/flowcontrol/framework/mocks/mocks.go @@ -67,14 +67,14 @@ var _ framework.FlowQueueAccessor = &MockFlowQueueAccessor{} // Simple accessors are configured with public value fields (e.g., `PriorityV`). // Complex methods with logic are configured with function fields (e.g., `IterateQueuesFunc`). type MockPriorityBandAccessor struct { - PriorityV uint + PriorityV int PriorityNameV string FlowKeysFunc func() []types.FlowKey QueueFunc func(flowID string) framework.FlowQueueAccessor IterateQueuesFunc func(callback func(queue framework.FlowQueueAccessor) (keepIterating bool)) } -func (m *MockPriorityBandAccessor) Priority() uint { return m.PriorityV } +func (m *MockPriorityBandAccessor) Priority() int { return m.PriorityV } func (m *MockPriorityBandAccessor) PriorityName() string { return m.PriorityNameV } func (m *MockPriorityBandAccessor) FlowKeys() []types.FlowKey { diff --git a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go index edcc02ac6..7addb9d13 100644 --- a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go +++ b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go @@ -26,6 +26,32 @@ import ( ) // FCFSPolicyName is the name of the FCFS policy implementation. +// +// This policy implements a First-Come, First-Served (FCFS) strategy by selecting the item with the earliest logical +// enqueue time. +// +// # Behavior and Queue Pairing +// +// The behavioral guarantees of this policy are critically dependent on the capabilities of the `framework.SafeQueue` it +// is paired with. The system distinguishes between: +// - "Logical Enqueue Time": The timestamp when a request first arrives at the `controller.FlowController`. +// - "Physical Enqueue Time": The timestamp when a request is added to a specific shard's queue, which happens later. +// +// This policy's behavior changes accordingly: +// - Paired with a `CapabilityPriorityConfigurable` queue, it provides strict FCFS ordering based on logical enqueue +// time, aligning with this policy's vended `framework.ItemComparator`. +// This configuration ensures that requests are processed in the order they arrived at the controller, providing the +// most intuitive behavior. +// - Paired with a `CapabilityFIFO` queue, it provides approximate FCFS ordering based on physical arrival order at +// the `framework.SafeQueue`. +// This configuration offers higher performance at the cost of strict logical-time ordering, as the +// `controller.FlowController`'s "bounce-and-retry" mechanic for Draining shards means a bounced request may be +// processed after a request that logically darrived later. +// +// Given that true end-to-end ordering is non-deterministic in a distributd system, this policy defaults to pairing with +// a CapabilityFIFO` queue (like "ListQueue") to prioritize performance and high throughput. For users who require the +// strictest possible logical-time ordering that this layer can provide, explicitly pairing this policy with a +// `CapabilityPriorityConfigurable` queue is recommended. const FCFSPolicyName = "FCFS" func init() { @@ -35,7 +61,9 @@ func init() { }) } -// fcfs (First-Come, First-Served) implements the `framework.IntraFlowDispatchPolicy` interface. +// fcfs is the internal implementation of the FCFS policy. +// See the documentation for the exported `FCFSPolicyName` constant for detailed user-facing information about its +// behavior. type fcfs struct { comparator framework.ItemComparator } @@ -70,9 +98,10 @@ func (p *fcfs) Comparator() framework.ItemComparator { return p.comparator } -// RequiredQueueCapabilities specifies that this policy needs a queue that supports FIFO operations. +// RequiredQueueCapabilities returns an empty slice, indicating that this policy can operate with any queue. +// See the `FCFSPolicyName` constant's documentation for details on the behavioral trade-offs. func (p *fcfs) RequiredQueueCapabilities() []framework.QueueCapability { - return []framework.QueueCapability{framework.CapabilityFIFO} + return []framework.QueueCapability{} } // --- enqueueTimeComparator --- diff --git a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go index 45c144238..cc6bceecf 100644 --- a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go +++ b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go @@ -41,8 +41,7 @@ func TestFCFS_RequiredQueueCapabilities(t *testing.T) { t.Parallel() policy := newFCFS() caps := policy.RequiredQueueCapabilities() - require.Len(t, caps, 1, "RequiredQueueCapabilities should return one capability") - assert.Equal(t, framework.CapabilityFIFO, caps[0], "Required capability should be FIFO") + require.Empty(t, caps, "No required capabilities should be returned") } func TestFCFS_SelectItem(t *testing.T) { diff --git a/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go b/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go index 8e123b631..792e3a46d 100644 --- a/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go +++ b/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go @@ -14,8 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package listqueue provides a simple, concurrent-safe queue implementation using a standard library -// `container/list.List` as the underlying data structure for FIFO (First-In, First-Out) behavior. +// Package listqueue provides a high-performance, concurrent-safe FIFO (First-In, First-Out) implementation of +// implementation of the `framework.SafeQueue` based on the standard library's `container/list`. package listqueue import ( @@ -28,7 +28,28 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" ) -// ListQueueName is the name of the list queue implementation. +// ListQueueName is the name of the list-based queue implementation. +// +// This queue provides a high-performance, low-overhead implementation based on a standard `container/list`. +// It advertises the `CapabilityFIFO`. +// +// # Behavioral Guarantees +// +// The core guarantee of this queue is strict physical First-In, First-Out (FIFO) ordering. It processes items in the +// exact order they are added to the queue on a specific shard. +// +// # Performance and Trade-offs +// +// Because the physical insertion order may not match a request's logical arrival time (due to the +// `controller.FlowController`'s internal "bounce-and-retry" mechanic), this queue provides an*approximate FCFS behavior +// from a system-wide perspective. +// +// Given that true end-to-end ordering is non-deterministic in a distributed system, this high-performance queue is the +// recommended default for most FCFS-like policies. It prioritizes throughput and efficiency, which aligns with the +// primary goal of the Flow Control system. +// +// For workloads that require the strictest possible logical-time ordering this layer can provide, explicitly using a +// queue that supports `CapabilityPriorityConfigurable` is the appropriate choice. const ListQueueName = "ListQueue" func init() { @@ -39,8 +60,8 @@ func init() { }) } -// listQueue implements the `framework.SafeQueue` interface using a standard `container/list.List` for FIFO behavior. -// This implementation is concurrent-safe. +// listQueue is the internal implementation of the ListQueue. +// See the documentation for the exported `ListQueueName` constant for detailed user-facing information. type listQueue struct { requests *list.List byteSize atomic.Uint64 diff --git a/pkg/epp/flowcontrol/framework/policies.go b/pkg/epp/flowcontrol/framework/policies.go index 5fc3e646d..eeea034eb 100644 --- a/pkg/epp/flowcontrol/framework/policies.go +++ b/pkg/epp/flowcontrol/framework/policies.go @@ -168,7 +168,7 @@ type FlowQueueAccessor interface { // Conformance: Implementations MUST ensure all methods are goroutine-safe for concurrent access. type PriorityBandAccessor interface { // Priority returns the numerical priority level of this band. - Priority() uint + Priority() int // PriorityName returns the human-readable name of this priority band. PriorityName() string diff --git a/pkg/epp/flowcontrol/registry/config.go b/pkg/epp/flowcontrol/registry/config.go index d404e78f7..af345a665 100644 --- a/pkg/epp/flowcontrol/registry/config.go +++ b/pkg/epp/flowcontrol/registry/config.go @@ -98,7 +98,7 @@ type Config struct { // the correct element within this specific configuration instance, preventing common "pointer-to-loop-variable" // errors, especially across deep copies or partitioning. // It is populated during validation and when the config is copied or partitioned. - priorityBandMap map[uint]*PriorityBandConfig + priorityBandMap map[int]*PriorityBandConfig // Factory functions used for plugin instantiation during configuration validation. // These enable dependency injection for unit testing the validation logic. @@ -112,9 +112,9 @@ type Config struct { // that operate at this priority level. type PriorityBandConfig struct { // Priority is the unique numerical priority level for this band. - // Convention: Lower numerical values indicate higher priority (e.g., 0 is highest). + // Convention: Highest numeric value corresponds to highest priority (centered on 0). // Required. - Priority uint + Priority int // PriorityName is a human-readable name for this priority band (e.g., "Critical", "Standard"). // It must be unique across all priority bands in the configuration. @@ -140,20 +140,6 @@ type PriorityBandConfig struct { MaxBytes uint64 } -// NewConfig performs validation and initialization, returning a guaranteed-valid `Config` object. -// This is the required constructor for creating a new configuration. It applies provided functional options (primarily -// for testing) and does not mutate the input `cfg`. -func NewConfig(cfg Config, opts ...configOption) (*Config, error) { - newCfg := cfg.deepCopy() - for _, opt := range opts { - opt(newCfg) - } - if err := newCfg.validateAndApplyDefaults(); err != nil { - return nil, err - } - return newCfg, nil -} - // ============================================================================= // Shard-Level Configuration // ============================================================================= @@ -170,13 +156,13 @@ type ShardConfig struct { // priorityBandMap provides O(1) lookups of `ShardPriorityBandConfig` by priority level. // It serves as a correctness mechanism, ensuring that accessors return a safe, stable pointer to the correct element // within this specific shard configuration instance. - priorityBandMap map[uint]*ShardPriorityBandConfig + priorityBandMap map[int]*ShardPriorityBandConfig } // ShardPriorityBandConfig holds the partitioned configuration for a single priority band within a single shard. type ShardPriorityBandConfig struct { // Priority is the unique numerical priority level for this band. - Priority uint + Priority int // PriorityName is a unique human-readable name for this priority band. PriorityName string // IntraFlowDispatchPolicy is the name of the policy for dispatch within a flow's queue. @@ -192,7 +178,7 @@ type ShardPriorityBandConfig struct { // getBandConfig finds and returns the shard-level configuration for a specific priority level. // Returns an error wrapping `contracts.ErrPriorityBandNotFound` if the priority is not configured. -func (sc *ShardConfig) getBandConfig(priority uint) (*ShardPriorityBandConfig, error) { +func (sc *ShardConfig) getBandConfig(priority int) (*ShardPriorityBandConfig, error) { if band, ok := sc.priorityBandMap[priority]; ok { return band, nil } @@ -205,52 +191,55 @@ func (sc *ShardConfig) getBandConfig(priority uint) (*ShardPriorityBandConfig, e // --- Validation and Defaulting --- -// validateAndApplyDefaults checks the global configuration for validity (including plugin compatibility) and mutates -// the receiver to populate any empty fields with system defaults. It also initializes internal lookup maps. -func (c *Config) validateAndApplyDefaults() error { +// ValidateAndApplyDefaults checks the global configuration for validity and then creates a new `Config` object, +// populating any empty fields with system defaults. +// It does not mutate the receiver. +func (c *Config) ValidateAndApplyDefaults() (*Config, error) { + cfg := c.deepCopy() + // Apply defaults to top-level fields. - if c.InitialShardCount <= 0 { - c.InitialShardCount = defaultInitialShardCount + if cfg.InitialShardCount <= 0 { + cfg.InitialShardCount = defaultInitialShardCount } - if c.FlowGCTimeout <= 0 { - c.FlowGCTimeout = defaultFlowGCTimeout + if cfg.FlowGCTimeout <= 0 { + cfg.FlowGCTimeout = defaultFlowGCTimeout } - if c.EventChannelBufferSize <= 0 { - c.EventChannelBufferSize = defaultEventChannelBufferSize + if cfg.EventChannelBufferSize <= 0 { + cfg.EventChannelBufferSize = defaultEventChannelBufferSize } // Ensure the DI factories are initialized for production use if `NewConfig` was called without options. - if c.interFlowDispatchPolicyFactory == nil { - c.interFlowDispatchPolicyFactory = inter.NewPolicyFromName + if cfg.interFlowDispatchPolicyFactory == nil { + cfg.interFlowDispatchPolicyFactory = inter.NewPolicyFromName } - if c.intraFlowDispatchPolicyFactory == nil { - c.intraFlowDispatchPolicyFactory = intra.NewPolicyFromName + if cfg.intraFlowDispatchPolicyFactory == nil { + cfg.intraFlowDispatchPolicyFactory = intra.NewPolicyFromName } - if c.queueFactory == nil { - c.queueFactory = queue.NewQueueFromName + if cfg.queueFactory == nil { + cfg.queueFactory = queue.NewQueueFromName } - if len(c.PriorityBands) == 0 { - return errors.New("config validation failed: at least one priority band must be defined") + if len(cfg.PriorityBands) == 0 { + return nil, errors.New("config validation failed: at least one priority band must be defined") } // Validate and default each priority band. - priorities := make(map[uint]struct{}) + priorities := make(map[int]struct{}) priorityNames := make(map[string]struct{}) - c.priorityBandMap = make(map[uint]*PriorityBandConfig, len(c.PriorityBands)) + cfg.priorityBandMap = make(map[int]*PriorityBandConfig, len(cfg.PriorityBands)) - for i := range c.PriorityBands { - band := &c.PriorityBands[i] + for i := range cfg.PriorityBands { + band := &cfg.PriorityBands[i] if _, exists := priorities[band.Priority]; exists { - return fmt.Errorf("config validation failed: duplicate priority level %d found", band.Priority) + return nil, fmt.Errorf("config validation failed: duplicate priority level %d found", band.Priority) } priorities[band.Priority] = struct{}{} if band.PriorityName == "" { - return fmt.Errorf("config validation failed: PriorityName is required for priority band %d", band.Priority) + return nil, fmt.Errorf("config validation failed: PriorityName is required for priority band %d", band.Priority) } if _, exists := priorityNames[band.PriorityName]; exists { - return fmt.Errorf("config validation failed: duplicate priority name %q found", band.PriorityName) + return nil, fmt.Errorf("config validation failed: duplicate priority name %q found", band.PriorityName) } priorityNames[band.PriorityName] = struct{}{} @@ -267,12 +256,12 @@ func (c *Config) validateAndApplyDefaults() error { band.MaxBytes = defaultPriorityBandMaxBytes } - if err := c.validateBandCompatibility(*band); err != nil { - return err + if err := cfg.validateBandCompatibility(*band); err != nil { + return nil, err } - c.priorityBandMap[band.Priority] = band + cfg.priorityBandMap[band.Priority] = band } - return nil + return cfg, nil } // validateBandCompatibility verifies that a band's configured queue type has the necessary capabilities. @@ -326,7 +315,7 @@ func (c *Config) partition(shardIndex, totalShards int) *ShardConfig { shardCfg := &ShardConfig{ MaxBytes: partitionUint64(c.MaxBytes, shardIndex, totalShards), PriorityBands: make([]ShardPriorityBandConfig, len(c.PriorityBands)), - priorityBandMap: make(map[uint]*ShardPriorityBandConfig, len(c.PriorityBands)), + priorityBandMap: make(map[int]*ShardPriorityBandConfig, len(c.PriorityBands)), } for i, template := range c.PriorityBands { @@ -423,6 +412,18 @@ func withQueueFactory(factory queueFactory) configOption { } } +// newConfig creates a new validated and defaulted `Config` object. +// It applies provided test-only functional options before validation and defaulting. +// It does not mutate the input `cfg`. +// test-only +func newConfig(cfg Config, opts ...configOption) (*Config, error) { + newCfg := cfg.deepCopy() + for _, opt := range opts { + opt(newCfg) + } + return newCfg.ValidateAndApplyDefaults() +} + // --- Internal Utilities --- // deepCopy creates a deep copy of the `Config` object. @@ -436,7 +437,6 @@ func (c *Config) deepCopy() *Config { FlowGCTimeout: c.FlowGCTimeout, EventChannelBufferSize: c.EventChannelBufferSize, PriorityBands: make([]PriorityBandConfig, len(c.PriorityBands)), - priorityBandMap: make(map[uint]*PriorityBandConfig, len(c.PriorityBands)), interFlowDispatchPolicyFactory: c.interFlowDispatchPolicyFactory, intraFlowDispatchPolicyFactory: c.intraFlowDispatchPolicyFactory, queueFactory: c.queueFactory, @@ -445,18 +445,21 @@ func (c *Config) deepCopy() *Config { // PriorityBandConfig contains only value types, so a slice copy is sufficient for a deep copy. copy(newCfg.PriorityBands, c.PriorityBands) - // Crucial: We must rebuild the map and take the address of the elements within the new slice (`newCfg.PriorityBands`) - // to ensure the map pointers are correct for the newly created `Config` instance. - for i := range newCfg.PriorityBands { - band := &newCfg.PriorityBands[i] - newCfg.priorityBandMap[band.Priority] = band + if c.priorityBandMap != nil { + newCfg.priorityBandMap = make(map[int]*PriorityBandConfig, len(c.PriorityBands)) + // Crucial: We must rebuild the map and take the address of the elements within the new slice (`newCfg.PriorityBands`) + // to ensure the map pointers are correct for the newly created `Config` instance. + for i := range newCfg.PriorityBands { + band := &newCfg.PriorityBands[i] + newCfg.priorityBandMap[band.Priority] = band + } } return newCfg } // getBandConfig finds and returns the global configuration template for a specific priority level. // Returns an error wrapping `contracts.ErrPriorityBandNotFound` if the priority is not configured. -func (c *Config) getBandConfig(priority uint) (*PriorityBandConfig, error) { +func (c *Config) getBandConfig(priority int) (*PriorityBandConfig, error) { if band, ok := c.priorityBandMap[priority]; ok { return band, nil } diff --git a/pkg/epp/flowcontrol/registry/config_test.go b/pkg/epp/flowcontrol/registry/config_test.go index 376282ffe..47814ae6e 100644 --- a/pkg/epp/flowcontrol/registry/config_test.go +++ b/pkg/epp/flowcontrol/registry/config_test.go @@ -35,7 +35,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework/plugins/queue/listqueue" ) -func TestConfig_NewConfig(t *testing.T) { +func TestConfig_ValidateAndApplyDefaults(t *testing.T) { t.Parallel() testCases := []struct { @@ -211,16 +211,26 @@ func TestConfig_NewConfig(t *testing.T) { name: "ShouldError_WhenQueueFactoryFails", input: Config{ PriorityBands: []PriorityBandConfig{{ - Priority: 1, - PriorityName: "High", - Queue: queue.RegisteredQueueName("failing-queue"), + Priority: 1, + PriorityName: "High", + Queue: queue.RegisteredQueueName("failing-queue"), + IntraFlowDispatchPolicy: intra.RegisteredPolicyName("policy-with-req"), }}, }, expectErr: true, - opts: []configOption{withQueueFactory( - func(_ queue.RegisteredQueueName, _ framework.ItemComparator) (framework.SafeQueue, error) { + opts: []configOption{ + withIntraFlowDispatchPolicyFactory( // Forces queue instance creation for validating capabilities. + func(name intra.RegisteredPolicyName) (framework.IntraFlowDispatchPolicy, error) { + return &mocks.MockIntraFlowDispatchPolicy{ + NameV: string(name), + RequiredQueueCapabilitiesV: []framework.QueueCapability{"required-capability"}, + }, nil + }, + ), + withQueueFactory(func(_ queue.RegisteredQueueName, _ framework.ItemComparator) (framework.SafeQueue, error) { return nil, errors.New("queue creation failed") - })}, + }), + }, }, { name: "ShouldError_WhenPolicyFactoryFails", @@ -276,20 +286,24 @@ func TestConfig_NewConfig(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { t.Parallel() - originalInputCopy := tc.input.deepCopy() - newCfg, err := NewConfig(tc.input, tc.opts...) + originalInput := tc.input.deepCopy() + validatedCfg, err := newConfig(tc.input, tc.opts...) + if tc.expectErr { - require.Error(t, err, "NewConfig should have returned an error") + require.Error(t, err, "expected an error but got nil") if tc.expectedErrIs != nil { - assert.ErrorIs(t, err, tc.expectedErrIs, "Error should wrap the expected error type") + assert.ErrorIs(t, err, tc.expectedErrIs, "error should wrap the expected error type") } - assert.Nil(t, newCfg, "On error, the returned config should be nil") + assert.Nil(t, validatedCfg, "validatedCfg should be nil on error") } else { - require.NoError(t, err, "NewConfig should not have returned an error") - require.NotNil(t, newCfg, "On success, the returned config should not be nil") + require.NoError(t, err, "expected no error but got: %v", err) + require.NotNil(t, validatedCfg, "validatedCfg should not be nil on success") if tc.assertion != nil { - tc.assertion(t, *originalInputCopy, newCfg) + tc.assertion(t, *originalInput, validatedCfg) } + + // Ensure the original config is not mutated. + assert.Equal(t, *originalInput, tc.input, "input config should not be mutated") } }) } @@ -297,7 +311,7 @@ func TestConfig_NewConfig(t *testing.T) { func TestConfig_Partition(t *testing.T) { t.Parallel() - baseCfg, err := NewConfig(Config{ + baseCfg, err := newConfig(Config{ MaxBytes: 103, // Will not distribute evenly PriorityBands: []PriorityBandConfig{ {Priority: 1, PriorityName: "High", MaxBytes: 55}, // Will not distribute evenly @@ -381,7 +395,7 @@ func TestConfig_Partition(t *testing.T) { func TestConfig_GetBandConfig(t *testing.T) { t.Parallel() - cfg, err := NewConfig(Config{ + cfg, err := newConfig(Config{ PriorityBands: []PriorityBandConfig{ {Priority: 10, PriorityName: "High"}, }, @@ -417,7 +431,7 @@ func TestConfig_DeepCopy(t *testing.T) { }, } // Create a fully initialized "original" config to be the source of the copy. - original, err := NewConfig(baseCfg) + original, err := newConfig(baseCfg) require.NoError(t, err, "Setup for deep copy should not fail") t.Run("ShouldReturnNil_ForNilReceiver", func(t *testing.T) { @@ -471,7 +485,7 @@ func TestConfig_DeepCopy(t *testing.T) { func TestShardConfig_GetBandConfig(t *testing.T) { t.Parallel() - baseCfg, err := NewConfig(Config{ + baseCfg, err := newConfig(Config{ PriorityBands: []PriorityBandConfig{ {Priority: 10, PriorityName: "High"}, {Priority: 20, PriorityName: "Low"}, diff --git a/pkg/epp/flowcontrol/registry/connection.go b/pkg/epp/flowcontrol/registry/connection.go index 995f23c13..cb9831655 100644 --- a/pkg/epp/flowcontrol/registry/connection.go +++ b/pkg/epp/flowcontrol/registry/connection.go @@ -31,13 +31,13 @@ type connection struct { var _ contracts.ActiveFlowConnection = &connection{} // Shards returns a stable snapshot of accessors for all internal state shards. -func (c *connection) Shards() []contracts.RegistryShard { +func (c *connection) ActiveShards() []contracts.RegistryShard { c.registry.mu.RLock() defer c.registry.mu.RUnlock() // Return a copy to ensure the caller cannot modify the registry's internal slice. - shardsCopy := make([]contracts.RegistryShard, len(c.registry.allShards)) - for i, s := range c.registry.allShards { + shardsCopy := make([]contracts.RegistryShard, len(c.registry.activeShards)) + for i, s := range c.registry.activeShards { shardsCopy[i] = s } return shardsCopy diff --git a/pkg/epp/flowcontrol/registry/managedqueue_test.go b/pkg/epp/flowcontrol/registry/managedqueue_test.go index f5e3d7fa7..64e5ab80c 100644 --- a/pkg/epp/flowcontrol/registry/managedqueue_test.go +++ b/pkg/epp/flowcontrol/registry/managedqueue_test.go @@ -98,7 +98,7 @@ type mockStatsPropagator struct { byteSizeDelta atomic.Int64 } -func (p *mockStatsPropagator) propagate(_ uint, lenDelta, byteSizeDelta int64) { +func (p *mockStatsPropagator) propagate(_ int, lenDelta, byteSizeDelta int64) { p.lenDelta.Add(lenDelta) p.byteSizeDelta.Add(byteSizeDelta) } diff --git a/pkg/epp/flowcontrol/registry/registry.go b/pkg/epp/flowcontrol/registry/registry.go index 95d604ede..3a73ef706 100644 --- a/pkg/epp/flowcontrol/registry/registry.go +++ b/pkg/epp/flowcontrol/registry/registry.go @@ -37,7 +37,7 @@ import ( // propagateStatsDeltaFunc defines the callback function used to propagate statistics changes (deltas) up the hierarchy // (Queue -> Shard -> Registry). // Implementations MUST be non-blocking (relying on atomics). -type propagateStatsDeltaFunc func(priority uint, lenDelta, byteSizeDelta int64) +type propagateStatsDeltaFunc func(priority int, lenDelta, byteSizeDelta int64) // bandStats holds the aggregated atomic statistics for a single priority band across all shards. type bandStats struct { @@ -120,7 +120,7 @@ type FlowRegistry struct { // Globally aggregated statistics, updated atomically via lock-free propagation. totalByteSize atomic.Int64 totalLen atomic.Int64 - perPriorityBandStats map[uint]*bandStats // Keyed by priority. + perPriorityBandStats map[int]*bandStats // Keyed by priority. // --- Administrative state (protected by `mu`) --- @@ -148,17 +148,13 @@ func withClock(clk clock.WithTickerAndDelayedExecution) RegistryOption { // NewFlowRegistry creates and initializes a new `FlowRegistry` instance. func NewFlowRegistry(config Config, logger logr.Logger, opts ...RegistryOption) (*FlowRegistry, error) { - validatedConfig, err := NewConfig(config) - if err != nil { - return nil, fmt.Errorf("master configuration is invalid: %w", err) - } - + cfg := config.deepCopy() fr := &FlowRegistry{ - config: validatedConfig, + config: cfg, logger: logger.WithName("flow-registry"), activeShards: []*registryShard{}, drainingShards: make(map[string]*registryShard), - perPriorityBandStats: make(map[uint]*bandStats, len(validatedConfig.PriorityBands)), + perPriorityBandStats: make(map[int]*bandStats, len(cfg.PriorityBands)), } for _, opt := range opts { @@ -173,7 +169,7 @@ func NewFlowRegistry(config Config, logger logr.Logger, opts ...RegistryOption) fr.perPriorityBandStats[band.Priority] = &bandStats{} } - if err := fr.updateShardCount(validatedConfig.InitialShardCount); err != nil { + if err := fr.updateShardCount(cfg.InitialShardCount); err != nil { return nil, fmt.Errorf("failed to initialize shards: %w", err) } fr.logger.V(logging.DEFAULT).Info("FlowRegistry initialized successfully") @@ -198,7 +194,7 @@ func (fr *FlowRegistry) Run(ctx context.Context) { } } -// --- `contracts.FlowRegistryClient` Implementation --- +// --- `contracts.FlowRegistryDataPlane` Implementation --- // Connect establishes a session for a given flow, acquiring a lifecycle lease. // This is the primary entry point for the data path. @@ -275,7 +271,7 @@ func (fr *FlowRegistry) prepareNewFlow(key types.FlowKey) (*flowState, error) { return &flowState{key: key}, nil } -// --- `contracts.FlowRegistryAdmin` Implementation --- +// --- `contracts.FlowRegistryObserver` Implementation --- // Stats returns globally aggregated statistics for the entire `FlowRegistry`. // @@ -289,7 +285,7 @@ func (fr *FlowRegistry) Stats() contracts.AggregateStats { TotalCapacityBytes: fr.config.MaxBytes, TotalByteSize: uint64(fr.totalByteSize.Load()), TotalLen: uint64(fr.totalLen.Load()), - PerPriorityBandStats: make(map[uint]contracts.PriorityBandStats, len(fr.config.PriorityBands)), + PerPriorityBandStats: make(map[int]contracts.PriorityBandStats, len(fr.config.PriorityBands)), } for p, s := range fr.perPriorityBandStats { @@ -592,7 +588,7 @@ func (fr *FlowRegistry) updateAllShardsCacheLocked() { } // propagateStatsDelta is the top-level, lock-free aggregator for all statistics. -func (fr *FlowRegistry) propagateStatsDelta(priority uint, lenDelta, byteSizeDelta int64) { +func (fr *FlowRegistry) propagateStatsDelta(priority int, lenDelta, byteSizeDelta int64) { stats, ok := fr.perPriorityBandStats[priority] if !ok { panic(fmt.Sprintf("invariant violation: priority band (%d) stats missing during propagation", priority)) diff --git a/pkg/epp/flowcontrol/registry/registry_test.go b/pkg/epp/flowcontrol/registry/registry_test.go index 5ffa600d0..b5bc322cb 100644 --- a/pkg/epp/flowcontrol/registry/registry_test.go +++ b/pkg/epp/flowcontrol/registry/registry_test.go @@ -73,9 +73,12 @@ func newRegistryTestHarness(t *testing.T, opts harnessOptions) *registryTestHarn config.InitialShardCount = opts.initialShardCount } + validatedCfg, err := config.ValidateAndApplyDefaults() + require.NoError(t, err, "Test setup: validating config should not fail") + fakeClock := testclock.NewFakeClock(time.Now()) registryOpts := []RegistryOption{withClock(fakeClock)} - fr, err := NewFlowRegistry(config, logr.Discard(), registryOpts...) + fr, err := NewFlowRegistry(*validatedCfg, logr.Discard(), registryOpts...) require.NoError(t, err, "Test setup: NewFlowRegistry should not fail") // Start the GC loop in the background. @@ -132,69 +135,9 @@ func (h *registryTestHarness) openConnectionOnFlow(key types.FlowKey) { func TestFlowRegistry_New(t *testing.T) { t.Parallel() - t.Run("ShouldApplyDefaults_WhenInitialized", func(t *testing.T) { - t.Parallel() - config := Config{PriorityBands: []PriorityBandConfig{{Priority: highPriority, PriorityName: "DefaultedBand"}}} - fr, err := NewFlowRegistry(config, logr.Discard()) - require.NoError(t, err, "Creating a valid registry with defaults should not fail") - assert.Equal(t, defaultInitialShardCount, fr.config.InitialShardCount, "InitialShardCount should be defaulted") - assert.Equal(t, defaultFlowGCTimeout, fr.config.FlowGCTimeout, "FlowGCTimeout should be defaulted") - assert.Equal(t, defaultEventChannelBufferSize, fr.config.EventChannelBufferSize, - "EventChannelBufferSize should be defaulted") - assert.Len(t, fr.allShards, defaultInitialShardCount, - "Registry should be initialized with the default number of shards") - bandConf, err := fr.config.getBandConfig(highPriority) - require.NoError(t, err, "Getting the defaulted band config should not fail") - assert.Equal(t, defaultPriorityBandMaxBytes, bandConf.MaxBytes, "Priority band MaxBytes should be defaulted") - }) - - t.Run("ShouldFail_OnInvalidConfiguration", func(t *testing.T) { - t.Parallel() - testCases := []struct { - name string - config Config - expectErrSubStr string - }{ - { - name: "WhenNoPriorityBandsAreDefined", - config: Config{}, - expectErrSubStr: "at least one priority band must be defined", - }, - { - name: "WhenPriorityLevelsAreDuplicated", - config: Config{ - PriorityBands: []PriorityBandConfig{ - {Priority: highPriority, PriorityName: "A"}, - {Priority: highPriority, PriorityName: "B"}, - }, - }, - expectErrSubStr: fmt.Sprintf("duplicate priority level %d", highPriority), - }, - { - name: "WhenPriorityNamesAreDuplicated", - config: Config{ - PriorityBands: []PriorityBandConfig{ - {Priority: highPriority, PriorityName: "A"}, - {Priority: lowPriority, PriorityName: "A"}, - }, - }, - expectErrSubStr: `duplicate priority name "A"`, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - _, err := NewFlowRegistry(tc.config, logr.Discard()) - require.Error(t, err, "NewFlowRegistry should fail with an invalid config") - assert.Contains(t, err.Error(), tc.expectErrSubStr, "Error message should contain the expected reason") - }) - } - }) - t.Run("ShouldFail_WhenInitialShardCreationFails", func(t *testing.T) { t.Parallel() - config, err := NewConfig( + config, err := newConfig( Config{PriorityBands: []PriorityBandConfig{{Priority: highPriority, PriorityName: "A"}}}, withInterFlowDispatchPolicyFactory(func(inter.RegisteredPolicyName) (framework.InterFlowDispatchPolicy, error) { return nil, errors.New("injected factory failure") @@ -261,7 +204,7 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { assert.ErrorContains(t, err, "injected factory failure", "The returned error must propagate the reason") }) - t.Run("Handle_Shards_ShouldReturnAllShardsAndBeACopy", func(t *testing.T) { + t.Run("Handle_Shards_ShouldReturnAllActiveShardsAndBeACopy", func(t *testing.T) { t.Parallel() // Create a registry with a known mixed topology of Active and Draining shards. h := newRegistryTestHarness(t, harnessOptions{initialShardCount: 3}) @@ -272,9 +215,9 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { key := types.FlowKey{ID: "test-flow", Priority: highPriority} err = h.fr.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { - shards := conn.Shards() + shards := conn.ActiveShards() - assert.Len(t, shards, 3, "Shards() must return all configured shards, including Draining ones") + assert.Len(t, shards, 2, "ActiveShards() must only return the Active shards") // Assert it's a copy by maliciously modifying it. require.NotEmpty(t, shards, "Test setup assumes shards are present") @@ -285,8 +228,8 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { require.NoError(t, err) // Prove the registry's internal state was not mutated by the modification. - assert.NotNil(t, h.fr.allShards[0], - "Modifying the slice returned by Shards() must not affect the registry's internal state") + assert.NotNil(t, h.fr.activeShards[0], + "Modifying the slice returned by ActiveShards() must not affect the registry's internal state") }) } @@ -323,6 +266,9 @@ func TestFlowRegistry_Stats(t *testing.T) { require.Len(t, shardStats, 2, "Should return stats for 2 shards") var totalShardLen, totalShardBytes uint64 for _, ss := range shardStats { + assert.True(t, ss.IsActive, "All shards should be active in this test") + assert.NotEmpty(t, ss.PerPriorityBandStats, "Each shard should have stats for its priority bands") + assert.NotEmpty(t, ss.ID, "Each shard should have a non-empty ID") totalShardLen += ss.TotalLen totalShardBytes += ss.TotalByteSize } @@ -541,14 +487,6 @@ func TestFlowRegistry_UpdateShardCount(t *testing.T) { expectedPartitionedGlobalCapacities: map[uint64]int{25: 4}, expectedPartitionedBandCapacities: map[uint64]int{12: 2, 13: 2}, }, - { - name: "Succeeds_ScaleUp_FromZero", - initialShardCount: 0, - targetShardCount: 4, - expectedActiveCount: 4, - expectedPartitionedGlobalCapacities: map[uint64]int{25: 4}, - expectedPartitionedBandCapacities: map[uint64]int{12: 2, 13: 2}, - }, { name: "Succeeds_ScaleDown_ToOne", initialShardCount: 3, @@ -589,7 +527,7 @@ func TestFlowRegistry_UpdateShardCount(t *testing.T) { } h := newRegistryTestHarness(t, harnessOptions{config: &config}) - key := types.FlowKey{ID: "flow", Priority: 10} + key := types.FlowKey{ID: "flow", Priority: highPriority} h.openConnectionOnFlow(key) err := h.fr.updateShardCount(tc.targetShardCount) @@ -600,24 +538,19 @@ func TestFlowRegistry_UpdateShardCount(t *testing.T) { require.NoError(t, err, "UpdateShardCount should not have returned an error") } - var finalActiveCount, finalDrainingCount int globalCapacities := make(map[uint64]int) bandCapacities := make(map[uint64]int) - err = h.fr.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { - for _, shard := range conn.Shards() { - if shard.IsActive() { - finalActiveCount++ - stats := shard.Stats() - globalCapacities[stats.TotalCapacityBytes]++ - bandCapacities[stats.PerPriorityBandStats[highPriority].CapacityBytes]++ - h.assertFlowExists(key, "Shard %s should contain the existing flow", shard.ID()) - } else { - finalDrainingCount++ - } - } - return nil - }) - require.NoError(t, err, "WithConnection should not fail") + + h.fr.mu.RLock() + finalActiveCount := len(h.fr.activeShards) + finalDrainingCount := len(h.fr.drainingShards) + for _, shard := range h.fr.activeShards { + stats := shard.Stats() + globalCapacities[stats.TotalCapacityBytes]++ + bandCapacities[stats.PerPriorityBandStats[highPriority].CapacityBytes]++ + h.assertFlowExists(key, "Shard %s should contain the existing flow", shard.ID()) + } + h.fr.mu.RUnlock() expectedDrainingCount := 0 if tc.initialShardCount > tc.expectedActiveCount { diff --git a/pkg/epp/flowcontrol/registry/shard.go b/pkg/epp/flowcontrol/registry/shard.go index 36032e42c..4a7918e79 100644 --- a/pkg/epp/flowcontrol/registry/shard.go +++ b/pkg/epp/flowcontrol/registry/shard.go @@ -18,7 +18,7 @@ package registry import ( "fmt" - "slices" + "sort" "sync" "sync/atomic" @@ -76,7 +76,7 @@ type registryShard struct { // onStatsDelta is the callback used to propagate statistics changes up to the parent registry. onStatsDelta propagateStatsDeltaFunc // orderedPriorityLevels is a cached, sorted list of priority levels. - orderedPriorityLevels []uint + orderedPriorityLevels []int // --- State Protected by `mu` --- @@ -88,7 +88,7 @@ type registryShard struct { // config holds the partitioned configuration for this shard, derived from the `FlowRegistry`'s global `Config`. config *ShardConfig // priorityBands is the primary lookup table for all managed queues on this shard. - priorityBands map[uint]*priorityBand + priorityBands map[int]*priorityBand // --- Concurrent-Safe State (Atomics) --- @@ -116,7 +116,7 @@ func newShard( logger: shardLogger, config: config, onStatsDelta: onStatsDelta, - priorityBands: make(map[uint]*priorityBand, len(config.PriorityBands)), + priorityBands: make(map[int]*priorityBand, len(config.PriorityBands)), } for _, bandConfig := range config.PriorityBands { @@ -133,8 +133,9 @@ func newShard( } s.orderedPriorityLevels = append(s.orderedPriorityLevels, bandConfig.Priority) } - - slices.Sort(s.orderedPriorityLevels) + sort.Slice(s.orderedPriorityLevels, func(i, j int) bool { + return s.orderedPriorityLevels[i] > s.orderedPriorityLevels[j] + }) s.logger.V(logging.DEFAULT).Info("Registry shard initialized successfully", "priorityBandCount", len(s.priorityBands), "orderedPriorities", s.orderedPriorityLevels) return s, nil @@ -184,7 +185,7 @@ func (s *registryShard) IntraFlowDispatchPolicy(key types.FlowKey) (framework.In // InterFlowDispatchPolicy retrieves a priority band's configured `framework.InterFlowDispatchPolicy`. // This read is lock-free as the policy instance is immutable after the shard is initialized. -func (s *registryShard) InterFlowDispatchPolicy(priority uint) (framework.InterFlowDispatchPolicy, error) { +func (s *registryShard) InterFlowDispatchPolicy(priority int) (framework.InterFlowDispatchPolicy, error) { // This read is safe because the `priorityBands` map structure is immutable after initialization. band, ok := s.priorityBands[priority] if !ok { @@ -195,7 +196,7 @@ func (s *registryShard) InterFlowDispatchPolicy(priority uint) (framework.InterF } // PriorityBandAccessor retrieves a read-only view for a given priority level. -func (s *registryShard) PriorityBandAccessor(priority uint) (framework.PriorityBandAccessor, error) { +func (s *registryShard) PriorityBandAccessor(priority int) (framework.PriorityBandAccessor, error) { s.mu.RLock() defer s.mu.RUnlock() @@ -209,7 +210,7 @@ func (s *registryShard) PriorityBandAccessor(priority uint) (framework.PriorityB // AllOrderedPriorityLevels returns a cached, sorted slice of all configured priority levels for this shard. // This is a lock-free read. -func (s *registryShard) AllOrderedPriorityLevels() []uint { +func (s *registryShard) AllOrderedPriorityLevels() []int { return s.orderedPriorityLevels } @@ -227,10 +228,12 @@ func (s *registryShard) Stats() contracts.ShardStats { // Casts from `int64` to `uint64` are safe because the non-negative invariant is strictly enforced at the // `managedQueue` level. stats := contracts.ShardStats{ + ID: s.id, + IsActive: s.IsActive(), TotalCapacityBytes: s.config.MaxBytes, TotalByteSize: uint64(s.totalByteSize.Load()), TotalLen: uint64(s.totalLen.Load()), - PerPriorityBandStats: make(map[uint]contracts.PriorityBandStats, len(s.priorityBands)), + PerPriorityBandStats: make(map[int]contracts.PriorityBandStats, len(s.priorityBands)), } for priority, band := range s.priorityBands { @@ -325,7 +328,7 @@ func (s *registryShard) updateConfig(newConfig *ShardConfig) { // propagateStatsDelta is the single point of entry for all statistics changes within the shard. // It atomically updates the relevant band's stats, the shard's total stats, and propagates the delta to the parent // registry. -func (s *registryShard) propagateStatsDelta(priority uint, lenDelta, byteSizeDelta int64) { +func (s *registryShard) propagateStatsDelta(priority int, lenDelta, byteSizeDelta int64) { // This read is safe because the `priorityBands` map structure is immutable after initialization. band, ok := s.priorityBands[priority] if !ok { @@ -355,7 +358,7 @@ type priorityBandAccessor struct { var _ framework.PriorityBandAccessor = &priorityBandAccessor{} // Priority returns the numerical priority level of this band. -func (a *priorityBandAccessor) Priority() uint { return a.band.config.Priority } +func (a *priorityBandAccessor) Priority() int { return a.band.config.Priority } // PriorityName returns the human-readable name of this priority band. func (a *priorityBandAccessor) PriorityName() string { return a.band.config.PriorityName } diff --git a/pkg/epp/flowcontrol/registry/shard_test.go b/pkg/epp/flowcontrol/registry/shard_test.go index 214497e41..23bf81325 100644 --- a/pkg/epp/flowcontrol/registry/shard_test.go +++ b/pkg/epp/flowcontrol/registry/shard_test.go @@ -37,11 +37,11 @@ import ( const ( // highPriority is the priority level for the "High" priority band in the test harness config. - highPriority uint = 10 + highPriority int = 20 // lowPriority is the priority level for the "Low" priority band in the test harness config. - lowPriority uint = 20 + lowPriority int = 10 // nonExistentPriority is a priority that is known not to exist in the test harness config. - nonExistentPriority uint = 99 + nonExistentPriority int = 99 ) // --- Test Harness and Mocks --- @@ -59,7 +59,7 @@ type shardTestHarness struct { // newShardTestHarness initializes a `shardTestHarness` with a default configuration. func newShardTestHarness(t *testing.T) *shardTestHarness { t.Helper() - globalConfig, err := NewConfig(Config{ + globalConfig, err := newConfig(Config{ PriorityBands: []PriorityBandConfig{ {Priority: highPriority, PriorityName: "High"}, {Priority: lowPriority, PriorityName: "Low"}, @@ -133,7 +133,7 @@ func TestShard_New(t *testing.T) { assert.Equal(t, "test-shard-1", h.shard.ID(), "Shard ID must match the value provided during construction") assert.True(t, h.shard.IsActive(), "A newly created shard must be initialized in the Active state") - assert.Equal(t, []uint{highPriority, lowPriority}, h.shard.AllOrderedPriorityLevels(), + assert.Equal(t, []int{highPriority, lowPriority}, h.shard.AllOrderedPriorityLevels(), "Shard must report configured priority levels sorted numerically (highest priority first)") bandHigh, ok := h.shard.priorityBands[highPriority] @@ -146,7 +146,7 @@ func TestShard_New(t *testing.T) { t.Run("ShouldFail_WhenInterFlowPolicyFactoryFails", func(t *testing.T) { t.Parallel() - shardConfig, _ := NewConfig(Config{PriorityBands: []PriorityBandConfig{ + shardConfig, _ := newConfig(Config{PriorityBands: []PriorityBandConfig{ {Priority: highPriority, PriorityName: "High"}, }}) failingFactory := func(inter.RegisteredPolicyName) (framework.InterFlowDispatchPolicy, error) { @@ -165,6 +165,8 @@ func TestShard_Stats(t *testing.T) { stats := h.shard.Stats() + assert.Equal(t, h.shard.ID(), stats.ID, "Stats ID must match the shard ID") + assert.True(t, stats.IsActive, "Shard must report itself as active in the stats snapshot") assert.Equal(t, uint64(2), stats.TotalLen, "Total shard length must aggregate counts from all bands") assert.Equal(t, uint64(150), stats.TotalByteSize, "Total shard byte size must aggregate sizes from all bands") diff --git a/pkg/epp/flowcontrol/types/errors.go b/pkg/epp/flowcontrol/types/errors.go index f7dffbd4d..8c966bb45 100644 --- a/pkg/epp/flowcontrol/types/errors.go +++ b/pkg/epp/flowcontrol/types/errors.go @@ -43,11 +43,8 @@ var ( // The following errors can occur before a request is formally added to a `framework.SafeQueue`. When returned by // `FlowController.EnqueueAndWait()`, these specific errors will typically be wrapped by `ErrRejected`. var ( - // ErrNilRequest indicates that a nil `types.FlowControlRequest` was provided. - ErrNilRequest = errors.New("FlowControlRequest cannot be nil") - // ErrQueueAtCapacity indicates that a request could not be enqueued because queue capacity limits were met. - ErrQueueAtCapacity = errors.New("queue at capacity and displacement failed to make space") + ErrQueueAtCapacity = errors.New("queue at capacity") ) // --- Post-Enqueue Eviction Errors --- @@ -68,10 +65,10 @@ var ( // --- General `controller.FlowController` Errors --- var ( - // ErrFlowControllerShutdown indicates that an operation could not complete or an item was evicted because the - // `controller.FlowController` is shutting down or has stopped. + // ErrFlowControllerNotRunning indicates that an operation could not complete or an item was evicted because the + // `controller.FlowController` is not running or is in the process of shutting down. // // When returned by `FlowController.EnqueueAndWait()`, this will be wrapped by `ErrRejected` (if rejection happens // before internal queuing) or `ErrEvicted` (if eviction happens after internal queuing). - ErrFlowControllerShutdown = errors.New("FlowController is shutting down") + ErrFlowControllerNotRunning = errors.New("flow controller is not running") ) diff --git a/pkg/epp/flowcontrol/types/flow.go b/pkg/epp/flowcontrol/types/flow.go index 71017c13c..2af2d5bd0 100644 --- a/pkg/epp/flowcontrol/types/flow.go +++ b/pkg/epp/flowcontrol/types/flow.go @@ -41,7 +41,7 @@ type FlowKey struct { // // Because the `FlowKey` is immutable, changing the priority of traffic requires using a new `FlowKey`; the old flow // instance will be automatically garbage collected by the registry when it becomes idle. - Priority uint + Priority int } func (k FlowKey) String() string { @@ -49,13 +49,13 @@ func (k FlowKey) String() string { } // Compare provides a stable comparison function for two FlowKey instances, suitable for use with sorting algorithms. -// It returns -1 if the key is less than the other, 0 if they are equal, and 1 if the key is greater than the other. +// It returns 1 if the key is less than the other, 0 if they are equal, and -1 if the key is greater than the other. // The comparison is performed first by `Priority` (ascending, higher priority first) and then by `ID` (ascending). func (k FlowKey) Compare(other FlowKey) int { - if k.Priority < other.Priority { // Lower number means higher priority + if k.Priority > other.Priority { // Higher number means higher priority return -1 } - if k.Priority > other.Priority { + if k.Priority < other.Priority { return 1 } return strings.Compare(k.ID, other.ID) diff --git a/pkg/epp/flowcontrol/types/mocks/mocks.go b/pkg/epp/flowcontrol/types/mocks/mocks.go index dbef031d7..5fabf3683 100644 --- a/pkg/epp/flowcontrol/types/mocks/mocks.go +++ b/pkg/epp/flowcontrol/types/mocks/mocks.go @@ -19,19 +19,19 @@ limitations under the License. package mocks import ( - "context" "time" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" ) // MockFlowControlRequest provides a mock implementation of the `types.FlowControlRequest` interface. type MockFlowControlRequest struct { - Ctx context.Context - FlowKeyV types.FlowKey - ByteSizeV uint64 - InitialEffectiveTTLV time.Duration - IDV string + FlowKeyV types.FlowKey + ByteSizeV uint64 + InitialEffectiveTTLV time.Duration + IDV string + CandidatePodsForSchedulingV []*metrics.FakePodMetrics } // NewMockFlowControlRequest creates a new `MockFlowControlRequest` instance. @@ -39,25 +39,27 @@ func NewMockFlowControlRequest( byteSize uint64, id string, key types.FlowKey, - ctx context.Context, ) *MockFlowControlRequest { - if ctx == nil { - ctx = context.Background() - } return &MockFlowControlRequest{ ByteSizeV: byteSize, IDV: id, FlowKeyV: key, - Ctx: ctx, } } -func (m *MockFlowControlRequest) Context() context.Context { return m.Ctx } func (m *MockFlowControlRequest) FlowKey() types.FlowKey { return m.FlowKeyV } func (m *MockFlowControlRequest) ByteSize() uint64 { return m.ByteSizeV } func (m *MockFlowControlRequest) InitialEffectiveTTL() time.Duration { return m.InitialEffectiveTTLV } func (m *MockFlowControlRequest) ID() string { return m.IDV } +func (m *MockFlowControlRequest) CandidatePodsForScheduling() []metrics.PodMetrics { + pods := make([]metrics.PodMetrics, 0, len(m.CandidatePodsForSchedulingV)) + for i, pod := range m.CandidatePodsForSchedulingV { + pods[i] = pod + } + return pods +} + var _ types.FlowControlRequest = &MockFlowControlRequest{} // MockQueueItemHandle provides a mock implementation of the `types.QueueItemHandle` interface. @@ -104,7 +106,6 @@ func NewMockQueueItemAccessor(byteSize uint64, reqID string, key types.FlowKey) byteSize, reqID, key, - context.Background(), ), HandleV: &MockQueueItemHandle{}, } diff --git a/pkg/epp/flowcontrol/types/request.go b/pkg/epp/flowcontrol/types/request.go index 756940704..e427b0aba 100644 --- a/pkg/epp/flowcontrol/types/request.go +++ b/pkg/epp/flowcontrol/types/request.go @@ -17,8 +17,9 @@ limitations under the License. package types import ( - "context" "time" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" ) // FlowControlRequest is the contract for an incoming request submitted to the `controller.FlowController`. It @@ -28,11 +29,6 @@ import ( // wraps this object with its own internal structures (which implement `QueueItemAccessor`) to manage the request's // lifecycle without modifying the original. type FlowControlRequest interface { - // Context returns the request's context. The `controller.FlowController` uses this for monitoring cancellation (e.g., - // if the client disconnects or a request-scoped timeout occurs), which can lead to the request being evicted from a - // queue. - Context() context.Context - // FlowKey returns the composite key that uniquely identifies the flow instance this request belongs to. // The `controller.FlowController` uses this key as the primary identifier to look up the correct // `contracts.ManagedQueue` and configured `framework.IntraFlowDispatchPolicy` from a `contracts.RegistryShard`. @@ -49,6 +45,11 @@ type FlowControlRequest interface { // applied. InitialEffectiveTTL() time.Duration + // CandidatePodsForScheduling passes through a set of candidate pods a request may be admitted to. + // This is necessary for invoking `contracts.SaturationDetector.IsSaturated`, but it is otherwise unused in the Flow + // Control system. + CandidatePodsForScheduling() []metrics.PodMetrics + // ID returns an optional, user-facing unique identifier for this specific request. It is intended for logging, // tracing, and observability. The `controller.FlowController` does not use this ID for dispatching decisions; it uses // the internal, opaque `QueueItemHandle`. @@ -92,7 +93,8 @@ type QueueItemAccessor interface { OriginalRequest() FlowControlRequest // EnqueueTime is the timestamp when the item was logically accepted by the `controller.FlowController` for queuing - // (i.e., when `controller.FlowController.EnqueueAndWait()` was called). + // (i.e., when `controller.FlowController.EnqueueAndWait()` was called). It does not reflect the time the request + // landed in a `framework.SafeQueue` instance. EnqueueTime() time.Time // EffectiveTTL is the actual Time-To-Live assigned to this item by the `controller.FlowController`, taking into diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 7f8122195..bfc4e147a 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -17,7 +17,6 @@ limitations under the License. package handlers import ( - "fmt" "strconv" "time" @@ -29,6 +28,13 @@ import ( errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" ) +const ( + // defaultFairnessID is the default fairness ID used when no ID is provided in the request. + // This ensures that requests without explicit fairness identifiers are still grouped and managed by the Flow Control + // system. + defaultFairnessID = "default-flow" +) + func (s *StreamingServer) HandleRequestHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error { reqCtx.RequestReceivedTimestamp = time.Now() @@ -42,14 +48,7 @@ func (s *StreamingServer) HandleRequestHeaders(reqCtx *RequestContext, req *extP if pod == nil { return errutil.Error{Code: errutil.Internal, Msg: "no pods available in datastore"} } - pool, err := s.datastore.PoolGet() - if err != nil { - return err - } - if len(pool.Spec.TargetPorts) != 1 { - return fmt.Errorf("expected 1 target port, got %d", len(pool.Spec.TargetPorts)) - } - reqCtx.TargetEndpoint = pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPorts[0].Number)) + reqCtx.TargetEndpoint = pod.GetIPAddress() + ":" + pod.GetPort() reqCtx.RequestSize = 0 reqCtx.reqHeaderResp = s.generateRequestHeaderResponse(reqCtx) return nil @@ -80,6 +79,11 @@ func (s *StreamingServer) HandleRequestHeaders(reqCtx *RequestContext, req *extP delete(reqCtx.Request.Headers, header.Key) } } + + if reqCtx.FairnessID == "" { + reqCtx.FairnessID = defaultFairnessID + } + return nil } diff --git a/pkg/epp/handlers/request_test.go b/pkg/epp/handlers/request_test.go index 4ae207803..a3ef90cb4 100644 --- a/pkg/epp/handlers/request_test.go +++ b/pkg/epp/handlers/request_test.go @@ -21,6 +21,7 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/stretchr/testify/assert" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metadata" ) @@ -66,3 +67,32 @@ func TestHandleRequestHeaders(t *testing.T) { t.Errorf("expected fairness ID header to be removed from request headers, but it was not") } } + +func TestHandleRequestHeaders_DefaultFairnessID(t *testing.T) { + t.Parallel() + + server := &StreamingServer{} + reqCtx := &RequestContext{ + Request: &Request{ + Headers: make(map[string]string), + }, + } + + req := &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "x-test-header", + Value: "test-value", + }, + }, + }, + EndOfStream: false, + }, + } + + err := server.HandleRequestHeaders(reqCtx, req) + assert.NoError(t, err, "expected no error") + assert.Equal(t, defaultFairnessID, reqCtx.FairnessID, "expected fairness ID to be defaulted") +} diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go index d0c3b020a..9c2f44be5 100644 --- a/pkg/epp/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -19,6 +19,7 @@ package handlers import ( "context" "encoding/json" + "fmt" "strings" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" @@ -41,8 +42,7 @@ func (s *StreamingServer) HandleResponseBody(ctx context.Context, reqCtx *Reques logger := log.FromContext(ctx) responseBytes, err := json.Marshal(response) if err != nil { - logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") - return reqCtx, err + return reqCtx, fmt.Errorf("error marshalling responseBody - %w", err) } if response["usage"] != nil { usg := response["usage"].(map[string]any) @@ -63,24 +63,28 @@ func (s *StreamingServer) HandleResponseBody(ctx context.Context, reqCtx *Reques reqCtx.ResponseComplete = true reqCtx.respBodyResp = generateResponseBodyResponses(responseBytes, true, reqCtx, logger) - return reqCtx, nil + + return s.director.HandleResponseBodyComplete(ctx, reqCtx) } // The function is to handle streaming response if the modelServer is streaming. func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context, reqCtx *RequestContext, responseText string) { + logger := log.FromContext(ctx) + _, err := s.director.HandleResponseBodyStreaming(ctx, reqCtx) + if err != nil { + logger.Error(err, "error in HandleResponseBodyStreaming") + } if strings.Contains(responseText, streamingEndMsg) { reqCtx.ResponseComplete = true resp := parseRespForUsage(ctx, responseText) reqCtx.Usage = resp.Usage metrics.RecordInputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.CompletionTokens) - if s.director != nil { - s.director.HandleResponseBodyComplete(ctx, reqCtx) + _, err := s.director.HandleResponseBodyComplete(ctx, reqCtx) + if err != nil { + logger.Error(err, "error in HandleResponseBodyComplete") } } - if s.director != nil { - s.director.HandleResponseBodyChunk(ctx, reqCtx) - } } func (s *StreamingServer) HandleResponseHeaders(ctx context.Context, reqCtx *RequestContext, resp *extProcPb.ProcessingRequest_ResponseHeaders) (*RequestContext, error) { @@ -92,7 +96,7 @@ func (s *StreamingServer) HandleResponseHeaders(ctx context.Context, reqCtx *Req } } - reqCtx, err := s.director.HandleResponse(ctx, reqCtx) + reqCtx, err := s.director.HandleResponseReceived(ctx, reqCtx) return reqCtx, err } @@ -173,16 +177,6 @@ func generateResponseBodyResponses( continue } - // Add metrics to usage if present - if usage, ok := obj["usage"].(map[string]interface{}); ok && usage != nil { - usage["ttft_ms"] = reqCtx.TTFT - usage["predicted_ttft_ms"] = reqCtx.PredictedTTFT - usage["tpot_observations_ms"] = reqCtx.TPOTObservations - usage["predicted_tpot_observations_ms"] = reqCtx.PredictedTPOTObservations - usage["avg_tpot_ms"] = reqCtx.AvgTPOT - usage["avg_predicted_tpot_ms"] = reqCtx.AvgPredictedTPOT - } - // Re-marshal and reconstruct SSE format if modifiedBytes, err := json.Marshal(obj); err != nil { logger.Error(err, "failed to re-marshal modified JSON", "obj", obj) diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go index 6eb7734e4..63b2de0da 100644 --- a/pkg/epp/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -23,6 +23,7 @@ import ( "github.com/google/go-cmp/cmp" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -59,6 +60,27 @@ data: [DONE] ` ) +type mockDirector struct{} + +func (m *mockDirector) HandleResponseBodyStreaming(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) { + return reqCtx, nil +} +func (m *mockDirector) HandleResponseBodyComplete(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) { + return reqCtx, nil +} +func (m *mockDirector) HandleResponseReceived(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) { + return reqCtx, nil +} +func (m *mockDirector) HandlePreRequest(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) { + return reqCtx, nil +} +func (m *mockDirector) GetRandomPod() *backend.Pod { + return &backend.Pod{} +} +func (m *mockDirector) HandleRequest(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) { + return reqCtx, nil +} + func TestHandleResponseBody(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) @@ -83,6 +105,7 @@ func TestHandleResponseBody(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { server := &StreamingServer{} + server.director = &mockDirector{} reqCtx := test.reqCtx if reqCtx == nil { reqCtx = &RequestContext{} @@ -143,6 +166,7 @@ func TestHandleStreamedResponseBody(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { server := &StreamingServer{} + server.director = &mockDirector{} reqCtx := test.reqCtx if reqCtx == nil { reqCtx = &RequestContext{} diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 0e0a1d03d..274dcf32c 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -33,7 +33,6 @@ import ( v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" @@ -55,9 +54,9 @@ func NewStreamingServer(datastore Datastore, director Director) *StreamingServer type Director interface { HandleRequest(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) - HandleResponse(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) - HandleResponseBodyChunk(ctx context.Context, reqCtx *RequestContext) error - HandleResponseBodyComplete(ctx context.Context, reqCtx *RequestContext) error + HandleResponseReceived(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) + HandleResponseBodyStreaming(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) + HandleResponseBodyComplete(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) GetRandomPod() *backend.Pod } @@ -85,7 +84,6 @@ type RequestContext struct { ObjectiveKey string RequestReceivedTimestamp time.Time ResponseCompleteTimestamp time.Time - LastTokenTimestamp time.Time RequestSize int Usage Usage ResponseSize int @@ -93,24 +91,12 @@ type RequestContext struct { ResponseStatusCode string RequestRunning bool Request *Request - GeneratedTokenCount int - LastSeenMetrics map[string]*backendmetrics.MetricsState - SchedulingResult *schedulingtypes.SchedulingResult SchedulingRequest *schedulingtypes.LLMRequest RequestState StreamRequestState modelServerStreaming bool - // -- New fields for latency predictor -- - TTFT float64 - PredictedTTFT float64 - AvgTPOT float64 - AvgPredictedTPOT float64 - TokenSampler *requtil.TokenSampler - TPOTObservations []float64 - PredictedTPOTObservations []float64 - Response *Response reqHeaderResp *extProcPb.ProcessingResponse @@ -138,7 +124,7 @@ const ( HeaderRequestResponseComplete StreamRequestState = 1 BodyRequestResponsesComplete StreamRequestState = 2 TrailerRequestResponsesComplete StreamRequestState = 3 - ResponseRecieved StreamRequestState = 4 + ResponseReceived StreamRequestState = 4 HeaderResponseResponseComplete StreamRequestState = 5 BodyResponseResponsesComplete StreamRequestState = 6 TrailerResponseResponsesComplete StreamRequestState = 7 @@ -195,9 +181,6 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) return nil } if recvErr != nil { - // This error occurs very frequently, though it doesn't seem to have any impact. - // TODO Figure out if we can remove this noise. - logger.V(logutil.DEFAULT).Error(err, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) } @@ -272,7 +255,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) loggerTrace.Info("model server is streaming response") } } - reqCtx.RequestState = ResponseRecieved + reqCtx.RequestState = ResponseReceived var responseErr error reqCtx, responseErr = s.HandleResponseHeaders(ctx, reqCtx, v) @@ -399,7 +382,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } - if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { + if r.RequestState == ResponseReceived && r.respHeaderResp != nil { loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp) if err := srv.Send(r.respHeaderResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) diff --git a/pkg/epp/metrics/collectors/inference_pool_test.go b/pkg/epp/metrics/collectors/inference_pool_test.go index dcac3b37d..af2923e50 100644 --- a/pkg/epp/metrics/collectors/inference_pool_test.go +++ b/pkg/epp/metrics/collectors/inference_pool_test.go @@ -40,7 +40,7 @@ var ( Name: "pod1", }, } - pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + pod1NamespacedName = types.NamespacedName{Name: pod1.Name + "-rank-0", Namespace: pod1.Namespace} pod1Metrics = &backendmetrics.MetricsState{ WaitingQueueSize: 100, KVCacheUsagePercent: 0.2, @@ -50,10 +50,10 @@ var ( func TestNoMetricsCollected(t *testing.T) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - datastore := datastore.NewDatastore(context.Background(), pmf) + ds := datastore.NewDatastore(context.Background(), pmf, 0) collector := &inferencePoolMetricsCollector{ - ds: datastore, + ds: ds, } if err := testutil.CollectAndCompare(collector, strings.NewReader(""), ""); err != nil { @@ -68,7 +68,7 @@ func TestMetricsCollected(t *testing.T) { }, } pmf := backendmetrics.NewPodMetricsFactory(pmc, time.Millisecond) - ds := datastore.NewDatastore(context.Background(), pmf) + ds := datastore.NewDatastore(context.Background(), pmf, 0) scheme := runtime.NewScheme() fakeClient := fake.NewClientBuilder(). @@ -94,7 +94,7 @@ func TestMetricsCollected(t *testing.T) { err := testutil.CollectAndCompare(collector, strings.NewReader(` # HELP inference_pool_per_pod_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying pod. # TYPE inference_pool_per_pod_queue_size gauge - inference_pool_per_pod_queue_size{model_server_pod="pod1",name="test-pool"} 100 + inference_pool_per_pod_queue_size{model_server_pod="pod1-rank-0",name="test-pool"} 100 `), "inference_pool_per_pod_queue_size") if err != nil { t.Fatal(err) diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index f5910099e..e8deaaab3 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -31,34 +31,34 @@ import ( ) const ( - InferenceModelComponent = "inference_model" - InferencePoolComponent = "inference_pool" - InferenceExtension = "inference_extension" + InferenceObjectiveComponent = "inference_objective" + InferencePoolComponent = "inference_pool" + InferenceExtension = "inference_extension" ) var ( - // Inference Model Metrics + // Inference Objective Metrics requestCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_total", - Help: metricsutil.HelpMsgWithStability("Counter of inference model requests broken out for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Counter of inference objective requests broken out for each model and target model.", compbasemetrics.ALPHA), }, []string{"model_name", "target_model_name"}, ) requestErrCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_error_total", - Help: metricsutil.HelpMsgWithStability("Counter of inference model requests errors broken out for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Counter of inference objective requests errors broken out for each model and target model.", compbasemetrics.ALPHA), }, []string{"model_name", "target_model_name", "error_code"}, ) requestTTFT = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_seconds", Help: metricsutil.HelpMsgWithStability("Inference model TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -71,7 +71,7 @@ var ( requestTTFTGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Inference model TTFT gauge in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -80,7 +80,7 @@ var ( requestPredictedTTFT = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_predicted_ttft_seconds", Help: metricsutil.HelpMsgWithStability("Inference model Predicted TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -93,7 +93,7 @@ var ( requestPredictedTTFTGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_predicted_ttft_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Inference model Predicted TTFT gauge in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -103,7 +103,7 @@ var ( // New metrics for TTFT prediction duration requestTTFTPredictionDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_prediction_duration_seconds", Help: metricsutil.HelpMsgWithStability("Duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -115,7 +115,7 @@ var ( requestTTFTPredictionDurationGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_prediction_duration_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Latest duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -124,7 +124,7 @@ var ( requestTPOT = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_seconds", Help: metricsutil.HelpMsgWithStability("Inference model TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -137,7 +137,7 @@ var ( requestTPOTGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Inference model TPOT gauge in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -145,7 +145,7 @@ var ( ) requestPredictedTPOT = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_predicted_tpot_seconds", Help: metricsutil.HelpMsgWithStability("Inference model Predicted TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -158,7 +158,7 @@ var ( requestPredictedTPOTGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_predicted_tpot_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Inference model Predicted TPOT gauge in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -168,7 +168,7 @@ var ( // New metrics for TPOT prediction duration requestTPOTPredictionDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_prediction_duration_seconds", Help: metricsutil.HelpMsgWithStability("Duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ @@ -180,7 +180,7 @@ var ( requestTPOTPredictionDurationGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_prediction_duration_seconds_gauge", Help: metricsutil.HelpMsgWithStability("Latest duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -190,7 +190,7 @@ var ( // SLO Violation Metrics requestTTFTSLOViolation = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_slo_violation", Help: metricsutil.HelpMsgWithStability("Boolean indicator (0 or 1) of whether the last TTFT measurement violated the SLO threshold for each model and target model.", compbasemetrics.ALPHA), }, @@ -199,7 +199,7 @@ var ( requestTTFTSLOViolationCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_slo_violation_total", Help: metricsutil.HelpMsgWithStability("Counter of TTFT SLO violations for each model and target model.", compbasemetrics.ALPHA), }, @@ -208,7 +208,7 @@ var ( requestTPOTSLOViolation = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_slo_violation", Help: metricsutil.HelpMsgWithStability("Boolean indicator (0 or 1) of whether the last TPOT measurement violated the SLO threshold for each model and target model.", compbasemetrics.ALPHA), }, @@ -217,7 +217,7 @@ var ( requestTPOTSLOViolationCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_slo_violation_total", Help: metricsutil.HelpMsgWithStability("Counter of TPOT SLO violations for each model and target model.", compbasemetrics.ALPHA), }, @@ -227,7 +227,7 @@ var ( // SLO threshold gauges (for dynamic threshold management) requestTTFTSLOThreshold = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_ttft_slo_threshold_seconds", Help: metricsutil.HelpMsgWithStability("Current TTFT SLO threshold in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -236,7 +236,7 @@ var ( requestTPOTSLOThreshold = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_tpot_slo_threshold_seconds", Help: metricsutil.HelpMsgWithStability("Current TPOT SLO threshold in seconds for each model and target model.", compbasemetrics.ALPHA), }, @@ -245,9 +245,9 @@ var ( requestLatencies = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_duration_seconds", - Help: metricsutil.HelpMsgWithStability("Inference model response latency distribution in seconds for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective response latency distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ 0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600, @@ -258,9 +258,9 @@ var ( requestSizes = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "request_sizes", - Help: metricsutil.HelpMsgWithStability("Inference model requests size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective requests size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). Buckets: []float64{ 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB @@ -273,9 +273,9 @@ var ( responseSizes = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "response_sizes", - Help: metricsutil.HelpMsgWithStability("Inference model responses size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective responses size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), // Most models have a response token < 8192 tokens. Each token, in average, has 4 characters. // 8192 * 4 = 32768. Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536}, @@ -285,9 +285,9 @@ var ( inputTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "input_tokens", - Help: metricsutil.HelpMsgWithStability("Inference model input token count distribution for requests in each model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective input token count distribution for requests in each model.", compbasemetrics.ALPHA), // Most models have a input context window less than 1 million tokens. Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576}, }, @@ -296,9 +296,9 @@ var ( outputTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "output_tokens", - Help: metricsutil.HelpMsgWithStability("Inference model output token count distribution for requests in each model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective output token count distribution for requests in each model.", compbasemetrics.ALPHA), // Most models generates output less than 8192 tokens. Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}, }, @@ -307,9 +307,9 @@ var ( runningRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "running_requests", - Help: metricsutil.HelpMsgWithStability("Inference model number of running requests in each model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective number of running requests in each model.", compbasemetrics.ALPHA), }, []string{"model_name"}, ) @@ -317,9 +317,9 @@ var ( // NTPOT - Normalized Time Per Output Token NormalizedTimePerOutputToken = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Subsystem: InferenceModelComponent, + Subsystem: InferenceObjectiveComponent, Name: "normalized_time_per_output_token_seconds", - Help: metricsutil.HelpMsgWithStability("Inference model latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA), + Help: metricsutil.HelpMsgWithStability("Inference objective latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA), // From few milliseconds per token to multiple seconds per token Buckets: []float64{ 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, @@ -422,6 +422,28 @@ var ( }, []string{"commit", "build_ref"}, ) + + // Flow Control Metrics + flowControlRequestQueueDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "flow_control_request_queue_duration_seconds", + Help: metricsutil.HelpMsgWithStability("Distribution of the total time requests spend in the EPP flow control layer, measured from the start of the EnqueueAndWait call until a final outcome is reached.", compbasemetrics.ALPHA), + Buckets: []float64{ + 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, + }, + }, + []string{"fairness_id", "priority", "outcome"}, + ) + + flowControlQueueSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferenceExtension, + Name: "flow_control_queue_size", + Help: metricsutil.HelpMsgWithStability("Current number of requests being actively managed by the EPP flow control layer, from the start of the EnqueueAndWait call until a final outcome is reached.", compbasemetrics.ALPHA), + }, + []string{"fairness_id", "priority"}, + ) ) var registerMetrics sync.Once @@ -473,7 +495,8 @@ func Register(customCollectors ...prometheus.Collector) { metrics.Registry.MustRegister(PrefixCacheSize) metrics.Registry.MustRegister(PrefixCacheHitRatio) metrics.Registry.MustRegister(PrefixCacheHitLength) - + metrics.Registry.MustRegister(flowControlRequestQueueDuration) + metrics.Registry.MustRegister(flowControlQueueSize) for _, collector := range customCollectors { metrics.Registry.MustRegister(collector) } @@ -500,6 +523,8 @@ func Reset() { PrefixCacheSize.Reset() PrefixCacheHitRatio.Reset() PrefixCacheHitLength.Reset() + flowControlRequestQueueDuration.Reset() + flowControlQueueSize.Reset() requestTPOT.Reset() requestTTFT.Reset() @@ -770,6 +795,21 @@ func RecordInferenceExtensionInfo(commitSha, buildRef string) { InferenceExtensionInfo.WithLabelValues(commitSha, buildRef).Set(1) } +// RecordFlowControlRequestQueueDuration records the duration a request spent in the Flow Control layer. +func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration) { + flowControlRequestQueueDuration.WithLabelValues(fairnessID, priority, outcome).Observe(duration.Seconds()) +} + +// IncFlowControlQueueSize increments the Flow Control queue size gauge. +func IncFlowControlQueueSize(fairnessID, priority string) { + flowControlQueueSize.WithLabelValues(fairnessID, priority).Inc() +} + +// DecFlowControlQueueSize decrements the Flow Control queue size gauge. +func DecFlowControlQueueSize(fairnessID, priority string) { + flowControlQueueSize.WithLabelValues(fairnessID, priority).Dec() +} + // SetTTFTSLOThreshold sets the TTFT SLO threshold for a model. // This allows dynamic threshold management and makes the threshold visible in metrics. func SetTTFTSLOThreshold(modelName, targetModelName string, threshold float64) { diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go index f1bb23f64..736d6854c 100644 --- a/pkg/epp/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -22,6 +22,9 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" "k8s.io/component-base/metrics/testutil" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -30,25 +33,32 @@ import ( ) const ( - RequestTotalMetric = InferenceModelComponent + "_request_total" - RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" - RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" - RequestSizesMetric = InferenceModelComponent + "_request_sizes" - ResponseSizesMetric = InferenceModelComponent + "_response_sizes" - InputTokensMetric = InferenceModelComponent + "_input_tokens" - OutputTokensMetric = InferenceModelComponent + "_output_tokens" - NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds" - RunningRequestsMetric = InferenceModelComponent + "_running_requests" + RequestTotalMetric = InferenceObjectiveComponent + "_request_total" + RequestErrorTotalMetric = InferenceObjectiveComponent + "_request_error_total" + RequestLatenciesMetric = InferenceObjectiveComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceObjectiveComponent + "_request_sizes" + ResponseSizesMetric = InferenceObjectiveComponent + "_response_sizes" + InputTokensMetric = InferenceObjectiveComponent + "_input_tokens" + OutputTokensMetric = InferenceObjectiveComponent + "_output_tokens" + NormalizedTimePerOutputTokenMetric = InferenceObjectiveComponent + "_normalized_time_per_output_token_seconds" + RunningRequestsMetric = InferenceObjectiveComponent + "_running_requests" KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" PerPodQueueSizeMetrics = InferencePoolComponent + "_per_pod_queue_size" - RequestTTFTSecondsMetric = InferenceModelComponent + "_request_ttft_seconds" - RequestTPOTSecondsMetric = InferenceModelComponent + "_request_tpot_seconds" - RequestTTFTPredictionsMAPEMetric = InferenceModelComponent + "_request_ttft_predictions_mape" - RequestTPOTPredictionsMAPEMetric = InferenceModelComponent + "_request_tpot_predictions_mape" + RequestTTFTSecondsMetric = InferenceObjectiveComponent + "_request_ttft_seconds" + RequestTPOTSecondsMetric = InferenceObjectiveComponent + "_request_tpot_seconds" + RequestTTFTPredictionsMAPEMetric = InferenceObjectiveComponent + "_request_ttft_predictions_mape" + RequestTPOTPredictionsMAPEMetric = InferenceObjectiveComponent + "_request_tpot_predictions_mape" ) +func TestMain(m *testing.M) { + // Register all metrics once for the entire test suite. + Register() + os.Exit(m.Run()) +} + func TestRecordRequestCounterandSizes(t *testing.T) { + Reset() type requests struct { modelName string targetModelName string @@ -82,7 +92,6 @@ func TestRecordRequestCounterandSizes(t *testing.T) { }, }, }} - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { @@ -118,6 +127,7 @@ func TestRecordRequestCounterandSizes(t *testing.T) { } func TestRecordRequestErrorCounter(t *testing.T) { + Reset() type requests struct { modelName string targetModelName string @@ -154,7 +164,6 @@ func TestRecordRequestErrorCounter(t *testing.T) { }, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { @@ -178,6 +187,7 @@ func TestRecordRequestErrorCounter(t *testing.T) { } func TestRecordRequestLatencies(t *testing.T) { + Reset() ctx := logutil.NewTestLoggerIntoContext(context.Background()) timeBaseline := time.Now() type requests struct { @@ -233,7 +243,6 @@ func TestRecordRequestLatencies(t *testing.T) { invalid: true, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { @@ -260,6 +269,7 @@ func TestRecordRequestLatencies(t *testing.T) { } func TestRecordNormalizedTimePerOutputToken(t *testing.T) { + Reset() ctx := logutil.NewTestLoggerIntoContext(context.Background()) timeBaseline := time.Now() type tokenRequests struct { @@ -334,7 +344,6 @@ func TestRecordNormalizedTimePerOutputToken(t *testing.T) { invalid: true, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { @@ -361,6 +370,7 @@ func TestRecordNormalizedTimePerOutputToken(t *testing.T) { } func TestRecordResponseMetrics(t *testing.T) { + Reset() type responses struct { modelName string targetModelName string @@ -404,7 +414,6 @@ func TestRecordResponseMetrics(t *testing.T) { }, }, }} - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, resp := range scenario.resp { @@ -455,6 +464,7 @@ func TestRecordResponseMetrics(t *testing.T) { } func TestRunningRequestsMetrics(t *testing.T) { + Reset() type request struct { modelName string complete bool // true -> request is completed, false -> running request @@ -487,7 +497,6 @@ func TestRunningRequestsMetrics(t *testing.T) { }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.requests { @@ -515,6 +524,7 @@ func TestRunningRequestsMetrics(t *testing.T) { } func TestInferencePoolMetrics(t *testing.T) { + Reset() scenarios := []struct { name string poolName string @@ -528,7 +538,6 @@ func TestInferencePoolMetrics(t *testing.T) { queueSizeAvg: 0.4, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { RecordInferencePoolAvgKVCache(scenario.poolName, scenario.kvCacheAvg) @@ -564,6 +573,7 @@ func TestInferencePoolMetrics(t *testing.T) { } func TestPluginProcessingLatencies(t *testing.T) { + Reset() type pluginLatency struct { extensionPoint string pluginType string @@ -604,7 +614,6 @@ func TestPluginProcessingLatencies(t *testing.T) { }, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, latency := range scenario.latencies { @@ -628,6 +637,7 @@ func TestPluginProcessingLatencies(t *testing.T) { } func TestSchedulerE2ELatency(t *testing.T) { + Reset() scenarios := []struct { name string durations []time.Duration @@ -647,7 +657,6 @@ func TestSchedulerE2ELatency(t *testing.T) { }, }, } - Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, duration := range scenario.durations { @@ -671,6 +680,7 @@ func TestSchedulerE2ELatency(t *testing.T) { } func TestPrefixCacheMetrics(t *testing.T) { + Reset() const ( PrefixCacheSizeMetric = InferenceExtension + "_prefix_indexer_size" PrefixCacheHitRatioMetric = InferenceExtension + "_prefix_indexer_hit_ratio" @@ -717,7 +727,6 @@ func TestPrefixCacheMetrics(t *testing.T) { }, } - Register() t.Run(scenario.name, func(t *testing.T) { // Record cache size metrics for _, size := range scenario.cacheSizes { @@ -772,3 +781,104 @@ func TestPrefixCacheMetrics(t *testing.T) { } }) } + +func getHistogramVecLabelValues(t *testing.T, h *prometheus.HistogramVec, labelValues ...string) (*dto.Histogram, error) { + t.Helper() + m, err := h.GetMetricWithLabelValues(labelValues...) + if err != nil { + return nil, err + } + metricDto := &dto.Metric{} + if err := m.(prometheus.Histogram).Write(metricDto); err != nil { + return nil, err + } + return metricDto.GetHistogram(), nil +} + +func TestFlowControlQueueDurationMetric(t *testing.T) { + Reset() + + records := []struct { + fairnessID string + priority string + outcome string + duration time.Duration + }{ + {fairnessID: "user-a", priority: "100", outcome: "Dispatched", duration: 10 * time.Millisecond}, + {fairnessID: "user-a", priority: "100", outcome: "Dispatched", duration: 20 * time.Millisecond}, + {fairnessID: "user-b", priority: "100", outcome: "RejectedCapacity", duration: 5 * time.Millisecond}, + {fairnessID: "user-a", priority: "50", outcome: "Dispatched", duration: 100 * time.Millisecond}, + } + + for _, rec := range records { + RecordFlowControlRequestQueueDuration(rec.fairnessID, rec.priority, rec.outcome, rec.duration) + } + + testCases := []struct { + name string + labels prometheus.Labels + expectCount uint64 + expectSum float64 + }{ + { + name: "user-a, prio 100, dispatched", + labels: prometheus.Labels{"fairness_id": "user-a", "priority": "100", "outcome": "Dispatched"}, + expectCount: 2, + expectSum: 0.03, // 0.01 + 0.02 + }, + { + name: "user-b, prio 100, rejected", + labels: prometheus.Labels{"fairness_id": "user-b", "priority": "100", "outcome": "RejectedCapacity"}, + expectCount: 1, + expectSum: 0.005, + }, + { + name: "user-a, prio 50, dispatched", + labels: prometheus.Labels{"fairness_id": "user-a", "priority": "50", "outcome": "Dispatched"}, + expectCount: 1, + expectSum: 0.1, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + labels := []string{tc.labels["fairness_id"], tc.labels["priority"], tc.labels["outcome"]} + hist, err := getHistogramVecLabelValues(t, flowControlRequestQueueDuration, labels...) + require.NoError(t, err, "Failed to get histogram for labels %v", tc.labels) + require.Equal(t, tc.expectCount, hist.GetSampleCount(), "Sample count mismatch for labels %v", tc.labels) + require.InDelta(t, tc.expectSum, hist.GetSampleSum(), 0.00001, "Sample sum mismatch for labels %v", tc.labels) + }) + } +} + +func TestFlowControlQueueSizeMetric(t *testing.T) { + Reset() + + // Basic Inc/Dec + IncFlowControlQueueSize("user-a", "100") + val, err := testutil.GetGaugeMetricValue(flowControlQueueSize.WithLabelValues("user-a", "100")) + require.NoError(t, err, "Failed to get gauge value for user-a/100 after Inc") + require.Equal(t, 1.0, val, "Gauge value should be 1 after Inc for user-a/100") + + DecFlowControlQueueSize("user-a", "100") + val, err = testutil.GetGaugeMetricValue(flowControlQueueSize.WithLabelValues("user-a", "100")) + require.NoError(t, err, "Failed to get gauge value for user-a/100 after Dec") + require.Equal(t, 0.0, val, "Gauge value should be 0 after Dec for user-a/100") + + // Multiple labels + IncFlowControlQueueSize("user-b", "200") + IncFlowControlQueueSize("user-b", "200") + val, err = testutil.GetGaugeMetricValue(flowControlQueueSize.WithLabelValues("user-b", "200")) + require.NoError(t, err, "Failed to get gauge value for user-b/200") + require.Equal(t, 2.0, val, "Gauge value should be 2 for user-b/200") + + DecFlowControlQueueSize("user-b", "200") + val, err = testutil.GetGaugeMetricValue(flowControlQueueSize.WithLabelValues("user-b", "200")) + require.NoError(t, err, "Failed to get gauge value for user-b/200 after one Dec") + require.Equal(t, 1.0, val, "Gauge value should be 1 for user-b/200 after one Dec") + + // Non-existent labels + val, err = testutil.GetGaugeMetricValue(flowControlQueueSize.WithLabelValues("user-c", "100")) + require.NoError(t, err, "Failed to get gauge value for non-existent user-c/100") + require.Equal(t, 0.0, val, "Gauge value for non-existent labels should be 0") +} diff --git a/pkg/epp/metrics/testdata/input_tokens_metric b/pkg/epp/metrics/testdata/input_tokens_metric index 245c7dfa7..5ec493f52 100644 --- a/pkg/epp/metrics/testdata/input_tokens_metric +++ b/pkg/epp/metrics/testdata/input_tokens_metric @@ -1,68 +1,68 @@ -# HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model. -# TYPE inference_model_input_tokens histogram -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 -inference_model_input_tokens_sum{model_name="m10",target_model_name="t10"} 30 -inference_model_input_tokens_count{model_name="m10",target_model_name="t10"} 2 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 -inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 -inference_model_input_tokens_sum{model_name="m10",target_model_name="t11"} 30 -inference_model_input_tokens_count{model_name="m10",target_model_name="t11"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 -inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 -inference_model_input_tokens_sum{model_name="m20",target_model_name="t20"} 40 -inference_model_input_tokens_count{model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_input_tokens [ALPHA] Inference objective input token count distribution for requests in each model. +# TYPE inference_objective_input_tokens histogram +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_objective_input_tokens_sum{model_name="m10",target_model_name="t10"} 30 +inference_objective_input_tokens_count{model_name="m10",target_model_name="t10"} 2 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_objective_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_objective_input_tokens_sum{model_name="m10",target_model_name="t11"} 30 +inference_objective_input_tokens_count{model_name="m10",target_model_name="t11"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_objective_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_objective_input_tokens_sum{model_name="m20",target_model_name="t20"} 40 +inference_objective_input_tokens_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric index bb6e93737..0a9c83ea4 100644 --- a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric +++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric @@ -1,50 +1,50 @@ -# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model. -# TYPE inference_model_normalized_time_per_output_token_seconds histogram -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2 -inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03 -inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1 -inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02 -inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1 -inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1 -inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006 -inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1 +# HELP inference_objective_normalized_time_per_output_token_seconds [ALPHA] Inference objective latency divided by number of output tokens in seconds for each model and target model. +# TYPE inference_objective_normalized_time_per_output_token_seconds histogram +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2 +inference_objective_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03 +inference_objective_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1 +inference_objective_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02 +inference_objective_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1 +inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1 +inference_objective_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006 +inference_objective_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/output_tokens_metric b/pkg/epp/metrics/testdata/output_tokens_metric index 40bbe3272..5b71ca0a3 100644 --- a/pkg/epp/metrics/testdata/output_tokens_metric +++ b/pkg/epp/metrics/testdata/output_tokens_metric @@ -1,47 +1,47 @@ -# HELP inference_model_output_tokens [ALPHA] Inference model output token count distribution for requests in each model. -# TYPE inference_model_output_tokens histogram -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 -inference_model_output_tokens_sum{model_name="m10",target_model_name="t10"} 300 -inference_model_output_tokens_count{model_name="m10",target_model_name="t10"} 2 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 0 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 -inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 -inference_model_output_tokens_sum{model_name="m10",target_model_name="t11"} 300 -inference_model_output_tokens_count{model_name="m10",target_model_name="t11"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 0 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 -inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 -inference_model_output_tokens_sum{model_name="m20",target_model_name="t20"} 400 -inference_model_output_tokens_count{model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_output_tokens [ALPHA] Inference objective output token count distribution for requests in each model. +# TYPE inference_objective_output_tokens histogram +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_objective_output_tokens_sum{model_name="m10",target_model_name="t10"} 300 +inference_objective_output_tokens_count{model_name="m10",target_model_name="t10"} 2 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_objective_output_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_objective_output_tokens_sum{model_name="m10",target_model_name="t11"} 300 +inference_objective_output_tokens_count{model_name="m10",target_model_name="t11"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 0 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_objective_output_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_objective_output_tokens_sum{model_name="m20",target_model_name="t20"} 400 +inference_objective_output_tokens_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/request_duration_seconds_metric b/pkg/epp/metrics/testdata/request_duration_seconds_metric index 6c70b4ba9..cd6f0c061 100644 --- a/pkg/epp/metrics/testdata/request_duration_seconds_metric +++ b/pkg/epp/metrics/testdata/request_duration_seconds_metric @@ -1,116 +1,116 @@ -# HELP inference_model_request_duration_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. -# TYPE inference_model_request_duration_seconds histogram -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 -inference_model_request_duration_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 -inference_model_request_duration_seconds_count{model_name="m10", target_model_name="t10"} 2 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 -inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 -inference_model_request_duration_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 -inference_model_request_duration_seconds_count{model_name="m10",target_model_name="t11"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 -inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 -inference_model_request_duration_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 -inference_model_request_duration_seconds_count{model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_request_duration_seconds [ALPHA] Inference objective response latency distribution in seconds for each model and target model. +# TYPE inference_objective_request_duration_seconds histogram +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_objective_request_duration_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_objective_request_duration_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_objective_request_duration_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_objective_request_duration_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_objective_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_objective_request_duration_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_objective_request_duration_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/request_error_total_metric b/pkg/epp/metrics/testdata/request_error_total_metric index 31036eb60..2a2e55364 100644 --- a/pkg/epp/metrics/testdata/request_error_total_metric +++ b/pkg/epp/metrics/testdata/request_error_total_metric @@ -1,5 +1,5 @@ -# HELP inference_model_request_error_total [ALPHA] Counter of inference model requests errors broken out for each model and target model. -# TYPE inference_model_request_error_total counter -inference_model_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2 -inference_model_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1 -inference_model_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_request_error_total [ALPHA] Counter of inference objective requests errors broken out for each model and target model. +# TYPE inference_objective_request_error_total counter +inference_objective_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2 +inference_objective_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1 +inference_objective_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/request_sizes_metric b/pkg/epp/metrics/testdata/request_sizes_metric index ceca532e2..74e672591 100644 --- a/pkg/epp/metrics/testdata/request_sizes_metric +++ b/pkg/epp/metrics/testdata/request_sizes_metric @@ -1,86 +1,86 @@ -# HELP inference_model_request_sizes [ALPHA] Inference model requests size distribution in bytes for each model and target model. -# TYPE inference_model_request_sizes histogram -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="32768"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 -inference_model_request_sizes_sum{model_name="m10",target_model_name="t10"} 1700 -inference_model_request_sizes_count{model_name="m10",target_model_name="t10"} 2 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="32768"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 -inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 -inference_model_request_sizes_sum{model_name="m10",target_model_name="t11"} 2480 -inference_model_request_sizes_count{model_name="m10",target_model_name="t11"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="32768"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 -inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 -inference_model_request_sizes_sum{model_name="m20",target_model_name="t20"} 80 -inference_model_request_sizes_count{model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_request_sizes [ALPHA] Inference objective requests size distribution in bytes for each model and target model. +# TYPE inference_objective_request_sizes histogram +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="32768"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_objective_request_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_objective_request_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="32768"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +inference_objective_request_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_objective_request_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_objective_request_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="32768"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +inference_objective_request_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_objective_request_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_objective_request_sizes_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/request_total_metric b/pkg/epp/metrics/testdata/request_total_metric index 9c6f48a36..a6200fdc9 100644 --- a/pkg/epp/metrics/testdata/request_total_metric +++ b/pkg/epp/metrics/testdata/request_total_metric @@ -1,5 +1,5 @@ -# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. -# TYPE inference_model_request_total counter -inference_model_request_total{model_name="m10", target_model_name="t10"} 2 -inference_model_request_total{model_name="m10", target_model_name="t11"} 1 -inference_model_request_total{model_name="m20", target_model_name="t20"} 1 +# HELP inference_objective_request_total [ALPHA] Counter of inference objective requests broken out for each model and target model. +# TYPE inference_objective_request_total counter +inference_objective_request_total{model_name="m10", target_model_name="t10"} 2 +inference_objective_request_total{model_name="m10", target_model_name="t11"} 1 +inference_objective_request_total{model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/response_sizes_metric b/pkg/epp/metrics/testdata/response_sizes_metric index 7f981090c..a9ad76ecb 100644 --- a/pkg/epp/metrics/testdata/response_sizes_metric +++ b/pkg/epp/metrics/testdata/response_sizes_metric @@ -1,56 +1,56 @@ -# HELP inference_model_response_sizes [ALPHA] Inference model responses size distribution in bytes for each model and target model. -# TYPE inference_model_response_sizes histogram -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 -inference_model_response_sizes_sum{model_name="m10",target_model_name="t10"} 1700 -inference_model_response_sizes_count{model_name="m10",target_model_name="t10"} 2 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 -inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 -inference_model_response_sizes_sum{model_name="m10",target_model_name="t11"} 2480 -inference_model_response_sizes_count{model_name="m10",target_model_name="t11"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1"} 0 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8"} 0 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16"} 0 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32"} 0 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 -inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 -inference_model_response_sizes_sum{model_name="m20",target_model_name="t20"} 80 -inference_model_response_sizes_count{model_name="m20",target_model_name="t20"} 1 +# HELP inference_objective_response_sizes [ALPHA] Inference objective responses size distribution in bytes for each model and target model. +# TYPE inference_objective_response_sizes histogram +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_objective_response_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_objective_response_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_objective_response_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_objective_response_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_objective_response_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_objective_response_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_objective_response_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_objective_response_sizes_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/running_requests_metrics b/pkg/epp/metrics/testdata/running_requests_metrics index a880e4998..962a50fbf 100644 --- a/pkg/epp/metrics/testdata/running_requests_metrics +++ b/pkg/epp/metrics/testdata/running_requests_metrics @@ -1,4 +1,4 @@ -# HELP inference_model_running_requests [ALPHA] Inference model number of running requests in each model. -# TYPE inference_model_running_requests gauge -inference_model_running_requests{model_name="m1"} 1 -inference_model_running_requests{model_name="m2"} 1 +# HELP inference_objective_running_requests [ALPHA] Inference objective number of running requests in each model. +# TYPE inference_objective_running_requests gauge +inference_objective_running_requests{model_name="m1"} 1 +inference_objective_running_requests{model_name="m2"} 1 diff --git a/pkg/epp/plugins/handle.go b/pkg/epp/plugins/handle.go index 8c9153cf1..c074e9076 100644 --- a/pkg/epp/plugins/handle.go +++ b/pkg/epp/plugins/handle.go @@ -19,6 +19,8 @@ package plugins import ( "context" "fmt" + + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" ) // Handle provides plugins a set of standard data and tools to work with @@ -27,6 +29,9 @@ type Handle interface { Context() context.Context HandlePlugins + + // PodList lists pods matching the given predicate. + PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics } // HandlePlugins defines a set of APIs to work with instantiated plugins @@ -44,10 +49,14 @@ type HandlePlugins interface { GetAllPluginsWithNames() map[string]Plugin } +// PodListFunc is a function type that filters and returns a list of pod metrics +type PodListFunc func(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics + // eppHandle is an implementation of the interface plugins.Handle type eppHandle struct { ctx context.Context HandlePlugins + podList PodListFunc } // Context returns a context the plugins can use, if they need one @@ -84,12 +93,18 @@ func (h *eppHandlePlugins) GetAllPluginsWithNames() map[string]Plugin { return h.plugins } -func NewEppHandle(ctx context.Context) Handle { +// PodList lists pods matching the given predicate. +func (h *eppHandle) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { + return h.podList(predicate) +} + +func NewEppHandle(ctx context.Context, podList PodListFunc) Handle { return &eppHandle{ ctx: ctx, HandlePlugins: &eppHandlePlugins{ plugins: map[string]Plugin{}, }, + podList: podList, } } diff --git a/pkg/epp/requestcontrol/admission.go b/pkg/epp/requestcontrol/admission.go new file mode 100644 index 000000000..69fd5adf8 --- /dev/null +++ b/pkg/epp/requestcontrol/admission.go @@ -0,0 +1,216 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +import ( + "context" + "time" + + "sigs.k8s.io/controller-runtime/pkg/log" + + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" +) + +// AdmissionController defines the interface for making admission control decisions. +// Implementations of this interface determine whether an incoming inference request should be accepted or rejected +// based on various criteria such as system load, fairness, priority, and available capacity. +type AdmissionController interface { + // Admit determines if a request should be admitted. + // It is called by the Director for each incoming request. + // + // Args: + // ctx: The request context, carrying deadlines, cancellation signals, and logger. + // reqCtx: The handlers.RequestContext containing details about the incoming request. + // candidatePods: A list of potential backend pods that can serve the request. + // priority: The priority level of the request, as determined by the InferenceObjective. + // + // Returns: + // - nil: If the request is admitted and should proceed to scheduling. + // - errutil.Error: If the request is rejected. + Admit( + ctx context.Context, + reqCtx *handlers.RequestContext, + candidatePods []backendmetrics.PodMetrics, + priority int, + ) error +} + +// saturationDetector defines the minimal interface required for checking if the backend pool is saturated. +type saturationDetector interface { + IsSaturated(ctx context.Context, candidatePods []backendmetrics.PodMetrics) bool +} + +// flowController defines the minimal interface required by FlowControlAdmissionController for enqueuing requests and +// waiting for an admission outcome. +type flowController interface { + EnqueueAndWait(ctx context.Context, req types.FlowControlRequest) (types.QueueOutcome, error) +} + +// rejectIfSheddableAndSaturated checks if a request should be immediately rejected because it's sheddable +// (priority < 0) and the system is saturated. +func rejectIfSheddableAndSaturated( + ctx context.Context, + sd saturationDetector, + reqCtx *handlers.RequestContext, + candidatePods []backendmetrics.PodMetrics, + priority int, +) error { + if requtil.IsSheddable(priority) { + logger := log.FromContext(ctx) + if sd.IsSaturated(ctx, candidatePods) { + logger.V(logutil.TRACE).Info("Request rejected: system saturated and request is sheddable", + "requestID", reqCtx.SchedulingRequest.RequestId) + return errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, + Msg: "system saturated, sheddable request dropped", + } + } + } + return nil +} + +// --- LegacyAdmissionController --- + +// LegacyAdmissionController implements saturation-based admission control. +// It rejects sheddable requests (priority < 0) if the saturationDetector indicates that the system is currently +// saturated. Non-sheddable requests always bypass the saturation check. +type LegacyAdmissionController struct { + saturationDetector saturationDetector +} + +// NewLegacyAdmissionController creates a new LegacyAdmissionController. +func NewLegacyAdmissionController(sd saturationDetector) *LegacyAdmissionController { + return &LegacyAdmissionController{saturationDetector: sd} +} + +// Admit implements the AdmissionController interface for the legacy strategy. +// It checks for saturation only for requests with priority < 0. +func (lac *LegacyAdmissionController) Admit( + ctx context.Context, + reqCtx *handlers.RequestContext, + candidatePods []backendmetrics.PodMetrics, + priority int, +) error { + logger := log.FromContext(ctx) + logger.V(logutil.TRACE).Info("Executing LegacyAdmissionController", + "priority", priority, "fairnessID", reqCtx.FairnessID) + if err := rejectIfSheddableAndSaturated(ctx, lac.saturationDetector, reqCtx, candidatePods, priority); err != nil { + return err + } + logger.V(logutil.TRACE).Info("Request admitted", "requestID", reqCtx.SchedulingRequest.RequestId) + return nil +} + +// --- FlowControlAdmissionController --- + +// FlowControlAdmissionController delegates admission decisions to the Flow Control layer. +// It first checks if the request is sheddable and the system is saturated, rejecting immediately if both conditions are +// true. Otherwise, it uses the provided flowController to enqueue the request and await an outcome. +type FlowControlAdmissionController struct { + saturationDetector saturationDetector + flowController flowController +} + +// NewFlowControlAdmissionController creates a new FlowControlAdmissionController. +// It requires a SaturationDetector and a flowController instance. +func NewFlowControlAdmissionController(sd saturationDetector, fc flowController) *FlowControlAdmissionController { + return &FlowControlAdmissionController{ + saturationDetector: sd, + flowController: fc, + } +} + +// Admit implements the AdmissionController interface by checking for saturation on sheddable requests first, then +// deferring to the Flow Control system. +func (fcac *FlowControlAdmissionController) Admit( + ctx context.Context, + reqCtx *handlers.RequestContext, + candidatePods []backendmetrics.PodMetrics, + priority int, +) error { + logger := log.FromContext(ctx) + logger.V(logutil.TRACE).Info("Executing FlowControlAdmissionController", + "requestID", reqCtx.SchedulingRequest.RequestId, "priority", priority, "fairnessID", reqCtx.FairnessID) + if err := rejectIfSheddableAndSaturated(ctx, fcac.saturationDetector, reqCtx, candidatePods, priority); err != nil { + return err + } + + logger.V(logutil.TRACE).Info("Request proceeding to flow control", "requestID", reqCtx.SchedulingRequest.RequestId) + + fcReq := &flowControlRequest{ + requestID: reqCtx.SchedulingRequest.RequestId, + fairnessID: reqCtx.FairnessID, + priority: priority, + requestByteSize: uint64(reqCtx.RequestSize), + candidatePods: candidatePods, + } + + outcome, err := fcac.flowController.EnqueueAndWait(ctx, fcReq) + logger.V(logutil.DEBUG).Info("Flow control outcome", + "requestID", reqCtx.SchedulingRequest.RequestId, "outcome", outcome, "error", err) + return translateFlowControlOutcome(outcome, err) +} + +// flowControlRequest is an adapter that implements the types.FlowControlRequest interface. +type flowControlRequest struct { + requestID string + fairnessID string + priority int + requestByteSize uint64 + candidatePods []backendmetrics.PodMetrics +} + +var _ types.FlowControlRequest = &flowControlRequest{} + +func (r *flowControlRequest) ID() string { return r.requestID } +func (r *flowControlRequest) InitialEffectiveTTL() time.Duration { return 0 } // Use controller default. +func (r *flowControlRequest) ByteSize() uint64 { return r.requestByteSize } +func (r *flowControlRequest) CandidatePodsForScheduling() []backendmetrics.PodMetrics { + return r.candidatePods +} +func (r *flowControlRequest) FlowKey() types.FlowKey { + return types.FlowKey{ID: r.fairnessID, Priority: r.priority} +} + +// translateFlowControlOutcome maps the context-rich outcome of the Flow Control layer to the public errutil.Error +// contract used by the Director. +func translateFlowControlOutcome(outcome types.QueueOutcome, err error) error { + msg := "request rejected by flow control" + if err != nil { + msg = err.Error() + } + + switch outcome { + case types.QueueOutcomeDispatched: + return nil + case types.QueueOutcomeRejectedCapacity: + return errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: msg} + case types.QueueOutcomeEvictedTTL: + return errutil.Error{Code: errutil.ServiceUnavailable, Msg: "request timed out in queue: " + msg} + case types.QueueOutcomeEvictedContextCancelled: + return errutil.Error{Code: errutil.ServiceUnavailable, Msg: "client disconnected: " + msg} + case types.QueueOutcomeRejectedOther, types.QueueOutcomeEvictedOther: + return errutil.Error{Code: errutil.Internal, Msg: "internal flow control error: " + msg} + default: + return errutil.Error{Code: errutil.Internal, Msg: "unhandled flow control outcome: " + msg} + } +} diff --git a/pkg/epp/requestcontrol/admission_test.go b/pkg/epp/requestcontrol/admission_test.go new file mode 100644 index 000000000..085778200 --- /dev/null +++ b/pkg/epp/requestcontrol/admission_test.go @@ -0,0 +1,282 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + fctypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// --- Mocks --- + +type mockSaturationDetector struct { + isSaturated bool +} + +func (m *mockSaturationDetector) IsSaturated(_ context.Context, _ []backendmetrics.PodMetrics) bool { + return m.isSaturated +} + +type mockFlowController struct { + outcome fctypes.QueueOutcome + err error + called bool +} + +func (m *mockFlowController) EnqueueAndWait( + _ context.Context, + _ fctypes.FlowControlRequest, +) (fctypes.QueueOutcome, error) { + m.called = true + return m.outcome, m.err +} + +func TestLegacyAdmissionController_Admit(t *testing.T) { + t.Parallel() + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + candidatePods := []backendmetrics.PodMetrics{} + reqCtx := &handlers.RequestContext{ + SchedulingRequest: &schedulingtypes.LLMRequest{RequestId: "test-req"}, + } + + testCases := []struct { + name string + priority int + isSaturated bool + expectErr bool + expectErrCode string + expectErrSubstr string + }{ + { + name: "non_sheddable_saturated_admit", + priority: 0, + isSaturated: true, + expectErr: false, + }, + { + name: "sheddable_not_saturated_admit", + priority: -1, + isSaturated: false, + expectErr: false, + }, + { + name: "sheddable_saturated_reject", + priority: -1, + isSaturated: true, + expectErr: true, + expectErrCode: errutil.InferencePoolResourceExhausted, + expectErrSubstr: "system saturated, sheddable request dropped", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + saturationDetector := &mockSaturationDetector{isSaturated: tc.isSaturated} + ac := NewLegacyAdmissionController(saturationDetector) + + err := ac.Admit(ctx, reqCtx, candidatePods, tc.priority) + + if !tc.expectErr { + assert.NoError(t, err, "Admit() should not have returned an error for scenario: %s", tc.name) + } else { + require.Error(t, err, "Admit() should have returned an error for scenario: %s", tc.name) + var e errutil.Error + if assert.ErrorAs(t, err, &e, "error should be of type errutil.Error") { + assert.Equal(t, tc.expectErrCode, e.Code, "incorrect error code for scenario: %s", tc.name) + assert.Contains(t, e.Msg, tc.expectErrSubstr, "incorrect error message substring for scenario: %s", tc.name) + } + } + }) + } +} + +func TestFlowControlRequestAdapter(t *testing.T) { + t.Parallel() + candidatePods := []backendmetrics.PodMetrics{&backendmetrics.FakePodMetrics{}} + + testCases := []struct { + name string + requestID string + fairnessID string + priority int + requestByteSize uint64 + expectFlowKey fctypes.FlowKey + }{ + { + name: "simple", + requestID: "req-1", + fairnessID: "flow-1", + priority: 10, + requestByteSize: 1024, + expectFlowKey: fctypes.FlowKey{ID: "flow-1", Priority: 10}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + fcReq := &flowControlRequest{ + requestID: tc.requestID, + fairnessID: tc.fairnessID, + priority: tc.priority, + requestByteSize: tc.requestByteSize, + candidatePods: candidatePods, + } + + assert.Equal(t, tc.requestID, fcReq.ID(), "ID() mismatch") + assert.Equal(t, tc.requestByteSize, fcReq.ByteSize(), "ByteSize() mismatch") + assert.Equal(t, candidatePods, fcReq.CandidatePodsForScheduling(), "CandidatePodsForScheduling() mismatch") + assert.Equal(t, tc.expectFlowKey, fcReq.FlowKey(), "FlowKey() mismatch") + assert.Zero(t, fcReq.InitialEffectiveTTL(), "InitialEffectiveTTL() should be zero") + }) + } +} +func TestFlowControlAdmissionController_Admit(t *testing.T) { + t.Parallel() + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + candidatePods := []backendmetrics.PodMetrics{} + + reqCtx := &handlers.RequestContext{ + SchedulingRequest: &schedulingtypes.LLMRequest{RequestId: "test-req"}, + } + + testCases := []struct { + name string + priority int + isSaturated bool + fcOutcome fctypes.QueueOutcome + fcErr error + expectErr bool + expectErrCode string + expectErrSubstr string + expectFCSkipped bool + }{ + { + name: "sheddable_saturated_reject", + priority: -1, + isSaturated: true, + expectErr: true, + expectErrCode: errutil.InferencePoolResourceExhausted, + expectErrSubstr: "system saturated, sheddable request dropped", + expectFCSkipped: true, + }, + { + name: "sheddable_not_saturated_dispatch", + priority: -1, + isSaturated: false, + fcOutcome: fctypes.QueueOutcomeDispatched, + expectErr: false, + }, + { + name: "non_sheddable_saturated_dispatch", + priority: 0, + isSaturated: true, + fcOutcome: fctypes.QueueOutcomeDispatched, + expectErr: false, + }, + { + name: "fc_reject_capacity", + priority: 0, + fcOutcome: fctypes.QueueOutcomeRejectedCapacity, + expectErr: true, + expectErrCode: errutil.InferencePoolResourceExhausted, + expectErrSubstr: "request rejected by flow control", + }, + { + name: "fc_evict_ttl", + priority: 0, + fcOutcome: fctypes.QueueOutcomeEvictedTTL, + fcErr: errors.New("timeout"), + expectErr: true, + expectErrCode: errutil.ServiceUnavailable, + expectErrSubstr: "request timed out in queue: timeout", + }, + { + name: "fc_evict_context_cancelled", + priority: 0, + fcOutcome: fctypes.QueueOutcomeEvictedContextCancelled, + expectErr: true, + expectErrCode: errutil.ServiceUnavailable, + expectErrSubstr: "client disconnected", + }, + { + name: "fc_reject_other", + priority: 0, + fcOutcome: fctypes.QueueOutcomeRejectedOther, + expectErr: true, + expectErrCode: errutil.Internal, + expectErrSubstr: "internal flow control error", + }, + { + name: "fc_evict_other", + priority: 0, + fcOutcome: fctypes.QueueOutcomeEvictedOther, + fcErr: errors.New("internal error"), + expectErr: true, + expectErrCode: errutil.Internal, + expectErrSubstr: "internal flow control error: internal error", + }, + { + name: "fc_unhandled_outcome", + priority: 0, + fcOutcome: fctypes.QueueOutcomeNotYetFinalized, + expectErr: true, + expectErrCode: errutil.Internal, + expectErrSubstr: "unhandled flow control outcome", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + sd := &mockSaturationDetector{isSaturated: tc.isSaturated} + fc := &mockFlowController{outcome: tc.fcOutcome, err: tc.fcErr} + ac := NewFlowControlAdmissionController(sd, fc) + + err := ac.Admit(ctx, reqCtx, candidatePods, tc.priority) + + if tc.expectFCSkipped { + assert.False(t, fc.called, "FlowController should not have been called for scenario: %s", tc.name) + } else { + assert.True(t, fc.called, "FlowController should have been called for scenario: %s", tc.name) + } + + if !tc.expectErr { + assert.NoError(t, err, "Admit() returned an unexpected error for scenario: %s", tc.name) + } else { + require.Error(t, err, "Admit() should have returned an error for scenario: %s", tc.name) + var e errutil.Error + if assert.ErrorAs(t, err, &e, "error should be of type errutil.Error") { + assert.Equal(t, tc.expectErrCode, e.Code, "incorrect error code for scenario: %s", tc.name) + assert.Contains(t, e.Msg, tc.expectErrSubstr, "incorrect error message substring for scenario: %s", tc.name) + } + } + }) + } +} diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go index 662491bdb..feda7edca 100644 --- a/pkg/epp/requestcontrol/director.go +++ b/pkg/epp/requestcontrol/director.go @@ -33,7 +33,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metadata" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" @@ -50,55 +49,6 @@ type Datastore interface { PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics } -/* -NOTE: To support this refined logic, the `handlers.RequestContext` struct -(defined in a different package) would need to be updated as follows: - -type RequestContext struct { - // ... existing fields ... - RequestReceivedTimestamp time.Time - FirstTokenTimestamp time.Time - ResponseCompleteTimestamp time.Time - IsModelServerStreaming func() bool - ResponseComplete bool - Prompt string - LastSeenMetrics *backend.Metrics - // ... etc ... - - // -- New fields for latency predictor -- - PredictedTTFT float64 // The predicted TTFT in milliseconds - PredictedTPOT float64 // The predicted TPOT in milliseconds - TTFT float64 // Actual Time To First Token in milliseconds - LastTokenTimestamp time.Time // Timestamp of the last token received - TPOTObservations []float64 // All actual inter-token latencies (for which we have predictions) - PredictedTPOTObservations []float64 // Predicted inter-token latencies (only for sampled tokens) - GeneratedTokenCount int // Current number of tokens generated -} - -*/ - -const ( - subsetHintNamespace = "envoy.lb.subset_hint" - subsetHintKey = "x-gateway-destination-endpoint-subset" -) - -const ( - // Poisson sampling parameters for predictions - defaultSamplingMean = 100 // Mean interval between prediction samples (tokens) - maxSampledTokens = 20 // Maximum number of prediction samples per request -) - -// calculateRunningAverage calculates the running average efficiently -func calculateRunningAverage(currentAvg float64, newValue float64, count int) float64 { - if count == 0 { - return 0 - } - if count == 1 { - return newValue - } - return currentAvg + (newValue-currentAvg)/float64(count) -} - // parseFloatHeader retrieves a header by name, parses it as a float64, // and returns the value or an error if the header is missing or invalid. func parseFloatHeader(reqCtx *handlers.RequestContext, headerName string) (float64, bool, error) { @@ -148,46 +98,43 @@ type Scheduler interface { Schedule(ctx context.Context, request *schedulingtypes.LLMRequest, candidatePods []schedulingtypes.Pod) (result *schedulingtypes.SchedulingResult, err error) } -// SaturationDetector provides a signal indicating whether the backends are considered saturated. -type SaturationDetector interface { - IsSaturated(ctx context.Context, candidatePods []backendmetrics.PodMetrics) bool -} - // NewDirectorWithConfig creates a new Director instance with all dependencies. -func NewDirectorWithConfig(datastore datastore.Datastore, scheduler Scheduler, saturationDetector SaturationDetector, config *Config) *Director { +func NewDirectorWithConfig( + datastore Datastore, + scheduler Scheduler, + admissionController AdmissionController, + config *Config, +) *Director { return &Director{ - datastore: datastore, - scheduler: scheduler, - saturationDetector: saturationDetector, - preRequestPlugins: config.preRequestPlugins, - postResponsePlugins: config.postResponsePlugins, - postResponseChunkPlugins: config.postResponseChunkPlugins, - postResponseCompletePlugins: config.postResponseCompletePlugins, - defaultPriority: 0, // define default priority explicitly + datastore: datastore, + scheduler: scheduler, + admissionController: admissionController, + requestControlPlugins: *config, + defaultPriority: 0, // define default priority explicitly } } -// Director orchestrates the request handling flow, including scheduling. +// Director orchestrates the request handling flow after initial parsing by the handler. +// Its responsibilities include: +// - Retrieving request metadata and relevant objectives. +// - Determining candidate pods. +// - Performing admission control via the AdmissionController. +// - Scheduling the request to target pod(s) via the Scheduler. +// - Running PreRequest plugins. +// - Preparing the request context for the Envoy ext_proc filter to route the request. +// - Running PostResponse plugins. type Director struct { - datastore datastore.Datastore - scheduler Scheduler - saturationDetector SaturationDetector - preRequestPlugins []PreRequest - postResponsePlugins []PostResponse - postResponseChunkPlugins []PostResponseChunk - postResponseCompletePlugins []PostResponseComplete + datastore Datastore + scheduler Scheduler + admissionController AdmissionController + requestControlPlugins Config // we just need a pointer to an int variable since priority is a pointer in InferenceObjective // no need to set this in the constructor, since the value we want is the default int val // and value types cannot be nil defaultPriority int } -// HandleRequest orchestrates the request lifecycle: -// 1. Parses request details. -// 2. Calls admitRequest for admission control. -// 3. Calls Scheduler.Schedule if request is approved. -// 4. Calls prepareRequest to populate RequestContext with result and call PreRequest plugins. -// +// HandleRequest orchestrates the request lifecycle. // It always returns the requestContext even in the error case, as the request context is used in error handling. func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { logger := log.FromContext(ctx) @@ -206,20 +153,17 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo } reqCtx.Request.Body["model"] = reqCtx.TargetModelName - prompt, err := requtil.ExtractPromptFromRequestBody(requestBodyMap) + requestBody, err := requtil.ExtractRequestBody(reqCtx.Request.Body) if err != nil { - return reqCtx, err + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: fmt.Errorf("failed to extract request data: %w", err).Error()} } + infObjective := d.datastore.ObjectiveGet(reqCtx.ObjectiveKey) if infObjective == nil { logger.V(logutil.VERBOSE).Info("No associated InferenceObjective found, using default", "objectiveKey", reqCtx.ObjectiveKey) - priority := d.defaultPriority - if strings.Contains(reqCtx.ObjectiveKey, "sheddable") { - priority = -1 - } infObjective = &v1alpha2.InferenceObjective{ Spec: v1alpha2.InferenceObjectiveSpec{ - Priority: &priority, + Priority: &d.defaultPriority, }, } } else if infObjective.Spec.Priority == nil { @@ -247,7 +191,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo reqCtx.SchedulingRequest = &schedulingtypes.LLMRequest{ RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], TargetModel: reqCtx.TargetModelName, - Prompt: prompt, + Body: requestBody, Headers: reqCtx.Request.Headers, TTFTSLO: ttftSLO, AvgTPOTSLO: avgTPOTSLO, @@ -266,25 +210,17 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo return reqCtx, errutil.Error{Code: errutil.ServiceUnavailable, Msg: "failed to find candidate pods for serving the request"} } - // TODO - // 1. Create datastore request object - // 2. Read/Write and maybe Drop to it during Schedule() and admitRequest() - // 3. Add it to the scheduled pod's RequestPriorityQueue - // 4. Drop from pod's RequestPriorityQueue and datastore global map when request is fully processed - - // + if err := d.admissionController.Admit(ctx, reqCtx, candidatePods, *infObjective.Spec.Priority); err != nil { + logger.V(logutil.DEFAULT).Info("Request rejected by admission control", "error", err) + return reqCtx, err + } result, err := d.scheduler.Schedule(ctx, reqCtx.SchedulingRequest, d.toSchedulerPodMetrics(candidatePods)) if err != nil { return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } - // Admission Control check - if err := d.admitRequest(ctx, candidatePods, reqCtx.SchedulingRequest, *infObjective.Spec.Priority, reqCtx.FairnessID); err != nil { - return reqCtx, err - } - - // --- 4. Prepare Request (Populates RequestContext and call PreRequest plugins) --- + // Prepare Request (Populates RequestContext and call PreRequest plugins) // Insert target endpoint to instruct Envoy to route requests to the specified target pod and attach the port number. // Invoke PreRequest registered plugins. reqCtx, err = d.prepareRequest(ctx, reqCtx, result) @@ -295,33 +231,6 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo return reqCtx, nil } -// admitRequest handles admission control to decide whether or not to accept the request -// based on the request priority and system saturation state. -func (d *Director) admitRequest(ctx context.Context, candidatePods []backendmetrics.PodMetrics, request *schedulingtypes.LLMRequest, requestPriority int, fairnessID string) error { - logger := log.FromContext(ctx) - - logger.V(logutil.DEBUG).Info("Entering Flow Control", "priority", requestPriority, "fairnessID", fairnessID) - - // This will be removed in favor of a more robust implementation (Flow Control) in the very near future. - // TODO: Make this a configurable value. - // Tracking issue https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1347 - if requestPriority >= 0 { - logger.V(logutil.DEBUG).Info("Non-sheddable request bypassing saturation check.") - return nil - } else { - logger.V(logutil.DEBUG).Info("Sheddable request subject to saturation check.") - } - - if d.saturationDetector.IsSaturated(ctx, candidatePods) || !request.HasValidPod { // Assuming non-nil Saturation Detector - return errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, - Msg: "system saturated, sheddable request dropped", - } - } - - return nil -} - // getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore. // according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies // a subset of endpoints, only these endpoints will be considered as candidates for the scheduler. @@ -357,7 +266,7 @@ func (d *Director) getCandidatePodsForScheduling(ctx context.Context, requestMet podTotalCount := 0 podFilteredList := d.datastore.PodList(func(pm backendmetrics.PodMetrics) bool { podTotalCount++ - if _, found := endpoints[pm.GetPod().Address]; found { + if _, found := endpoints[pm.GetPod().GetIPAddress()]; found { return true } return false @@ -376,20 +285,12 @@ func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestC return reqCtx, errutil.Error{Code: errutil.Internal, Msg: "results must be greater than zero"} } // primary profile is used to set destination - pool, err := d.datastore.PoolGet() - if err != nil { - return reqCtx, err - } targetPods := []*backend.Pod{} - if len(pool.Spec.TargetPorts) != 1 { - return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "targetPorts should have length 1"} - } - targetPort := int(pool.Spec.TargetPorts[0].Number) targetEndpoints := []string{} for _, pod := range result.ProfileResults[result.PrimaryProfileName].TargetPods { curPod := pod.GetPod() - curEndpoint := net.JoinHostPort(curPod.Address, strconv.Itoa(targetPort)) + curEndpoint := net.JoinHostPort(curPod.GetIPAddress(), curPod.GetPort()) targetPods = append(targetPods, curPod) targetEndpoints = append(targetEndpoints, curEndpoint) } @@ -400,10 +301,7 @@ func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestC reqCtx.TargetPod = targetPods[0] reqCtx.TargetEndpoint = multiEndpointString - d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort) - reqCtx.SchedulingResult = result - reqCtx.LastSeenMetrics = make(map[string]*backendmetrics.MetricsState) - RefreshLastSeenMetrics(ctx, reqCtx) + d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result) return reqCtx, nil } @@ -417,36 +315,47 @@ func (d *Director) toSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []sch return pm } -// HandleResponseHeaders is called when the first chunk of the response arrives. -func (d *Director) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { - logger := log.FromContext(ctx).WithValues("stage", "headers") - logger.V(logutil.DEBUG).Info("Entering HandleResponseHeaders") +// HandleResponseReceived is called when the response headers are received. +func (d *Director) HandleResponseReceived(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + response := &Response{ + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + } - d.runPostResponsePlugins(ctx, reqCtx) + // TODO: to extend fallback functionality, handle cases where target pod is unavailable + // https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1224 + d.runResponseReceivedPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) - logger.V(logutil.DEBUG).Info("Exiting HandleResponseHeaders") return reqCtx, nil } -func (d *Director) HandleResponseBodyChunk(ctx context.Context, reqCtx *handlers.RequestContext) error { +// HandleResponseBodyStreaming is called every time a chunk of the response body is received. +func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk") + response := &Response{ + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + } - d.runPostResponseChunkPlugins(ctx, reqCtx) + d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) logger.V(logutil.TRACE).Info("Exiting HandleResponseBodyChunk") - return nil + return reqCtx, nil } // HandleResponseBodyComplete is called when the response body is fully received. -// It runs the PostResponseComplete plugins. -func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) error { +func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") logger.V(logutil.DEBUG).Info("Entering HandleResponseBodyComplete") + response := &Response{ + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + } - d.runPostResponseCompletePlugins(ctx, reqCtx) + d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) logger.V(logutil.DEBUG).Info("Exiting HandleResponseBodyComplete") - return nil + return reqCtx, nil } func (d *Director) GetRandomPod() *backend.Pod { @@ -460,45 +369,46 @@ func (d *Director) GetRandomPod() *backend.Pod { } func (d *Director) runPreRequestPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, - schedulingResult *schedulingtypes.SchedulingResult, targetPort int) { + schedulingResult *schedulingtypes.SchedulingResult) { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) - for _, plugin := range d.preRequestPlugins { - loggerDebug.Info("Running pre-request plugin", "plugin", plugin.TypedName()) + for _, plugin := range d.requestControlPlugins.preRequestPlugins { + loggerDebug.Info("Running PreRequest plugin", "plugin", plugin.TypedName()) before := time.Now() - plugin.PreRequest(ctx, request, schedulingResult, targetPort) + plugin.PreRequest(ctx, request, schedulingResult) metrics.RecordPluginProcessingLatency(PreRequestExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) - loggerDebug.Info("Completed running pre-request plugin successfully", "plugin", plugin.TypedName()) + loggerDebug.Info("Completed running PreRequest plugin successfully", "plugin", plugin.TypedName()) } } -func (d *Director) runPostResponsePlugins(ctx context.Context, reqCtx *handlers.RequestContext) { +func (d *Director) runResponseReceivedPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) - for _, plugin := range d.postResponsePlugins { - loggerDebug.Info("Running post-response plugin", "plugin", plugin.TypedName()) + for _, plugin := range d.requestControlPlugins.responseReceivedPlugins { + loggerDebug.Info("Running ResponseReceived plugin", "plugin", plugin.TypedName()) before := time.Now() - plugin.PostResponse(ctx, reqCtx) - metrics.RecordPluginProcessingLatency(PostResponseExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) - loggerDebug.Info("Completed running post-response plugin successfully", "plugin", plugin.TypedName()) + plugin.ResponseReceived(ctx, request, response, targetPod) + metrics.RecordPluginProcessingLatency(ResponseReceivedExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) + loggerDebug.Info("Completed running ResponseReceived plugin successfully", "plugin", plugin.TypedName()) } } -func (d *Director) runPostResponseChunkPlugins(ctx context.Context, reqCtx *handlers.RequestContext) { +func (d *Director) runResponseStreamingPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { loggerTrace := log.FromContext(ctx).V(logutil.TRACE) - for _, plugin := range d.postResponseChunkPlugins { - loggerTrace.Info("Running post-response chunk plugin", "plugin", plugin.TypedName().Type) + for _, plugin := range d.requestControlPlugins.responseStreamingPlugins { + loggerTrace.Info("Running ResponseStreaming plugin", "plugin", plugin.TypedName()) before := time.Now() - plugin.PostResponseChunk(ctx, reqCtx) - metrics.RecordPluginProcessingLatency(PostResponseChunkExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) + plugin.ResponseStreaming(ctx, request, response, targetPod) + metrics.RecordPluginProcessingLatency(ResponseStreamingExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) + loggerTrace.Info("Completed running ResponseStreaming plugin successfully", "plugin", plugin.TypedName()) } } -func (d *Director) runPostResponseCompletePlugins(ctx context.Context, reqCtx *handlers.RequestContext) { +func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) - for _, plugin := range d.postResponseCompletePlugins { - loggerDebug.Info("Running post-response complete plugin", "plugin", plugin.TypedName().Type) + for _, plugin := range d.requestControlPlugins.responseCompletePlugins { + loggerDebug.Info("Running ResponseComplete plugin", "plugin", plugin.TypedName()) before := time.Now() - plugin.PostResponseComplete(ctx, reqCtx) - metrics.RecordPluginProcessingLatency(PostResponseCompleteExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) - loggerDebug.Info("Completed running post-response complete plugin successfully", "plugin", plugin.TypedName()) + plugin.ResponseComplete(ctx, request, response, targetPod) + metrics.RecordPluginProcessingLatency(ResponseCompleteExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) + loggerDebug.Info("Completed running ResponseComplete plugin successfully", "plugin", plugin.TypedName()) } } diff --git a/pkg/epp/requestcontrol/director_test.go b/pkg/epp/requestcontrol/director_test.go index 61a8b31be..c3111ed5e 100644 --- a/pkg/epp/requestcontrol/director_test.go +++ b/pkg/epp/requestcontrol/director_test.go @@ -40,7 +40,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" latencypredictor "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" @@ -55,12 +54,17 @@ import ( // --- Mocks --- -type mockSaturationDetector struct { - isSaturated bool +type mockAdmissionController struct { + admitErr error } -func (m *mockSaturationDetector) IsSaturated(_ context.Context, _ []backendmetrics.PodMetrics) bool { - return m.isSaturated +func (m *mockAdmissionController) Admit( + _ context.Context, + _ *handlers.RequestContext, + _ []backendmetrics.PodMetrics, + _ int, +) error { + return m.admitErr } // Updated mock scheduler to handle the new Schedule method signature @@ -143,27 +147,7 @@ func (ds *mockDatastore) PodList(predicate func(backendmetrics.PodMetrics) bool) return res } -func (ds *mockDatastore) PodDelete(namespacedName types.NamespacedName) {} -func (ds *mockDatastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { return true } -func (ds *mockDatastore) ObjectiveSet(infObjective *v1alpha2.InferenceObjective) {} -func (ds *mockDatastore) ObjectiveDelete(namespacedName types.NamespacedName) {} -func (ds *mockDatastore) ObjectiveGetAll() []*v1alpha2.InferenceObjective { return nil } -func (ds *mockDatastore) PodAddRequest(podName types.NamespacedName, requestID string, tpot float64) error { - return nil -} -func (ds *mockDatastore) PodRemoveRequest(podName types.NamespacedName, requestID string) error { - return nil -} -func (ds *mockDatastore) PodUpdateRequest(podName types.NamespacedName, requestID string, tpot float64) error { - return nil -} -func (ds *mockDatastore) PodGetRunningRequests(podName types.NamespacedName) (*datalayer.RequestPriorityQueue, error) { - return nil, nil -} -func (ds *mockDatastore) PodGetRequestCount(podName types.NamespacedName) (int, error) { return 0, nil } -func (ds *mockDatastore) Clear() {} -// mockPredictor implements the Predictor interface for testing. type mockPredictor struct { PredictFunc func(ctx context.Context, req latencypredictor.PredictionRequest) (*latencypredictor.PredictionResponse, error) trainingSamples []latencypredictor.TrainingEntry @@ -201,7 +185,6 @@ func (m *mockPredictor) AddTrainingDataBulk(entry []latencypredictor.TrainingEnt m.trainingSamples = append(m.trainingSamples, entry...) return nil } - func TestDirector_HandleRequest(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) @@ -229,7 +212,7 @@ func TestDirector_HandleRequest(t *testing.T) { // Datastore setup pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) - ds := datastore.NewDatastore(t.Context(), pmf) + ds := datastore.NewDatastore(t.Context(), pmf, 0) ds.ObjectiveSet(ioFoodReview) ds.ObjectiveSet(ioFoodReviewResolve) ds.ObjectiveSet(ioFoodReviewSheddable) @@ -279,6 +262,8 @@ func TestDirector_HandleRequest(t *testing.T) { Pod: &schedulingtypes.PodMetrics{ Pod: &backend.Pod{ Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", NamespacedName: types.NamespacedName{Name: "pod1", Namespace: "default"}, }, }, @@ -287,6 +272,8 @@ func TestDirector_HandleRequest(t *testing.T) { Pod: &schedulingtypes.PodMetrics{ Pod: &backend.Pod{ Address: "192.168.2.100", + Port: "8000", + MetricsHost: "192.168.2.100:8000", NamespacedName: types.NamespacedName{Name: "pod2", Namespace: "default"}, }, }, @@ -295,6 +282,8 @@ func TestDirector_HandleRequest(t *testing.T) { Pod: &schedulingtypes.PodMetrics{ Pod: &backend.Pod{ Address: "192.168.4.100", + Port: "8000", + MetricsHost: "192.168.4.100:8000", NamespacedName: types.NamespacedName{Name: "pod4", Namespace: "default"}, }, }, @@ -310,10 +299,9 @@ func TestDirector_HandleRequest(t *testing.T) { &schedulingtypes.ScoredPod{ Pod: &schedulingtypes.PodMetrics{ Pod: &backend.Pod{ - Address: "192.168.1.100", - NamespacedName: types.NamespacedName{Name: "pod1", Namespace: "default"}, - RunningRequests: &datalayer.RequestPriorityQueue{}, // Add empty queue - Labels: map[string]string{"app": "inference"}, + Address: "192.168.1.100", + NamespacedName: types.NamespacedName{Name: "pod1", Namespace: "default"}, + Labels: map[string]string{"app": "inference"}, }, }, }, @@ -323,10 +311,9 @@ func TestDirector_HandleRequest(t *testing.T) { &schedulingtypes.ScoredPod{ Pod: &schedulingtypes.PodMetrics{ Pod: &backend.Pod{ - Address: "192.168.1.100", - NamespacedName: types.NamespacedName{Name: "pod1", Namespace: "default"}, - RunningRequests: &datalayer.RequestPriorityQueue{}, // Add empty queue - Labels: map[string]string{"app": "inference"}, + Address: "192.168.1.100", + NamespacedName: types.NamespacedName{Name: "pod1", Namespace: "default"}, + Labels: map[string]string{"app": "inference"}, }, }, }: 0.8, // 80% prefix cache score @@ -337,24 +324,24 @@ func TestDirector_HandleRequest(t *testing.T) { } tests := []struct { - name string - reqBodyMap map[string]any - mockSaturationDetector *mockSaturationDetector - inferenceObjectiveName string - schedulerMockSetup func(m *mockScheduler) - predictorMockSetup func(m *mockPredictor) // NEW: Add predictor setup - wantErrCode string // Expected errutil code string - wantReqCtx *handlers.RequestContext // Fields to check in the returned RequestContext - wantMutatedBodyModel string // Expected model in reqCtx.Request.Body after PostDispatch - targetModelName string // Expected model name after target model resolution + name string + reqBodyMap map[string]any + mockAdmissionController *mockAdmissionController + inferenceObjectiveName string + schedulerMockSetup func(m *mockScheduler) + predictorMockSetup func(m *mockPredictor) + wantErrCode string // Expected errutil code string + wantReqCtx *handlers.RequestContext // Fields to check in the returned RequestContext + wantMutatedBodyModel string // Expected model in reqCtx.Request.Body after PostDispatch + targetModelName string // Expected model name after target model resolution }{ { - name: "successful completions request (critical, saturation ignored)", + name: "successful completions request", reqBodyMap: map[string]any{ "model": model, "prompt": "critical prompt", }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -362,9 +349,10 @@ func TestDirector_HandleRequest(t *testing.T) { ObjectiveKey: objectiveName, TargetModelName: model, TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -373,39 +361,7 @@ func TestDirector_HandleRequest(t *testing.T) { targetModelName: model, }, { - name: "non-critical request dropped due to saturation", - reqBodyMap: map[string]any{ - "model": modelSheddable, - "prompt": "test prompt", - }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, - schedulerMockSetup: func(m *mockScheduler) { - m.scheduleResults = defaultSuccessfulScheduleResults - }, - wantReqCtx: &handlers.RequestContext{ - ObjectiveKey: objectiveNameSheddable, - TargetModelName: model, - TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized - }, - TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", - }, - predictorMockSetup: func(m *mockPredictor) { - // Mock prediction that violates SLOs - m.PredictFunc = func(ctx context.Context, req latencypredictor.PredictionRequest) (*latencypredictor.PredictionResponse, error) { - return &latencypredictor.PredictionResponse{ - TTFT: 150.0, // Above SLO of 100 - TPOT: 80.0, // Above SLO of 50 - }, nil - } - }, - inferenceObjectiveName: objectiveNameSheddable, - wantErrCode: errutil.InferencePoolResourceExhausted, - }, - { - name: "successful chat completions request (default critical, saturation ignored)", + name: "successful chat completions request", reqBodyMap: map[string]any{ "model": model, "messages": []any{ @@ -415,7 +371,7 @@ func TestDirector_HandleRequest(t *testing.T) { }, }, }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -424,6 +380,8 @@ func TestDirector_HandleRequest(t *testing.T) { TargetPod: &backend.Pod{ NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -436,7 +394,7 @@ func TestDirector_HandleRequest(t *testing.T) { "model": model, // Critical model "prompt": "test prompt", }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -452,9 +410,10 @@ func TestDirector_HandleRequest(t *testing.T) { wantReqCtx: &handlers.RequestContext{ TargetModelName: model, TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -476,6 +435,7 @@ func TestDirector_HandleRequest(t *testing.T) { }, }, }, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -483,9 +443,10 @@ func TestDirector_HandleRequest(t *testing.T) { ObjectiveKey: objectiveName, TargetModelName: model, TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -499,7 +460,7 @@ func TestDirector_HandleRequest(t *testing.T) { "model": modelSheddable, "prompt": "sheddable prompt", }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -507,9 +468,10 @@ func TestDirector_HandleRequest(t *testing.T) { ObjectiveKey: objectiveNameSheddable, TargetModelName: modelSheddable, TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -523,7 +485,7 @@ func TestDirector_HandleRequest(t *testing.T) { "model": modelWithResolvedTarget, "prompt": "prompt for target resolution", }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = defaultSuccessfulScheduleResults }, @@ -531,9 +493,10 @@ func TestDirector_HandleRequest(t *testing.T) { ObjectiveKey: objectiveNameResolve, TargetModelName: "resolved-target-model-A", TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -550,9 +513,10 @@ func TestDirector_HandleRequest(t *testing.T) { ObjectiveKey: "food-review-1", TargetModelName: "food-review-1", TargetPod: &backend.Pod{ - NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, - Address: "192.168.1.100", - RunningRequests: &datalayer.RequestPriorityQueue{}, // Empty but initialized + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + Port: "8000", + MetricsHost: "192.168.1.100:8000", }, TargetEndpoint: "192.168.1.100:8000,192.168.2.100:8000,192.168.4.100:8000", }, @@ -561,28 +525,27 @@ func TestDirector_HandleRequest(t *testing.T) { "model": "food-review-1", "prompt": "test prompt", }, - mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, - inferenceObjectiveName: "food-review-1", - targetModelName: "food-review-1", + mockAdmissionController: &mockAdmissionController{admitErr: nil}, + inferenceObjectiveName: "food-review-1", + targetModelName: "food-review-1", }, { - name: "request dropped (sheddable, saturated)", + name: "request rejected by admission controller", reqBodyMap: map[string]any{ "model": modelSheddable, "prompt": "sheddable prompt", }, - inferenceObjectiveName: objectiveNameSheddable, - mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, - wantErrCode: errutil.InferencePoolResourceExhausted, + inferenceObjectiveName: objectiveNameSheddable, + mockAdmissionController: &mockAdmissionController{admitErr: errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: "simulated admission rejection"}}, + wantErrCode: errutil.InferencePoolResourceExhausted, }, { - name: "model not found, expect err", - reqBodyMap: map[string]any{"prompt": "p"}, - mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, - wantErrCode: errutil.BadRequest, + name: "model not found, expect err", + reqBodyMap: map[string]any{"prompt": "p"}, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, + wantErrCode: errutil.BadRequest, }, - { name: "prompt or messages not found, expect err", reqBodyMap: map[string]any{"model": model}, @@ -602,6 +565,7 @@ func TestDirector_HandleRequest(t *testing.T) { "model": model, "prompt": "prompt that causes scheduler error", }, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleErr = errors.New("simulated scheduler failure") }, @@ -614,6 +578,7 @@ func TestDirector_HandleRequest(t *testing.T) { "model": model, "prompt": "prompt for nil,nil scheduler return", }, + mockAdmissionController: &mockAdmissionController{admitErr: nil}, schedulerMockSetup: func(m *mockScheduler) { m.scheduleResults = nil m.scheduleErr = nil @@ -636,9 +601,9 @@ func TestDirector_HandleRequest(t *testing.T) { if test.predictorMockSetup != nil { mockPred = &mockPredictor{} test.predictorMockSetup(mockPred) - director = NewDirectorWithConfig(ds, mockSched, test.mockSaturationDetector, NewConfig()) + director = NewDirectorWithConfig(ds, mockSched, test.mockAdmissionController, NewConfig()) } else { - director = NewDirectorWithConfig(ds, mockSched, test.mockSaturationDetector, NewConfig()) + director = NewDirectorWithConfig(ds, mockSched, test.mockAdmissionController, NewConfig()) } reqCtx := &handlers.RequestContext{ @@ -674,15 +639,7 @@ func TestDirector_HandleRequest(t *testing.T) { assert.Equal(t, test.wantReqCtx.ObjectiveKey, returnedReqCtx.ObjectiveKey, "reqCtx.Model mismatch") assert.Equal(t, test.wantReqCtx.TargetModelName, returnedReqCtx.TargetModelName, "reqCtx.ResolvedTargetModel mismatch") - if test.wantReqCtx != nil && test.wantReqCtx.TargetPod != nil { - expected := test.wantReqCtx.TargetPod - actual := returnedReqCtx.TargetPod - - assert.Equal(t, expected.NamespacedName, actual.NamespacedName, "NamespacedName mismatch") - assert.Equal(t, expected.Address, actual.Address, "Address mismatch") - assert.Equal(t, expected.Labels, actual.Labels, "Labels mismatch") - // Skip RunningRequests comparison - it's not relevant to the test - } + assert.Equal(t, test.wantReqCtx.TargetPod, returnedReqCtx.TargetPod, "reqCtx.TargetPod mismatch") assert.Equal(t, test.wantReqCtx.TargetEndpoint, returnedReqCtx.TargetEndpoint, "reqCtx.TargetEndpoint mismatch") } @@ -766,13 +723,13 @@ func TestGetCandidatePodsForScheduling(t *testing.T) { ds := &mockDatastore{pods: testInput} for _, test := range tests { t.Run(test.name, func(t *testing.T) { - director := NewDirectorWithConfig(ds, &mockScheduler{}, &mockSaturationDetector{}, NewConfig()) + director := NewDirectorWithConfig(ds, &mockScheduler{}, &mockAdmissionController{}, NewConfig()) got := director.getCandidatePodsForScheduling(context.Background(), test.metadata) diff := cmp.Diff(test.output, got, cmpopts.SortSlices(func(a, b backendmetrics.PodMetrics) bool { return a.GetPod().NamespacedName.String() < b.GetPod().NamespacedName.String() - }), cmpopts.IgnoreUnexported(backendmetrics.FakePodMetrics{})) + })) if diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } @@ -809,10 +766,29 @@ func TestGetRandomPod(t *testing.T) { }, } + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) + _ = v1.Install(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + pool := &v1.InferencePool{ + Spec: v1.InferencePoolSpec{ + TargetPorts: []v1.Port{ + {Number: 8000}, + }, + }, + } + for _, test := range tests { t.Run(test.name, func(t *testing.T) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Millisecond) - ds := datastore.NewDatastore(t.Context(), pmf) + ds := datastore.NewDatastore(t.Context(), pmf, 0) + err := ds.PoolSet(t.Context(), fakeClient, pool) + if err != nil { + t.Errorf("unexpected error setting pool: %s", err) + } for _, pod := range test.storePods { ds.PodUpdateOrAddIfNotExist(pod) } @@ -829,13 +805,13 @@ func TestGetRandomPod(t *testing.T) { } } -func TestDirector_HandleResponse(t *testing.T) { - pr1 := newTestPostResponse("pr1") +func TestDirector_HandleResponseReceived(t *testing.T) { + pr1 := newTestResponseReceived("pr1") ctx := logutil.NewTestLoggerIntoContext(context.Background()) - ds := datastore.NewDatastore(t.Context(), nil) + ds := datastore.NewDatastore(t.Context(), nil, 0) mockSched := &mockScheduler{} - director := NewDirectorWithConfig(ds, mockSched, nil, NewConfig().WithPostResponsePlugins(pr1)) + director := NewDirectorWithConfig(ds, mockSched, &mockAdmissionController{}, NewConfig().WithResponseReceivedPlugins(pr1)) reqCtx := &handlers.RequestContext{ Request: &handlers.Request{ @@ -850,7 +826,7 @@ func TestDirector_HandleResponse(t *testing.T) { TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}}, } - _, err := director.HandleResponse(ctx, reqCtx) + _, err := director.HandleResponseReceived(ctx, reqCtx) if err != nil { t.Fatalf("HandleResponse() returned unexpected error: %v", err) } @@ -866,31 +842,143 @@ func TestDirector_HandleResponse(t *testing.T) { } } +func TestDirector_HandleResponseStreaming(t *testing.T) { + ps1 := newTestResponseStreaming("ps1") + + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + ds := datastore.NewDatastore(t.Context(), nil, 0) + mockSched := &mockScheduler{} + director := NewDirectorWithConfig(ds, mockSched, nil, NewConfig().WithResponseStreamingPlugins(ps1)) + + reqCtx := &handlers.RequestContext{ + Request: &handlers.Request{ + Headers: map[string]string{ + requtil.RequestIdHeaderKey: "test-req-id-for-streaming", + }, + }, + Response: &handlers.Response{ + Headers: map[string]string{"X-Test-Streaming-Header": "StreamValue"}, + }, + TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}}, + } + + _, err := director.HandleResponseBodyStreaming(ctx, reqCtx) + if err != nil { + t.Fatalf("HandleResponseBodyStreaming() returned unexpected error: %v", err) + } + + if diff := cmp.Diff("test-req-id-for-streaming", ps1.lastRespOnStreaming.RequestId); diff != "" { + t.Errorf("Scheduler.OnStreaming RequestId mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(reqCtx.Response.Headers, ps1.lastRespOnStreaming.Headers); diff != "" { + t.Errorf("Scheduler.OnStreaming Headers mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff("namespace1/test-pod-name", ps1.lastTargetPodOnStreaming); diff != "" { + t.Errorf("Scheduler.OnStreaming TargetPodName mismatch (-want +got):\n%s", diff) + } +} + +func TestDirector_HandleResponseComplete(t *testing.T) { + pc1 := newTestResponseComplete("pc1") + + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + ds := datastore.NewDatastore(t.Context(), nil, 0) + mockSched := &mockScheduler{} + director := NewDirectorWithConfig(ds, mockSched, nil, NewConfig().WithResponseCompletePlugins(pc1)) + + reqCtx := &handlers.RequestContext{ + Request: &handlers.Request{ + Headers: map[string]string{ + requtil.RequestIdHeaderKey: "test-req-id-for-complete", + }, + }, + Response: &handlers.Response{ + Headers: map[string]string{"X-Test-Complete-Header": "CompleteValue"}, + }, + TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}}, + } + + _, err := director.HandleResponseBodyComplete(ctx, reqCtx) + if err != nil { + t.Fatalf("HandleResponseBodyComplete() returned unexpected error: %v", err) + } + + if diff := cmp.Diff("test-req-id-for-complete", pc1.lastRespOnComplete.RequestId); diff != "" { + t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" { + t.Errorf("Scheduler.OnComplete Headers mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" { + t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff) + } +} + const ( - testPostResponseType = "test-post-response" + testResponseReceivedType = "test-response-received" + testPostStreamingType = "test-response-streaming" + testPostCompleteType = "test-response-complete" ) -type testPostResponse struct { +type testResponseReceived struct { tn plugins.TypedName lastRespOnResponse *Response lastTargetPodOnResponse string } -func newTestPostResponse(name string) *testPostResponse { - return &testPostResponse{ - tn: plugins.TypedName{Type: testPostResponseType, Name: name}, +type testResponseStreaming struct { + tn plugins.TypedName + lastRespOnStreaming *Response + lastTargetPodOnStreaming string +} + +type testResponseComplete struct { + tn plugins.TypedName + lastRespOnComplete *Response + lastTargetPodOnComplete string +} + +func newTestResponseReceived(name string) *testResponseReceived { + return &testResponseReceived{ + tn: plugins.TypedName{Type: testResponseReceivedType, Name: name}, } } -func (p *testPostResponse) TypedName() plugins.TypedName { - return p.tn +func newTestResponseStreaming(name string) *testResponseStreaming { + return &testResponseStreaming{ + tn: plugins.TypedName{Type: testPostStreamingType, Name: name}, + } } -func (p *testPostResponse) PostResponse(_ context.Context, reqCtx *handlers.RequestContext) { - response := &Response{ - RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], - Headers: reqCtx.Response.Headers, +func newTestResponseComplete(name string) *testResponseComplete { + return &testResponseComplete{ + tn: plugins.TypedName{Type: testPostCompleteType, Name: name}, } +} + +func (p *testResponseReceived) TypedName() plugins.TypedName { + return p.tn +} + +func (p *testResponseStreaming) TypedName() plugins.TypedName { + return p.tn +} + +func (p *testResponseComplete) TypedName() plugins.TypedName { + return p.tn +} + +func (p *testResponseReceived) ResponseReceived(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { p.lastRespOnResponse = response - p.lastTargetPodOnResponse = reqCtx.TargetPod.NamespacedName.String() + p.lastTargetPodOnResponse = targetPod.NamespacedName.String() +} + +func (p *testResponseStreaming) ResponseStreaming(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { + p.lastRespOnStreaming = response + p.lastTargetPodOnStreaming = targetPod.NamespacedName.String() +} + +func (p *testResponseComplete) ResponseComplete(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { + p.lastRespOnComplete = response + p.lastTargetPodOnComplete = targetPod.NamespacedName.String() } diff --git a/pkg/epp/requestcontrol/plugins.go b/pkg/epp/requestcontrol/plugins.go index 1bb56062a..30f31f070 100644 --- a/pkg/epp/requestcontrol/plugins.go +++ b/pkg/epp/requestcontrol/plugins.go @@ -19,39 +19,41 @@ package requestcontrol import ( "context" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) const ( - PreRequestExtensionPoint = "PreRequest" - PostResponseExtensionPoint = "PostResponse" - PostResponseChunkExtensionPoint = "PostResponseChunk" - PostResponseCompleteExtensionPoint = "PostResponseComplete" + PreRequestExtensionPoint = "PreRequest" + ResponseReceivedExtensionPoint = "ResponseReceived" + ResponseStreamingExtensionPoint = "ResponseStreaming" + ResponseCompleteExtensionPoint = "ResponseComplete" ) -// PreRequest is called by the director after a getting result from scheduling layer but +// PreRequest is called by the director after a getting result from scheduling layer and // before a request is sent to the selected model server. type PreRequest interface { plugins.Plugin - PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, targetPort int) + PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) } -// PostResponse is called by the director after a successful response is recieved or first chunk if streaming. -type PostResponse interface { +// ResponseReceived is called by the director after the response headers are successfully received +// which indicates the beginning of the response handling by the model server. +// The given pod argument is the pod that served the request. +type ResponseReceived interface { plugins.Plugin - PostResponse(ctx context.Context, reqCtx *handlers.RequestContext) + ResponseReceived(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod) } -// PostResponseChunk is called by the director if in streaming mode after each successful response chunk. -type PostResponseChunk interface { +// ResponseStreaming is called by the director after each chunk of streaming response is sent. +type ResponseStreaming interface { plugins.Plugin - PostResponseChunk(ctx context.Context, reqCtx *handlers.RequestContext) + ResponseStreaming(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod) } -// PostResponseComplete is called by the director if in streaming mode after the final successful response chunk is sent. -type PostResponseComplete interface { +// ResponseComplete is called by the director after the complete response is sent. +type ResponseComplete interface { plugins.Plugin - PostResponseComplete(ctx context.Context, reqCtx *handlers.RequestContext) + ResponseComplete(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod) } diff --git a/pkg/epp/requestcontrol/plugins/slorequest/slo_request_tracker.go b/pkg/epp/requestcontrol/plugins/slorequest/slo_request_tracker.go deleted file mode 100644 index cc57a9963..000000000 --- a/pkg/epp/requestcontrol/plugins/slorequest/slo_request_tracker.go +++ /dev/null @@ -1,177 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package slorequest - -import ( - "context" - "time" - - "github.com/go-logr/logr" - "github.com/google/uuid" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/log" - - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" - scheduling_types "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" - requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" -) - -const ( - SLORequestTrackerPluginType = "slo-request-tracker" -) - -type SLORequestTracker struct { - tn plugins.TypedName - latencypredictor latencypredictorasync.PredictorInterface - datastore datastore.Datastore -} - -var _ requestcontrol.PreRequest = &SLORequestTracker{} -var _ requestcontrol.PostResponse = &SLORequestTracker{} -var _ requestcontrol.PostResponseChunk = &SLORequestTracker{} -var _ requestcontrol.PostResponseComplete = &SLORequestTracker{} - -func New(latencypredictor latencypredictorasync.PredictorInterface, datastore datastore.Datastore) *SLORequestTracker { - return &SLORequestTracker{ - tn: plugins.TypedName{Type: SLORequestTrackerPluginType, Name: SLORequestTrackerPluginType}, - latencypredictor: latencypredictor, - datastore: datastore, - } -} - -func (t *SLORequestTracker) TypedName() plugins.TypedName { - return t.tn -} - -func (s *SLORequestTracker) WithName(name string) *SLORequestTracker { - s.tn.Name = name - return s -} - -func (t *SLORequestTracker) PreRequest(ctx context.Context, request *scheduling_types.LLMRequest, schedulingResult *scheduling_types.SchedulingResult, targetPort int) { - logger := log.FromContext(ctx) - - if schedulingResult == nil || len(schedulingResult.ProfileResults) == 0 { - logger.V(logutil.DEBUG).Info("SLORequestTracker: Skipping PreRequest because no scheduling result was provided.") - return - } - - targetPod := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName].TargetPods[0].GetPod() - - podName := types.NamespacedName{ - Name: targetPod.NamespacedName.Name, - Namespace: targetPod.NamespacedName.Namespace, - } - - logger.V(logutil.DEBUG).Info("request ID for SLO tracking", "requestID", request.Headers[requtil.RequestIdHeaderKey], "podName", podName) - if request.Headers[requtil.RequestIdHeaderKey] == "" { - request.Headers[requtil.RequestIdHeaderKey] = uuid.New().String() - logger.V(logutil.DEBUG).Info("Generated new request ID for SLO tracking", "requestID", request.Headers[requtil.RequestIdHeaderKey]) - logger.V(logutil.DEBUG).Info("request headers for SLO tracking", "requestHeaders", request.Headers) - } - - err := t.datastore.PodAddRequest(podName, request.Headers[requtil.RequestIdHeaderKey], request.AvgTPOTSLO) - if err != nil { - logger.V(logutil.DEBUG).Error(err, "SLORequestTracker: Failed to add request to pod running queue", "podName", podName, "requestID", request.Headers[requtil.RequestIdHeaderKey]) - } -} - -func (t *SLORequestTracker) PostResponse(ctx context.Context, reqCtx *handlers.RequestContext) { - logger := log.FromContext(ctx) - targetPod := reqCtx.TargetPod - if !t.CheckPredictor(logger, targetPod) { - return - } - - if err := requestcontrol.ProcessHeaderForLatencyPrediction(ctx, t.latencypredictor, reqCtx); err != nil { - logger.V(logutil.DEBUG).Error(err, "ProcessHeader in latencypredictor failed") - } - -} - -func (t *SLORequestTracker) PostResponseChunk(ctx context.Context, reqCtx *handlers.RequestContext) { - logger := log.FromContext(ctx) - targetPod := reqCtx.TargetPod - if !t.CheckPredictor(logger, targetPod) { - return - } - - now := time.Now() - - if reqCtx.TTFT == 0 { - requestcontrol.ProcessFirstTokenForLatencyPrediction(ctx, t.latencypredictor, reqCtx, now) - } else { - requestcontrol.ProcessTokenForLatencyPrediction(ctx, t.latencypredictor, reqCtx, now) - } - -} - -func (t *SLORequestTracker) PostResponseComplete(ctx context.Context, reqCtx *handlers.RequestContext) { - logger := log.FromContext(ctx) - request := reqCtx.SchedulingRequest - targetPod := reqCtx.TargetPod - if !t.CheckPredictor(logger, targetPod) { - return - } - - if reqCtx.TTFT > 0 { - logger.V(logutil.DEBUG).Info("Averages calculated", "avgActualTTFT", reqCtx.TTFT, "avgPredictedTTFT", reqCtx.PredictedTTFT) - metrics.RecordRequestTTFT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.TTFT/1000) - metrics.RecordRequestPredictedTTFT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.PredictedTTFT/1000) - if reqCtx.SchedulingRequest.TTFTSLO > 0 { - metrics.RecordRequestTTFTWithSLO(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.TTFT, reqCtx.SchedulingRequest.TTFTSLO) - } - } - - if reqCtx.AvgTPOT > 0 { - logger.V(logutil.DEBUG).Info("Averages calculated", "avgActualTPOT", reqCtx.AvgTPOT, "avgPredictedTPOT", reqCtx.AvgPredictedTPOT) - metrics.RecordRequestTPOT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.AvgTPOT/1000) - metrics.RecordRequestPredictedTPOT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.AvgPredictedTPOT/1000) - if reqCtx.SchedulingRequest.AvgTPOTSLO > 0 { - metrics.RecordRequestTPOTWithSLO(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.AvgTPOT, reqCtx.SchedulingRequest.AvgTPOTSLO) - } - } - logger.V(logutil.DEBUG).Info("SLO Aware Routing Mode", "PredictorBasedScheduling", request.PredictorBasedScheduling) - - podName := types.NamespacedName{ - Name: targetPod.NamespacedName.Name, - Namespace: targetPod.NamespacedName.Namespace, - } - - if err := t.datastore.PodRemoveRequest(podName, request.Headers[requtil.RequestIdHeaderKey]); err != nil { - logger.V(logutil.DEBUG).Error(err, "SLORequestTracker: Failed to remove request from queue", "requestID", request.Headers[requtil.RequestIdHeaderKey]) - } -} - -func (t *SLORequestTracker) CheckPredictor(logger logr.Logger, targetPod *backend.Pod) bool { - if targetPod == nil { - logger.V(logutil.DEBUG).Info("SLORequestTracker: Skipping PostResponse because no target pod was provided.") - return false - } - if t.latencypredictor == nil { - logger.V(logutil.DEBUG).Info("SLORequestTracker: Skipping PostResponse because predictor missing") - return false - } - return true -} diff --git a/pkg/epp/requestcontrol/request_control_config.go b/pkg/epp/requestcontrol/request_control_config.go index 32b68a38b..ffa6c6609 100644 --- a/pkg/epp/requestcontrol/request_control_config.go +++ b/pkg/epp/requestcontrol/request_control_config.go @@ -23,19 +23,19 @@ import ( // NewConfig creates a new Config object and returns its pointer. func NewConfig() *Config { return &Config{ - preRequestPlugins: []PreRequest{}, - postResponsePlugins: []PostResponse{}, - postResponseChunkPlugins: []PostResponseChunk{}, - postResponseCompletePlugins: []PostResponseComplete{}, + preRequestPlugins: []PreRequest{}, + responseReceivedPlugins: []ResponseReceived{}, + responseStreamingPlugins: []ResponseStreaming{}, + responseCompletePlugins: []ResponseComplete{}, } } // Config provides a configuration for the requestcontrol plugins. type Config struct { - preRequestPlugins []PreRequest - postResponsePlugins []PostResponse - postResponseChunkPlugins []PostResponseChunk - postResponseCompletePlugins []PostResponseComplete + preRequestPlugins []PreRequest + responseReceivedPlugins []ResponseReceived + responseStreamingPlugins []ResponseStreaming + responseCompletePlugins []ResponseComplete } // WithPreRequestPlugins sets the given plugins as the PreRequest plugins. @@ -45,40 +45,44 @@ func (c *Config) WithPreRequestPlugins(plugins ...PreRequest) *Config { return c } -// WithPostResponsePlugins sets the given plugins as the PostResponse plugins. -// If the Config has PostResponse plugins already, this call replaces the existing plugins with the given ones. -func (c *Config) WithPostResponsePlugins(plugins ...PostResponse) *Config { - c.postResponsePlugins = plugins +// WithResponseReceivedPlugins sets the given plugins as the ResponseReceived plugins. +// If the Config has ResponseReceived plugins already, this call replaces the existing plugins with the given ones. +func (c *Config) WithResponseReceivedPlugins(plugins ...ResponseReceived) *Config { + c.responseReceivedPlugins = plugins return c } -// WithPostResponsePlugins sets the given plugins as the PostResponse plugins. -// If the Config has PostResponse plugins already, this call replaces the existing plugins with the given ones. -func (c *Config) WithPostResponseChunkPlugins(plugins ...PostResponseChunk) *Config { - c.postResponseChunkPlugins = plugins +// WithResponseStreamingPlugins sets the given plugins as the ResponseStreaming plugins. +// If the Config has ResponseStreaming plugins already, this call replaces the existing plugins with the given ones. +func (c *Config) WithResponseStreamingPlugins(plugins ...ResponseStreaming) *Config { + c.responseStreamingPlugins = plugins return c } -// WithPostResponseCompletePlugins sets the given plugins as the PostResponseComplete plugins. -// If the Config has PostResponseComplete plugins already, this call replaces the existing plugins with the given ones. -func (c *Config) WithPostResponseCompletePlugins(plugins ...PostResponseComplete) *Config { - c.postResponseCompletePlugins = plugins +// WithResponseCompletePlugins sets the given plugins as the ResponseComplete plugins. +// If the Config has ResponseComplete plugins already, this call replaces the existing plugins with the given ones. +func (c *Config) WithResponseCompletePlugins(plugins ...ResponseComplete) *Config { + c.responseCompletePlugins = plugins return c } +// AddPlugins adds the given plugins to the Config. +// The type of each plugin is checked and added to the corresponding list of plugins in the Config. +// If a plugin implements multiple plugin interfaces, it will be added to each corresponding list. + func (c *Config) AddPlugins(pluginObjects ...plugins.Plugin) { for _, plugin := range pluginObjects { if preRequestPlugin, ok := plugin.(PreRequest); ok { c.preRequestPlugins = append(c.preRequestPlugins, preRequestPlugin) } - if postResponsePlugin, ok := plugin.(PostResponse); ok { - c.postResponsePlugins = append(c.postResponsePlugins, postResponsePlugin) + if responseReceivedPlugin, ok := plugin.(ResponseReceived); ok { + c.responseReceivedPlugins = append(c.responseReceivedPlugins, responseReceivedPlugin) } - if postResponseChunkPlugin, ok := plugin.(PostResponseChunk); ok { - c.postResponseChunkPlugins = append(c.postResponseChunkPlugins, postResponseChunkPlugin) + if responseStreamingPlugin, ok := plugin.(ResponseStreaming); ok { + c.responseStreamingPlugins = append(c.responseStreamingPlugins, responseStreamingPlugin) } - if postResponseCompletePlugin, ok := plugin.(PostResponseComplete); ok { - c.postResponseCompletePlugins = append(c.postResponseCompletePlugins, postResponseCompletePlugin) + if responseCompletePlugin, ok := plugin.(ResponseComplete); ok { + c.responseCompletePlugins = append(c.responseCompletePlugins, responseCompletePlugin) } } } diff --git a/pkg/epp/requestcontrol/types.go b/pkg/epp/requestcontrol/types.go index 8604e1dda..c881ed713 100644 --- a/pkg/epp/requestcontrol/types.go +++ b/pkg/epp/requestcontrol/types.go @@ -16,7 +16,7 @@ limitations under the License. package requestcontrol -// Response contains information from the response received to be passed to PostResponse plugins +// Response contains information from the response received to be passed to the Response requestcontrol plugins type Response struct { // RequestId is the Envoy generated Id for the request being processed RequestId string diff --git a/pkg/epp/saturationdetector/saturationdetector_test.go b/pkg/epp/saturationdetector/saturationdetector_test.go index 7d46143c3..0b861d90a 100644 --- a/pkg/epp/saturationdetector/saturationdetector_test.go +++ b/pkg/epp/saturationdetector/saturationdetector_test.go @@ -26,133 +26,19 @@ import ( "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" ) -// --- Mock Implementations --- - -type mockDatastore struct { - pods []backendmetrics.PodMetrics -} - -// PodGetAll returns all pod metrics from the fake datastore. -func (fds *mockDatastore) PodGetAll() []backendmetrics.PodMetrics { - return fds.pods -} - -func (fds *mockDatastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { - res := []backendmetrics.PodMetrics{} - for _, pm := range fds.pods { - if predicate(pm) { - res = append(res, pm) - } - } - return res -} - -// Helper function to create a properly initialized fake pod metrics -func newMockPodMetrics(name string, metrics *backendmetrics.MetricsState) backendmetrics.PodMetrics { - // Create a proper k8s pod - k8sPod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "ns1", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - PodIP: "192.168.1.1", +func newMockPodMetrics(name string, metrics *backendmetrics.MetricsState) *backendmetrics.FakePodMetrics { + return &backendmetrics.FakePodMetrics{ + Pod: &backend.Pod{ + NamespacedName: types.NamespacedName{Name: name, Namespace: "ns1"}, }, + Metrics: metrics, } - - // Use the proper constructor - fakePodMetrics := backendmetrics.NewFakePodMetrics(k8sPod) - - // Create a custom fake that can return the specified metrics - return &testPodMetrics{ - FakePodMetrics: fakePodMetrics, - customMetrics: metrics, - } -} - -// testPodMetrics wraps FakePodMetrics to allow custom metrics for testing -type testPodMetrics struct { - *backendmetrics.FakePodMetrics - customMetrics *backendmetrics.MetricsState -} - -// AddRequest implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).AddRequest of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) AddRequest(requestID string, tpot float64) bool { - panic("unimplemented") -} - -// ContainsRequest implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).ContainsRequest of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) ContainsRequest(requestID string) bool { - panic("unimplemented") -} - -// GetPod implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).GetPod of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) GetPod() *backend.Pod { - return t.FakePodMetrics.GetPod() -} - -// GetRequestCount implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).GetRequestCount of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) GetRequestCount() int { - panic("unimplemented") -} - -// GetRunningRequests implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).GetRunningRequests of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) GetRunningRequests() *datalayer.RequestPriorityQueue { - panic("unimplemented") -} - -// PeekRequestPriorityQueue implements metrics.PodMetrics. -func (t *testPodMetrics) PeekRequestPriorityQueue() *datalayer.Request { - panic("unimplemented") -} - -// RemoveRequest implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).RemoveRequest of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) RemoveRequest(requestID string) bool { - panic("unimplemented") -} - -// StopRefreshLoop implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).StopRefreshLoop of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) StopRefreshLoop() { - panic("unimplemented") -} - -// String implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).String of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) String() string { - panic("unimplemented") -} - -// UpdatePod implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).UpdatePod of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) UpdatePod(*corev1.Pod) { - panic("unimplemented") -} - -// UpdateRequest implements metrics.PodMetrics. -// Subtle: this method shadows the method (*FakePodMetrics).UpdateRequest of testPodMetrics.FakePodMetrics. -func (t *testPodMetrics) UpdateRequest(requestID string, tpot float64) bool { - panic("unimplemented") -} - -// Override GetMetrics to return custom metrics for testing -func (t *testPodMetrics) GetMetrics() *backendmetrics.MetricsState { - return t.customMetrics // Return exactly what was passed, including nil } // --- Tests --- @@ -228,16 +114,16 @@ func TestDetector_IsSaturated(t *testing.T) { } tests := []struct { - name string - config *Config - pods []backendmetrics.PodMetrics - expectedSaturat bool + name string + config *Config + pods []backendmetrics.PodMetrics + expectedSaturation bool }{ { - name: "No pods in datastore", - config: defaultConfig, - pods: []backendmetrics.PodMetrics{}, - expectedSaturat: true, // No capacity = saturated + name: "No candidate pods", + config: defaultConfig, + pods: []backendmetrics.PodMetrics{}, + expectedSaturation: true, // No capacity = saturated }, { name: "Single pod with good capacity", @@ -247,11 +133,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 2, KVCacheUsagePercent: 0.5, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, + expectedSaturation: false, }, { name: "Single pod with stale metrics", @@ -261,11 +145,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime.Add(-200 * time.Millisecond), // Stale WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: true, + expectedSaturation: true, }, { name: "Single pod with high queue depth", @@ -275,11 +157,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 10, // Exceeds threshold 5 KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: true, + expectedSaturation: true, }, { name: "Single pod with high KV cache utilization", @@ -289,11 +169,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 1, KVCacheUsagePercent: 0.95, // Exceeds threshold 0.90 - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: true, + expectedSaturation: true, }, { name: "Single pod with nil metrics", @@ -301,7 +179,7 @@ func TestDetector_IsSaturated(t *testing.T) { pods: []backendmetrics.PodMetrics{ newMockPodMetrics("pod1", nil), }, - expectedSaturat: true, + expectedSaturation: true, }, { name: "Multiple pods, all good capacity", @@ -311,18 +189,14 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), newMockPodMetrics("pod2", &backendmetrics.MetricsState{ UpdateTime: baseTime.Add(-10 * time.Millisecond), WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, + expectedSaturation: false, }, { name: "Multiple pods, one good, one bad (stale)", @@ -332,18 +206,14 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, // Good WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), newMockPodMetrics("pod2", &backendmetrics.MetricsState{ UpdateTime: baseTime.Add(-300 * time.Millisecond), // Stale WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, // One good pod is enough + expectedSaturation: false, // One good pod is enough }, { name: "Multiple pods, one good, one bad (high queue)", @@ -353,18 +223,14 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), newMockPodMetrics("pod2", &backendmetrics.MetricsState{ UpdateTime: baseTime, WaitingQueueSize: 15, // Bad queue KVCacheUsagePercent: 0.2, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, + expectedSaturation: false, }, { name: "Multiple pods, all bad capacity", @@ -374,25 +240,19 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime.Add(-200 * time.Millisecond), // Stale WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), newMockPodMetrics("pod2", &backendmetrics.MetricsState{ UpdateTime: baseTime, WaitingQueueSize: 20, // High queue KVCacheUsagePercent: 0.2, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), newMockPodMetrics("pod3", &backendmetrics.MetricsState{ UpdateTime: baseTime, WaitingQueueSize: 1, KVCacheUsagePercent: 0.99, // High KV - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: true, + expectedSaturation: true, }, { name: "Queue depth exactly at threshold", @@ -402,11 +262,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: defaultConfig.QueueDepthThreshold, // Exactly at threshold (good) KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, + expectedSaturation: false, }, { name: "KV cache exactly at threshold", @@ -416,11 +274,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime, WaitingQueueSize: 1, KVCacheUsagePercent: defaultConfig.KVCacheUtilThreshold, // Exactly at threshold (good) - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: false, + expectedSaturation: false, }, { name: "Metrics age just over staleness threshold", @@ -430,11 +286,9 @@ func TestDetector_IsSaturated(t *testing.T) { UpdateTime: baseTime.Add(-defaultConfig.MetricsStalenessThreshold - time.Nanosecond), // Just over (stale) WaitingQueueSize: 1, KVCacheUsagePercent: 0.1, - ActiveModels: make(map[string]int), - WaitingModels: make(map[string]int), }), }, - expectedSaturat: true, + expectedSaturation: true, }, } @@ -442,8 +296,8 @@ func TestDetector_IsSaturated(t *testing.T) { t.Run(test.name, func(t *testing.T) { detector := NewDetector(test.config, logr.Discard()) - if got := detector.IsSaturated(context.Background(), test.pods); got != test.expectedSaturat { - t.Errorf("IsSaturated() = %v, want %v", got, test.expectedSaturat) + if got := detector.IsSaturated(context.Background(), test.pods); got != test.expectedSaturation { + t.Errorf("IsSaturated() = %v, want %v", got, test.expectedSaturation) } }) } diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go index bd9e2c96e..8b68132dc 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go @@ -149,3 +149,35 @@ func (i *indexer) reportLRUSize(ctx context.Context, interval time.Duration) { i.mu.RUnlock() } } + +// RemovePod removes a pod and its associated entries from the indexer. +func (i *indexer) RemovePod(pod ServerID) { + i.mu.RLock() + lruCache, exists := i.podToLRU[pod] + i.mu.RUnlock() + + if !exists { + return + } + + // Remove all hashes associated with the pod from hashToPods (triggers eviction callbacks). + for _, hash := range lruCache.Keys() { + lruCache.Remove(hash) + } + + i.mu.Lock() + delete(i.podToLRU, pod) + i.mu.Unlock() +} + +// Pods returns the list of all pods currently tracked in the indexer. +func (i *indexer) Pods() []ServerID { + i.mu.RLock() + defer i.mu.RUnlock() + + pods := make([]ServerID, 0, len(i.podToLRU)) + for pod := range i.podToLRU { + pods = append(pods, pod) + } + return pods +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go index 6d4fcc5f4..c35af8e27 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go @@ -46,3 +46,63 @@ func TestIndexer_AddAndGet(t *testing.T) { servers = i.Get(BlockHash(4)) assert.Empty(t, servers, "Cache should not contain non-existent hash") } + +func TestIndexer_RemovePodAndEviction(t *testing.T) { + const indexerSize = 10 + + i := newIndexer(context.Background(), indexerSize) + + server1 := ServerID{Namespace: "default", Name: "server1"} + server2 := ServerID{Namespace: "default", Name: "server2"} + + // Add indexerSize hashes to both servers + var hashes []BlockHash + for j := 0; j < indexerSize; j++ { + h := BlockHash(j) + hashes = append(hashes, h) + i.Add([]BlockHash{h}, server1) + i.Add([]BlockHash{h}, server2) + } + + // Ensure all entries are added + assert.Equal(t, indexerSize, i.podToLRU[server1].Len(), "server1 should have 10 entries") + assert.Equal(t, indexerSize, i.podToLRU[server2].Len(), "server2 should have 10 entries") + + // Ensure each hash in hashToPods maps to both server1 and server2 + for _, h := range hashes { + pods := i.hashToPods[h] + assert.Len(t, pods, 2, "Each hash should be associated with exactly 2 pods") + assert.Contains(t, pods, server1, "hash should be associated with server1") + assert.Contains(t, pods, server2, "hash should be associated with server2") + } + + // Add indexerSize hash to server1 → should evict BlockHash(0) + evictedHash := BlockHash(0) + newHash := BlockHash(indexerSize) + i.Add([]BlockHash{newHash}, server1) + + // server1 LRU should still be at max capacity + assert.Equal(t, indexerSize, i.podToLRU[server1].Len(), "server1 LRU should maintain max size") + + // BlockHash(0) should no longer have server1 in hashToPods + pods := i.Get(evictedHash) + assert.NotContains(t, pods, server1, "server1 should be evicted from hashToPods for hash 0") + assert.Contains(t, pods, server2, "server2 should still have hash 0") + + // Remove server2 + i.RemovePod(server2) + + // hashToPods for hash 0 should now be empty + pods = i.Get(evictedHash) + assert.NotContains(t, pods, server2, "server2 should be removed from hash 0") + assert.Empty(t, pods, "hash 0 should have no pods after both eviction and removal") + + // All remaining hashes should map only to server1 + for hash, pods := range i.hashToPods { + assert.Len(t, pods, 1, "hash %v should have only 1 pod after server2 removal", hash) + assert.Contains(t, pods, server1, "hash %v should only contain server1", hash) + } + + // Ensure hashToPods contains exactly indexerSize hashes (post-eviction and server2 removal) + assert.Len(t, i.hashToPods, indexerSize, "hashToPods should contain %d hashes after cleanup", indexerSize) +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go index c87f8e8bf..d0986e25c 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -22,11 +22,13 @@ import ( "encoding/json" "fmt" "sync" + "time" "github.com/cespare/xxhash/v2" k8stypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" @@ -37,7 +39,7 @@ import ( const ( // vLLM default token block size is 16, and a good guess of average characters per token is 4. - DefaultHashBlockSize = 64 + DefaultBlockSize = 64 // The maximum number of blocks to match. Two long requests with the same prefix up to this // limit will be indistinguishable. // This parameter provides a trade-off between cache size, prefix matching speed and matching @@ -57,16 +59,23 @@ const ( PrefixCachePluginType = "prefix-cache-scorer" ) +const ( + PodActiveCheckInterval = 2 * time.Minute + + // An estimated average characters per token, used since the request we cached is not tokenized. + averageCharactersPerToken = 4 +) + var DefaultConfig = Config{ - HashBlockSize: DefaultHashBlockSize, + DefaultBlockSize: DefaultBlockSize, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } type Config struct { - // The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests + // The input prompt is broken into sizes of BlockSize to calculate block hashes . Requests // with length shorter than the block size will be ignored. - HashBlockSize int `json:"hashBlockSize"` + DefaultBlockSize int `json:"blockSize"` // MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will // be ignored. MaxPrefixBlocksToMatch int `json:"maxPrefixBlocksToMatch"` @@ -93,6 +102,8 @@ type podSet map[ServerID]struct{} type Indexer interface { Get(hash BlockHash) podSet Add(hashes []BlockHash, server ServerID) + RemovePod(server ServerID) + Pods() []ServerID } // BlockHash is a hash of the block of request body. @@ -130,13 +141,15 @@ func (s *SchedulingContextState) Clone() plugins.StateData { } // compile-time type assertion -var _ framework.Scorer = &Plugin{} -var _ requestcontrol.PreRequest = &Plugin{} +var ( + _ framework.Scorer = &Plugin{} + _ requestcontrol.PreRequest = &Plugin{} +) // PrefixCachePluginFactory defines the factory function for Prefix plugin. func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { parameters := Config{ - HashBlockSize: DefaultHashBlockSize, + DefaultBlockSize: DefaultBlockSize, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -147,7 +160,9 @@ func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle } } - return New(handle.Context(), parameters).WithName(name), nil + p := New(handle.Context(), parameters).WithName(name) + go p.CleanUpInactivePods(handle.Context(), handle) + return p, nil } // New initializes a new prefix Plugin and returns its pointer. @@ -182,9 +197,8 @@ func (p *Plugin) WithName(name string) *Plugin { // Score returns the scoring result for the given list of pods based on context. func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { - // pre score step, hashing prompt and find longest prefix match. - hashes := hashPrompt(ctx, request, p.config.HashBlockSize, p.config.MaxPrefixBlocksToMatch) + hashes := hashPrompt(ctx, request, getBlockSize(pods, p.config.DefaultBlockSize), p.config.MaxPrefixBlocksToMatch) state := &SchedulingContextState{ PrefixHashes: hashes, PrefixCacheServers: p.matchLongestPrefix(ctx, hashes), @@ -212,7 +226,7 @@ func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, reques } // PreRequest records in the plugin cache the result of the scheduling selection. -func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, _ int) { +func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) { primaryProfileResult := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName] targetPod := primaryProfileResult.TargetPods[0].GetPod() // get the first pod of the primary profile @@ -235,7 +249,9 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche total := len(state.PrefixHashes) matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)] - metrics.RecordPrefixCacheMatch(matchLen*p.config.HashBlockSize, total*p.config.HashBlockSize) + + blockSize := getBlockSize(primaryProfileResult.TargetPods, p.config.DefaultBlockSize) + metrics.RecordPrefixCacheMatch(matchLen*blockSize, total*blockSize) } // matchLongestPrefix returns a map of servers and length of prefix that each server caches. @@ -254,45 +270,81 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map for server := range cachedServers { // Update servers with their longest prefix match. res[server]++ - } } } return res } +// CleanUpInactivePods starts a goroutine that watches for inactive pods. +func (m *Plugin) CleanUpInactivePods(ctx context.Context, handle plugins.Handle) { + logger := log.FromContext(ctx).V(logutil.VERBOSE) + ticker := time.NewTicker(PodActiveCheckInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + activePodMetrics := handle.PodList(func(_ backendmetrics.PodMetrics) bool { return true }) + activePods := make(map[ServerID]struct{}, len(activePodMetrics)) + for _, pm := range activePodMetrics { + activePods[ServerID(pm.GetPod().NamespacedName)] = struct{}{} + } + + for _, pod := range m.indexer.Pods() { + if _, ok := activePods[pod]; !ok { + m.indexer.RemovePod(pod) + logger.Info("Removed pod not in active set", "pod", pod) + } + } + } + } +} + // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block. -// hash(0) is the hash of the model name, since different models generally don't share prefix cache. +// hash[0] is calculated including the model name and cache_salt(if provided), since different models generally don't share prefix cache. // For block i, hash(i) = hash(block i content, hash(i-1)). func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) - prompt := []byte(request.Prompt) - if len(prompt) < cacheBlockSize { - loggerDebug.Info("Request body too small for prefix cache", "size", len(prompt), "block size", cacheBlockSize) + if request == nil || request.Body == nil { + loggerDebug.Info("Request or request data is nil, skipping hashing") return nil } - if len(prompt) > cacheBlockSize*maxPrefixBlocks { - loggerDebug.Info("Truncating input", "size", len(prompt), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize) - prompt = prompt[:maxPrefixBlocks*cacheBlockSize] + + userInput, err := getUserInputBytes(request) + if err != nil { + loggerDebug.Error(err, "Failed to get user input bytes") + return nil } - // Split the body into blocks of size cacheBlockSize. The +1 is to account for the model. + + if len(userInput) < cacheBlockSize { + loggerDebug.Info("Request body too small for prefix cache", "size", len(userInput), "block size", cacheBlockSize) + return nil + } + if len(userInput) > cacheBlockSize*maxPrefixBlocks { + loggerDebug.Info("Truncating input", "size", len(userInput), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize) + userInput = userInput[:maxPrefixBlocks*cacheBlockSize] + } + // Split the body into blocks of size cacheBlockSize. // If the last block is smaller than cacheBlockSize, it will be ignored. - res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize) + res := make([]BlockHash, 0, len(userInput)/cacheBlockSize) // Add the model to the first block hash so that different models have different hashes even with the same body. - - firstBlockSize := cacheBlockSize - if len(prompt) < cacheBlockSize { - firstBlockSize = len(prompt) + h := xxhash.New() + _, _ = h.Write([]byte(request.TargetModel)) + if cacheSalt := request.Body.CacheSalt(); cacheSalt != "" { + _, _ = h.Write([]byte(cacheSalt)) } - firstBlock := prompt[0:firstBlockSize] - firstBlockWithModel := append([]byte(request.TargetModel), firstBlock...) - res = append(res, BlockHash(xxhash.Sum64(firstBlockWithModel))) - - for i := cacheBlockSize; i+cacheBlockSize <= len(prompt); i += cacheBlockSize { - block := prompt[i : i+cacheBlockSize] - prevBlockHash := res[len(res)-1] - block = append(block, toBytes(prevBlockHash)...) - res = append(res, BlockHash(xxhash.Sum64(block))) + + prevBlockHash := BlockHash(h.Sum64()) + for i := 0; i+cacheBlockSize <= len(userInput); i += cacheBlockSize { + h.Reset() + _, _ = h.Write(userInput[i : i+cacheBlockSize]) + _, _ = h.Write(toBytes(prevBlockHash)) + res = append(res, BlockHash(h.Sum64())) + + prevBlockHash = res[len(res)-1] } return res } @@ -302,3 +354,28 @@ func toBytes(i BlockHash) []byte { binary.LittleEndian.PutUint64(bytes, uint64(i)) return bytes } + +func getUserInputBytes(request *types.LLMRequest) ([]byte, error) { + if request.Body.Completions != nil { // assumed to be valid if not nil + return []byte(request.Body.Completions.Prompt), nil + } + + // must be chat-completions request at this point, return bytes of entire messages + return json.Marshal(request.Body.ChatCompletions.Messages) +} + +func getBlockSize(pods []types.Pod, defaultBlockSize int) int { + if len(pods) == 0 { + return defaultBlockSize + } + + // Since all PODs originate from the same inference pool, they are considered to have identical configurations. + // Therefore, using the CacheBlockSize value from the first POD suffices. + if pod := pods[0]; pod.GetMetrics() != nil { + cacheBlockSize := pod.GetMetrics().CacheBlockSize + if cacheBlockSize > 0 { + return cacheBlockSize * averageCharactersPerToken + } + } + return defaultBlockSize +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go index 3fbac2ce1..59a09db52 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go @@ -19,7 +19,6 @@ package prefix import ( "context" "fmt" - "math" "math/rand" "strings" "testing" @@ -29,28 +28,32 @@ import ( k8stypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) -func TestPrefixPlugin(t *testing.T) { - +func TestPrefixPluginCompletion(t *testing.T) { config := Config{ - HashBlockSize: 4, + DefaultBlockSize: 4, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } plugin := New(context.Background(), config) - pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}} - pod2 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}} + pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, MetricsState: backendmetrics.NewMetricsState()} + pod2 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, MetricsState: backendmetrics.NewMetricsState()} pods := []types.Pod{pod1, pod2} // First request. req1 := &types.LLMRequest{ RequestId: uuid.NewString(), TargetModel: "test-model1", - Prompt: "aaaaaa", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "aaaaaa", + }, + }, } scores := plugin.Score(context.Background(), types.NewCycleState(), req1, pods) state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req1.RequestId, plugins.StateKey(plugin.TypedName().String())) @@ -70,7 +73,7 @@ func TestPrefixPlugin(t *testing.T) { "default": {TargetPods: []types.Pod{pod1}}, }, } - plugin.PreRequest(context.Background(), req1, schedulingResult, 0) + plugin.PreRequest(context.Background(), req1, schedulingResult) plugin.wg.Wait() // Second request doesn't share any prefix with first one. It should be added to the cache but @@ -78,7 +81,11 @@ func TestPrefixPlugin(t *testing.T) { req2 := &types.LLMRequest{ RequestId: uuid.NewString(), TargetModel: "test-model2", - Prompt: "bbbbbb", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "bbbbbb", + }, + }, } scores = plugin.Score(context.Background(), types.NewCycleState(), req2, pods) state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req2.RequestId, plugins.StateKey(plugin.TypedName().String())) @@ -98,14 +105,18 @@ func TestPrefixPlugin(t *testing.T) { "default": {TargetPods: []types.Pod{pod2}}, }, } - plugin.PreRequest(context.Background(), req2, schedulingResult, 0) + plugin.PreRequest(context.Background(), req2, schedulingResult) plugin.wg.Wait() // Third request shares partial prefix with first one. req3 := &types.LLMRequest{ RequestId: uuid.NewString(), TargetModel: "test-model1", - Prompt: "aaaabbbb", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "aaaabbbb", + }, + }, } scores = plugin.Score(context.Background(), types.NewCycleState(), req3, pods) state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req3.RequestId, plugins.StateKey(plugin.TypedName().String())) @@ -124,14 +135,18 @@ func TestPrefixPlugin(t *testing.T) { "default": {TargetPods: []types.Pod{pod1}}, }, } - plugin.PreRequest(context.Background(), req3, schedulingResult, 0) + plugin.PreRequest(context.Background(), req3, schedulingResult) plugin.wg.Wait() // 4th request is same as req3 except the model is different, still no match. req4 := &types.LLMRequest{ RequestId: uuid.NewString(), TargetModel: "test-model-new", - Prompt: "aaaabbbb", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "aaaabbbb", + }, + }, } scores = plugin.Score(context.Background(), types.NewCycleState(), req4, pods) state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req4.RequestId, plugins.StateKey(plugin.TypedName().String())) @@ -150,14 +165,18 @@ func TestPrefixPlugin(t *testing.T) { "default": {TargetPods: []types.Pod{pod1}}, }, } - plugin.PreRequest(context.Background(), req4, schedulingResult, 0) + plugin.PreRequest(context.Background(), req4, schedulingResult) plugin.wg.Wait() // 5th request shares partial prefix with 3rd one. req5 := &types.LLMRequest{ RequestId: uuid.NewString(), TargetModel: "test-model1", - Prompt: "aaaabbbbcccc", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "aaaabbbbcccc", + }, + }, } scores = plugin.Score(context.Background(), types.NewCycleState(), req5, pods) state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req5.RequestId, plugins.StateKey(plugin.TypedName().String())) @@ -176,8 +195,153 @@ func TestPrefixPlugin(t *testing.T) { "default": {TargetPods: []types.Pod{pod1}}, }, } - plugin.PreRequest(context.Background(), req5, schedulingResult, 0) + plugin.PreRequest(context.Background(), req5, schedulingResult) + plugin.wg.Wait() +} + +func TestPrefixPluginChatCompletions(t *testing.T) { + config := Config{ + DefaultBlockSize: 4, + MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + plugin := New(context.Background(), config) + + pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, MetricsState: &backendmetrics.MetricsState{}} + pods := []types.Pod{pod1} + + // Test with chat completions request + req1 := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "test-model1", + Body: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "user", Content: types.Content{Raw: "hello world"}}, + {Role: "assistant", Content: types.Content{Raw: "hi there"}}, + }, + }, + }, + } + scores := plugin.Score(context.Background(), types.NewCycleState(), req1, pods) + state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req1.RequestId, plugins.StateKey(plugin.TypedName().String())) + assert.NoError(t, err) + t.Logf("Chat completions - Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Should have some hashes for the JSON-encoded messages + assert.Greater(t, len(state.PrefixHashes), 1, "should have hashes for chat completions") + assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers initially") + assert.Equal(t, float64(0), scores[pod1], "score for pod1") +} + +func TestPrefixPluginChatCompletionsGrowth(t *testing.T) { + config := Config{ + DefaultBlockSize: 8, // Use larger block size for more predictable JSON marshaling + MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + plugin := New(context.Background(), config) + + pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, MetricsState: &backendmetrics.MetricsState{}} + pod2 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, MetricsState: &backendmetrics.MetricsState{}} + pods := []types.Pod{pod1, pod2} + + // First request with initial conversation + req1 := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "test-model1", + Body: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{Raw: "You are a helpful assistant"}}, + {Role: "user", Content: types.Content{Raw: "Hello, how are you?"}}, + }, + }, + }, + } + scores := plugin.Score(context.Background(), types.NewCycleState(), req1, pods) + state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req1.RequestId, plugins.StateKey(plugin.TypedName().String())) + assert.NoError(t, err) + t.Logf("Initial conversation - Hashes %+v, cached servers: %+v", len(state.PrefixHashes), state.PrefixCacheServers) + initialHashCount := len(state.PrefixHashes) + assert.Greater(t, initialHashCount, 1, "should have hashes for chat completions") + assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers initially") + assert.Equal(t, float64(0), scores[pod1], "score for pod1") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + // Simulate pod1 was picked + schedulingResult := &types.SchedulingResult{ + PrimaryProfileName: "default", + ProfileResults: map[string]*types.ProfileRunResult{ + "default": {TargetPods: []types.Pod{pod1}}, + }, + } + plugin.PreRequest(context.Background(), req1, schedulingResult) + plugin.wg.Wait() + + // Second request adds assistant response and new user message (conversation grows) + req2 := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "test-model1", + Body: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{Raw: "You are a helpful assistant"}}, + {Role: "user", Content: types.Content{Raw: "Hello, how are you?"}}, + {Role: "assistant", Content: types.Content{Raw: "I'm doing well, thank you! How can I help you today?"}}, + {Role: "user", Content: types.Content{Raw: "Can you explain how prefix caching works?"}}, + }, + }, + }, + } + scores = plugin.Score(context.Background(), types.NewCycleState(), req2, pods) + state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req2.RequestId, plugins.StateKey(plugin.TypedName().String())) + assert.NoError(t, err) + t.Logf("Extended conversation - Hashes %+v, cached servers: %+v", len(state.PrefixHashes), state.PrefixCacheServers) + extendedHashCount := len(state.PrefixHashes) + assert.Greater(t, extendedHashCount, initialHashCount, "extended conversation should have more hashes") + assert.Greater(t, len(state.PrefixCacheServers), 0, "should have cached servers from prefix match") + + // Calculate expected score - pod1 should have cached the initial prefix + cachedBlocks := state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)] + expectedScore := float64(cachedBlocks) / float64(extendedHashCount) + assert.Equal(t, expectedScore, scores[pod1], "pod1 should have prefix cache hit") + assert.Equal(t, float64(0), scores[pod2], "pod2 should have no cache hit") + + // Simulate pod1 was picked again + plugin.PreRequest(context.Background(), req2, schedulingResult) plugin.wg.Wait() + + // Third request continues the conversation even further + req3 := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "test-model1", + Body: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{Raw: "You are a helpful assistant"}}, + {Role: "user", Content: types.Content{Raw: "Hello, how are you?"}}, + {Role: "assistant", Content: types.Content{Raw: "I'm doing well, thank you! How can I help you today?"}}, + {Role: "user", Content: types.Content{Raw: "Can you explain how prefix caching works?"}}, + {Role: "assistant", Content: types.Content{Raw: "Prefix caching is a technique where..."}}, + {Role: "user", Content: types.Content{Raw: "That's very helpful, thank you!"}}, + }, + }, + }, + } + scores = plugin.Score(context.Background(), types.NewCycleState(), req3, pods) + state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req3.RequestId, plugins.StateKey(plugin.TypedName().String())) + assert.NoError(t, err) + t.Logf("Long conversation - Hashes %+v, cached servers: %+v", len(state.PrefixHashes), state.PrefixCacheServers) + longHashCount := len(state.PrefixHashes) + assert.Greater(t, longHashCount, extendedHashCount, "long conversation should have even more hashes") + assert.Greater(t, len(state.PrefixCacheServers), 0, "should have cached servers from prefix match") + + // pod1 should have an even higher cache hit rate now + cachedBlocks = state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)] + expectedScore = float64(cachedBlocks) / float64(longHashCount) + assert.Equal(t, expectedScore, scores[pod1], "pod1 should have higher prefix cache hit") + assert.Greater(t, scores[pod1], float64(0.5), "cache hit rate should be substantial for growing conversation") + assert.Equal(t, float64(0), scores[pod2], "pod2 should still have no cache hit") } // TestPrefixPluginStress is a stress test for the prefix scoring plugin, using prompts of increasing length. @@ -185,7 +349,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) { blockSize := 4 maxPrefixBlocks := 50000 config := Config{ - HashBlockSize: blockSize, + DefaultBlockSize: blockSize, MaxPrefixBlocksToMatch: maxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -193,45 +357,44 @@ func BenchmarkPrefixPluginStress(b *testing.B) { plugin := New(context.Background(), config) types.NewCycleState() var promptLen []int - for i := 1; i <= 1024; i++ { + for i := 1; i <= 1024; { promptLen = append(promptLen, i) + i += 10 } promptLen = append(promptLen, 2048, 4096, 8192, 10000, 20000, 50000) - for _, i := range promptLen { - // Generate increasing-length random prompts - prompt := randomPrompt(4 + i) - pod := &types.PodMetrics{ - Pod: &backend.Pod{ - NamespacedName: k8stypes.NamespacedName{ - Name: fmt.Sprintf("random-pod-%d", i), + for i, v := range promptLen { + b.Run(fmt.Sprintf("messages_%d_length_%d", i, v), func(b *testing.B) { + // Generate increasing-length random prompts + prompt := randomPrompt(4 + v) + pod := &types.PodMetrics{ + Pod: &backend.Pod{ + NamespacedName: k8stypes.NamespacedName{ + Name: fmt.Sprintf("random-pod-%d", v), + }, }, - }, - } - - pods := []types.Pod{pod} - req := &types.LLMRequest{ - RequestId: uuid.NewString(), - TargetModel: "model-stress", - Prompt: prompt, - } - - // First cycle: simulate scheduling and insert prefix info into the cache - plugin.Score(context.Background(), types.NewCycleState(), req, pods) - schedulingResult := &types.SchedulingResult{ - PrimaryProfileName: "default", - ProfileResults: map[string]*types.ProfileRunResult{ - "default": {TargetPods: []types.Pod{pod}}, - }, - } - plugin.PreRequest(context.Background(), req, schedulingResult, 0) - plugin.wg.Wait() + } + + pods := []types.Pod{pod} + req := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "model-stress", + Body: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: prompt, + }, + }, + } + + b.ResetTimer() + // Benchmark the scoring operation + scores := plugin.Score(context.Background(), types.NewCycleState(), req, pods) + _ = scores // Use the result to prevent optimization + + // Clean up state for next iteration + plugin.pluginState.Delete(req.RequestId) + }) - // Second cycle: validate internal state - state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req.RequestId, plugins.StateKey(plugin.TypedName().String())) - assert.NoError(b, err) - expectedHashes := int(math.Min(float64(maxPrefixBlocks), float64(len(req.Prompt)/blockSize))) - assert.Equal(b, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect") } } @@ -244,3 +407,75 @@ func randomPrompt(n int) string { } return sb.String() } + +// BenchmarkPrefixPluginChatCompletionsStress is a stress test for chat completions with varying message counts and lengths +func BenchmarkPrefixPluginChatCompletionsStress(b *testing.B) { + blockSize := 8 + maxPrefixBlocks := 50000 + config := Config{ + DefaultBlockSize: blockSize, + MaxPrefixBlocksToMatch: maxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + plugin := New(context.Background(), config) + + // Test scenarios: varying number of messages and message lengths + scenarios := []struct { + messageCount int + messageLength int + }{ + {2, 50}, // Short conversation, short messages + {2, 500}, // Short conversation, long messages + {5, 100}, // Medium conversation, medium messages + {10, 200}, // Long conversation, medium messages + {20, 100}, // Very long conversation, medium messages + {50, 50}, // Very long conversation, short messages + {100, 25}, // Extremely long conversation, very short messages + } + + for _, scenario := range scenarios { + b.Run(fmt.Sprintf("messages_%d_length_%d", scenario.messageCount, scenario.messageLength), func(b *testing.B) { + // Generate messages for this scenario + messages := make([]types.Message, scenario.messageCount) + messages[0] = types.Message{Role: "system", Content: types.Content{Raw: "You are a helpful assistant."}} + + for i := 1; i < scenario.messageCount; i++ { + role := "user" + if i%2 == 0 { + role = "assistant" + } + content := randomPrompt(scenario.messageLength) + messages[i] = types.Message{Role: role, Content: types.Content{Raw: content}} + } + + pod := &types.PodMetrics{ + Pod: &backend.Pod{ + NamespacedName: k8stypes.NamespacedName{ + Name: fmt.Sprintf("chat-pod-%d-%d", scenario.messageCount, scenario.messageLength), + }, + }, + } + pods := []types.Pod{pod} + + req := &types.LLMRequest{ + RequestId: uuid.NewString(), + TargetModel: "chat-model-stress", + Body: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: messages, + }, + }, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Benchmark the scoring operation + scores := plugin.Score(context.Background(), types.NewCycleState(), req, pods) + _ = scores // Use the result to prevent optimization + + // Clean up state for next iteration + plugin.pluginState.Delete(req.RequestId) + } + }) + } +} diff --git a/pkg/epp/requestcontrol/latencypredictor_helper.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/latencypredictor_helper.go similarity index 78% rename from pkg/epp/requestcontrol/latencypredictor_helper.go rename to pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/latencypredictor_helper.go index 8cd840391..ed86e5aa4 100644 --- a/pkg/epp/requestcontrol/latencypredictor_helper.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/latencypredictor_helper.go @@ -11,7 +11,7 @@ distributed under the License. */ // Package requestcontrol contains helpers to decouple latency-predictor logic. -package requestcontrol +package slo_aware_router import ( "context" @@ -24,19 +24,24 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" latencypredictor "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" ) -// RefreshLastSeenMetrics updates reqCtx.LastSeenMetrics from the latest scheduling result. -func RefreshLastSeenMetrics(ctx context.Context, reqCtx *handlers.RequestContext) { - if sr := reqCtx.SchedulingResult; sr != nil { +const ( + // Poisson sampling parameters for predictions + defaultSamplingMean = 100 // Mean interval between prediction samples (tokens) + maxSampledTokens = 20 // Maximum number of prediction samples per request +) + +// RefreshLastSeenMetrics updates sloCtx.LastSeenMetrics from the latest scheduling result. +func RefreshLastSeenMetrics(ctx context.Context, sloCtx *SLORequestContext) { + if sr := sloCtx.SchedulingResult; sr != nil { if pr := sr.ProfileResults[sr.PrimaryProfileName]; pr != nil && pr.TargetPods != nil { for profileName, profileResult := range sr.ProfileResults { if profileResult != nil && profileResult.TargetPods != nil && len(profileResult.TargetPods) > 0 { - reqCtx.LastSeenMetrics[profileName] = profileResult.TargetPods[0].GetMetrics().Clone() + sloCtx.LastSeenMetrics[profileName] = profileResult.TargetPods[0].GetMetrics().Clone() } } } @@ -99,32 +104,32 @@ func GetTargetPodForProfile( return targetPod } -// GetMetricsForPrediction retrieves the latest metrics for prediction from reqCtx.LastSeenMetrics. -func GetLatestMetricsForProfile(ctx context.Context, reqCtx *handlers.RequestContext, profileName string) (*backendmetrics.MetricsState, error) { - if len(reqCtx.LastSeenMetrics) == 0 { +// GetMetricsForPrediction retrieves the latest metrics for prediction from sloCtx.LastSeenMetrics. +func GetLatestMetricsForProfile(ctx context.Context, sloCtx *SLORequestContext, profileName string) (*backendmetrics.MetricsState, error) { + if len(sloCtx.LastSeenMetrics) == 0 { return nil, fmt.Errorf("no last seen metrics available for prediction") } // Use the primary profile's metrics for prediction - if metrics, exists := reqCtx.LastSeenMetrics[profileName]; exists { + if metrics, exists := sloCtx.LastSeenMetrics[profileName]; exists { return metrics, nil } log.FromContext(ctx).V(logutil.DEBUG).Info("No metrics found for profile, trying primary profile", "profile_name", profileName) - primaryProfileName := reqCtx.SchedulingResult.PrimaryProfileName - if metrics, exists := reqCtx.LastSeenMetrics[primaryProfileName]; exists { + primaryProfileName := sloCtx.SchedulingResult.PrimaryProfileName + if metrics, exists := sloCtx.LastSeenMetrics[primaryProfileName]; exists { return metrics, nil } return nil, fmt.Errorf("no metrics found for primary profile %s", primaryProfileName) } -// ProcessHeader refreshes metrics, applies TTFT prediction, updates reqCtx.PredictedTTFT and timestamp. +// ProcessHeader refreshes metrics, applies TTFT prediction, updates sloCtx.PredictedTTFT and timestamp. func ProcessHeaderForLatencyPrediction( ctx context.Context, predictor latencypredictor.PredictorInterface, - reqCtx *handlers.RequestContext, + sloCtx *SLORequestContext, ) error { logger := log.FromContext(ctx) @@ -133,18 +138,18 @@ func ProcessHeaderForLatencyPrediction( // Build prediction request //check if prefill profile name is set, if not use primary profile name - m, err := GetLatestMetricsForProfile(ctx, reqCtx, "prefill") + m, err := GetLatestMetricsForProfile(ctx, sloCtx, "prefill") if err != nil { logger.V(logutil.DEBUG).Info("Skipping prediction due to missing metrics", "error", err) return err } - targetPod := GetTargetPodForProfile(ctx, reqCtx.SchedulingResult, "prefill") - prefix_cache_score := GetPrefixCacheScoreForPod(ctx, reqCtx.SchedulingResult, targetPod, "prefill") + targetPod := GetTargetPodForProfile(ctx, sloCtx.SchedulingResult, "prefill") + prefix_cache_score := GetPrefixCacheScoreForPod(ctx, sloCtx.SchedulingResult, targetPod, "prefill") in := latencypredictor.PredictionRequest{ KVCachePercentage: m.KVCacheUsagePercent, - InputTokenLength: len(strings.Fields(reqCtx.SchedulingRequest.Prompt)), + InputTokenLength: len(strings.Fields(sloCtx.SchedulingRequest.Body.Completions.Prompt)), NumRequestWaiting: m.WaitingQueueSize, NumRequestRunning: m.RunningQueueSize, NumTokensGenerated: 0, @@ -157,55 +162,55 @@ func ProcessHeaderForLatencyPrediction( dur := time.Since(start) if err != nil { logger.V(logutil.DEBUG).Error(err, "header TTFT predict failed", "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTTFT = 0 + sloCtx.PredictedTTFT = 0 } else if p == nil { logger.V(logutil.DEBUG).Info("header TTFT predict nil", "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTTFT = 0 + sloCtx.PredictedTTFT = 0 } else { logger.V(logutil.DEBUG).Info("header TTFT succeeded", "value_ms", p.TTFT, "duration_ms", dur.Milliseconds()) - metrics.RecordRequestTTFTPredictionDuration(ctx, reqCtx.TargetModelName, reqCtx.IncomingModelName, dur.Seconds()) + metrics.RecordRequestTTFTPredictionDuration(ctx, sloCtx.SchedulingRequest.TargetModel, sloCtx.IncomingModelName, dur.Seconds()) - reqCtx.PredictedTTFT = p.TTFT + sloCtx.PredictedTTFT = p.TTFT } // Advance timestamp for first token reference - reqCtx.LastTokenTimestamp = time.Now() - RefreshLastSeenMetrics(ctx, reqCtx) + sloCtx.LastTokenTimestamp = time.Now() + RefreshLastSeenMetrics(ctx, sloCtx) return err } -// ProcessFirstToken records actual TTFT, trains, predicts first TPOT, updates reqCtx, and advances timestamp. +// ProcessFirstToken records actual TTFT, trains, predicts first TPOT, updates sloCtx, and advances timestamp. func ProcessFirstTokenForLatencyPrediction( ctx context.Context, predictor latencypredictor.PredictorInterface, - reqCtx *handlers.RequestContext, + sloCtx *SLORequestContext, now time.Time, ) { logger := log.FromContext(ctx) // Initialize sampler - if reqCtx.TokenSampler == nil { - requestID := reqCtx.Request.Headers[requtil.RequestIdHeaderKey] - reqCtx.TokenSampler = requtil.NewTokenSampler(requestID, defaultSamplingMean, maxSampledTokens) - logger.V(logutil.DEBUG).Info("Initialized token sampler for first token", "request_id", requestID, "next_prediction_token", reqCtx.TokenSampler.GetNextSampleToken()) + if sloCtx.TokenSampler == nil { + requestID := sloCtx.SchedulingRequest.Headers[requtil.RequestIdHeaderKey] + sloCtx.TokenSampler = requtil.NewTokenSampler(requestID, defaultSamplingMean, maxSampledTokens) + logger.V(logutil.DEBUG).Info("Initialized token sampler for first token", "request_id", requestID, "next_prediction_token", sloCtx.TokenSampler.GetNextSampleToken()) } // Actual TTFT - reqCtx.TTFT = float64(now.Sub(reqCtx.RequestReceivedTimestamp).Milliseconds()) - reqCtx.GeneratedTokenCount = 1 - m, err := GetLatestMetricsForProfile(ctx, reqCtx, "prefill") + sloCtx.TTFT = float64(now.Sub(sloCtx.RequestReceivedTimestamp).Milliseconds()) + sloCtx.GeneratedTokenCount = 1 + m, err := GetLatestMetricsForProfile(ctx, sloCtx, "prefill") if err != nil { logger.V(logutil.DEBUG).Info("Skipping prediction due to missing metrics", "error", err) return } - targetPod := GetTargetPodForProfile(ctx, reqCtx.SchedulingResult, "prefill") - prefix_cache_score := GetPrefixCacheScoreForPod(ctx, reqCtx.SchedulingResult, targetPod, "prefill") + targetPod := GetTargetPodForProfile(ctx, sloCtx.SchedulingResult, "prefill") + prefix_cache_score := GetPrefixCacheScoreForPod(ctx, sloCtx.SchedulingResult, targetPod, "prefill") // Train TTFT entry := latencypredictor.TrainingEntry{ KVCachePercentage: m.KVCacheUsagePercent, - InputTokenLength: len(strings.Fields(reqCtx.SchedulingRequest.Prompt)), - ActualTTFT: reqCtx.TTFT, + InputTokenLength: len(strings.Fields(sloCtx.SchedulingRequest.Body.Completions.Prompt)), + ActualTTFT: sloCtx.TTFT, ActualTPOT: 0, Timestamp: now, NumRequestWaiting: m.WaitingQueueSize, @@ -216,7 +221,7 @@ func ProcessFirstTokenForLatencyPrediction( if err := predictor.AddTrainingDataBulk([]latencypredictor.TrainingEntry{entry}); err != nil { logger.V(logutil.DEBUG).Error(err, "record TTFT training failed") } - m, err = GetLatestMetricsForProfile(ctx, reqCtx, reqCtx.SchedulingResult.PrimaryProfileName) + m, err = GetLatestMetricsForProfile(ctx, sloCtx, sloCtx.SchedulingResult.PrimaryProfileName) if err != nil { logger.V(logutil.DEBUG).Info("Skipping first TPOT prediction due to missing metrics", "error", err) @@ -226,10 +231,10 @@ func ProcessFirstTokenForLatencyPrediction( // Predict first TPOT in := latencypredictor.PredictionRequest{ KVCachePercentage: m.KVCacheUsagePercent, - InputTokenLength: len(strings.Fields(reqCtx.SchedulingRequest.Prompt)), + InputTokenLength: len(strings.Fields(sloCtx.SchedulingRequest.Body.Completions.Prompt)), NumRequestWaiting: m.WaitingQueueSize, NumRequestRunning: m.RunningQueueSize, - NumTokensGenerated: reqCtx.GeneratedTokenCount, + NumTokensGenerated: sloCtx.GeneratedTokenCount, PrefixCacheScore: 0, } start := time.Now() @@ -237,48 +242,48 @@ func ProcessFirstTokenForLatencyPrediction( dur := time.Since(start) if err != nil || p == nil { logger.V(logutil.DEBUG).Error(err, "first TPOT predict failed", "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTPOTObservations = append(reqCtx.PredictedTPOTObservations, 0) - reqCtx.AvgPredictedTPOT = calculateRunningAverage(reqCtx.AvgPredictedTPOT, 0, len(reqCtx.PredictedTPOTObservations)) + sloCtx.PredictedTPOTObservations = append(sloCtx.PredictedTPOTObservations, 0) + sloCtx.AvgPredictedTPOT = calculateRunningAverage(sloCtx.AvgPredictedTPOT, 0, len(sloCtx.PredictedTPOTObservations)) } else { logger.V(logutil.DEBUG).Info("first TPOT succeeded", "value_ms", p.TPOT, "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTPOTObservations = append(reqCtx.PredictedTPOTObservations, p.TPOT) - reqCtx.AvgPredictedTPOT = calculateRunningAverage(reqCtx.AvgPredictedTPOT, p.TPOT, len(reqCtx.PredictedTPOTObservations)) + sloCtx.PredictedTPOTObservations = append(sloCtx.PredictedTPOTObservations, p.TPOT) + sloCtx.AvgPredictedTPOT = calculateRunningAverage(sloCtx.AvgPredictedTPOT, p.TPOT, len(sloCtx.PredictedTPOTObservations)) } - metrics.RecordRequestTPOTPredictionDuration(ctx, reqCtx.TargetModelName, reqCtx.IncomingModelName, dur.Seconds()) + metrics.RecordRequestTPOTPredictionDuration(ctx, sloCtx.SchedulingRequest.TargetModel, sloCtx.IncomingModelName, dur.Seconds()) // Advance timestamp - reqCtx.LastTokenTimestamp = now + sloCtx.LastTokenTimestamp = now // Refresh metrics - RefreshLastSeenMetrics(ctx, reqCtx) + RefreshLastSeenMetrics(ctx, sloCtx) } -// ProcessToken records actual inter-token latency, trains, predicts sampled TPOT, updates reqCtx, and advances timestamp. +// ProcessToken records actual inter-token latency, trains, predicts sampled TPOT, updates sloCtx, and advances timestamp. func ProcessTokenForLatencyPrediction( ctx context.Context, predictor latencypredictor.PredictorInterface, - reqCtx *handlers.RequestContext, + sloCtx *SLORequestContext, now time.Time, ) { logger := log.FromContext(ctx) // Initialize sampler if not yet - if reqCtx.TokenSampler == nil { - requestID := reqCtx.Request.Headers[requtil.RequestIdHeaderKey] - reqCtx.TokenSampler = requtil.NewTokenSampler(requestID, defaultSamplingMean, maxSampledTokens) - logger.V(logutil.DEBUG).Info("Initialized token sampler for subsequent tokens", "request_id", requestID, "next_prediction_token", reqCtx.TokenSampler.GetNextSampleToken()) + if sloCtx.TokenSampler == nil { + requestID := sloCtx.SchedulingRequest.Headers[requtil.RequestIdHeaderKey] + sloCtx.TokenSampler = requtil.NewTokenSampler(requestID, defaultSamplingMean, maxSampledTokens) + logger.V(logutil.DEBUG).Info("Initialized token sampler for subsequent tokens", "request_id", requestID, "next_prediction_token", sloCtx.TokenSampler.GetNextSampleToken()) } // Inter-token latency - latencyMs := float64(now.Sub(reqCtx.LastTokenTimestamp).Milliseconds()) - reqCtx.GeneratedTokenCount++ + latencyMs := float64(now.Sub(sloCtx.LastTokenTimestamp).Milliseconds()) + sloCtx.GeneratedTokenCount++ //log the inter-token latency for predicted samples - if reqCtx.GeneratedTokenCount == 2 || reqCtx.TokenSampler.ShouldPredict(reqCtx.GeneratedTokenCount) { //tricky logic, since next sample token is always +1 from current token - reqCtx.TPOTObservations = append(reqCtx.TPOTObservations, latencyMs) - reqCtx.AvgTPOT = calculateRunningAverage(reqCtx.AvgTPOT, latencyMs, len(reqCtx.TPOTObservations)) + if sloCtx.GeneratedTokenCount == 2 || sloCtx.TokenSampler.ShouldPredict(sloCtx.GeneratedTokenCount) { //tricky logic, since next sample token is always +1 from current token + sloCtx.TPOTObservations = append(sloCtx.TPOTObservations, latencyMs) + sloCtx.AvgTPOT = calculateRunningAverage(sloCtx.AvgTPOT, latencyMs, len(sloCtx.TPOTObservations)) } - m, err := GetLatestMetricsForProfile(ctx, reqCtx, reqCtx.SchedulingResult.PrimaryProfileName) + m, err := GetLatestMetricsForProfile(ctx, sloCtx, sloCtx.SchedulingResult.PrimaryProfileName) if err != nil { logger.V(logutil.DEBUG).Info("Skipping first TPOT prediction due to missing metrics", "error", err) @@ -287,13 +292,13 @@ func ProcessTokenForLatencyPrediction( // Record actual TPOT entry := latencypredictor.TrainingEntry{ KVCachePercentage: m.KVCacheUsagePercent, - InputTokenLength: len(strings.Fields(reqCtx.SchedulingRequest.Prompt)), + InputTokenLength: len(strings.Fields(sloCtx.SchedulingRequest.Body.Completions.Prompt)), ActualTTFT: 0, ActualTPOT: latencyMs, Timestamp: now, NumRequestWaiting: m.WaitingQueueSize, NumRequestRunning: m.RunningQueueSize, - NumTokensGenerated: reqCtx.GeneratedTokenCount - 1, + NumTokensGenerated: sloCtx.GeneratedTokenCount - 1, PrefixCacheScore: 0, // TPOT does not use prefix cache score } if err := predictor.AddTrainingDataBulk([]latencypredictor.TrainingEntry{entry}); err != nil { @@ -301,13 +306,13 @@ func ProcessTokenForLatencyPrediction( } // Sampled predict - if reqCtx.TokenSampler.ShouldPredict(reqCtx.GeneratedTokenCount) { + if sloCtx.TokenSampler.ShouldPredict(sloCtx.GeneratedTokenCount) { in := latencypredictor.PredictionRequest{ KVCachePercentage: m.KVCacheUsagePercent, - InputTokenLength: len(strings.Fields(reqCtx.SchedulingRequest.Prompt)), + InputTokenLength: len(strings.Fields(sloCtx.SchedulingRequest.Body.Completions.Prompt)), NumRequestWaiting: m.WaitingQueueSize, NumRequestRunning: m.RunningQueueSize, - NumTokensGenerated: reqCtx.GeneratedTokenCount, + NumTokensGenerated: sloCtx.GeneratedTokenCount, PrefixCacheScore: 0, // TPOT does not use prefix cache score } start := time.Now() @@ -315,22 +320,22 @@ func ProcessTokenForLatencyPrediction( dur := time.Since(start) if err != nil || p == nil { logger.V(logutil.DEBUG).Error(err, "TPOT predict failed", "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTPOTObservations = append(reqCtx.PredictedTPOTObservations, 0) - reqCtx.AvgPredictedTPOT = calculateRunningAverage(reqCtx.AvgPredictedTPOT, 0, len(reqCtx.PredictedTPOTObservations)) + sloCtx.PredictedTPOTObservations = append(sloCtx.PredictedTPOTObservations, 0) + sloCtx.AvgPredictedTPOT = calculateRunningAverage(sloCtx.AvgPredictedTPOT, 0, len(sloCtx.PredictedTPOTObservations)) } else { logger.V(logutil.DEBUG).Info("TPOT predict succeeded", "value_ms", p.TPOT, "duration_ms", dur.Milliseconds()) - reqCtx.PredictedTPOTObservations = append(reqCtx.PredictedTPOTObservations, p.TPOT) - reqCtx.AvgPredictedTPOT = calculateRunningAverage(reqCtx.AvgPredictedTPOT, p.TPOT, len(reqCtx.PredictedTPOTObservations)) + sloCtx.PredictedTPOTObservations = append(sloCtx.PredictedTPOTObservations, p.TPOT) + sloCtx.AvgPredictedTPOT = calculateRunningAverage(sloCtx.AvgPredictedTPOT, p.TPOT, len(sloCtx.PredictedTPOTObservations)) } - metrics.RecordRequestTPOTPredictionDuration(ctx, reqCtx.TargetModelName, reqCtx.IncomingModelName, dur.Seconds()) + metrics.RecordRequestTPOTPredictionDuration(ctx, sloCtx.SchedulingRequest.TargetModel, sloCtx.IncomingModelName, dur.Seconds()) - reqCtx.TokenSampler.RecordPrediction(reqCtx.GeneratedTokenCount) + sloCtx.TokenSampler.RecordPrediction(sloCtx.GeneratedTokenCount) } // Advance timestamp - reqCtx.LastTokenTimestamp = now + sloCtx.LastTokenTimestamp = now // Refresh metrics - RefreshLastSeenMetrics(ctx, reqCtx) + RefreshLastSeenMetrics(ctx, sloCtx) } // PredictWithMetrics predicts TTFT or TPOT based on provided metrics state and token count. @@ -488,19 +493,19 @@ func BulkPredictWithMetrics( } // Fixed DebugPrintRawScores for map[string]map[Pod]float64 structure -func DebugPrintRawScores(ctx context.Context, reqCtx *handlers.RequestContext) { +func DebugPrintRawScores(ctx context.Context, sloCtx *SLORequestContext) { logger := log.FromContext(ctx) - if reqCtx.SchedulingResult == nil || reqCtx.SchedulingResult.AllProfileRunResults == nil { + if sloCtx.SchedulingResult == nil || sloCtx.SchedulingResult.AllProfileRunResults == nil { logger.V(logutil.DEBUG).Info("No raw scheduling results available for debug") return } logger.V(logutil.DEBUG).Info("=== RAW SCHEDULING RESULTS DEBUG START ===", - "total_profiles", len(reqCtx.SchedulingResult.AllProfileRunResults)) + "total_profiles", len(sloCtx.SchedulingResult.AllProfileRunResults)) // Print raw results for all profiles - for profileName, profileResult := range reqCtx.SchedulingResult.AllProfileRunResults { + for profileName, profileResult := range sloCtx.SchedulingResult.AllProfileRunResults { if profileResult == nil { logger.V(logutil.DEBUG).Info("Profile result is nil", "profile", profileName) continue @@ -652,3 +657,14 @@ func GetPrefixCacheScoreForPod( "profile", targetProfile) return 0.0 } + +// calculateRunningAverage calculates the running average efficiently +func calculateRunningAverage(currentAvg float64, newValue float64, count int) float64 { + if count == 0 { + return 0 + } + if count == 1 { + return newValue + } + return currentAvg + (newValue-currentAvg)/float64(count) +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/slo_scorer.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/plugin.go similarity index 88% rename from pkg/epp/scheduling/framework/plugins/scorer/slo_scorer.go rename to pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/plugin.go index bdf6a1aae..bff535722 100644 --- a/pkg/epp/scheduling/framework/plugins/scorer/slo_scorer.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/plugin.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package scorer +package slo_aware_router import ( "context" @@ -29,10 +29,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" latencypredictor "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -55,9 +53,9 @@ const ( ) const ( - SLOScorerPluginType = "slo-scorer" - MinScore = 0 - MaxScore = 100 + SLOAwareRouterPluginType = "slo-aware-routing" + MinScore = 0 + MaxScore = 100 ) var SLOBufferFactor = func() float64 { @@ -163,50 +161,52 @@ type PodPredictionResult struct { PrefixCacheScore float64 // Prefix cache score for the pod } -type SLOScorer struct { - tn plugins.TypedName - predictor latencypredictor.PredictorInterface - datastore datastore.Datastore - headroomStrategy HeadroomStrategy +type SLOAwareRouter struct { + tn plugins.TypedName + latencypredictor latencypredictor.PredictorInterface + runningRequestLists map[types.NamespacedName]*RequestPriorityQueue + sloContextStore map[string]*SLORequestContext + headroomStrategy HeadroomStrategy } -func (s *SLOScorer) Dependencies() []plugins.TypedName { +func (s *SLOAwareRouter) Dependencies() []plugins.TypedName { return []plugins.TypedName{ {Type: "prefix-cache-scorer", Name: "prefix-cache-scorer"}, } } -var _ framework.Scorer = &SLOScorer{} +var _ framework.Scorer = &SLOAwareRouter{} -func NewSLOScorer(predictor latencypredictor.PredictorInterface, datastore datastore.Datastore, strategy HeadroomStrategy) *SLOScorer { - return &SLOScorer{ - tn: plugins.TypedName{Type: SLOScorerPluginType, Name: SLOScorerPluginType}, - predictor: predictor, - datastore: datastore, - headroomStrategy: strategy, +func NewSLOAwareRouter(latencypredictor latencypredictor.PredictorInterface, strategy HeadroomStrategy) *SLOAwareRouter { + return &SLOAwareRouter{ + tn: plugins.TypedName{Type: SLOAwareRouterPluginType, Name: SLOAwareRouterPluginType}, + latencypredictor: latencypredictor, + runningRequestLists: make(map[types.NamespacedName]*RequestPriorityQueue), + sloContextStore: make(map[string]*SLORequestContext), + headroomStrategy: strategy, } } -func (s *SLOScorer) TypedName() plugins.TypedName { +func (s *SLOAwareRouter) TypedName() plugins.TypedName { return s.tn } -func (s *SLOScorer) WithName(name string) *SLOScorer { +func (s *SLOAwareRouter) WithName(name string) *SLOAwareRouter { s.tn.Name = name return s } // SetHeadroomStrategy allows runtime configuration of headroom selection strategy -func (s *SLOScorer) SetHeadroomStrategy(strategy HeadroomStrategy) { +func (s *SLOAwareRouter) SetHeadroomStrategy(strategy HeadroomStrategy) { s.headroomStrategy = strategy } // GetHeadroomStrategy returns the current headroom selection strategy -func (s *SLOScorer) GetHeadroomStrategy() HeadroomStrategy { +func (s *SLOAwareRouter) GetHeadroomStrategy() HeadroomStrategy { return s.headroomStrategy } -func (s *SLOScorer) epsilonGreedyAffinityGate( +func (s *SLOAwareRouter) epsilonGreedyAffinityGate( ctx context.Context, candidates []PodPredictionResult, r *rand.Rand, @@ -239,10 +239,10 @@ func (s *SLOScorer) epsilonGreedyAffinityGate( return eligible, true } -func (s *SLOScorer) Score(ctx context.Context, state *schedulingtypes.CycleState, request *schedulingtypes.LLMRequest, pods []schedulingtypes.Pod) map[schedulingtypes.Pod]float64 { +func (s *SLOAwareRouter) Score(ctx context.Context, state *schedulingtypes.CycleState, request *schedulingtypes.LLMRequest, pods []schedulingtypes.Pod) map[schedulingtypes.Pod]float64 { logger := log.FromContext(ctx) - if s.predictor == nil { - logger.V(logutil.DEBUG).Info("SLOScorer: no predictor configured, returning nil scores") + if s.latencypredictor == nil { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: no predictor configured, returning nil scores") return nil } @@ -345,7 +345,7 @@ func (s *SLOScorer) Score(ctx context.Context, state *schedulingtypes.CycleState // selectFromPositiveHeadroomPods selects a pod from positive headroom pods using headroom strategy // Updated to incorporate TTFTHeadroom with a configurable blend vs TPOT headroom. -func (s *SLOScorer) selectFromPositiveHeadroomPods(ctx context.Context, posHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { +func (s *SLOAwareRouter) selectFromPositiveHeadroomPods(ctx context.Context, posHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { logger := log.FromContext(ctx) if len(posHeadroomPods) == 1 { @@ -458,6 +458,7 @@ func (s *SLOScorer) selectFromPositiveHeadroomPods(ctx context.Context, posHeadr // If no pod was selected (shouldn't happen), fallback to first pod if selectedPod == nil { selectedPod = candidates[0].Pod + selectedPod = posHeadroomPods[0].Pod } return selectedPod @@ -465,7 +466,7 @@ func (s *SLOScorer) selectFromPositiveHeadroomPods(ctx context.Context, posHeadr // selectFromNegativeHeadroomPods selects a pod from negative headroom pods using hierarchical TTFT/TPOT logic // Modified to strictly prefer pods with 0 running requests -func (s *SLOScorer) selectFromNegativeHeadroomPods(ctx context.Context, negHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { +func (s *SLOAwareRouter) selectFromNegativeHeadroomPods(ctx context.Context, negHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { logger := log.FromContext(ctx) if len(negHeadroomPods) == 1 { @@ -500,7 +501,7 @@ func (s *SLOScorer) selectFromNegativeHeadroomPods(ctx context.Context, negHeadr } // selectFromNegativeHeadroomPodsInternal handles the actual selection logic for negative headroom pods -func (s *SLOScorer) selectFromNegativeHeadroomPodsInternal(ctx context.Context, negHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { +func (s *SLOAwareRouter) selectFromNegativeHeadroomPodsInternal(ctx context.Context, negHeadroomPods []PodPredictionResult, r *rand.Rand) schedulingtypes.Pod { if len(negHeadroomPods) == 1 { return negHeadroomPods[0].Pod } @@ -543,7 +544,7 @@ func (s *SLOScorer) selectFromNegativeHeadroomPodsInternal(ctx context.Context, // weightPodsByBlendedDeficit applies blended weighting using TTFT and TPOT deficits. // Lower blended deficit => higher weight. -func (ps *SLOScorer) weightPodsByBlendedDeficit( +func (ps *SLOAwareRouter) weightPodsByBlendedDeficit( ctx context.Context, pods []PodPredictionResult, choices *[]Choice, @@ -643,7 +644,7 @@ func (ps *SLOScorer) weightPodsByBlendedDeficit( } } -func (s *SLOScorer) handleNegativeHeadroomPodsHierarchical( +func (s *SLOAwareRouter) handleNegativeHeadroomPodsHierarchical( ctx context.Context, negHeadroomPods []PodPredictionResult, choices *[]Choice, @@ -700,7 +701,7 @@ func (s *SLOScorer) handleNegativeHeadroomPodsHierarchical( } // generatePredictions creates prediction results for all candidate pods -func (s *SLOScorer) generatePredictions(ctx context.Context, state *schedulingtypes.CycleState, request *schedulingtypes.LLMRequest, candidatePods []schedulingtypes.Pod) []PodPredictionResult { +func (s *SLOAwareRouter) generatePredictions(ctx context.Context, state *schedulingtypes.CycleState, request *schedulingtypes.LLMRequest, candidatePods []schedulingtypes.Pod) []PodPredictionResult { logger := log.FromContext(ctx) predictions := make([]PodPredictionResult, 0, len(candidatePods)) @@ -712,10 +713,8 @@ func (s *SLOScorer) generatePredictions(ctx context.Context, state *schedulingty // Get prefix cache score for the pod prefixCacheScore := s.getPrefixCacheScoreForPod(ctx, state, pod) - // TODO update the request in the datastore request tracker - // Generate prediction - prediction, err := requestcontrol.PredictWithMetrics(ctx, s.predictor, pod.GetMetrics(), request.Prompt, 1, prefixCacheScore) + prediction, err := PredictWithMetrics(ctx, s.latencypredictor, pod.GetMetrics(), request.Body.Completions.Prompt, 1, prefixCacheScore) if err != nil { logger.V(logutil.DEBUG).Info("Skipping pod due to prediction error", "pod", pod.GetPod().String(), "error", err) predResult.Error = err @@ -754,31 +753,31 @@ func (s *SLOScorer) generatePredictions(ctx context.Context, state *schedulingty return predictions } -func (s *SLOScorer) getPodMinTPOTSLO(pod schedulingtypes.Pod) float64 { +func (s *SLOAwareRouter) getPodMinTPOTSLO(pod schedulingtypes.Pod) float64 { podName := types.NamespacedName{ Name: pod.GetPod().NamespacedName.Name, Namespace: pod.GetPod().NamespacedName.Namespace, } - if runningReqs, err := s.datastore.PodGetRunningRequests(podName); err == nil && runningReqs != nil { + if runningReqs, ok := s.runningRequestLists[podName]; ok && runningReqs.GetSize() > 0 { if topReq := runningReqs.Peek(); topReq != nil { return topReq.TPOT } } - return 0 + return 0 // no running requests or no TPOT SLOs } -func (s *SLOScorer) getPodRunningRequestCount(pod schedulingtypes.Pod) int { +func (s *SLOAwareRouter) getPodRunningRequestCount(pod schedulingtypes.Pod) int { podName := types.NamespacedName{ Name: pod.GetPod().NamespacedName.Name, Namespace: pod.GetPod().NamespacedName.Namespace, } - if runningReqs, err := s.datastore.PodGetRequestCount(podName); err == nil { - return runningReqs + if runningReqs, ok := s.runningRequestLists[podName]; ok { + return runningReqs.GetSize() } - return 0 + return 0 // no running requests } -func (s *SLOScorer) validatePrediction( +func (s *SLOAwareRouter) validatePrediction( pred *latencypredictor.PredictionResponse, req *schedulingtypes.LLMRequest, podMinTPOTSLO float64, @@ -803,7 +802,7 @@ func (s *SLOScorer) validatePrediction( return } -func (s *SLOScorer) getPrefixCacheScoreForPod(ctx context.Context, cycleState *schedulingtypes.CycleState, pod schedulingtypes.Pod) float64 { +func (s *SLOAwareRouter) getPrefixCacheScoreForPod(ctx context.Context, cycleState *schedulingtypes.CycleState, pod schedulingtypes.Pod) float64 { log.FromContext(ctx).V(logutil.DEBUG).Info("Running getPrefixCacheScoreForPod, getting prefix cache score for pod", "pod", pod.GetPod().String()) plugintype := prefix.PrefixCachePluginType pluginname := prefix.PrefixCachePluginType @@ -838,7 +837,7 @@ func (s *SLOScorer) getPrefixCacheScoreForPod(ctx context.Context, cycleState *s } // updateRequestContextWithPredictions updates the request context with prediction data -func (s *SLOScorer) updateRequestContextWithPredictions(request *schedulingtypes.LLMRequest, predictions []PodPredictionResult) { +func (s *SLOAwareRouter) updateRequestContextWithPredictions(request *schedulingtypes.LLMRequest, predictions []PodPredictionResult) { for _, pred := range predictions { if pred.Error == nil { podKey := pred.Pod.GetPod().String() diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go new file mode 100644 index 000000000..b1c66d5a9 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go @@ -0,0 +1,218 @@ +package slo_aware_router + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "github.com/google/uuid" + "sigs.k8s.io/controller-runtime/pkg/log" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" +) + +var _ requestcontrol.PreRequest = &SLOAwareRouter{} +var _ requestcontrol.ResponseReceived = &SLOAwareRouter{} +var _ requestcontrol.ResponseStreaming = &SLOAwareRouter{} +var _ requestcontrol.ResponseComplete = &SLOAwareRouter{} + +type SLORequestContext struct { + SchedulingRequest schedulingtypes.LLMRequest + TargetPod *backend.Pod + SchedulingResult *schedulingtypes.SchedulingResult + LastSeenMetrics map[string]*backendmetrics.MetricsState + LastTokenTimestamp time.Time + RequestReceivedTimestamp time.Time + GeneratedTokenCount int + IncomingModelName string + TTFT float64 + PredictedTTFT float64 + AvgTPOT float64 + AvgPredictedTPOT float64 + TokenSampler *requtil.TokenSampler + TPOTObservations []float64 + PredictedTPOTObservations []float64 +} + +func NewSLORequestContext(request *schedulingtypes.LLMRequest) *SLORequestContext { + return &SLORequestContext{ + SchedulingRequest: *request, + LastSeenMetrics: make(map[string]*backendmetrics.MetricsState), + } +} + +func (s *SLOAwareRouter) getSLOContextForRequest(request *schedulingtypes.LLMRequest) (*SLORequestContext, error) { + id := request.Headers[requtil.RequestIdHeaderKey] + if ctx, exists := s.sloContextStore[id]; exists { + return ctx, nil + } + return nil, fmt.Errorf("SLO context not found for request ID: %s", id) +} + +func (s *SLOAwareRouter) setSLOContextForRequest(request *schedulingtypes.LLMRequest, ctx *SLORequestContext) { + id := request.Headers[requtil.RequestIdHeaderKey] + s.sloContextStore[id] = ctx +} + +func (s *SLOAwareRouter) deleteSLOContextForRequest(request *schedulingtypes.LLMRequest) { + id := request.Headers[requtil.RequestIdHeaderKey] + delete(s.sloContextStore, id) +} + +// --- RequestControl Hooks --- + +func (t *SLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult) { + logger := log.FromContext(ctx) + + if schedulingResult == nil || len(schedulingResult.ProfileResults) == 0 { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: Skipping PreRequest because no scheduling result was provided.") + return + } + + targetPod := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName].TargetPods[0].GetPod() + + podName := types.NamespacedName{ + Name: targetPod.NamespacedName.Name, + Namespace: targetPod.NamespacedName.Namespace, + } + + logger.V(logutil.DEBUG).Info("request ID for SLO tracking", "requestID", request.Headers[requtil.RequestIdHeaderKey], "podName", podName) + if request.Headers[requtil.RequestIdHeaderKey] == "" { + request.Headers[requtil.RequestIdHeaderKey] = uuid.New().String() + logger.V(logutil.DEBUG).Info("Generated new request ID for SLO tracking", "requestID", request.Headers[requtil.RequestIdHeaderKey]) + logger.V(logutil.DEBUG).Info("request headers for SLO tracking", "requestHeaders", request.Headers) + } + + id := request.Headers[requtil.RequestIdHeaderKey] + podRequestList, ok := t.runningRequestLists[podName] + if !ok { + podRequestList = NewRequestPriorityQueue() + t.runningRequestLists[podName] = podRequestList + } + + added := podRequestList.Add(id, request.AvgTPOTSLO) + if !added { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: Item already exists in queue", "podName", podName, "requestID", id) + } + + // Set up SLO request context + sloCtx := NewSLORequestContext(request) + sloCtx.TargetPod = targetPod + sloCtx.SchedulingResult = schedulingResult + RefreshLastSeenMetrics(ctx, sloCtx) + t.setSLOContextForRequest(request, sloCtx) +} + +func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) { + logger := log.FromContext(ctx) + id := request.Headers[requtil.RequestIdHeaderKey] + + sloCtx, err := t.getSLOContextForRequest(request) + if err != nil { + logger.V(logutil.DEBUG).Error(err, "SLOAwareRouter: Failed to get SLO context for request", "requestID", id) + return + } + + if !t.CheckPredictor(logger, targetPod) { + return + } + + if err := ProcessHeaderForLatencyPrediction(ctx, t.latencypredictor, sloCtx); err != nil { + logger.V(logutil.DEBUG).Error(err, "ProcessHeader in latencypredictor failed") + } + +} + +func (t *SLOAwareRouter) ResponseStreaming(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *backend.Pod) { + logger := log.FromContext(ctx) + if !t.CheckPredictor(logger, pod) { + return + } + + now := time.Now() + sloCtx, err := t.getSLOContextForRequest(request) + if err != nil { + id := request.Headers[requtil.RequestIdHeaderKey] + logger.V(logutil.DEBUG).Error(err, "SLOAwareRouter.ResponseStreaming: Failed to get SLO context for request", "requestID", id) + return + } + + if sloCtx.TTFT == 0 { + ProcessFirstTokenForLatencyPrediction(ctx, t.latencypredictor, sloCtx, now) + } else { + ProcessTokenForLatencyPrediction(ctx, t.latencypredictor, sloCtx, now) + } + +} + +func (t *SLOAwareRouter) ResponseComplete(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *backend.Pod) { + logger := log.FromContext(ctx) + targetPod := pod + if !t.CheckPredictor(logger, targetPod) { + return + } + + sloCtx, err := t.getSLOContextForRequest(request) + if err != nil { + id := request.Headers[requtil.RequestIdHeaderKey] + logger.V(logutil.DEBUG).Error(err, "SLOAwareRouter.ResponseComplete: Failed to get SLO context for request", "requestID", id) + return + } + + if sloCtx.TTFT > 0 { + logger.V(logutil.DEBUG).Info("Averages calculated", "avgActualTTFT", sloCtx.TTFT, "avgPredictedTTFT", sloCtx.PredictedTTFT) + metrics.RecordRequestTTFT(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.TTFT/1000) + metrics.RecordRequestPredictedTTFT(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.PredictedTTFT/1000) + if sloCtx.SchedulingRequest.TTFTSLO > 0 { + metrics.RecordRequestTTFTWithSLO(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.TTFT, sloCtx.SchedulingRequest.TTFTSLO) + } + } + + if sloCtx.AvgTPOT > 0 { + logger.V(logutil.DEBUG).Info("Averages calculated", "avgActualTPOT", sloCtx.AvgTPOT, "avgPredictedTPOT", sloCtx.AvgPredictedTPOT) + metrics.RecordRequestTPOT(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.AvgTPOT/1000) + metrics.RecordRequestPredictedTPOT(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.AvgPredictedTPOT/1000) + if sloCtx.SchedulingRequest.AvgTPOTSLO > 0 { + metrics.RecordRequestTPOTWithSLO(ctx, sloCtx.IncomingModelName, request.TargetModel, sloCtx.AvgTPOT, sloCtx.SchedulingRequest.AvgTPOTSLO) + } + } + logger.V(logutil.DEBUG).Info("SLO Aware Routing Mode", "PredictorBasedScheduling", request.PredictorBasedScheduling) + + podName := types.NamespacedName{ + Name: targetPod.NamespacedName.Name, + Namespace: targetPod.NamespacedName.Namespace, + } + + id := request.Headers[requtil.RequestIdHeaderKey] + podRequestList, ok := t.runningRequestLists[podName] + if !ok { + err := fmt.Errorf("no running request list found for pod %s", podName.String()) + logger.V(logutil.DEBUG).Error(err, "SLOAwareRouter: Failed to remove request from queue", "requestID", id) + } + + _, removed := podRequestList.Remove(id) + if !removed { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: Item not found in queue", "podName", podName, "requestID", id) + } + t.deleteSLOContextForRequest(request) +} + +func (t *SLOAwareRouter) CheckPredictor(logger logr.Logger, targetPod *backend.Pod) bool { + if targetPod == nil { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: Skipping PostResponse because no target pod was provided.") + return false + } + if t.latencypredictor == nil { + logger.V(logutil.DEBUG).Info("SLOAwareRouter: Skipping PostResponse because predictor missing") + return false + } + return true +} diff --git a/pkg/epp/datalayer/running_request_queue.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue.go similarity index 99% rename from pkg/epp/datalayer/running_request_queue.go rename to pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue.go index 29bef911a..1199be641 100644 --- a/pkg/epp/datalayer/running_request_queue.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue.go @@ -1,4 +1,4 @@ -package datalayer +package slo_aware_router import ( "container/heap" diff --git a/pkg/epp/datalayer/running_request_queue_test.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue_test.go similarity index 99% rename from pkg/epp/datalayer/running_request_queue_test.go rename to pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue_test.go index bac82106d..a8eba5fe1 100644 --- a/pkg/epp/datalayer/running_request_queue_test.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/running_request_queue_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package datalayer +package slo_aware_router import ( "fmt" diff --git a/pkg/epp/scheduling/framework/plugins/picker/common.go b/pkg/epp/scheduling/framework/plugins/picker/common.go index 4bbc300da..c8655840f 100644 --- a/pkg/epp/scheduling/framework/plugins/picker/common.go +++ b/pkg/epp/scheduling/framework/plugins/picker/common.go @@ -16,6 +16,13 @@ limitations under the License. package picker +import ( + "math/rand/v2" + "time" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + const ( DefaultMaxNumOfEndpoints = 1 // common default to all pickers ) @@ -24,3 +31,14 @@ const ( type pickerParameters struct { MaxNumOfEndpoints int `json:"maxNumOfEndpoints"` } + +func shuffleScoredPods(scoredPods []*types.ScoredPod) { + // Rand package is not safe for concurrent use, so we create a new instance. + // Source: https://pkg.go.dev/math/rand/v2#pkg-overview + randomGenerator := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), 0)) + + // Shuffle in-place + randomGenerator.Shuffle(len(scoredPods), func(i, j int) { + scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i] + }) +} diff --git a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go index 325f735fa..33e99bd06 100644 --- a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go +++ b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go @@ -20,9 +20,7 @@ import ( "context" "encoding/json" "fmt" - "math/rand" "slices" - "time" "sigs.k8s.io/controller-runtime/pkg/log" @@ -85,15 +83,8 @@ func (p *MaxScorePicker) Pick(ctx context.Context, cycleState *types.CycleState, log.FromContext(ctx).V(logutil.DEBUG).Info("Selecting pods from candidates sorted by max score", "max-num-of-endpoints", p.maxNumOfEndpoints, "num-of-candidates", len(scoredPods), "scored-pods", scoredPods) - // TODO: merge this with the logic in RandomPicker - // Rand package is not safe for concurrent use, so we create a new instance. - // Source: https://pkg.go.dev/math/rand#pkg-overview - randomGenerator := rand.New(rand.NewSource(time.Now().UnixNano())) - // Shuffle in-place - needed for random tie break when scores are equal - randomGenerator.Shuffle(len(scoredPods), func(i, j int) { - scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i] - }) + shuffleScoredPods(scoredPods) slices.SortStableFunc(scoredPods, func(i, j *types.ScoredPod) int { // highest score first if i.Score > j.Score { diff --git a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go index 741a49d59..022328efd 100644 --- a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go +++ b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go @@ -18,6 +18,7 @@ package picker import ( "context" + "math" "testing" "github.com/google/go-cmp/cmp" @@ -138,8 +139,8 @@ func TestPickMaxScorePicker(t *testing.T) { func TestPickWeightedRandomPicker(t *testing.T) { const ( - testIterations = 1000 - tolerance = 0.2 // 20% tolerance in [0,1] range + testIterations = 10000 + tolerance = 0.05 // Verify within tolerance ±5% ) pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}} @@ -197,14 +198,14 @@ func TestPickWeightedRandomPicker(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { picker := NewWeightedRandomPicker(test.maxPods) - selectionCounts := make(map[string]int) - // Calculate expected probabilities based on scores + // Summarize the total score of all pods totalScore := 0.0 for _, pod := range test.input { totalScore += pod.Score } + // Calculate expected probabilities based on scores expectedProbabilities := make(map[string]float64) for _, pod := range test.input { podName := pod.GetPod().NamespacedName.Name @@ -216,20 +217,19 @@ func TestPickWeightedRandomPicker(t *testing.T) { } // Initialize selection counters for each pod + selectionCounts := make(map[string]int) for _, pod := range test.input { podName := pod.GetPod().NamespacedName.Name selectionCounts[podName] = 0 } // Run multiple iterations to gather statistical data - for i := 0; i < testIterations; i++ { + for range testIterations { result := picker.Pick(context.Background(), types.NewCycleState(), test.input) // Count selections for probability analysis - if len(result.TargetPods) > 0 { - selectedPodName := result.TargetPods[0].GetPod().NamespacedName.Name - selectionCounts[selectedPodName]++ - } + selectedPodName := result.TargetPods[0].GetPod().NamespacedName.Name + selectionCounts[selectedPodName]++ } // Verify probability distribution @@ -237,11 +237,7 @@ func TestPickWeightedRandomPicker(t *testing.T) { actualCount := selectionCounts[podName] actualProb := float64(actualCount) / float64(testIterations) - toleranceValue := expectedProb * tolerance - lowerBound := expectedProb - toleranceValue - upperBound := expectedProb + toleranceValue - - if actualProb < lowerBound || actualProb > upperBound { + if math.Abs(actualProb-expectedProb) > tolerance { t.Errorf("Pod %s: expected probability %.3f ±%.1f%%, got %.3f (count: %d/%d)", podName, expectedProb, tolerance*100, actualProb, actualCount, testIterations) } else { diff --git a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go index 87a1747fc..10ad68469 100644 --- a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go +++ b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go @@ -20,8 +20,6 @@ import ( "context" "encoding/json" "fmt" - "math/rand" - "time" "sigs.k8s.io/controller-runtime/pkg/log" @@ -84,15 +82,8 @@ func (p *RandomPicker) Pick(ctx context.Context, _ *types.CycleState, scoredPods log.FromContext(ctx).V(logutil.DEBUG).Info("Selecting pods from candidates randomly", "max-num-of-endpoints", p.maxNumOfEndpoints, "num-of-candidates", len(scoredPods), "scored-pods", scoredPods) - // TODO: merge this with the logic in MaxScorePicker - // Rand package is not safe for concurrent use, so we create a new instance. - // Source: https://pkg.go.dev/math/rand#pkg-overview - randomGenerator := rand.New(rand.NewSource(time.Now().UnixNano())) - // Shuffle in-place - randomGenerator.Shuffle(len(scoredPods), func(i, j int) { - scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i] - }) + shuffleScoredPods(scoredPods) // if we have enough pods to return keep only the relevant subset if p.maxNumOfEndpoints < len(scoredPods) { diff --git a/pkg/epp/scheduling/framework/plugins/scorer/kvcache_utilization.go b/pkg/epp/scheduling/framework/plugins/scorer/kvcache_utilization.go index 48d982cd8..6db2c23e8 100644 --- a/pkg/epp/scheduling/framework/plugins/scorer/kvcache_utilization.go +++ b/pkg/epp/scheduling/framework/plugins/scorer/kvcache_utilization.go @@ -65,9 +65,8 @@ func (s *KVCacheUtilizationScorer) WithName(name string) *KVCacheUtilizationScor } // Score returns the scoring result for the given list of pods based on context. -func (s *KVCacheUtilizationScorer) Score(_ context.Context, _ *types.CycleState, req *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { +func (s *KVCacheUtilizationScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { scores := make(map[types.Pod]float64, len(pods)) - for _, pod := range pods { scores[pod] = 1 - pod.GetMetrics().KVCacheUsagePercent } diff --git a/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go b/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go index d3cbad4b4..fc5b8f7c4 100644 --- a/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go +++ b/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go @@ -65,7 +65,6 @@ func (s *LoraAffinityScorer) WithName(name string) *LoraAffinityScorer { } func (s *LoraAffinityScorer) Score(_ context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { - scores := make(map[types.Pod]float64, len(pods)) if request.PredictorBasedScheduling { diff --git a/pkg/epp/scheduling/framework/plugins/scorer/queue.go b/pkg/epp/scheduling/framework/plugins/scorer/queue.go index 9f9fd763a..0db645283 100644 --- a/pkg/epp/scheduling/framework/plugins/scorer/queue.go +++ b/pkg/epp/scheduling/framework/plugins/scorer/queue.go @@ -67,8 +67,7 @@ func (s *QueueScorer) WithName(name string) *QueueScorer { } // Score returns the scoring result for the given list of pods based on context. -func (s *QueueScorer) Score(_ context.Context, _ *types.CycleState, req *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { - +func (s *QueueScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { minQueueSize := math.MaxInt maxQueueSize := math.MinInt diff --git a/pkg/epp/scheduling/framework/plugins/test/filter/request_header_based_filter.go b/pkg/epp/scheduling/framework/plugins/test/filter/request_header_based_filter.go index bf36d7782..2836755d4 100644 --- a/pkg/epp/scheduling/framework/plugins/test/filter/request_header_based_filter.go +++ b/pkg/epp/scheduling/framework/plugins/test/filter/request_header_based_filter.go @@ -73,7 +73,7 @@ func (f *HeaderBasedTestingFilter) Filter(_ context.Context, _ *types.CycleState podAddressMap := make(map[string]types.Pod, len(pods)) for _, pod := range pods { - podAddressMap[pod.GetPod().Address] = pod + podAddressMap[pod.GetPod().GetIPAddress()] = pod } endpoints := strings.Split(headerValue, ",") diff --git a/pkg/epp/scheduling/framework/scheduler_profile.go b/pkg/epp/scheduling/framework/scheduler_profile.go index 7bb6fc653..f3833f5d7 100644 --- a/pkg/epp/scheduling/framework/scheduler_profile.go +++ b/pkg/epp/scheduling/framework/scheduler_profile.go @@ -191,6 +191,14 @@ func (p *SchedulerProfile) runScorerPlugins(ctx context.Context, request *types. } + for pod, score := range scores { + logger.V(logutil.DEBUG).Info("Pod score", + "scorer_type", scorer.TypedName().Type, + "scorer_name", scorer.TypedName().Name, + "pod_namespace", pod.GetPod().NamespacedName.Namespace, + "pod_name", pod.GetPod().NamespacedName.Name, + "score", score) + } for pod, score := range scores { logger.V(logutil.DEBUG).Info("Pod score", "scorer_type", scorer.TypedName().Type, diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index 056723dbf..15c5b6658 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -17,20 +17,25 @@ limitations under the License. package types import ( + "encoding/json" + "errors" "fmt" + "strings" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" ) +const nilString = "" + // LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. type LLMRequest struct { // RequestId is the Envoy generated Id for the request being processed RequestId string // TargetModel is the final target model after traffic split. TargetModel string - // Prompt is the prompt that was sent in the request body. - Prompt string + // Data contains the request-body fields that we parse out as user input. + Body *LLMRequestBody // Headers is a map of the request headers. Headers map[string]string @@ -40,8 +45,6 @@ type LLMRequest struct { AvgTPOTSLO float64 // PredictorBasedScheduling indicates whether to use predictor based scheduling. - // ### TODO Move below fields to the datalayer request object - PredictorBasedScheduling bool //PredictedTTFTForScheduling is the map of pod names to predicted TTFT values for scheduling. PredictedTTFTForScheduling map[string]float64 @@ -53,7 +56,150 @@ type LLMRequest struct { } func (r *LLMRequest) String() string { - return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers) + if r == nil { + return nilString + } + + return fmt.Sprintf("RequestID: %s, TargetModel: %s, Body: %s, Headers: %v", + r.RequestId, r.TargetModel, r.Body, r.Headers) +} + +// LLMRequestBody contains the request-body fields that we parse out as user input, +// to be used in forming scheduling decisions. +// An LLMRequestBody must contain exactly one of CompletionsRequest or ChatCompletionsRequest. +type LLMRequestBody struct { + // CompletionsRequest is the representation of the OpenAI /v1/completions request body. + Completions *CompletionsRequest `json:"completions,omitempty"` + // ChatCompletionsRequest is the representation of the OpenAI /v1/chat_completions request body. + ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"` +} + +func (r *LLMRequestBody) CacheSalt() string { + if r.ChatCompletions == nil && r.Completions == nil { + return "" + } + + if r.ChatCompletions != nil { + return r.ChatCompletions.CacheSalt + } + + return r.Completions.CacheSalt +} + +// CompletionsRequest is a structured representation of the fields we parse out of the /v1/completions request +// body. For detailed body fields, please refer to https://platform.openai.com/docs/api-reference/completions. +// This struct includes fields usable for plugins and scheduling decisions - and not the entire +// API spec. +type CompletionsRequest struct { + // Prompt is the prompt that was sent in the request body. + Prompt string `json:"prompt,omitempty"` + // CacheSalt is an optional request parameter to isolate prefix caches for security reasons. + CacheSalt string `json:"cache_salt,omitempty"` +} + +func (r *CompletionsRequest) String() string { + if r == nil { + return nilString + } + + return fmt.Sprintf("{PromptLength: %d}", len(r.Prompt)) +} + +// ChatCompletionsRequest is a structured representation of the fields we parse out of the v1/chat/completions +// request body. For detailed body fields, please refer to https://platform.openai.com/docs/api-reference/chat. +// This struct includes fields usable for plugins and scheduling decisions - and not the entire +// API spec. +type ChatCompletionsRequest struct { + /* parameters from the official OpenAI chat-completions API */ + Messages []Message `json:"messages,omitempty"` + Tools []interface{} `json:"tools,omitempty"` + /* parameters from the HuggingFace transformers chat-templates API */ + Documents []interface{} `json:"documents,omitempty"` + ChatTemplate string `json:"chat_template,omitempty"` + ReturnAssistantTokensMask bool `json:"return_assistant_tokens_mask,omitempty"` + ContinueFinalMessage bool `json:"continue_final_message,omitempty"` + AddGenerationPrompt bool `json:"add_generation_prompt,omitempty"` + ChatTemplateKWArgs map[string]interface{} `json:"chat_template_kwargs,omitempty"` + // CacheSalt is an optional request parameter to isolate prefix caches for security reasons. + CacheSalt string `json:"cache_salt,omitempty"` +} + +func (r *ChatCompletionsRequest) String() string { + if r == nil { + return nilString + } + + messagesLen := 0 + for _, msg := range r.Messages { + messagesLen += len(msg.Content.PlainText()) + } + return fmt.Sprintf("{MessagesLength: %d}", messagesLen) +} + +// Message represents a single message in a chat-completions request. +type Message struct { + // Role is the message Role, optional values are 'user', 'assistant', ... + Role string `json:"role,omitempty"` + // Content defines text of this message + Content Content `json:"content,omitempty"` +} + +type Content struct { + Raw string + Structured []ContentBlock +} + +type ContentBlock struct { + Type string `json:"type"` + Text string `json:"text,omitempty"` + ImageURL ImageBlock `json:"image_url,omitempty"` +} + +type ImageBlock struct { + Url string `json:"url,omitempty"` +} + +// UnmarshalJSON allow use both format +func (mc *Content) UnmarshalJSON(data []byte) error { + // Raw format + var str string + if err := json.Unmarshal(data, &str); err == nil { + mc.Raw = str + return nil + } + + // Block format + var blocks []ContentBlock + if err := json.Unmarshal(data, &blocks); err == nil { + mc.Structured = blocks + return nil + } + + return errors.New("content format not supported") +} + +func (mc Content) MarshalJSON() ([]byte, error) { + if mc.Raw != "" { + return json.Marshal(mc.Raw) + } + if mc.Structured != nil { + return json.Marshal(mc.Structured) + } + return json.Marshal("") +} + +func (mc Content) PlainText() string { + if mc.Raw != "" { + return mc.Raw + } + var sb strings.Builder + for _, block := range mc.Structured { + if block.Type == "text" { + sb.WriteString(block.Text) + sb.WriteString(" ") + } + } + return sb.String() } type Pod interface { @@ -69,8 +215,9 @@ type ScoredPod struct { func (pm *PodMetrics) String() string { if pm == nil { - return "" + return nilString } + return fmt.Sprintf("%+v", *pm) } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index de3cb1023..361fff080 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -43,6 +43,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" latencypredictor "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/latencypredictorasync" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" ) // ExtProcServerRunner provides methods to manage an external process server. @@ -57,7 +58,7 @@ type ExtProcServerRunner struct { RefreshPrometheusMetricsInterval time.Duration MetricsStalenessThreshold time.Duration Director *requestcontrol.Director - SaturationDetector requestcontrol.SaturationDetector + SaturationDetector *saturationdetector.Detector UseExperimentalDatalayerV2 bool // Pluggable data layer feature flag LatencyPredictor latencypredictor.PredictorInterface @@ -84,6 +85,7 @@ const ( DefaultTotalRunningRequestsMetric = "vllm:num_requests_running" // default for --total-running-requests-metric DefaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" // default for --kv-cache-usage-percentage-metric DefaultLoraInfoMetric = "vllm:lora_requests_info" // default for --lora-info-metric + DefaultCacheInfoMetric = "vllm:cache_config_info" // default for --cache-info-metric DefaultCertPath = "" // default for --cert-path DefaultConfigFile = "" // default for --config-file DefaultConfigText = "" // default for --config-text @@ -162,8 +164,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { cert, err = tlsutil.CreateSelfSignedTLSCertificate(logger) } if err != nil { - logger.Error(err, "Failed to create self signed certificate") - return err + return fmt.Errorf("failed to create self signed certificate - %w", err) } creds := credentials.NewTLS(&tls.Config{ diff --git a/pkg/epp/server/server_test.go b/pkg/epp/server/server_test.go index 175406400..bdff7b527 100644 --- a/pkg/epp/server/server_test.go +++ b/pkg/epp/server/server_test.go @@ -27,7 +27,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metadata" testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" @@ -55,7 +54,6 @@ func TestServer(t *testing.T) { director := &testDirector{} ctx, cancel, ds, _ := utils.PrepareForTestStreamingServer([]*v1alpha2.InferenceObjective{model}, []*v1.Pod{{ObjectMeta: metav1.ObjectMeta{Name: podName}}}, "test-pool1", namespace, poolPort) - streamingServer := handlers.NewStreamingServer(ds, director) testListener, errChan := utils.SetupTestStreamingServer(t, ctx, ds, streamingServer) @@ -174,11 +172,6 @@ type testDirector struct { requestHeaders map[string]string } -// GetDatastore implements handlers.Director. -func (ts *testDirector) GetDatastore() datastore.Datastore { - panic("unimplemented") -} - func (ts *testDirector) HandleRequest(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { ts.requestHeaders = reqCtx.Request.Headers @@ -187,22 +180,15 @@ func (ts *testDirector) HandleRequest(ctx context.Context, reqCtx *handlers.Requ return reqCtx, nil } -func (ts *testDirector) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { +func (ts *testDirector) HandleResponseReceived(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { return reqCtx, nil } -func (ts *testDirector) HandleResponseBodyChunk(ctx context.Context, reqCtx *handlers.RequestContext) error { - // Implement logic for handling response body chunk if needed - return nil -} - -func (ts *testDirector) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) error { - // Implement logic for handling response body chunk if needed - return nil +func (ts *testDirector) HandleResponseBodyStreaming(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + return reqCtx, nil } -func (ts *testDirector) HandleResponseTrailers(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { - // Implement logic for handling response body chunk if needed +func (ts *testDirector) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { return reqCtx, nil } diff --git a/pkg/epp/util/request/body.go b/pkg/epp/util/request/body.go index 46de1fa54..07877415f 100644 --- a/pkg/epp/util/request/body.go +++ b/pkg/epp/util/request/body.go @@ -17,70 +17,43 @@ limitations under the License. package request import ( - "fmt" + "encoding/json" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" ) -func ExtractPromptFromRequestBody(body map[string]any) (string, error) { - if _, ok := body["messages"]; ok { - return extractPromptFromMessagesField(body) +// ExtractRequestBody extracts the LLMRequestBody from the given request body map. +func ExtractRequestBody(rawBody map[string]any) (*types.LLMRequestBody, error) { + // Convert map back to JSON bytes + jsonBytes, err := json.Marshal(rawBody) + if err != nil { + return nil, errutil.Error{Code: errutil.BadRequest, Msg: "invalid request body"} } - return extractPromptField(body) -} -func extractPromptField(body map[string]any) (string, error) { - prompt, ok := body["prompt"] - if !ok { - return "", errutil.Error{Code: errutil.BadRequest, Msg: "prompt not found in request"} - } - promptStr, ok := prompt.(string) - if !ok { - return "", errutil.Error{Code: errutil.BadRequest, Msg: "prompt is not a string"} + // Try completions request first + var completions types.CompletionsRequest + if err = json.Unmarshal(jsonBytes, &completions); err == nil && completions.Prompt != "" { + return &types.LLMRequestBody{Completions: &completions}, nil } - return promptStr, nil -} -func extractPromptFromMessagesField(body map[string]any) (string, error) { - messages, ok := body["messages"] - if !ok { - return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages not found in request"} - } - messageList, ok := messages.([]any) - if !ok { - return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages is not a list"} - } - if len(messageList) == 0 { - return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages is empty"} + // Try chat completions + var chatCompletions types.ChatCompletionsRequest + if err = json.Unmarshal(jsonBytes, &chatCompletions); err != nil { + return nil, errutil.Error{Code: errutil.BadRequest, Msg: "invalid request format"} } - prompt := "" - for _, msg := range messageList { - msgMap, ok := msg.(map[string]any) - if !ok { - continue - } - content, ok := msgMap["content"] - if !ok { - continue - } - contentStr, ok := content.(string) - if !ok { - continue - } - role, ok := msgMap["role"] - if !ok { - continue - } - roleStr, ok := role.(string) - if !ok { - continue - } - prompt += constructChatMessage(roleStr, contentStr) + if err = validateChatCompletionsMessages(chatCompletions.Messages); err != nil { + return nil, errutil.Error{Code: errutil.BadRequest, Msg: "invalid chat-completions request: " + err.Error()} } - return prompt, nil + + return &types.LLMRequestBody{ChatCompletions: &chatCompletions}, nil } -func constructChatMessage(role string, content string) string { - return fmt.Sprintf("<|im_start|>%s\n%s<|im_end|>\n", role, content) +func validateChatCompletionsMessages(messages []types.Message) error { + if len(messages) == 0 { + return errutil.Error{Code: errutil.BadRequest, Msg: "chat-completions request must have at least one message"} + } + + return nil } diff --git a/pkg/epp/util/request/body_test.go b/pkg/epp/util/request/body_test.go index ce5a93921..5f5ec5c23 100644 --- a/pkg/epp/util/request/body_test.go +++ b/pkg/epp/util/request/body_test.go @@ -18,16 +18,30 @@ package request import ( "testing" + + "github.com/google/go-cmp/cmp" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) -func TestExtractPromptFromRequestBody(t *testing.T) { +func TestExtractRequestData(t *testing.T) { tests := []struct { name string body map[string]any - want string + want *types.LLMRequestBody wantErr bool - errType error }{ + { + name: "completions request body", + body: map[string]any{ + "model": "test", + "prompt": "test prompt", + }, + want: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "test prompt", + }, + }, + }, { name: "chat completions request body", body: map[string]any{ @@ -39,169 +53,339 @@ func TestExtractPromptFromRequestBody(t *testing.T) { map[string]any{ "role": "user", "content": "hello", }, + }, + }, + want: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{Raw: "this is a system message"}}, + {Role: "user", Content: types.Content{Raw: "hello"}}, + }, + }, + }, + }, + { + name: "chat completions request body with multi-modal content", + body: map[string]any{ + "model": "test", + "messages": []any{ map[string]any{ - "role": "assistant", "content": "hi, what can I do for you?", + "role": "system", + "content": []map[string]any{ + { + "type": "text", + "text": "Describe this image in one sentence.", + }, + }, + }, + map[string]any{ + "role": "user", + "content": []map[string]any{ + { + "type": "image_url", + "image_url": map[string]any{ + "url": "https://example.com/images/dui.jpg.", + }, + }, + }, + }, + }, + }, + want: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{ + Structured: []types.ContentBlock{ + { + Text: "Describe this image in one sentence.", + Type: "text", + }, + }, + }}, + {Role: "user", Content: types.Content{ + Structured: []types.ContentBlock{ + { + Type: "image_url", + ImageURL: types.ImageBlock{Url: "https://example.com/images/dui.jpg."}, + }, + }, + }}, }, }, }, - want: "<|im_start|>system\nthis is a system message<|im_end|>\n" + - "<|im_start|>user\nhello<|im_end|>\n" + - "<|im_start|>assistant\nhi, what can I do for you?<|im_end|>\n", }, { - name: "completions request body", + name: "chat completions with all optional fields", body: map[string]any{ - "model": "test", - "prompt": "test prompt", + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "tools": []any{map[string]any{"type": "function"}}, + "documents": []any{map[string]any{"content": "doc"}}, + "chat_template": "custom template", + "return_assistant_tokens_mask": true, + "continue_final_message": true, + "add_generation_prompt": true, + "chat_template_kwargs": map[string]any{"key": "value"}, }, - want: "test prompt", + want: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{{Role: "user", Content: types.Content{Raw: "hello"}}}, + Tools: []any{map[string]any{"type": "function"}}, + Documents: []any{map[string]any{"content": "doc"}}, + ChatTemplate: "custom template", + ReturnAssistantTokensMask: true, + ContinueFinalMessage: true, + AddGenerationPrompt: true, + ChatTemplateKWArgs: map[string]any{"key": "value"}, + }, + }, + }, + { + name: "nil body", + body: nil, + wantErr: true, }, { name: "invalid prompt format", + body: map[string]any{ + "model": "test", + "prompt": 123, + }, + wantErr: true, + }, + { + name: "invalid messages format", + body: map[string]any{ + "model": "test", + "messages": "invalid", + }, + wantErr: true, + }, + { + name: "neither prompt nor messages", body: map[string]any{ "model": "test", - "prompt": []any{ - map[string]any{ - "role": "system", "content": "this is a system message", - }, - map[string]any{ - "role": "user", "content": "hello", - }, - map[string]any{ - "role": "assistant", "content": "hi, what can I", - }, + }, + wantErr: true, + }, + { + name: "empty messages array", + body: map[string]any{ + "model": "test", + "messages": []any{}, + }, + wantErr: true, + }, + { + name: "message with non-string role", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{"role": 123, "content": "hello"}, }, }, wantErr: true, }, { - name: "invalid messaged format", + name: "message with non-string content", body: map[string]any{ "model": "test", - "messages": map[string]any{ - "role": "system", "content": "this is a system message", + "messages": []any{ + map[string]any{"role": "user", "content": 123}, }, }, wantErr: true, }, { - name: "prompt does not exist", + name: "invalid tools format", body: map[string]any{ "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "tools": "invalid", }, wantErr: true, }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ExtractPromptFromRequestBody(tt.body) - if (err != nil) != tt.wantErr { - t.Errorf("ExtractPromptFromRequestBody() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("ExtractPromptFromRequestBody() got = %v, want %v", got, tt.want) - } - }) - } -} - -func TestExtractPromptField(t *testing.T) { - tests := []struct { - name string - body map[string]any - want string - wantErr bool - }{ { - name: "valid prompt", + name: "invalid documents format", body: map[string]any{ - "prompt": "test prompt", + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "documents": "invalid", }, - want: "test prompt", + wantErr: true, }, { - name: "prompt not found", - body: map[string]any{}, + name: "invalid chat_template format", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "chat_template": 123, + }, wantErr: true, }, { - name: "non-string prompt", + name: "invalid return_assistant_tokens_mask format", body: map[string]any{ - "prompt": 123, + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "return_assistant_tokens_mask": "invalid", }, wantErr: true, }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := extractPromptField(tt.body) - if (err != nil) != tt.wantErr { - t.Errorf("extractPromptField() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("extractPromptField() got = %v, want %v", got, tt.want) - } - }) - } -} - -func TestExtractPromptFromMessagesField(t *testing.T) { - tests := []struct { - name string - body map[string]any - want string - wantErr bool - }{ { - name: "valid messages", + name: "invalid continue_final_message format", body: map[string]any{ + "model": "test", "messages": []any{ - map[string]any{"role": "user", "content": "test1"}, - map[string]any{"role": "assistant", "content": "test2"}, + map[string]any{"role": "user", "content": "hello"}, }, + "continue_final_message": "invalid", }, - want: "<|im_start|>user\ntest1<|im_end|>\n<|im_start|>assistant\ntest2<|im_end|>\n", + wantErr: true, }, { - name: "invalid messages format", + name: "invalid add_generation_prompt format", body: map[string]any{ - "messages": "invalid", + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "add_generation_prompt": "invalid", }, wantErr: true, }, + { + name: "invalid chat_template_kwargs format", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "chat_template_kwargs": "invalid", + }, + wantErr: true, + }, + { + name: "completions request with cache_salt", + body: map[string]any{ + "model": "test", + "prompt": "test prompt", + "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + want: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "test prompt", + CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + }, + }, + { + name: "chat completions request with cache_salt", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{ + "role": "system", "content": "this is a system message", + }, + map[string]any{ + "role": "user", "content": "hello", + }, + }, + "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + want: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: types.Content{Raw: "this is a system message"}}, + {Role: "user", Content: types.Content{Raw: "hello"}}, + }, + CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := extractPromptFromMessagesField(tt.body) + got, err := ExtractRequestBody(tt.body) if (err != nil) != tt.wantErr { - t.Errorf("extractPromptFromMessagesField() error = %v, wantErr %v", err, tt.wantErr) + t.Errorf("ExtractRequestBody() error = %v, wantErr %v", err, tt.wantErr) + return + } + if tt.wantErr { return } - if got != tt.want { - t.Errorf("extractPromptFromMessagesField() got = %v, want %v", got, tt.want) + + if diff := cmp.Diff(tt.want, got); diff != "" { + t.Errorf("ExtractRequestBody() mismatch (-want +got):\n%s", diff) } }) } } -func TestConstructChatMessage(t *testing.T) { - tests := []struct { - role string - content string - want string - }{ - {"user", "hello", "<|im_start|>user\nhello<|im_end|>\n"}, - {"assistant", "hi", "<|im_start|>assistant\nhi<|im_end|>\n"}, +// Benchmark tests for performance comparison +func BenchmarkExtractRequestData_Completions(b *testing.B) { + body := map[string]any{ + "model": "test", + "prompt": "test prompt", } - for _, tt := range tests { - if got := constructChatMessage(tt.role, tt.content); got != tt.want { - t.Errorf("constructChatMessage() = %v, want %v", got, tt.want) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := ExtractRequestBody(body) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkExtractRequestData_ChatCompletions(b *testing.B) { + body := map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := ExtractRequestBody(body) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkExtractRequestData_ChatCompletionsWithOptionals(b *testing.B) { + body := map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{"role": "user", "content": "hello"}, + }, + "tools": []any{map[string]any{"type": "function"}}, + "documents": []any{map[string]any{"content": "doc"}}, + "chat_template": "custom template", + "return_assistant_tokens_mask": true, + "continue_final_message": true, + "add_generation_prompt": true, + "chat_template_kwargs": map[string]any{"key": "value"}, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := ExtractRequestBody(body) + if err != nil { + b.Fatal(err) } } } diff --git a/pkg/epp/util/request/sheddable.go b/pkg/epp/util/request/sheddable.go new file mode 100644 index 000000000..c2f32c1f2 --- /dev/null +++ b/pkg/epp/util/request/sheddable.go @@ -0,0 +1,22 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +// IsSheddable determines if a request is considered sheddable based on its priority. +func IsSheddable(priority int) bool { + return priority < 0 +} diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go index 9e7f4a17b..7621bff96 100644 --- a/pkg/epp/util/testing/wrappers.go +++ b/pkg/epp/util/testing/wrappers.go @@ -179,7 +179,11 @@ func MakeInferencePool(name string) *InferencePoolWrapper { APIVersion: "inference.networking.k8s.io/v1", Kind: "InferencePool", }, - Spec: v1.InferencePoolSpec{}, + Spec: v1.InferencePoolSpec{ + TargetPorts: []v1.Port{ + {Number: 8000}, + }, + }, }, } } diff --git a/pkg/generator/main.go b/pkg/generator/main.go index 3cb70a508..c663588a9 100644 --- a/pkg/generator/main.go +++ b/pkg/generator/main.go @@ -35,6 +35,7 @@ import ( func main() { roots, err := loader.LoadRoots( "k8s.io/apimachinery/pkg/runtime/schema", // Needed to parse generated register functions. + "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha1", "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2", "sigs.k8s.io/gateway-api-inference-extension/api/v1", ) diff --git a/site-src/_includes/bbr.md b/site-src/_includes/bbr.md new file mode 100644 index 000000000..bfbbfd4c7 --- /dev/null +++ b/site-src/_includes/bbr.md @@ -0,0 +1,3 @@ +### Deploy the Body Based Router Extension (Optional) + +This guide has shown how to get started with serving a single base model type per L7 URL path. If after this exercise, you wish to continue on to exercise model-aware routing such that more than 1 base model is served at the same L7 url path, that requires use of the (optional) Body Based Routing (BBR) extension which is described in a separate section of the documentation, namely the [`Serving Multiple GenAI Models`](serve-multiple-genai-models.md) section. If you wish to exercise that function, then retain the setup you have deployed so far from this guide and move on to the additional steps described in [that guide](serve-multiple-genai-models.md) or else move on to the following section to cleanup your setup. diff --git a/site-src/_includes/epp-latest.md b/site-src/_includes/epp-latest.md new file mode 100644 index 000000000..9f9a4e265 --- /dev/null +++ b/site-src/_includes/epp-latest.md @@ -0,0 +1,43 @@ +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Kgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Agentgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` diff --git a/site-src/_includes/epp.md b/site-src/_includes/epp.md new file mode 100644 index 000000000..69c5f42c0 --- /dev/null +++ b/site-src/_includes/epp.md @@ -0,0 +1,43 @@ +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Kgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Agentgateway" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` diff --git a/site-src/_includes/infobj.md b/site-src/_includes/infobj.md new file mode 100644 index 000000000..c71b498bf --- /dev/null +++ b/site-src/_includes/infobj.md @@ -0,0 +1,5 @@ +??? example "Experimental" + + This project is still in an alpha state and breaking changes may occur in the future. + +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! diff --git a/site-src/_includes/intro.md b/site-src/_includes/intro.md new file mode 100644 index 000000000..c71b498bf --- /dev/null +++ b/site-src/_includes/intro.md @@ -0,0 +1,5 @@ +??? example "Experimental" + + This project is still in an alpha state and breaking changes may occur in the future. + +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! diff --git a/site-src/_includes/model-server-cpu.md b/site-src/_includes/model-server-cpu.md new file mode 100644 index 000000000..c2cf00b4d --- /dev/null +++ b/site-src/_includes/model-server-cpu.md @@ -0,0 +1,14 @@ +=== "CPU-Based Model Server" + + ???+ warning + + CPU deployment can be unreliable i.e. the pods may crash/restart because of resource contraints. + + This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. + For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. + + While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. + + After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. + + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. diff --git a/site-src/_includes/model-server-gpu.md b/site-src/_includes/model-server-gpu.md new file mode 100644 index 000000000..f2d6e6bbf --- /dev/null +++ b/site-src/_includes/model-server-gpu.md @@ -0,0 +1,7 @@ +=== "GPU-Based Model Server" + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed. + Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). + Ensure that the token grants access to this model. + + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. diff --git a/site-src/_includes/model-server-sim.md b/site-src/_includes/model-server-sim.md new file mode 100644 index 000000000..3c1a7a4b8 --- /dev/null +++ b/site-src/_includes/model-server-sim.md @@ -0,0 +1,6 @@ +=== "vLLM Simulator Model Server" + + This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server. + This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments. + + To deploy the vLLM simulator, run the following command. diff --git a/site-src/_includes/model-server.md b/site-src/_includes/model-server.md new file mode 100644 index 000000000..47d8e54dc --- /dev/null +++ b/site-src/_includes/model-server.md @@ -0,0 +1,19 @@ + Three options are supported for running the model server: + + 1. GPU-based model server. + Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). + + 1. CPU-based model server (not using GPUs). + The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). + + 1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs). + The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. + + Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other. + +=== "GPU-Based Model Server" + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. + Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. + + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. diff --git a/site-src/_includes/prereqs.md b/site-src/_includes/prereqs.md new file mode 100644 index 000000000..0c655a110 --- /dev/null +++ b/site-src/_includes/prereqs.md @@ -0,0 +1,11 @@ +A cluster with: + +- Support for one of the three most recent Kubernetes minor [releases](https://kubernetes.io/releases/). +- Support for services of type `LoadBalancer`. For kind clusters, follow [this guide](https://kind.sigs.k8s.io/docs/user/loadbalancer) + to get services of type LoadBalancer working. +- Support for [sidecar containers](https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/) (enabled by default since Kubernetes v1.29) + to run the model server deployment. + +Tooling: + +- [Helm](https://helm.sh/docs/intro/install/) installed. diff --git a/site-src/_includes/test.md b/site-src/_includes/test.md new file mode 100644 index 000000000..0fbdd2cdd --- /dev/null +++ b/site-src/_includes/test.md @@ -0,0 +1,15 @@ +### Try it out + + Wait until the gateway is ready. + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "food-review-1", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` diff --git a/site-src/api-types/inferencemodel.md b/site-src/api-types/inferencemodel.md deleted file mode 100644 index 54fe57397..000000000 --- a/site-src/api-types/inferencemodel.md +++ /dev/null @@ -1,19 +0,0 @@ -# Inference Model - -??? example "Alpha since v0.1.0" - - The `InferenceModel` resource is alpha and may have breaking changes in - future releases of the API. - -## Background - -An InferenceModel allows the Inference Workload Owner to define: - -- Which Model/LoRA adapter(s) to consume. - - Mapping from a client facing model name to the target model name in the InferencePool. - - InferenceModel allows for traffic splitting between adapters _in the same InferencePool_ to allow for new LoRA adapter versions to be easily rolled out. -- Criticality of the requests to the InferenceModel. - -## Spec - -The full spec of the InferenceModel is defined [here](/reference/spec/#inferencemodel). \ No newline at end of file diff --git a/site-src/api-types/inferenceobjective.md b/site-src/api-types/inferenceobjective.md new file mode 100644 index 000000000..8c48651d6 --- /dev/null +++ b/site-src/api-types/inferenceobjective.md @@ -0,0 +1,18 @@ +# Inference Objective + +??? example "Alpha since v1.0.0" + + The `InferenceObjective` resource is alpha and may have breaking changes in + future releases of the API. + +## Background + +The **InferenceObjective** API defines a set of serving objectives of the specific request it is associated with. This CRD currently houses only `Priority` but will be expanded to include fields such as SLO attainment. + +## Usage + +To associate a request to the InferencePool with a specific InferenceObjective, the system uses a specific header: `x-gateway-inference-objective` with the value of the header set to the InferenceObjective metadata name. So the calling client must set the header key/value on the request to associate the selected InferenceObjective. If no InferenceObjective is selected, default values are used. + +## Spec + +The full spec of the InferenceObjective is defined [here](/reference/x-v1-a2-spec/#inferenceobjective). diff --git a/site-src/api-types/inferencepool.md b/site-src/api-types/inferencepool.md index c4481b1ad..8922d0d11 100644 --- a/site-src/api-types/inferencepool.md +++ b/site-src/api-types/inferencepool.md @@ -1,9 +1,8 @@ # Inference Pool -??? example "Alpha since v0.1.0" +??? success example "GA since v1.0.0" - The `InferencePool` resource is alpha and may have breaking changes in - future releases of the API. + The `InferencePool` resource has been graduated to v1 and is considered stable. ## Background diff --git a/site-src/api-types/inferencepoolimport.md b/site-src/api-types/inferencepoolimport.md new file mode 100644 index 000000000..aa58247b2 --- /dev/null +++ b/site-src/api-types/inferencepoolimport.md @@ -0,0 +1,26 @@ +# Inference Pool Import + +??? example "Alpha since v1.1.0" + + The `InferencePoolImport` resource is alpha and may have breaking changes in + future releases of the API. + +## Background + +The **InferencePoolImport** API is a cluster-local, controller-managed resource that represents an imported InferencePool. +It primarily communicates a relationship between an exported InferencePool and the exporting cluster name. It is not +user-authored; status carries the effective import. Inference Platform Owners can reference the InferencePoolImport, +even if the local cluster does not have an InferencePool. In the context of Gateway API, it means that an HTTPRoute can +be configured to reference an InferencePoolImport to route matching requests to endpoints of backing InferencePools. + +Key ideas: + +- Map an exported InferencePool to exporting controller and cluster. +- Name/namespace sameness with the exported InferencePool (avoids extra indirection). +- Conditions: Surface a controller-level status condition to indicate whether the InferencePoolImport is ready for use. +- Conditions: Surface parent-level status conditions to indicate whether the InferencePoolImport is referenced by a parent, + e.g. Gateway. + +## Spec + +The full spec of the InferencePoolImport is defined [here](/reference/x-v1a1-spec/#inferencepoolimport). diff --git a/site-src/concepts/api-overview.md b/site-src/concepts/api-overview.md index ab07a1d2d..01ee25431 100644 --- a/site-src/concepts/api-overview.md +++ b/site-src/concepts/api-overview.md @@ -23,6 +23,6 @@ each aligning with a specific user persona in the Generative AI serving workflow InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferencePool, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). -### InferenceModel +### InferenceObjective -An InferenceModel represents a model or adapter, and configuration associated with that model. This resource enables you to configure the relative criticality of a model, and allows you to seamlessly translate the requested model name to one or more backend model names. Multiple InferenceModels can be attached to an InferencePool. For more information on this resource, refer to our [InferenceModel documentation](/api-types/inferencemodel) or go directly to the [InferenceModel spec](/reference/spec/#inferencemodel). +An InferenceObjective represents the objectives of a specific request. A single InferenceObjective is associated with a request, and multiple requests with different InferenceObjectives can be attached to an InferencePool. For more information on this resource, refer to our [InferenceObjective documentation](/api-types/inferenceobjective) or go directly to the [InferenceObjective spec](/reference/spec/#inferenceobjective). diff --git a/site-src/concepts/priority-and-capacity.md b/site-src/concepts/priority-and-capacity.md new file mode 100644 index 000000000..367aebfdf --- /dev/null +++ b/site-src/concepts/priority-and-capacity.md @@ -0,0 +1,17 @@ +# Priority and Capacity + +The InferenceObjective creates the definition of `Priority` which describes how requests interact with each other, this naturally interacts with total pool capacity, and properly understanding and configuring these behaviors is important in allowing a pool to handle requests of different priority. + +## Priority (in flow control) + +It should be noted that priority is currently only used in [Capacity](#capacity), and that the description below is how Priority will be consumed in the `Flow Control` model. + +Priority is a simple stack rank; the higher the number, the higher the priority. Should no priority for a request be specified, the default value is zero. Requests of higher priority are _always_ selected first when requests are queued. Requests of equal priority currently operate on a FCFS basis. + +## Capacity + +The current capacity model uses configurable [thresholds](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/35b14a10a9830d1a9e3850913539066ebc8fb317/pkg/epp/saturationdetector/saturationdetector.go#L49) to determine if the entire pool is saturated. The calculation is to simply iterate through each endpoint in the pool, and if all are above all thresholds, the pool is considered `saturated`. In the event of saturation, all requests with a negative priority will be rejected, and other requests will be scheduled and queued on the model servers. + +## Future work + +The Flow Control system is nearing completion and will add more nuance to the Priority and Capacity model: proper priority enforcement, more articulate capacity tracking, queuing at the Inference Gateway level, etc. This documentation will be updated when the Flow Control has finished implementation. \ No newline at end of file diff --git a/site-src/concepts/roles-and-personas.md b/site-src/concepts/roles-and-personas.md index 0746adbfb..f1d17a59d 100644 --- a/site-src/concepts/roles-and-personas.md +++ b/site-src/concepts/roles-and-personas.md @@ -17,7 +17,7 @@ The Inference Platform Admin creates and manages the infrastructure necessary to An Inference Workload Owner persona owns and manages one or many Generative AI Workloads (LLM focused *currently*). This includes: -- Defining criticality +- Defining priority - Managing fine-tunes - LoRA Adapters - System Prompts diff --git a/site-src/contributing/index.md b/site-src/contributing/index.md index 7f56b828e..eb5da1710 100644 --- a/site-src/contributing/index.md +++ b/site-src/contributing/index.md @@ -46,4 +46,4 @@ doc. Feel free to add topics for discussion at an upcoming meeting. All meetings are recorded and automatically uploaded to the [WG-Serving meetings YouTube -playlist](https://www.youtube.com/playlist?list=PL69nYSiGNLP30qNanabU75ayPK7OPNAAS). +playlist](https://www.youtube.com/playlist?list=PL69nYSiGNLP2io2Gg92njBfh-DX9sk7O3). diff --git a/site-src/enhancements/overview.md b/site-src/enhancements/overview.md new file mode 100644 index 000000000..ad2355e60 --- /dev/null +++ b/site-src/enhancements/overview.md @@ -0,0 +1,6 @@ +# Inference Gateway Proposal process + +Our current proposal process is intentionally light-weight. If you have a proposal you are interested in sharing, please follow these steps: + +1. Cut an issue or bring a topic to the weekly meeting! +2. Assuming positive signal, or if more context is needed please add a proposal, following the style and naming conventions shown here: https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals \ No newline at end of file diff --git a/site-src/gieps/giep-116/index.md b/site-src/gieps/giep-116/index.md deleted file mode 100644 index 4d678da22..000000000 --- a/site-src/gieps/giep-116/index.md +++ /dev/null @@ -1,47 +0,0 @@ -# GIEP-116: GIEP template - -* Issue: [#0](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/116) -* Status: Provisional|Implementable|Experimental|Standard|Deferred|Rejected|Withdrawn|Replaced - -(See status definitions [here](overview.md#status).) - -## TLDR - -(1-2 sentence summary of the proposal) - -## Goals - -(Primary goals of this proposal.) - -## Non-Goals - -(What is out of scope for this proposal.) - -## Introduction - -(Can link to external doc -- but we should bias towards copying -the content into the GEP as online documents are easier to lose --- e.g. owner messes up the permissions, accidental deletion) - -## API - -(... details, can point to PR with changes) - -## Conformance Details - -(This section describes the names to be used for the feature or -features in conformance tests and profiles. - -These should be `CamelCase` names that specify the feature as -precisely as possible, and are particularly important for -Extended features, since they may be surfaced to users.) - -## Alternatives - -(List other design alternatives and why we did not go in that -direction) - -## References - -(Add any additional document links. Again, we should try to avoid -too much content not in version control to avoid broken links) diff --git a/site-src/gieps/giep-116/metadata.yaml b/site-src/gieps/giep-116/metadata.yaml deleted file mode 100644 index 56d101834..000000000 --- a/site-src/gieps/giep-116/metadata.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: internal.gateway.networking.k8s.io/v1alpha1 -kind: GIEPDetails -number: 116 -name: GIEP template -status: Completed -# Any authors who contribute to the GEP in any way should be listed here using -# their Github handle. -authors: - - robscott -relationships: - # obsoletes indicates that a GEP makes the linked GEP obsolete, and completely - # replaces that GEP. The obsoleted GEP MUST have its obsoletedBy field - # set back to this GEP, and MUST be moved to Declined. - obsoletes: {} - obsoletedBy: {} - # extends indicates that a GEP extends the linkned GEP, adding more detail - # or additional implementation. The extended GEP MUST have its extendedBy - # field set back to this GEP. - extends: {} - extendedBy: {} - # seeAlso indicates other GEPs that are relevant in some way without being - # covered by an existing relationship. - seeAlso: {} -# references is a list of hyperlinks to relevant external references. -# It's intended to be used for storing Github discussions, Google docs, etc. -references: {} -# featureNames is a list of the feature names introduced by the GEP, if there -# are any. This will allow us to track which feature was introduced by which GEP. -featureNames: {} -# changelog is a list of hyperlinks to PRs that make changes to the GEP, in -# ascending date order. -changelog: {} diff --git a/site-src/gieps/overview.md b/site-src/gieps/overview.md deleted file mode 100644 index 438a0ffe0..000000000 --- a/site-src/gieps/overview.md +++ /dev/null @@ -1,272 +0,0 @@ -# Gateway Inference Enhancement Proposal (GIEP) - -Gateway Inference Enhancement Proposals (GIEPs) serve a similar purpose to the -[GIEP](https://gateway-api.sigs.k8s.io/GIEPs/overview/) process for the main -Gateway API project: - -1. Ensure that changes to the API follow a known process and discussion in the - OSS community. -1. Make changes and proposals discoverable (current and future). -1. Document design ideas, tradeoffs, decisions that were made for historical - reference. -1. Record the results of larger community discussions. -1. Record changes to the GIEP process itself. - -## Process - -This diagram shows the state diagram of the GIEP process at a high level, but -the details are below. - -
- -```mermaid -flowchart TD - D([Discuss with
the community]) --> C - C([Issue Created]) -------> Memorandum - C([Issue Created]) --> Provisional - Provisional -->|If practical
work needed| Prototyping - Provisional -->|GIEP Doc PR
done| Implementable - Prototyping -->|GIEP Doc PR
done| Implementable - Implementable -->|Gateway API
work completed| Experimental - Experimental -->|Supported in
multiple implementations
+ Conformance tests| Standard - Standard -->|Entire change is GA or implemented| Completed -``` - -
- -## GIEP Definitions - -### GIEP States - -Each GIEP has a state, which tracks where it is in the GIEP process. - -GIEPs can move to some states from any other state: - - * **Declined**: The GIEP has been declined and further work will not occur. - * **Deferred:** We do not currently have bandwidth to handle this GIEP, it may - be revisited in the future. - * **Declined:** This proposal was considered by the community but ultimately - rejected. - * **Withdrawn:** This proposal was considered by the community but ultimately - withdrawn by the author. - -There is a special state to cover Memorandum GIEPs: - - * **Memorandum**: These GIEPs either: - * Document an agreement for further work, creating no spec changes - themselves, or - * Update the GIEP process. - -API GIEPs flow through a number of states, which generally correspond to the -level of stability of the change described in the GIEP: - - * **Provisional:** The goals described by this GIEP have consensus but - implementation details have not been agreed to yet. - * **Prototyping:** An extension of `Provisional` which can be opted in to in - order to indicate to the community that there are some active practical - tests and experiments going on which are intended to be a part of the - development of this GIEP. This may include APIs or code, but that content - _must_ not be distributed with releases. - * **Implementable:** The goals and implementation details described by this - GIEP have consensus but have not been fully implemented yet. - * **Experimental:** This GIEP has been implemented and is part of the - "Experimental" release channel. Breaking changes are still possible, up to - and including complete removal and moving to `Rejected`. - * **Standard:** This GIEP has been implemented and is part of the "Standard" - release channel. It should be quite stable. - * **Completed**: All implementation work on this API GIEP has been completed. - -### Relationships between GIEPs - -GIEPs can have relationships between them. At this time, there are three -possible relationships: - -* **Obsoletes** and its backreference **ObsoletedBy**: when a GIEP is made - obsolete by another GIEP, and has its functionality completely replaced. The - Obsoleted GIEP is moved to the **Declined** state. -* **Extends** and its backreference **ExtendedBy**: when a GIEP has additional - details or implementation added in another GIEP. -* **SeeAlso**: when a GIEP is relevant to another GIEP, but is not affected in - any other defined way. - -Relationships are tracked in the YAML metadata files accompanying each GIEP. - -### GIEP metadata file - -Each GIEP has a YAML file containing metadata alongside it, please keep it up to -date as changes to the GIEP occur. - -In particular, note the `authors`, and `changelog` fields, please keep those up -to date. - -## Process - -### 1. Discuss with the community - -Before creating a GIEP, share your high level idea with the community. There are -several places this may be done: - -- A [new GitHub - Discussion](https://github.com/kubernetes-sigs/gateway-api/discussions/new) -- On our [Slack Channel](https://kubernetes.slack.com/archives/CR0H13KGA) -- On one of our [community - meetings](https://gateway-api.sigs.k8s.io/contributing/?h=meetings#meetings) - -Please default to GitHub discussions: they work a lot like GitHub issues which -makes them easy to search. - -### 2. Create an Issue -[Create a GIEP -issue](https://github.com/kubernetes-sigs/gateway-api/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md) -in the repo describing your change. At this point, you should copy the outcome -of any other conversations or documents into this document. - -### 3. Agree on the Goals -Although it can be tempting to start writing out all the details of your -proposal, it's important to first ensure we all agree on the goals. - -For API GIEPs, the first version of your GIEP should aim for a "Provisional" -status and leave out any implementation details, focusing primarily on "Goals" -and "Non-Goals". - -For Memorandum GIEPs, the first version of your GIEP will be the only one, as -Memorandums have only a single stage - `Accepted`. - -### 3. Document Implementation Details -Now that everyone agrees on the goals, it is time to start writing out your -proposed implementation details. These implementation details should be very -thorough, including the proposed API spec, and covering any relevant edge cases. -Note that it may be helpful to use a shared doc for part of this phase to enable -faster iteration on potential designs. - -It is likely that throughout this process, you will discuss a variety of -alternatives. Be sure to document all of these in the GIEP, and why we decided -against them. At this stage, the GIEP should be targeting the "Implementable" -stage. - -### 4. Implement the GIEP as "Experimental" - -With the GIEP marked as "Implementable", it is time to actually make those -proposed changes in our API. In some cases, these changes will be documentation -only, but in most cases, some API changes will also be required. It is important -that every new feature of the API is marked as "Experimental" when it is -introduced. Within the API, we use `` tags to denote -experimental fields. Within Golang packages (conformance tests, CLIs, e.t.c.) we -use the `experimental` Golang build tag to denote experimental functionality. - -Some other requirements must be met before marking a GIEP `Experimental`: - -- the graduation criteria to reach `Standard` MUST be filled out -- a proposed probationary period (see next section) must be included in the GIEP - and approved by maintainers. - -Before changes are released they MUST be documented. GIEPs that have not been -both implemented and documented before a release cut off will be excluded from -the release. - -#### Probationary Period - -Any GIEP in the `Experimental` phase is automatically under a "probationary -period" where it will come up for re-assessment if its graduation criteria are -not met within a given time period. GIEPs that wish to move into `Experimental` -status MUST document a proposed period (6 months is the suggested default) that -MUST be approved by maintainers. Maintainers MAY select an alternative time -duration for a probationary period if deemed appropriate, and will document -their reasoning. - -> **Rationale**: This probationary period exists to avoid GIEPs getting "stale" -> and to provide guidance to implementations about how relevant features should -> be used, given that they are not guaranteed to become supported. - -At the end of a probationary period if the GIEP has not been able to resolve its -graduation criteria it will move to "Rejected" status. In extenuating -circumstances an extension of that period may be accepted by approval from -maintainers. GIEPs which are `Rejected` in this way are removed from the -experimental CRDs and more or less put on hold. GIEPs may be allowed to move -back into `Experimental` status from `Rejected` for another probationary period -if a new strategy for achieving their graduation criteria can be established. -Any such plan to take a GIEP "off the shelf" must be reviewed and accepted by -the maintainers. - -> **Warning**: It is extremely important** that projects which implement -> `Experimental` features clearly document that these features may be removed in -> future releases. - -### 5. Graduate the GIEP to "Standard" - -Once this feature has met the [graduation -criteria](/concepts/versioning/#graduation-criteria), it is time to graduate it -to the "Standard" channel of the API. Depending on the feature, this may include -any of the following: - -1. Graduating the resource to beta -2. Graduating fields to "standard" by removing `` tags -3. Graduating a concept to "standard" by updating documentation - -### 6. Close out the GIEP issue - -The GIEP issue should only be closed once the feature has: -- Moved to the standard channel for distribution (if necessary) -- Moved to a "v1" `apiVersion` for CRDs -- been completely implemented and has wide acceptance (for process changes). - -In short, the GIEP issue should only be closed when the work is "done" (whatever -that means for that GIEP). - -## Format - -GIEPs should match the format of the template found in -[GIEP-696](/GIEPs/GIEP-696). - -## Out of scope - -What is out of scope: see [text from KEP][kep-when-to-use]. Examples: - -* Bug fixes -* Small changes (API validation, documentation, fixups). It is always possible - that the reviewers will determine a "small" change ends up requiring a GIEP. - -## FAQ - -#### Why is it named GIEP? -To avoid potential confusion if people start following the cross references to -the full GEP or KEP process. - -#### Why have a different process than mainline? -Gateway API has some differences with most upstream KEPs. Notably Gateway API -intentionally avoids including any implementation with the project, so this -process is focused entirely on the substance of the API. As this project is -based on CRDs it also has an entirely separately release process, and has -developed concepts like "release channels" that do not exist in upstream. - -#### Is it ok to discuss using shared docs, scratch docs etc? -Yes, this can be a helpful intermediate step when iterating on design details. -It is important that all major feedback, discussions, and alternatives -considered in that step are represented in the GIEP though. A key goal of GIEPs -is to show why we made a decision and which alternatives were considered. If -separate docs are used, it's important that we can still see all relevant -context and decisions in the final GIEP. - -#### When should I mark a GIEP as `Prototyping` as opposed to `Provisional`? -The `Prototyping` status carries the same base meaning as `Provisional` in that -consensus is not complete between stakeholders and we're not ready to move -toward releasing content yet. You should use `Prototyping` to indicate to your -fellow community members that we're in a state of active practical tests and -experiments which are intended to help us learn and iterate on the GIEP. These -can include distributing content, but not under any release channel. - -#### Should I implement support for `Experimental` channel features? -Ultimately one of the main ways to get something into `Standard` is for it to -mature through the `Experimental` phase, so we really _need_ people to implement -these features and provide feedback in order to have progress. That said, the -graduation of a feature past `Experimental` is not a forgone conclusion. Before -implementing an experimental feature, you should: - -* Clearly document that support for the feature is experimental and may - disappear in the future. -* Have a plan in place for how you would handle the removal of this feature from - the API. - -[kep]: https://github.com/kubernetes/enhancements -[kep-when-to-use]: - https://github.com/kubernetes/enhancements/tree/master/keps#do-i-have-to-use-the-kep-process diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md index 0936d2913..7d6611c92 100644 --- a/site-src/guides/adapter-rollout.md +++ b/site-src/guides/adapter-rollout.md @@ -3,7 +3,6 @@ The goal of this guide is to show you how to perform incremental roll out operations, which gradually deploy new versions of your inference infrastructure. You can update LoRA adapters and Inference Pool with minimal service disruption. -This page also provides guidance on traffic splitting and rollbacks to help ensure reliable deployments for LoRA adapters rollout. LoRA adapter rollouts let you deploy new versions of LoRA adapters in phases, without altering the underlying base model or infrastructure. @@ -49,36 +48,7 @@ data: The new adapter version is applied to the model servers live, without requiring a restart. - -### Direct traffic to the new adapter version - -Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for food-review model will be sent to the new ***food-review-2*** adapter. - - -```bash -kubectl edit inferencemodel food-review -``` - -Change the targetModels list in InferenceModel to match the following: - - -```yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: food-review -spec: - criticality: 1 - poolRef: - name: vllm-llama3-8b-instruct - targetModels: - - name: food-review-1 - weight: 90 - - name: food-review-2 - weight: 10 -``` - -The above configuration means one in every ten requests should be sent to the new version. Try it out: +Try it out: 1. Get the gateway IP: ```bash @@ -88,7 +58,7 @@ IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].va 2. Send a few requests as follows: ```bash curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "food-review", +"model": "food-review-2", "prompt": "Write as if you were a critic: San Francisco", "max_tokens": 100, "temperature": 0 @@ -97,23 +67,6 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ ### Finish the rollout - -Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. - -```yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: food-review -spec: - criticality: 1 - poolRef: - name: vllm-llama3-8b-instruct - targetModels: - - name: food-review-2 - weight: 100 -``` - Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: ```yaml @@ -137,5 +90,5 @@ data: source: Kawon/llama3.1-food-finetune_v14_r8 ``` -With this, all requests should be served by the new adapter version. +With this, the new adapter version should be available for all incoming requests. diff --git a/site-src/guides/epp-configuration/config-text.md b/site-src/guides/epp-configuration/config-text.md index 6df19db80..43a0e6cf7 100644 --- a/site-src/guides/epp-configuration/config-text.md +++ b/site-src/guides/epp-configuration/config-text.md @@ -1,17 +1,14 @@ -# Configuring Plugins via text +# Configuring Plugins via YAML The set of lifecycle hooks (plugins) that are used by the Inference Gateway (IGW) is determined by how -it is configured. The IGW can be configured in several ways, either by code or via text. +it is configured. The IGW is primarily configured via a configuration file. -If configured by code either a set of predetermined environment variables must be used or one must -fork the IGW and change code. - -A simpler way to congigure the IGW is to use a text based configuration. This text is in YAML format -and can either be in a file or specified in-line as a parameter. The configuration defines the set of +The YAML file can either be specified as a path to a file or in-line as a parameter. The configuration defines the set of plugins to be instantiated along with their parameters. Each plugin can also be given a name, enabling -the same plugin type to be instantiated multiple times, if needed. +the same plugin type to be instantiated multiple times, if needed (such as when configuring multiple scheduling profiles). -Also defined is a set of SchedulingProfiles, which determine the set of plugins to be used when scheduling a request. If one is not defailed, a default one names `default` will be added and will reference all of the +Also defined is a set of SchedulingProfiles, which determine the set of plugins to be used when scheduling a request. +If no scheduling profile is specified, a default profile, named `default` will be added and will reference all of the instantiated plugins. The set of plugins instantiated can include a Profile Handler, which determines which SchedulingProfiles @@ -22,12 +19,9 @@ In addition, the set of instantiated plugins can also include a picker, which ch the request is scheduled after filtering and scoring. If one is not referenced in a SchedulingProfile, an instance of `MaxScorePicker` will be added to the SchedulingProfile in question. -It should be noted that while the configuration text looks like a Kubernetes Custom Resource, it is -**NOT** a Kubernetes Custom Resource. Kubernetes infrastructure is used to load the configuration -text and in the future will also help in versioning the text. - -It should also be noted that even when the configuration text is loaded from a file, it is loaded at -the Endpoint-Picker's (EPP) startup and changes to the file at runtime are ignored. +***NOTE***: While the configuration text looks like a Kubernetes CRD, it is +**NOT** a Kubernetes CRD. Specifically, the config is not reconciled upon, and is only read on startup. +This behavior is intentional, as augmenting the scheduling config without redeploying the EPP is not supported. The configuration text has the following form: ```yaml @@ -91,7 +85,7 @@ kind: EndpointPickerConfig plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 schedulingProfiles: @@ -158,7 +152,7 @@ spec: plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 schedulingProfiles: @@ -177,7 +171,7 @@ kind: EndpointPickerConfig plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: single-profile-handler @@ -207,7 +201,7 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC - *Type*: prefix-cache-scorer - *Parameters*: - - `hashBlockSize` specified the size of the blocks to break up the input prompt when + - `blockSize` specified the size of the blocks to break up the input prompt when calculating the block hashes. If not specified defaults to `64` - `maxPrefixBlocksToMatch` specifies the maximum number of prefix blocks to match. If not specified defaults to `256` @@ -241,6 +235,15 @@ Picks a random pod from the list of candidates. - `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates. If not specified defaults to `1`. +#### **WeightedRandomPicker** + +Picks pod(s) from the list of candidates based on weighted random sampling using A-Res algorithm. + +- *Type*: weighted-random-picker +- *Parameters*: + - `maxNumOfEndpoints`: Maximum number of endpoints to pick from the list of candidates. If not + specified defaults to `1`. + #### **KvCacheScorer** Scores the candidate pods based on their KV cache utilization. diff --git a/site-src/guides/epp-configuration/flags.md b/site-src/guides/epp-configuration/flags.md new file mode 100644 index 000000000..715466822 --- /dev/null +++ b/site-src/guides/epp-configuration/flags.md @@ -0,0 +1,33 @@ +# EPP Configuration Flags + +This page documents selected configuration flags for the Endpoint Picker (EPP) binary. Most flags are self-explanatory via their `--help` descriptions; only flags with nuanced or non-obvious behavior are detailed here. + +## --pool-namespace + +**Description:** +Specifies the namespace of the InferencePool this Endpoint Picker is associated with. + +**Resolution order:** +1. If `--pool-namespace` is set to a non-empty value, its value is used. +2. If the flag is not set (i.e., left empty), the `NAMESPACE` environment variable is checked. If set, its value is used. +3. If neither is set, the namespace defaults to `default`. + +This allows the EPP to automatically use the namespace it is running in (when the `NAMESPACE` env var is set via Kubernetes Downward API), without requiring explicit configuration. If you want to force the use of the default namespace, explicitly set `--pool-namespace=default`. If you want to use the environment variable or fallback, leave the flag unset or set it to an empty string. + +**Example manifest snippet to set the env var from pod metadata:** + +```yaml +env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace +``` + +--- + +For a full list of flags, run: + +``` +EPP_BINARY --help +``` diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md index 9c1074be9..88573c466 100644 --- a/site-src/guides/epp-configuration/prefix-aware.md +++ b/site-src/guides/epp-configuration/prefix-aware.md @@ -14,7 +14,7 @@ Like any other plugins, the prefix cache aware plugin can be enabled/disabled vi The prefix cache plugin exposes the following advanced configuration parameters: -* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size +* `blockSize`: The plugin matches prefixes in the unit of blocks. This is the size of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default is set to 64 in EPP. The default is recommended unless performance is critical for use cases with extremely long inputs. diff --git a/site-src/guides/ga-migration.md b/site-src/guides/ga-migration.md new file mode 100644 index 000000000..c56dea23d --- /dev/null +++ b/site-src/guides/ga-migration.md @@ -0,0 +1,274 @@ +# Inference Gateway: Migrating from v1alpha2 to v1 API + +## Introduction + +This guide provides a comprehensive walkthrough for migrating your Inference Gateway setup from the alpha `v1alpha2` API to the generally available `v1` API. +This document is intended for platform administrators and networking specialists +who are currently using the `v1alpha2` version of the Inference Gateway and +want to upgrade to the `v1` version to leverage the latest features and improvements. + +Before you start the migration, ensure you are familiar with the concepts and deployment of the Inference Gateway. + +*** + +## Before you begin + +Before starting the migration, it's important to determine if this guide is necessary for your setup. + +### Checking for Existing v1alpha2 APIs + +To check if you are actively using the `v1alpha2` Inference Gateway APIs, run the following command: + +```bash +kubectl get inferencepools.inference.networking.x-k8s.io --all-namespaces +``` + +* If this command returns one or more `InferencePool` resources, you are using the `v1alpha2` API and should proceed with this migration guide. +* If the command returns `No resources found`, you are not using the `v1alpha2` `InferencePool` and do not need to follow this migration guide. You can proceed with a fresh installation of the `v1` Inference Gateway. + +*** + +## Migration Paths + +There are two paths for migrating from `v1alpha2` to `v1`: + +1. **Simple Migration (with downtime):** This path is for users who can afford a short period of downtime. It involves deleting the old `v1alpha2` resources and CRDs before installing the new `v1` versions. +2. **Zero-Downtime Migration:** This path is for users who need to migrate without any service interruption. It involves running both `v1alpha2` and `v1` stacks side-by-side and gradually shifting traffic. + +*** + +## Simple Migration (with downtime) + +This approach is faster and simpler but will result in a brief period of downtime while the resources are being updated. It is the recommended path if you do not require a zero-downtime migration. + +### 1. Delete Existing v1alpha2 Resources + +**Option a: Uninstall using Helm.** + +```bash +helm uninstall +``` + +**Option b: Manually delete alpha `InferencePool` resources.** + +If you are not using Helm, you will need to manually delete all resources associated with your `v1alpha2` deployment. The key is to remove the `HTTPRoute`'s reference to the old `InferencePool` and then delete the `v1alpha2` resources themselves. + +1. **Update or Delete the `HTTPRoute`**: Modify the `HTTPRoute` to remove the `backendRef` that points to the `v1alpha2` `InferencePool`. +2. **Delete the `InferencePool` and associated resources**: You must delete the `v1alpha2` `InferencePool`, any `InferenceModel` (or 'InferenceObjective') resources that point to it, and the corresponding Endpoint Picker (EPP) Deployment and Service. +3. **Delete the `v1alpha2` CRDs**: Once all `v1alpha2` custom resources are deleted, you can remove the CRD definitions from your cluster. + ```bash + # You can change the version to the one you installed `v1alpha2` CRDs + export VERSION="v0.3.0" + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${VERSION}/manifests.yaml + ``` + +### 2. Install v1 Resources + +After cleaning up the old resources, you can proceed with a fresh installation of the `v1` Inference Gateway. +This involves deploying a new EPP image compatible with the `v1` API and installing the new `v1` CRDs. +You can then create a new v1 InferencePool with its corresponding InferenceObjective resources, and a new HTTPRoute that directs traffic to your new `v1` InferencePool. + + +### 3. Verify the Deployment + +After a few minutes, verify that your new `v1` stack is correctly serving traffic. You should have a **`PROGRAMMED`** gateway. + +```bash +❯ kubectl get gateway -o wide +NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway True 10m +``` + +Curl the endpoint to make sure you are getting a successful response with a **200** response code. + +```bash +IP=$(kubectl get gateway/ -o jsonpath='{.status.addresses[0].value}') +PORT=80 + +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "", +"prompt": "", +"max_tokens": 100, +"temperature": 0 +}' +``` + +*** + +## Zero-Downtime Migration + +This migration path is designed for users who cannot afford any service interruption. Assuming you already have the following stack shown in the diagram + +Inference Gateway Alpha Stage + +### A Note on Interacting with Multiple API Versions + +During the zero-downtime migration, both `v1alpha2` and `v1` CRDs will be installed on your cluster. This can create ambiguity when using `kubectl` to query for `InferencePool` resources. To ensure you are interacting with the correct version, you **must** use the full resource name: + +* **For v1alpha2**: `kubectl get inferencepools.inference.networking.x-k8s.io` +* **For v1**: `kubectl get inferencepools.inference.networking.k8s.io` + +The `v1` API also provides a convenient short name, `infpool`, which can be used to query `v1` resources specifically: + +```bash +kubectl get infpool +``` + +This guide will use these full names or the short name for `v1` to avoid ambiguity. + +*** + +### Stage 1: Side-by-side v1 Deployment + +In this stage, you will deploy the new `v1` `InferencePool` stack alongside the existing `v1alpha2` stack. This allows for a safe, gradual migration. + +After finishing all the steps in this stage, you’ll have the following infrastructure shown in the following diagram + +Inference Gateway Migration Stage + +**1. Install v1 CRDs** + +```bash +RELEASE=v1.0.0 +kubectl apply -f [https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/$RELEASE/config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/$RELEASE/config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml) +``` + +**2. Install the v1 `InferencePool`** + +Use Helm to install a new `v1` `InferencePool` with a distinct release name (e.g., `vllm-llama3-8b-instruct-ga`). + +```bash +helm install vllm-llama3-8b-instruct-ga \ + --set inferencePool.modelServers.matchLabels.app= \ + --set provider.name= \ + --version $RELEASE \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool +``` + +**3. Create the v1 `InferenceObjective`** + +The `v1` API replaces `InferenceModel` with `InferenceObjective`. Create the new resources, referencing the new `v1` `InferencePool`. + +```yaml +kubectl apply -f - < + +You should have a **`PROGRAMMED`** gateway: +```bash +❯ kubectl get gateway -o wide +NAME CLASS ADDRESS PROGRAMMED AGE +inference-gateway inference-gateway True 10m +``` + +Curl the endpoint and verify a **200** response code: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') +PORT=80 + +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "", +"prompt": "", +"max_tokens": 100, +"temperature": 0 +}' +``` + +**3. Clean Up v1alpha2 Resources** + +After confirming the `v1` stack is fully operational, safely remove the old `v1alpha2` resources. diff --git a/site-src/guides/getting-started-latest.md b/site-src/guides/getting-started-latest.md new file mode 100644 index 000000000..e436a5cfe --- /dev/null +++ b/site-src/guides/getting-started-latest.md @@ -0,0 +1,364 @@ +# Getting started with an Inference Gateway + +!!! warning "Unreleased/main branch" + This guide tracks **main**. It is intended for users who want the very latest features and fixes and are comfortable with potential breakage. + For the stable, tagged experience, see **Getting started (Released)**. + +--8<-- "site-src/_includes/intro.md" + +## **Prerequisites** + +--8<-- "site-src/_includes/prereqs.md" + +## **Steps** + +### Deploy Sample Model Server + +--8<-- "site-src/_includes/model-server-intro.md" + +--8<-- "site-src/_includes/model-server-gpu.md" + + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml + ``` + +--8<-- "site-src/_includes/model-server-cpu.md" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml + ``` + +--8<-- "site-src/_includes/model-server-sim.md" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment.yaml + ``` + +### Install the Inference Extension CRDs + +```bash +kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd +``` + +### Deploy the InferencePool and Endpoint Picker Extension + + Install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port 8000. The Helm install command automatically installs the endpoint-picker, InferencePool along with provider specific resources. + + Set the chart version and then select a tab to follow the provider-specific instructions. + + ```bash + export IGW_CHART_VERSION=v0 + ``` + +--8<-- "site-src/_includes/epp-latest.md" + +### Deploy an Inference Gateway + + Choose one of the following options to deploy an Inference Gateway. + +=== "GKE" + + 1. Enable the Google Kubernetes Engine API, Compute Engine API, the Network Services API and configure proxy-only subnets when necessary. + See [Deploy Inference Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway) + for detailed instructions. + + 2. Deploy Inference Gateway: + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` + 3. Deploy the HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/httproute.yaml + ``` + + 4. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + + ```bash + kubectl get httproute llm-route -o yaml + ``` + +=== "Istio" + + Please note that this feature is currently in an experimental phase and is not intended for production use. + The implementation and user experience are subject to changes as we continue to iterate on this project. + + 1. Requirements + + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 2. Install Istio + + ``` + TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.28-dev) + # on Linux + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz + tar -xvf istioctl-$TAG-linux-amd64.tar.gz + # on macOS + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-osx.tar.gz + tar -xvf istioctl-$TAG-osx.tar.gz + # on Windows + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-win.zip + unzip istioctl-$TAG-win.zip + + ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing --set values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true + ``` + + 3. If your EPP uses secure serving with self-signed certs (default), temporarily bypass TLS verification: + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` + + 4. Deploy Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` + + 5. Deploy the HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/httproute.yaml + ``` + + 6. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + + ```bash + kubectl get httproute llm-route -o yaml + ``` + +=== "Kgateway" + + [Kgateway](https://kgateway.dev/) is a Gateway API and Inference Gateway + [conformant](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/conformance/reports/v1.0.0/gateway/kgateway) + gateway. Follow these steps to run Kgateway: + + 1. Requirements + + - [Helm](https://helm.sh/docs/intro/install/) installed. + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 2. Set the Kgateway version and install the Kgateway CRDs. + + ```bash + KGTW_VERSION=v2.2.0-main + helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + + 3. Install Kgateway + + ```bash + helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true + ``` + + 4. Deploy the Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway kgateway True 22s + ``` + + 5. Deploy the HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml + ``` + + 6. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + + ```bash + kubectl get httproute llm-route -o yaml + ``` + +=== "Agentgateway" + + [Agentgateway](https://agentgateway.dev/) is a purpose-built proxy designed for AI workloads, and comes with native support for Inference Gateway. + Agentgateway integrates with [Kgateway](https://kgateway.dev/) as it's control plane. Follow these steps to run Kgateway with the agentgateway + data plane: + + 1. Requirements + + - [Helm](https://helm.sh/docs/intro/install/) installed. + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 2. Set the Kgateway version and install the Kgateway CRDs. + + ```bash + KGTW_VERSION=v2.2.0-main + helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + + 3. Install Kgateway + + ```bash + helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true --set agentgateway.enabled=true + ``` + + 4. Deploy the Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/gateway.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway agentgateway True 22s + ``` + + 5. Deploy the HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/httproute.yaml + ``` + + 6. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + + ```bash + kubectl get httproute llm-route -o yaml + ``` + +### Deploy InferenceObjective (Optional) + +Deploy the sample InferenceObjective which allows you to specify priority of requests. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferenceobjective.yaml + ``` + +--8<-- "site-src/_includes/test.md" + +--8<-- "site-src/_includes/bbr.md" + +### Cleanup + + The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. + Please be careful not to delete resources you'd like to keep. + + 1. Uninstall the InferencePool, InferenceObjective and model server resources + + ```bash + helm uninstall vllm-llama3-8b-instruct + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferenceobjective.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` + + 1. Uninstall the Gateway API Inference Extension CRDs + + ```bash + kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found + ``` + + 1. Choose one of the following options to cleanup the Inference Gateway. + +=== "GKE" + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/httproute.yaml --ignore-not-found + ``` + +=== "Istio" + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/httproute.yaml --ignore-not-found + + ``` + + The following steps assume you would like to clean up ALL Istio resources that were created in this quickstart guide. + + 1. Uninstall All Istio resources + + ```bash + istioctl uninstall -y --purge + ``` + + 2. Remove the Istio namespace + + ```bash + kubectl delete ns istio-system + ``` + +=== "Kgateway" + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml --ignore-not-found + ``` + + The following steps assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + + 1. Uninstall Kgateway + + ```bash + helm uninstall kgateway -n kgateway-system + ``` + + 2. Uninstall the Kgateway CRDs. + + ```bash + helm uninstall kgateway-crds -n kgateway-system + ``` + + 3. Remove the Kgateway namespace. + + ```bash + kubectl delete ns kgateway-system + ``` + +=== "Agentgateway" + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/httproute.yaml --ignore-not-found + ``` + + The following steps assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + + 1. Uninstall Kgateway + + ```bash + helm uninstall kgateway -n kgateway-system + ``` + + 2. Uninstall the Kgateway CRDs. + + ```bash + helm uninstall kgateway-crds -n kgateway-system + ``` + + 3. Remove the Kgateway namespace. + + ```bash + kubectl delete ns kgateway-system + ``` diff --git a/site-src/guides/implementers.md b/site-src/guides/implementers.md index 6fce01657..d42fa8d91 100644 --- a/site-src/guides/implementers.md +++ b/site-src/guides/implementers.md @@ -157,8 +157,8 @@ An example of a similar approach is Kuadrant’s [WASM Shim](https://github.com/ Here are some tips for testing your controller end-to-end: - **Focus on Key Scenarios**: Add common scenarios like creating, updating, and deleting InferencePool resources, as well as different routing rules that target InferencePool backends. -- **Verify Routing Behaviors**: Design more complex routing scenarios and verify that requests are correctly routed to the appropriate model server pods within the InferencePool based on the InferenceModel configuration. -- **Test Error Handling**: Verify that the controller correctly handles scenarios like unsupported model names or resource constraints (if criticality-based shedding is implemented). Test with state transitions (such as constant requests while Pods behind EPP are being replaced and Pods behind InferencePool are being replaced) to ensure that the system is resilient to failures and can automatically recover by redirecting traffic to healthy Pods. +- **Verify Routing Behaviors**: Design more complex routing scenarios and verify that requests are correctly routed to the appropriate model server pods within the InferencePool. +- **Test Error Handling**: Verify that the controller correctly handles scenarios like unsupported model names or resource constraints (if priority-based shedding is implemented). Test with state transitions (such as constant requests while Pods behind EPP are being replaced and Pods behind InferencePool are being replaced) to ensure that the system is resilient to failures and can automatically recover by redirecting traffic to healthy Pods. - **Using Reference EPP Implementation + Echoserver**: You can use the [reference EPP implementation](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) for testing your controller end-to-end. Instead of a full-fledged model server, a simple mock server (like the [echoserver](https://github.com/kubernetes-sigs/ingress-controller-conformance/tree/master/images/echoserver)) can be very useful for verifying routing to ensure the correct pod received the request. - **Performance Test**: Run end-to-end [benchmarks](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/) to make sure that your inference gateway can achieve the latency target that is desired. diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 4e40ebaf5..21b23849e 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,85 +1,124 @@ # Getting started with an Inference Gateway -??? example "Experimental" +--8<-- "site-src/_includes/intro.md" - This project is still in an alpha state and breaking changes may occur in the future. +## **Prerequisites** -???+ warning +--8<-- "site-src/_includes/prereqs.md" +## **Steps** - This page is out of date with the v1.0.0 release candidate. Updates under active development +### Deploy Sample Model Server -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! +--8<-- "site-src/_includes/model-server-intro.md" -## **Prerequisites** +--8<-- "site-src/_includes/model-server-gpu.md" -A cluster with: - - Support for services of type `LoadBalancer`. For kind clusters, follow [this guide](https://kind.sigs.k8s.io/docs/user/loadbalancer) - to get services of type LoadBalancer working. - - Support for [sidecar containers](https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/) (enabled by default since Kubernetes v1.29) - to run the model server deployment. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/gpu-deployment.yaml + ``` -## **Steps** +--8<-- "site-src/_includes/model-server-cpu.md" -### Deploy Sample Model Server + ```bash + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/cpu-deployment.yaml + ``` - Three options are supported for running the model server: +--8<-- "site-src/_includes/model-server-sim.md" - 1. GPU-based model server. - Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). + ```bash + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/sim-deployment.yaml + ``` - 1. CPU-based model server (not using GPUs). - The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). +### Install the Inference Extension CRDs - 1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs). - The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. +```bash +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.2/manifests.yaml +``` - Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other. +### Install the Gateway -=== "GPU-Based Model Server" + Choose one of the following options to install Gateway. - For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. - Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. +=== "GKE" - Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + Nothing to install here, you can move to the next [section](#deploy-the-inferencepool-and-endpoint-picker-extension) - ```bash - kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml - ``` +=== "Istio" -=== "CPU-Based Model Server" + 1. Requirements + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. - This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. - For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. + 2. Install Istio - While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. + ``` + TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.28-dev) + # on Linux + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz + tar -xvf istioctl-$TAG-linux-amd64.tar.gz + # on macOS + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-osx.tar.gz + tar -xvf istioctl-$TAG-osx.tar.gz + # on Windows + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-win.zip + unzip istioctl-$TAG-win.zip - After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. + ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing --set values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true + ``` - Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. +=== "Kgateway" - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml - ``` + 1. Requirements -=== "vLLM Simulator Model Server" + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + - [Helm](https://helm.sh/docs/intro/install/) installed. - This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server. - This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments. + 2. Set the Kgateway version and install the Kgateway CRDs. - To deploy the vLLM simulator, run the following command. + ```bash + KGTW_VERSION=v2.2.0-main + helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment.yaml - ``` + 3. Install Kgateway -### Install the Inference Extension CRDs + ```bash + helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true + ``` + +=== "Agentgateway" + + 1. Requirements + + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + - [Helm](https://helm.sh/docs/intro/install/) installed. + + 2. Set the Kgateway version and install the Kgateway CRDs. + + ```bash + KGTW_VERSION=v2.2.0-main + helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + ``` + + 3. Install Kgateway + + ```bash + helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true --set agentgateway.enabled=true + ``` + +### Deploy the InferencePool and Endpoint Picker Extension + + Install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port 8000. The Helm install command automatically installs the endpoint-picker, InferencePool along with provider specific resources. + + Set the chart version and then select a tab to follow the provider-specific instructions. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml + export IGW_CHART_VERSION=v1.0.2 ``` +--8<-- "site-src/_includes/epp.md" + ### Deploy an Inference Gateway Choose one of the following options to deploy an Inference Gateway. @@ -90,10 +129,10 @@ A cluster with: See [Deploy Inference Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway) for detailed instructions. - 2. Deploy Inference Gateway: + 2. Deploy the Inference Gateway: ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/gateway.yaml ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -106,7 +145,7 @@ A cluster with: 3. Deploy the HTTPRoute ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/httproute.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/httproute.yaml ``` 4. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: @@ -114,40 +153,19 @@ A cluster with: ```bash kubectl get httproute llm-route -o yaml ``` - + === "Istio" Please note that this feature is currently in an experimental phase and is not intended for production use. The implementation and user experience are subject to changes as we continue to iterate on this project. - 1. Requirements - - - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. - - 2. Install Istio - - ``` - TAG=$(curl https://storage.googleapis.com/istio-build/dev/1.27-dev) - # on Linux - wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz - tar -xvf istioctl-$TAG-linux-amd64.tar.gz - # on macOS - wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-osx.tar.gz - tar -xvf istioctl-$TAG-osx.tar.gz - # on Windows - wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-win.zip - unzip istioctl-$TAG-win.zip - - ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing --set values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true - ``` - - 3. If you run the Endpoint Picker (EPP) with the `--secure-serving` flag set to `true` (the default mode), it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + 1. If your EPP uses secure serving with self-signed certs (default), temporarily bypass TLS verification: ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml ``` - 4. Deploy Gateway + 2. Deploy the Inference Gateway ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml @@ -160,13 +178,13 @@ A cluster with: inference-gateway inference-gateway True 22s ``` - 6. Deploy the HTTPRoute + 3. Deploy the HTTPRoute ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/httproute.yaml ``` - 7. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + 4. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: ```bash kubectl get httproute llm-route -o yaml @@ -174,32 +192,13 @@ A cluster with: === "Kgateway" - [Kgateway](https://kgateway.dev/) recently added support for inference extension as a **technical preview**. This means do not - run Kgateway with inference extension in production environments. Refer to [Issue 10411](https://github.com/kgateway-dev/kgateway/issues/10411) - for the list of caveats, supported features, etc. - - 1. Requirements - - - [Helm](https://helm.sh/docs/intro/install/) installed. - - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. - - 2. Set the Kgateway version and install the Kgateway CRDs. - - ```bash - KGTW_VERSION=v2.0.4 - helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds - ``` - - 3. Install Kgateway - - ```bash - helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true - ``` + [Kgateway](https://kgateway.dev/) is a Gateway API and Inference Gateway + [conformant](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/conformance/reports/v1.0.0/gateway/kgateway) gateway. Follow these steps to run Kgateway: - 4. Deploy the Gateway + 1. Deploy the Inference Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/kgateway/gateway.yaml ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -209,13 +208,13 @@ A cluster with: inference-gateway kgateway True 22s ``` - 5. Deploy the HTTPRoute + 2. Deploy the HTTPRoute ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/kgateway/httproute.yaml ``` - 6. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + 3. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: ```bash kubectl get httproute llm-route -o yaml @@ -223,30 +222,14 @@ A cluster with: === "Agentgateway" - [Agentgateway](https://agentgateway.dev/) is a purpose-built proxy designed for AI workloads, and comes with native support for inference routing. Agentgateway integrates with [Kgateway](https://kgateway.dev/) as it's control plane. - - 1. Requirements - - - [Helm](https://helm.sh/docs/intro/install/) installed. - - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. - - 2. Set the Kgateway version and install the Kgateway CRDs. - - ```bash - KGTW_VERSION=v2.0.4 - helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds - ``` - - 3. Install Kgateway - - ```bash - helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true --set agentGateway.enabled=true - ``` + [Agentgateway](https://agentgateway.dev/) is a purpose-built proxy designed for AI workloads, and comes with native support for Inference Gateway. + Agentgateway integrates with [Kgateway](https://kgateway.dev/) as it's control plane. Follow these steps to run Kgateway with the agentgateway + data plane: - 4. Deploy the Gateway + 1. Deploy the Inference Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/gateway.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/agentgateway/gateway.yaml ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -256,103 +239,72 @@ A cluster with: inference-gateway agentgateway True 22s ``` - 5. Deploy the HTTPRoute + 2. Deploy the HTTPRoute ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/agentgateway/httproute.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/agentgateway/httproute.yaml ``` - 6. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: + 3. Confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True`: ```bash kubectl get httproute llm-route -o yaml ``` - -### Deploy the InferencePool and Endpoint Picker Extension - - Install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label app: vllm-llama3-8b-instruct and listening on port 8000, you can run the following command: - - ```bash - export GATEWAY_PROVIDER=none # See [README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/README.md#configuration) for valid configurations - helm install vllm-llama3-8b-instruct \ - --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ - --set provider.name=$GATEWAY_PROVIDER \ - --version v0.5.1 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - ``` - - The Helm install automatically installs the endpoint-picker, inferencepool along with provider specific resources. - ### Deploy InferenceObjective (Optional) - Deploy the sample InferenceObjective which allows you to specify priority of requests. +Deploy the sample InferenceObjective which allows you to specify priority of requests. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferenceobjective.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/inferenceobjective.yaml ``` +--8<-- "site-src/_includes/test.md" -### Try it out - - Wait until the gateway is ready. - - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=80 - - curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "food-review-1", - "prompt": "Write as if you were a critic: San Francisco", - "max_tokens": 100, - "temperature": 0 - }' - ``` +--8<-- "site-src/_includes/bbr.md" ### Cleanup The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. Please be careful not to delete resources you'd like to keep. - 1. Uninstall the InferencePool, InferenceModel, and model server resources + 1. Uninstall the InferencePool, InferenceObjective and model server resources ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferenceobjective.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + helm uninstall vllm-llama3-8b-instruct + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/inferenceobjective.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/vllm/sim-deployment.yaml --ignore-not-found kubectl delete secret hf-token --ignore-not-found ``` - 1. Uninstall the Gateway API resources - - ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/httproute.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/httproute.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml --ignore-not-found - ``` - 1. Uninstall the Gateway API Inference Extension CRDs ```bash - kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.2/manifests.yaml --ignore-not-found ``` - + 1. Choose one of the following options to cleanup the Inference Gateway. === "GKE" - No further clean up is needed. + ```bash + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/gateway.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/gcp-backend-policy.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/gke/httproute.yaml --ignore-not-found + ``` === "Istio" - The following instructions assume you would like to clean up ALL Istio resources that were created in this quickstart guide. + ```bash + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/istio/httproute.yaml --ignore-not-found + ``` + + The following steps assume you would like to clean up ALL Istio resources that were created in this quickstart guide. 1. Uninstall All Istio resources @@ -360,16 +312,20 @@ A cluster with: istioctl uninstall -y --purge ``` - 1. Remove the Istio namespace + 2. Remove the Istio namespace ```bash kubectl delete ns istio-system ``` - === "Kgateway" - The following instructions assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + ```bash + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/kgateway/httproute.yaml --ignore-not-found + ``` + + The following steps assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. 1. Uninstall Kgateway @@ -377,13 +333,13 @@ A cluster with: helm uninstall kgateway -n kgateway-system ``` - 1. Uninstall the Kgateway CRDs. + 2. Uninstall the Kgateway CRDs. ```bash helm uninstall kgateway-crds -n kgateway-system ``` - 1. Remove the Kgateway namespace. + 3. Remove the Kgateway namespace. ```bash kubectl delete ns kgateway-system @@ -391,7 +347,12 @@ A cluster with: === "Agentgateway" - The following instructions assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + ```bash + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/agentgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.0.2/config/manifests/gateway/agentgateway/httproute.yaml --ignore-not-found + ``` + + The following steps assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. 1. Uninstall Kgateway diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md index f3d929466..b274a7262 100644 --- a/site-src/guides/inferencepool-rollout.md +++ b/site-src/guides/inferencepool-rollout.md @@ -34,7 +34,6 @@ teams can ensure stability and performance, quickly identifying and reverting an 1. **Deploy new infrastructure**: Create a new InferencePool configured with the new node(compute/accelerator) / model server / base model that you chose. 1. **Configure traffic splitting**: Use an HTTPRoute to split traffic between the existing InferencePool and the new InferencePool. The `backendRefs.weight` field controls the traffic percentage allocated to each pool. -1. **Maintain InferenceModel integrity**: Retain the existing InferenceModel configuration to ensure uniform model behavior across both node configurations or base model versions or model server versions. 1. **Preserve rollback capability**: Retain the original nodes and InferencePool during the roll out to facilitate a rollback if necessary. ## Example @@ -45,276 +44,10 @@ Follow the steps in the [main guide](index.md) ### Deploy new infrastructure You start with an existing InferencePool named vllm-llama3-8b-instruct. -To replace the original InferencePool, you create a new InferencePool named vllm-llama3-8b-instruct-new along with -InferenceModels and Endpoint Picker Extension configured with the updated node specifications of `nvidia-h100-80gb` accelerator type, +To replace the original InferencePool, you create a new InferencePool, configured to select the pods with the `nvidia-h100-80gb` accelerator type. -```yaml -kubectl apply -f - < ``` With this, all requests should be served by the new Inference Pool. diff --git a/site-src/guides/metrics-and-observability.md b/site-src/guides/metrics-and-observability.md index 34b0db100..d9dab803a 100644 --- a/site-src/guides/metrics-and-observability.md +++ b/site-src/guides/metrics-and-observability.md @@ -17,7 +17,7 @@ This guide describes the current state of exposed metrics and how to scrape them "max_tokens": 10, "temperature": 0, "stream": true, - "stream_options": {"include_usage": "true"} + "stream_options": {"include_usage": true} }' ``` @@ -32,15 +32,15 @@ This guide describes the current state of exposed metrics and how to scrape them | **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | |:---------------------------------------------|:-----------------|:------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------| -| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=<model-name> | ALPHA | +| inference_objective_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_objective_running_requests | Gauge | Number of running requests for each model. | `model_name`=<model-name> | ALPHA | | inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | | inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | | inference_pool_per_pod_queue_size | Gauge | The total number of queue for each model server pod under the inference pool | `model_server_pod`=<model-server-pod-name>
`name`=<inference-pool-name> | ALPHA | @@ -53,6 +53,15 @@ This guide describes the current state of exposed metrics and how to scrape them |:---------------------------|:-----------------|:-------------------------------------------------|:------------------------------------------|:------------| | lora_syncer_adapter_status | Gauge | Status of LoRA adapters (1=loaded, 0=not_loaded) | `adapter_name`=<adapter-id> | ALPHA | +### Flow Control Metrics (Experimental) + +These metrics provide insights into the experimental flow control layer within the EPP. + +| **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | +|:---|:---|:---|:---|:---| +| inference_extension_flow_control_request_queue_duration_seconds | Distribution | Distribution of the total time requests spend in the flow control layer. This is measured from the moment a request enters the `EnqueueAndWait` function until it reaches a final outcome (e.g., Dispatched, Rejected, Evicted). | `fairness_id`=<flow-id>
`priority`=<flow-priority>
`outcome`=<QueueOutcome> | ALPHA | +| inference_extension_flow_control_queue_size | Gauge | The current number of requests being actively managed by the flow control layer. This counts requests from the moment they enter the `EnqueueAndWait` function until they reach a final outcome. | `fairness_id`=<flow-id>
`priority`=<flow-priority> | ALPHA | + ## Scrape Metrics & Pprof profiles The metrics endpoints are exposed on different ports by default: @@ -216,7 +225,7 @@ A template alert rule is available at [alert.yaml](../../tools/alerts/alert.yaml ```yaml alert: HighInferenceRequestLatencyP99 -expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0 # Adjust threshold as needed (e.g., 10.0 seconds) +expr: histogram_quantile(0.99, rate(inference_objective_request_duration_seconds_bucket[5m])) > 10.0 # Adjust threshold as needed (e.g., 10.0 seconds) for: 5m annotations: title: 'High latency (P99) for model {% raw %}{{ $labels.model_name }}{% endraw %}' @@ -229,7 +238,7 @@ labels: ```yaml alert: HighInferenceErrorRate -expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05 # Adjust threshold as needed (e.g., 5% error rate) +expr: sum by (model_name) (rate(inference_objective_request_error_total[5m])) / sum by (model_name) (rate(inference_objective_request_total[5m])) > 0.05 # Adjust threshold as needed (e.g., 5% error rate) for: 5m annotations: title: 'High error rate for model {% raw %}{{ $labels.model_name }}{% endraw %}' diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index 92bfedaca..25198e88f 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -1,10 +1,8 @@ # Serve multiple generative AI models A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads. -For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application. -The company needs to ensure optimal serving performance for these LLMs. -By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`. -You can then route requests based on the model name (such as `chatbot` and `recommender`) and the `Criticality` property. +For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application (or as in the example in this guide, a combination of a Llama3 model and a smaller Phi4 model).. You may choose to locate these 2 models at 2 different L7 url paths and follow the steps described in the [`Getting started`](index.md) guide for each such model as already described. However you may also need to serve multiple models located at the same L7 url path and rely on parsing information such as +the Model name in the LLM prompt requests as defined in the OpenAI API format which is commonly used by most models. For such Model-aware routing, you can use the Body-Based Routing feature as described in this guide. ## How @@ -12,135 +10,235 @@ The following diagram illustrates how an Inference Gateway routes requests to di The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) (BBR) from the request body to the header. The header is then matched to dispatch requests to different `InferencePool` (and their EPPs) instances. -![Serving multiple generative AI models](../images/serve-mul-gen-AI-models.png) -### Deploy Body-Based Routing +### Example Model-Aware Routing using Body-Based Routing (BBR) -To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. Depending on your Gateway provider, you can use one of the following commands: +This guide assumes you have already setup the cluster for basic model serving as described in the [`Getting started`](index.md) guide and this guide describes the additional steps needed from that point onwards in order to deploy and exercise an example of routing across multiple models. + + +### Deploy Body-Based Routing Extension + +To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. This is a separate ExtProc server from the EndPoint Picker and when installed, is automatically inserted at the start of the gateway's ExtProc chain ahead of other EtxProc servers such as EPP. + +First install this server. Depending on your Gateway provider, you can use one of the following commands: === "GKE" ```bash helm install body-based-router \ - --set provider.name=gke \ - --version v0.5.1 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --set provider.name=gke \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing ``` === "Istio" ```bash helm install body-based-router \ - --set provider.name=istio \ - --version v0.5.1 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --set provider.name=istio \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing ``` === "Other" ```bash helm install body-based-router \ - --version v0.5.1 \ - oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing + ``` + +Once this is installed, verify that the BBR pod is running without errors using the command `kubectl get pods`. + +### Serving a Second Base Model +Next deploy the second base model that will be served from the same L7 path (which is `/`) as the `meta-llama/Llama-3.1-8B-Instruct` model already being served after following the steps from the [`Getting started`](index.md) guide. In this example the 2nd model is `microsoft/Phi-4-mini-instruct` a relatively small model ( about 3B parameters) from HuggingFace. Note that for this exercise, there need to be atleast 2 GPUs available on the system one each for the two models being served. Serve the second model via the following command. + +```bash +kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/heads/main/config/manifests/bbr-example/vllm-phi4-mini.yaml +``` +Once this is installed, and after allowing for model download and startup time which can last several minutes, verify that the pod with this 2nd LLM phi4-mini, is running without errors using the command `kubectl get pods`. + +### Deploy the 2nd InferencePool and Endpoint Picker Extension +We also want to use an InferencePool and EndPoint Picker for this second model in addition to the Body Based Router in order to be able to schedule across multiple endpoints or LORA adapters within each base model. Hence we create these for our second model as follows. + +=== "GKE" + + ```bash + export GATEWAY_PROVIDER=gke + helm install vllm-phi4-mini-instruct \ + --set inferencePool.modelServers.matchLabels.app=phi4-mini \ + --set provider.name=$GATEWAY_PROVIDER \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` + +=== "Istio" + + ```bash + export GATEWAY_PROVIDER=istio + helm install vllm-phi4-mini-instruct \ + --set inferencePool.modelServers.matchLabels.app=phi4-mini \ + --set provider.name=$GATEWAY_PROVIDER \ + --version v1.0.0 \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool ``` +After executing this, verify that you see two InferencePools and two EPP pods, one per base model type, running without errors, using the CLIs `kubectl get inferencepools` and `kubectl get pods`. + ### Configure HTTPRoute -This example illustrates a conceptual example regarding how to use the `HTTPRoute` object to route based on model name like “chatbot” or “recommender” to `InferencePool`. +Before configuring the httproutes for the models, we need to delete the prior httproute created for the vllm-llama3-8b-instruct model because we will alter the routing to now also match on the model name as determined by the `X-Gateway-Model-Name` http header that will get inserted by the BBR extension after parsing the Model name from the body of the LLM request message. + +```bash +kubectl delete httproute llm-route +``` + +Now configure new HTTPRoutes, one per each model we want to serve via BBR using the following command which configures both routes. Also examine this manifest file, to see how the `X-Gateway-Model-Name` is used for a header match in the Gateway's rules to route requests to the correct Backend based on model name. For convenience the manifest is also listed below in order to view this routing configuration. + +```bash +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/bbr-example/httproute_bbr.yaml +``` ```yaml +--- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: - name: routes-to-llms + name: llm-llama-route spec: parentRefs: - - name: inference-gateway + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway rules: - - matches: - - headers: - - type: Exact - name: X-Gateway-Model-Name # (1)! - value: chatbot - path: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: type: PathPrefix value: / - backendRefs: - - name: gemma3 - group: inference.networking.x-k8s.io + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name # (1)! + value: 'meta-llama/Llama-3.1-8B-Instruct' + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-phi4-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io kind: InferencePool - - matches: - - headers: - - type: Exact - name: X-Gateway-Model-Name # (2)! - value: recommender - path: + name: vllm-phi4-mini-instruct + matches: + - path: type: PathPrefix value: / - backendRefs: - - name: deepseek-r1 - group: inference.networking.x-k8s.io - kind: InferencePool + headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: 'microsoft/Phi-4-mini-instruct' + timeouts: + request: 300s +--- ``` -1. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances. -2. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances. +Before testing the setup, confirm that the HTTPRoute status conditions include `Accepted=True` and `ResolvedRefs=True` for both routes using the following commands. -## Try it out +```bash +kubectl get httproute llm-llama-route -o yaml +``` -1. Get the gateway IP: ```bash -IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 +kubectl get httproute llm-phi4-route -o yaml ``` +## Try it out + +1. Get the gateway IP: + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 + ``` + === "Chat Completions API" - 1. Send a few requests to model `chatbot` as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "chatbot", - "messages": [{"role": "user", "content": "What is the color of the sky?"}], - "max_tokens": 100, - "temperature": 0 - }' - ``` - - 2. Send a few requests to model `recommender` as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "recommender", - "messages": [{"role": "user", "content": "Give me restaurant recommendations in Paris"}], - "max_tokens": 100, - "temperature": 0 - }' - ``` + 1. Send a few requests to Llama model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Linux is said to be an open source kernel because " + } + ] + }' + ``` + + 2. Send a few requests to the Phi4 as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "microsoft/Phi-4-mini-instruct", + "max_tokens": 100, + "temperature": 0, + "messages": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "2+2 is " + } + ] + }' + ``` === "Completions API" - 1. Send a few requests to model `chatbot` as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "chatbot", - "prompt": "What is the color of the sky", - "max_tokens": 100, - "temperature": 0 - }' - ``` - - 2. Send a few requests to model `recommender` as follows: - ```bash - curl -X POST -i ${IP}:${PORT}/v1/completions \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "recommender", - "prompt": "Give me restaurant recommendations in Paris", - "max_tokens": 100, - "temperature": 0 - }' - ``` + 1. Send a few requests to Llama model as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "Linux is said to be an open source kernel because ", + "max_tokens": 100, + "temperature": 0 + }' + ``` + + 2. Send a few requests to the Phi4 as follows: + ```bash + curl -X POST -i ${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "microsoft/Phi-4-mini-instruct", + "prompt": "2+2 is ", + "max_tokens": 20, + "temperature": 0 + }' + ``` + diff --git a/site-src/guides/serve-multiple-lora-adapters.md b/site-src/guides/serve-multiple-lora-adapters.md deleted file mode 100644 index 59cfe7208..000000000 --- a/site-src/guides/serve-multiple-lora-adapters.md +++ /dev/null @@ -1,98 +0,0 @@ -# Serve LoRA adapters on a shared pool -A company wants to serve LLMs for document analysis and focuses on audiences in multiple languages, such as English and Spanish. -They have a fine-tuned LoRA adapter for each language, but need to efficiently use their GPU and TPU capacity. -You can use an Inference Gateway to deploy dynamic LoRA fine-tuned adapters for each language (for example, `english-bot` and `spanish-bot`) on a common base model and accelerator. -This lets you reduce the number of required accelerators by densely packing multiple models in a shared pool. - -## How -The following diagram illustrates how Inference Gateway serves multiple LoRA adapters on a shared pool. -![Serving LoRA adapters on a shared pool](../images/serve-LoRA-adapters.png) -This example illustrates how you can densely serve multiple LoRA adapters with distinct workload performance objectives on a common InferencePool. -```yaml -apiVersion: gateway.networking.x-k8s.io/v1alpha1 -kind: InferencePool -metadata: - name: gemma3 -spec: - selector: - pool: gemma3 -``` -Let us say we have a couple of LoRA adapters named “english-bot” and “spanish-bot” for the Gemma3 base model. -You can create an `InferenceModel` resource and associate these LoRA adapters to the relevant InferencePool resource. -In this case, we associate these LoRA adapters to the gemma3 InferencePool resource created above. - -```yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: english-bot -spec: - criticality: 1 - poolRef: - name: gemma3 - ---- -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: spanish-bot -spec: - criticality: 2 - poolRef: - name: gemma3 - -``` -Now, you can route your requests from the gateway using the `HTTPRoute` object. -```yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway -spec: - listeners: - - protocol: HTTP - port: 80 - name: http - ---- -kind: HTTPRoute -apiVersion: gateway.networking.k8s.io/v1 -metadata: - name: routes-to-llms -spec: - parentRefs: - - name: inference-gateway - rules: - - matches: - path: - type: PathPrefix - value: / - backendRefs: - - name: gemma3 - kind: InferencePool -``` - -## Try it out - -1. Get the gateway IP: -```bash -IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 -``` -2. Send a few requests to model "english-bot" as follows: -```bash -curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "english-bot", -"prompt": "What is the color of the sky", -"max_tokens": 100, -"temperature": 0 -}' -``` -3. Send a few requests to model "spanish-bot" as follows: -```bash -curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "spanish-bot", -"prompt": "¿De qué color es...?", -"max_tokens": 100, -"temperature": 0 -}' -``` \ No newline at end of file diff --git a/site-src/guides/troubleshooting.md b/site-src/guides/troubleshooting.md index c741d59bf..1f08619f7 100644 --- a/site-src/guides/troubleshooting.md +++ b/site-src/guides/troubleshooting.md @@ -16,7 +16,7 @@ This is a default gateway error, meaning the request never reached a backend ser ## 429 Too Many Requests ### `system saturated, sheddable request dropped` -This error indicates that the entire inference pool has exceeded its saturation thresholds. This means the system is under heavy load and is shedding non-critical requests. To address this, check the following: +This error indicates that the entire inference pool has exceeded its saturation thresholds. This means the system is under heavy load and is shedding low priority requests. To address this, check the following: * gateway-api-inference-extension version: * **v0.5.1 and earlier**: Verify you're using an `InferenceModel` and that its `criticality` is set to `Critical`. This ensures requests are queued on the model servers instead of being dropped. diff --git a/site-src/images/alpha-stage.png b/site-src/images/alpha-stage.png new file mode 100644 index 000000000..7ba1ca5a7 Binary files /dev/null and b/site-src/images/alpha-stage.png differ diff --git a/site-src/images/ga-stage.png b/site-src/images/ga-stage.png new file mode 100644 index 000000000..5448c290d Binary files /dev/null and b/site-src/images/ga-stage.png differ diff --git a/site-src/images/inference-overview.svg b/site-src/images/inference-overview.svg index a82c09e26..8524ebbea 100644 --- a/site-src/images/inference-overview.svg +++ b/site-src/images/inference-overview.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/site-src/images/migration-stage.png b/site-src/images/migration-stage.png new file mode 100644 index 000000000..9adac2ffe Binary files /dev/null and b/site-src/images/migration-stage.png differ diff --git a/site-src/images/resource-model.png b/site-src/images/resource-model.png index 0910c5b7a..216c856e2 100644 Binary files a/site-src/images/resource-model.png and b/site-src/images/resource-model.png differ diff --git a/site-src/images/serve-LoRA-adapters.png b/site-src/images/serve-LoRA-adapters.png deleted file mode 100644 index e33dc708a..000000000 Binary files a/site-src/images/serve-LoRA-adapters.png and /dev/null differ diff --git a/site-src/images/serve-mul-gen-AI-models.png b/site-src/images/serve-mul-gen-AI-models.png deleted file mode 100644 index 957a054f1..000000000 Binary files a/site-src/images/serve-mul-gen-AI-models.png and /dev/null differ diff --git a/site-src/implementations/gateways.md b/site-src/implementations/gateways.md index 5ab1e0920..62abe017f 100644 --- a/site-src/implementations/gateways.md +++ b/site-src/implementations/gateways.md @@ -8,6 +8,7 @@ This project has several implementations that are planned or in progress: * [Google Kubernetes Engine][4] * [Istio][5] * [Kgateway][6] +* [Kubvernor][7] [1]:#agentgateway [2]:#alibaba-cloud-container-service-for-kubernetes @@ -15,12 +16,17 @@ This project has several implementations that are planned or in progress: [4]:#google-kubernetes-engine [5]:#istio [6]:#kgateway +[7]:#kubvernor ## Agentgateway -[Agentgateway](https://agentgateway.dev/) is an open source Gateway API implementation focusing on AI use cases, including LLM consumption, LLM serving, agent-to-agent ([A2A](https://a2aproject.github.io/A2A/latest/)), and agent-to-tool ([MCP](https://modelcontextprotocol.io/introduction)). It is the first and only proxy designed specifically for the Kubernetes Gateway API, powered by a high performance and scalable Rust dataplane implementation. +[Agentgateway](https://agentgateway.dev/) is an open source Gateway API and Inference Gateway +[v1.0.0 conformant](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/conformance/reports/v1.0.0/gateway/kgateway/agentgateway) +implementation focusing on AI use cases, including LLM consumption, LLM serving, agent-to-agent ([A2A](https://a2aproject.github.io/A2A/latest/)), +and agent-to-tool ([MCP](https://modelcontextprotocol.io/introduction)). It is the first and only proxy designed specifically for the Kubernetes Gateway API +that is powered by a high performance and scalable Rust dataplane. -Agentgateway comes with native support for Gateway API Inference Extension, powered by the [Kgateway](https://kgateway.dev/) control plane. +Agentgateway can run independently or can be managed by [Kgateway](https://kgateway.dev/). ## Alibaba Cloud Container Service for Kubernetes @@ -41,7 +47,6 @@ by [this Issue](https://github.com/AliyunContainerService/ack-gateway-api/issues [ack-gie]:https://www.alibabacloud.com/help/en/ack/product-overview/ack-gateway-with-inference-extension [ack-gie-usage]:https://www.alibabacloud.com/help/en/ack/ack-managed-and-ack-dedicated/user-guide/intelligent-routing-and-traffic-management-with-ack-gateway-inference-extension - ## Envoy AI Gateway [Envoy AI Gateway][aigw-home] is an open source project built on top of @@ -88,8 +93,16 @@ Issue](https://github.com/istio/istio/issues/55768). ## Kgateway -[Kgateway](https://kgateway.dev/) is a Gateway API Inference Extension -[conformant](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/conformance/reports/v0.5.1/gateway/kgateway) -gateway that can run [independently](https://gateway-api-inference-extension.sigs.k8s.io/guides/#__tabbed_3_3), as an [Istio waypoint](https://kgateway.dev/blog/extend-istio-ambient-kgateway-waypoint/), -or within your [llm-d infrastructure](https://github.com/llm-d-incubation/llm-d-infra) to improve accelerator (GPU) -utilization for AI inference workloads. +[Kgateway](https://kgateway.dev/) is a Gateway API and Inference Gateway +[v1.0.0 conformant](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/conformance/reports/v1.0.0/gateway/kgateway) +implementation that can run [independently](https://gateway-api-inference-extension.sigs.k8s.io/guides/#__tabbed_3_3), as an +[Istio waypoint](https://kgateway.dev/blog/extend-istio-ambient-kgateway-waypoint/), or within your +[llm-d infrastructure](https://github.com/llm-d-incubation/llm-d-infra) to improve accelerator (GPU) utilization for AI inference workloads. + +## Kubvernor + +[Kubvernor Rust API Gateway][krg] is an open-source, highly experimental implementation of API controller in Rust programming language. Currently, Kubvernor supports Envoy Proxy. The project aims to be as generic as possible so Kubvernor can be used to manage/deploy different gateways (Envoy, Nginx, HAProxy, etc.). See the docs for the [usage][krgu]. + +[krg]:https://github.com/kubvernor/kubvernor +[krgu]: https://github.com/kubvernor/kubvernor/blob/main/README.md + diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md index da9968fad..ed57e1252 100644 --- a/site-src/implementations/model-servers.md +++ b/site-src/implementations/model-servers.md @@ -19,34 +19,29 @@ vLLM is configured as the default in the [endpoint picker extension](https://git Triton specific metric names need to be specified when starting the EPP. -### Option 1: Use Helm +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the `inferencepool` via helm. See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/README.md) for more details. -Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details. + Add the following to the `flags` in the helm chart as [flags to EPP](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36) -### Option 2: Edit EPP deployment yaml - - Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) - - ``` -- --total-queued-requests-metric -- "nv_trt_llm_request_metrics{request_type=waiting}" -- --kv-cache-usage-percentage-metric -- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" -- --lora-info-metric -- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. +``` +- name=total-queued-requests-metric + value="nv_trt_llm_request_metrics{request_type=waiting}" +- name=kv-cache-usage-percentage-metric + value="nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" +- name=lora-info-metric + value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. ``` ## SGLang -### Edit EPP deployment yaml + Add the following `flags` while deploying using helm charts in the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/29ea29028496a638b162ff287c62c0087211bbe5/config/charts/inferencepool/values.yaml#L36) - Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) ``` -- --totalQueuedRequestsMetric -- "sglang:num_queue_reqs" -- --kvCacheUsagePercentageMetric -- "sglang:token_usage" -- --lora-info-metric -- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet. -``` +- name=total-queued-requests-metric + value="sglang:num_queue_reqs" +- name=kv-cache-usage-percentage-metric + value="sglang:token_usage" +- name=lora-info-metric + value="" # Set an empty metric to disable LoRA metric scraping as they are not supported by SGLang yet. +``` \ No newline at end of file diff --git a/site-src/index.md b/site-src/index.md index a2892c5b8..0fbb338f8 100644 --- a/site-src/index.md +++ b/site-src/index.md @@ -1,11 +1,5 @@ # Introduction -???+ warning - - - Some portions of this site may be out of date with the v1.0.0 release candidate. - Updates under active development! - Gateway API Inference Extension is an official Kubernetes project that optimizes self-hosting Generative Models on Kubernetes. The overall resource model focuses on 2 new inference-focused @@ -31,6 +25,7 @@ The following specific terms to this project: performance, availability and capabilities to optimize routing. Includes things like [Prefix Cache](https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html) status or [LoRA Adapters](https://docs.vllm.ai/en/stable/features/lora.html) availability. - **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). +- **Body Based Router(BBR)**: An additional (and optional) implementation of an extension that extracts information from the body portion of the inference request, currently the model name attribute from the body of an OpenAI API request, which can then be used by the gateway to perform model-aware functions such as routing/scheduling. This may be used along with the EPP in order to have a combination of model picking and endpoint picking functionality. [Inference Gateway]:#concepts-and-definitions @@ -49,7 +44,7 @@ in a higher level **AI Gateways** like [LiteLLM](https://www.litellm.ai/), [Gloo - **Model-aware routing**: Instead of simply routing based on the path of the request, an **[inference gateway]** allows you to route to models based on the model names. This is enabled by support for GenAI Inference API specifications (such as OpenAI API) in the gateway implementations such as in Envoy Proxy. This model-aware routing also extends to Low-Rank Adaptation (LoRA) fine-tuned models. -- **Serving priority**: an **[inference gateway]** allows you to specify the serving priority of your models. For example, you can specify that your models for online inference of chat tasks (which is more latency sensitive) have a higher [*Criticality*](/reference/spec/#criticality) than a model for latency tolerant tasks such as a summarization. +- **Serving priority**: an **[inference gateway]** allows you to specify the serving priority of your models. For example, you can specify that your models for online inference of chat tasks (which is more latency sensitive) have a higher [*Priority*](/reference/spec/#priority) than a model for latency tolerant tasks such as a summarization. - **Model rollouts**: an **[inference gateway]** allows you to incrementally roll out new model versions by traffic splitting definitions based on the model names. diff --git a/site-src/reference/spec.md b/site-src/reference/spec.md index 11080fe24..666ebe36b 100644 --- a/site-src/reference/spec.md +++ b/site-src/reference/spec.md @@ -15,6 +15,32 @@ inference.networking.k8s.io API group. +#### ControllerName + +_Underlying type:_ _string_ + +ControllerName is the name of a controller that manages ParentStatus. It must be a domain prefixed +path. + +Valid values include: + +* "example.com/bar" + +Invalid values include: + +* "example.com" - must include path +* "foo.example.com" - must include path + +_Validation:_ +- MaxLength: 253 +- MinLength: 1 +- Pattern: `^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$` + +_Appears in:_ +- [ParentStatus](#parentstatus) + + + #### EndpointPickerFailureMode _Underlying type:_ _string_ @@ -124,7 +150,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `selector` _[LabelSelector](#labelselector)_ | Selector determines which Pods are members of this inference pool.
It matches Pods by their labels only within the same namespace; cross-namespace
selection is not supported.
The structure of this LabelSelector is intentionally simple to be compatible
with Kubernetes Service selectors, as some implementations may translate
this configuration into a Service resource. | | | -| `targetPorts` _[Port](#port) array_ | TargetPorts defines a list of ports that are exposed by this InferencePool.
Currently, the list may only include a single port definition. | | MaxItems: 1
MinItems: 1
| +| `targetPorts` _[Port](#port) array_ | TargetPorts defines a list of ports that are exposed by this InferencePool.
Every port will be treated as a distinctive endpoint by EPP,
addressable as a 'podIP:portNumber' combination. | | MaxItems: 8
MinItems: 1
| | `endpointPickerRef` _[EndpointPickerRef](#endpointpickerref)_ | EndpointPickerRef is a reference to the Endpoint Picker extension and its
associated configuration. | | | @@ -329,6 +355,7 @@ _Appears in:_ | --- | --- | --- | --- | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions is a list of status conditions that provide information about the observed
state of the InferencePool. This field is required to be set by the controller that
manages the InferencePool.
Supported condition types are:
* "Accepted"
* "ResolvedRefs" | | MaxItems: 8
| | `parentRef` _[ParentReference](#parentreference)_ | ParentRef is used to identify the parent resource that this status
is associated with. It is used to match the InferencePool with the parent
resource, such as a Gateway. | | | +| `controllerName` _[ControllerName](#controllername)_ | ControllerName is a domain/path string that indicates the name of the controller that
wrote this status. This corresponds with the GatewayClass controllerName field when the
parentRef references a Gateway kind.
Example: "example.net/gateway-controller".
The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes names:
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
Controllers MAY populate this field when writing status. When populating this field, controllers
should ensure that entries to status populated with their ControllerName are cleaned up when they
are no longer necessary. | | MaxLength: 253
MinLength: 1
Pattern: `^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$`
| #### Port diff --git a/site-src/reference/x-v1a1-spec.md b/site-src/reference/x-v1a1-spec.md new file mode 100644 index 000000000..55bec76e0 --- /dev/null +++ b/site-src/reference/x-v1a1-spec.md @@ -0,0 +1,126 @@ +# API Reference + +## Packages +- [inference.networking.x-k8s.io/v1alpha1](#inferencenetworkingx-k8siov1alpha1) + + +## inference.networking.x-k8s.io/v1alpha1 + +Package v1alpha1 contains API Schema definitions for the +inference.networking.x-k8s.io API group. + + +### Resource Types +- [InferencePoolImport](#inferencepoolimport) + + + +#### ClusterName + +_Underlying type:_ _string_ + +ClusterName is the name of a cluster that exported the InferencePool. + +_Validation:_ +- MaxLength: 253 +- MinLength: 1 + +_Appears in:_ +- [ExportingCluster](#exportingcluster) + + + +#### ControllerName + +_Underlying type:_ _string_ + +ControllerName is the name of a controller that manages a resource. It must be a domain prefixed path. + +Valid values include: + + - "example.com/bar" + +Invalid values include: + + - "example.com" - must include path + - "foo.example.com" - must include path + +_Validation:_ +- MaxLength: 253 +- MinLength: 1 +- Pattern: `^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$` + +_Appears in:_ +- [ImportController](#importcontroller) + + + +#### ExportingCluster + + + +ExportingCluster defines a cluster that exported the InferencePool that backs this InferencePoolImport. + + + +_Appears in:_ +- [ImportController](#importcontroller) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _[ClusterName](#clustername)_ | Name of the exporting cluster (must be unique within the list). | | MaxLength: 253
MinLength: 1
Required: \{\}
| + + +#### ImportController + + + +ImportController defines a controller that is responsible for managing the InferencePoolImport. + + + +_Appears in:_ +- [InferencePoolImportStatus](#inferencepoolimportstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _[ControllerName](#controllername)_ | Name is a domain/path string that indicates the name of the controller that manages the
InferencePoolImport. Name corresponds to the GatewayClass controllerName field when the
controller will manage parents of type "Gateway". Otherwise, the name is implementation-specific.
Example: "example.net/import-controller".
The format of this field is DOMAIN "/" PATH, where DOMAIN and PATH are valid Kubernetes
names (https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names).
A controller MUST populate this field when writing status and ensure that entries to status
populated with their controller name are removed when they are no longer necessary. | | MaxLength: 253
MinLength: 1
Pattern: `^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$`
| +| `exportingClusters` _[ExportingCluster](#exportingcluster) array_ | ExportingClusters is a list of clusters that exported the InferencePool(s) that back the
InferencePoolImport. Required when the controller is responsible for CRUD'ing the InferencePoolImport
from the exported InferencePool(s). | | | +| `parents` _ParentStatus array_ | Parents is a list of parent resources, typically Gateways, that are associated with the
InferencePoolImport, and the status of the InferencePoolImport with respect to each parent.
Ancestor would be a more accurate name, but Parent is consistent with InferencePool terminology.
Required when the controller manages the InferencePoolImport as an HTTPRoute backendRef. The controller
must add an entry for each parent it manages and remove the parent entry when the controller no longer
considers the InferencePoolImport to be associated with that parent. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePoolImport.
Known condition types are:
* "Accepted" | | MaxItems: 8
| + + +#### InferencePoolImport + + + +InferencePoolImport is the Schema for the InferencePoolImports API. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha1` | | | +| `kind` _string_ | `InferencePoolImport` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `status` _[InferencePoolImportStatus](#inferencepoolimportstatus)_ | Status defines the observed state of the InferencePoolImport. | | | + + +#### InferencePoolImportStatus + + + +InferencePoolImportStatus defines the observed state of the InferencePoolImport. + + + +_Appears in:_ +- [InferencePoolImport](#inferencepoolimport) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `controllers` _[ImportController](#importcontroller) array_ | Controllers is a list of controllers that are responsible for managing the InferencePoolImport. | | MaxItems: 8
Required: \{\}
| + + diff --git a/site-src/reference/x-spec.md b/site-src/reference/x-v1a2-spec.md similarity index 97% rename from site-src/reference/x-spec.md rename to site-src/reference/x-v1a2-spec.md index 9151dcda6..c1a57ce3f 100644 --- a/site-src/reference/x-spec.md +++ b/site-src/reference/x-v1a2-spec.md @@ -123,12 +123,6 @@ performance and latency goals for the model. These workloads are expected to operate within an InferencePool sharing compute capacity with other InferenceObjectives, defined by the Inference Platform Admin. -InferenceObjective's modelName (not the ObjectMeta name) is unique for a given InferencePool, -if the name is reused, an error will be shown on the status of a -InferenceObjective that attempted to reuse. The oldest InferenceObjective, based on -creation timestamp, will be selected to remain valid. In the event of a race -condition, one will be selected at random. - _Appears in:_ diff --git a/test/cel/inferencepool_test.go b/test/cel/inferencepool_test.go index 8b3ba3ea5..deccb6e03 100644 --- a/test/cel/inferencepool_test.go +++ b/test/cel/inferencepool_test.go @@ -80,6 +80,20 @@ func TestValidateInferencePool(t *testing.T) { }, wantErrors: []string{"port is required when kind is 'Service' or unspecified (defaults to 'Service')"}, }, + { + desc: "passes validation with multiple unique port numbers", + mutate: func(ip *v1.InferencePool) { + ip.Spec.TargetPorts = []v1.Port{{Number: 8000}, {Number: 80}, {Number: 8081}, {Number: 443}} + }, + wantErrors: nil, + }, + { + desc: "fails validation with port numbers containing duplicates", + mutate: func(ip *v1.InferencePool) { + ip.Spec.TargetPorts = []v1.Port{{Number: 8000}, {Number: 80}, {Number: 8000}, {Number: 443}} + }, + wantErrors: []string{"port number must be unique"}, + }, } for _, tc := range testCases { diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index e6e5b83ba..b41f108f8 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -17,7 +17,6 @@ limitations under the License. package epp import ( - "context" "errors" "fmt" "os" @@ -29,34 +28,20 @@ import ( "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/config" infextv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" infextv1a2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) const ( - // defaultExistsTimeout is the default timeout for a resource to exist in the api server. - defaultExistsTimeout = 30 * time.Second - // defaultReadyTimeout is the default timeout for a resource to report a ready state. - defaultReadyTimeout = 3 * time.Minute - // defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state. - defaultModelReadyTimeout = 10 * time.Minute // defaultCurlTimeout is the default timeout for the curl command to get a response. defaultCurlTimeout = 30 * time.Second - // defaultInterval is the default interval to check if a resource exists or ready conditions. - defaultInterval = time.Millisecond * 250 // defaultCurlInterval is the default interval to run the test curl command. defaultCurlInterval = time.Second * 5 // defaultNsName is the default name of the Namespace used for tests. Can override using the E2E_NS environment variable. @@ -100,13 +85,8 @@ const ( const e2eLeaderElectionEnabledEnvVar = "E2E_LEADER_ELECTION_ENABLED" var ( - ctx = context.Background() - cli client.Client + testConfig *testutils.TestConfig // Required for exec'ing in curl pod - kubeCli *kubernetes.Clientset - scheme = runtime.NewScheme() - cfg = config.GetConfigOrDie() - nsName string e2eImage string leaderElectionEnabled bool ) @@ -119,10 +99,12 @@ func TestAPIs(t *testing.T) { } var _ = ginkgo.BeforeSuite(func() { - nsName = os.Getenv("E2E_NS") + nsName := os.Getenv("E2E_NS") if nsName == "" { nsName = defaultNsName } + testConfig = testutils.NewTestConfig(nsName) + e2eImage = os.Getenv("E2E_IMAGE") gomega.Expect(e2eImage).NotTo(gomega.BeEmpty(), "E2E_IMAGE environment variable is not set") @@ -143,11 +125,11 @@ func setupInfra() { // run this before createNs to fail fast in case it doesn't. modelServerManifestPath := readModelServerManifestPath() - createNamespace(cli, nsName) + createNamespace(testConfig) modelServerManifestArray := getYamlsFromModelServerManifest(modelServerManifestPath) if strings.Contains(modelServerManifestArray[0], "hf-token") { - createHfSecret(cli, modelServerSecretManifest) + createHfSecret(testConfig, modelServerSecretManifest) } crds := map[string]string{ "inferencepools.inference.networking.x-k8s.io": xInferPoolManifest, @@ -155,19 +137,19 @@ func setupInfra() { "inferencepools.inference.networking.k8s.io": inferPoolManifest, } - createCRDs(cli, crds) + createCRDs(testConfig, crds) inferExtManifestPath := inferExtManifestDefault if leaderElectionEnabled { inferExtManifestPath = inferExtManifestLeaderElection } - createInferExt(cli, inferExtManifestPath) - createClient(cli, clientManifest) - createEnvoy(cli, envoyManifest) - createMetricsRbac(cli, metricsRbacManifest) + createInferExt(testConfig, inferExtManifestPath) + createClient(testConfig, clientManifest) + createEnvoy(testConfig, envoyManifest) + createMetricsRbac(testConfig, metricsRbacManifest) // Run this step last, as it requires additional time for the model server to become ready. ginkgo.By("Creating model server resources from manifest: " + modelServerManifestPath) - createModelServer(cli, modelServerManifestArray) + createModelServer(testConfig, modelServerManifestArray) } var _ = ginkgo.AfterSuite(func() { @@ -192,77 +174,57 @@ var _ = ginkgo.AfterSuite(func() { // setupSuite initializes the test suite by setting up the Kubernetes client, // loading required API schemes, and validating configuration. func setupSuite() { - gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil()) - - err := clientgoscheme.AddToScheme(scheme) + err := clientgoscheme.AddToScheme(testConfig.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = apiextv1.AddToScheme(scheme) + err = apiextv1.AddToScheme(testConfig.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1a2.Install(scheme) + err = infextv1a2.Install(testConfig.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1.Install(scheme) + err = infextv1.Install(testConfig.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - cli, err = client.New(cfg, client.Options{Scheme: scheme}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(cli).NotTo(gomega.BeNil()) - - kubeCli, err = kubernetes.NewForConfig(cfg) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(kubeCli).NotTo(gomega.BeNil()) + testConfig.CreateCli() } func cleanupResources() { - if cli == nil { + if testConfig.K8sClient == nil { return // could happen if BeforeSuite had an error } - gomega.Expect(testutils.DeleteClusterResources(ctx, cli)).To(gomega.Succeed()) - gomega.Expect(testutils.DeleteNamespacedResources(ctx, cli, nsName)).To(gomega.Succeed()) + gomega.Expect(testutils.DeleteClusterResources(testConfig)).To(gomega.Succeed()) + gomega.Expect(testutils.DeleteNamespacedResources(testConfig)).To(gomega.Succeed()) } func cleanupInferModelResources() { - gomega.Expect(testutils.DeleteInferenceObjectiveResources(ctx, cli, nsName)).To(gomega.Succeed()) -} - -func getTimeout(key string, fallback time.Duration) time.Duration { - if value, ok := os.LookupEnv(key); ok { - if parsed, err := time.ParseDuration(value); err == nil { - return parsed - } - } - return fallback + gomega.Expect(testutils.DeleteInferenceObjectiveResources(testConfig)).To(gomega.Succeed()) } var ( - existsTimeout = getTimeout("EXISTS_TIMEOUT", defaultExistsTimeout) - readyTimeout = getTimeout("READY_TIMEOUT", defaultReadyTimeout) - modelReadyTimeout = getTimeout("MODEL_READY_TIMEOUT", defaultModelReadyTimeout) - curlTimeout = getTimeout("CURL_TIMEOUT", defaultCurlTimeout) - interval = defaultInterval - curlInterval = defaultCurlInterval + curlTimeout = env.GetEnvDuration("CURL_TIMEOUT", defaultCurlTimeout, ginkgo.GinkgoLogr) + curlInterval = defaultCurlInterval ) -func createNamespace(k8sClient client.Client, ns string) { - ginkgo.By("Creating e2e namespace: " + ns) +func createNamespace(testConfig *testutils.TestConfig) { + ginkgo.By("Creating e2e namespace: " + testConfig.NsName) obj := &corev1.Namespace{ ObjectMeta: v1.ObjectMeta{ - Name: ns, + Name: testConfig.NsName, }, } - err := k8sClient.Create(ctx, obj) + err := testConfig.K8sClient.Create(testConfig.Context, obj) gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Failed to create e2e test namespace") } // namespaceExists ensures that a specified namespace exists and is ready for use. -func namespaceExists(k8sClient client.Client, ns string) { - ginkgo.By("Ensuring namespace exists: " + ns) - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: ns}, &corev1.Namespace{}) - }, existsTimeout, interval) +func namespaceExists(testConfig *testutils.TestConfig) { + ginkgo.By("Ensuring namespace exists: " + testConfig.NsName) + testutils.EventuallyExists(testConfig, func() error { + return testConfig.K8sClient.Get(testConfig.Context, + types.NamespacedName{Name: testConfig.NsName}, &corev1.Namespace{}) + }) } // readModelServerManifestPath reads from env var the absolute filepath to model server deployment for testing. @@ -275,57 +237,39 @@ func readModelServerManifestPath() string { func getYamlsFromModelServerManifest(modelServerManifestPath string) []string { ginkgo.By("Ensuring the model server manifest points to an existing file") - modelServerManifestArray := readYaml(modelServerManifestPath) + modelServerManifestArray := testutils.ReadYaml(modelServerManifestPath) gomega.Expect(modelServerManifestArray).NotTo(gomega.BeEmpty()) return modelServerManifestArray } // createCRDs creates the Inference Extension CRDs used for testing. -func createCRDs(k8sClient client.Client, crds map[string]string) { - for name, path := range crds { +func createCRDs(testConfig *testutils.TestConfig, crds map[string]string) { + for _, path := range crds { ginkgo.By("Creating CRD resource from manifest: " + path) - applyYAMLFile(k8sClient, path) - - // Wait for the CRD to exist. - crd := &apiextv1.CustomResourceDefinition{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: name}, crd) - }, existsTimeout, interval) - - // Wait for the CRD to be established. - testutils.CRDEstablished(ctx, k8sClient, crd, readyTimeout, interval) + testutils.ApplyYAMLFile(testConfig, path) } } // createClient creates the client pod used for testing from the given filePath. -func createClient(k8sClient client.Client, filePath string) { +func createClient(testConfig *testutils.TestConfig, filePath string) { ginkgo.By("Creating client resources from manifest: " + filePath) - applyYAMLFile(k8sClient, filePath) - - // Wait for the pod to exist. - pod := &corev1.Pod{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "curl"}, pod) - }, existsTimeout, interval) - - // Wait for the pod to be ready. - testutils.PodReady(ctx, k8sClient, pod, readyTimeout, interval) + testutils.ApplyYAMLFile(testConfig, filePath) } // createMetricsRbac creates the metrics RBAC resources from the manifest file. -func createMetricsRbac(k8sClient client.Client, filePath string) { - inManifests := readYaml(filePath) +func createMetricsRbac(testConfig *testutils.TestConfig, filePath string) { + inManifests := testutils.ReadYaml(filePath) ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") outManifests := []string{} for _, m := range inManifests { - outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", testConfig.NsName)) } ginkgo.By("Creating RBAC resources for scraping metrics from manifest: " + filePath) - createObjsFromYaml(k8sClient, outManifests) + testutils.CreateObjsFromYaml(testConfig, outManifests) // wait for sa token to exist - testutils.EventuallyExists(ctx, func() error { - token, err := getMetricsReaderToken(k8sClient) + testutils.EventuallyExists(testConfig, func() error { + token, err := getMetricsReaderToken(testConfig.K8sClient) if err != nil { return err } @@ -333,30 +277,21 @@ func createMetricsRbac(k8sClient client.Client, filePath string) { return errors.New("failed to get metrics reader token") } return nil - }, existsTimeout, interval) + }) } // createModelServer creates the model server resources used for testing from the given filePaths. -func createModelServer(k8sClient client.Client, modelServerManifestArray []string) { - createObjsFromYaml(k8sClient, modelServerManifestArray) - - // Wait for the deployment to exist. - deploy := &appsv1.Deployment{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: modelServerName}, deploy) - }, existsTimeout, interval) - - // Wait for the deployment to be available. - testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) +func createModelServer(testConfig *testutils.TestConfig, modelServerManifestArray []string) { + testutils.CreateObjsFromYaml(testConfig, modelServerManifestArray) } // createHfSecret read HF_TOKEN from env var and creates a secret that contains the access token. -func createHfSecret(k8sClient client.Client, secretPath string) { +func createHfSecret(testConfig *testutils.TestConfig, secretPath string) { ginkgo.By("Ensuring the HF_TOKEN environment variable is set") token := os.Getenv("HF_TOKEN") gomega.Expect(token).NotTo(gomega.BeEmpty(), "HF_TOKEN is not set") - inManifests := readYaml(secretPath) + inManifests := testutils.ReadYaml(secretPath) ginkgo.By("Replacing placeholder secret data with HF_TOKEN environment variable") outManifests := []string{} for _, m := range inManifests { @@ -364,152 +299,49 @@ func createHfSecret(k8sClient client.Client, secretPath string) { } ginkgo.By("Creating model server secret resource") - createObjsFromYaml(k8sClient, outManifests) - - // Wait for the secret to exist before proceeding with test. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "hf-token"}, &corev1.Secret{}) - }, existsTimeout, interval) + testutils.CreateObjsFromYaml(testConfig, outManifests) } // createEnvoy creates the envoy proxy resources used for testing from the given filePath. -func createEnvoy(k8sClient client.Client, filePath string) { - inManifests := readYaml(filePath) +func createEnvoy(testConfig *testutils.TestConfig, filePath string) { + inManifests := testutils.ReadYaml(filePath) ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") outManifests := []string{} for _, m := range inManifests { - outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", testConfig.NsName)) } ginkgo.By("Creating envoy proxy resources from manifest: " + filePath) - createObjsFromYaml(k8sClient, outManifests) - - // Wait for the configmap to exist before proceeding with test. - cfgMap := &corev1.ConfigMap{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, cfgMap) - }, existsTimeout, interval) - - // Wait for the deployment to exist. - deploy := &appsv1.Deployment{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, deploy) - }, existsTimeout, interval) - - // Wait for the deployment to be available. - testutils.DeploymentAvailable(ctx, k8sClient, deploy, readyTimeout, interval) - - // Wait for the service to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, &corev1.Service{}) - }, existsTimeout, interval) + testutils.CreateObjsFromYaml(testConfig, outManifests) } // createInferExt creates the inference extension resources used for testing from the given filePath. -func createInferExt(k8sClient client.Client, filePath string) { - inManifests := readYaml(filePath) +func createInferExt(testConfig *testutils.TestConfig, filePath string) { + inManifests := testutils.ReadYaml(filePath) ginkgo.By("Replacing placeholders with environment variables") outManifests := []string{} + replacer := strings.NewReplacer( + "$E2E_NS", testConfig.NsName, + "$E2E_IMAGE", e2eImage, + ) for _, manifest := range inManifests { - replacer := strings.NewReplacer( - "$E2E_NS", nsName, - "$E2E_IMAGE", e2eImage, - ) outManifests = append(outManifests, replacer.Replace(manifest)) } ginkgo.By("Creating inference extension resources from manifest: " + filePath) - createObjsFromYaml(k8sClient, outManifests) - - // Wait for the serviceaccount to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, &corev1.ServiceAccount{}) - }, existsTimeout, interval) - - // Wait for the role to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "pod-read"}, &rbacv1.Role{}) - }, existsTimeout, interval) - - // Wait for the rolebinding to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "pod-read-binding"}, &rbacv1.RoleBinding{}) - }, existsTimeout, interval) - - // Wait for the clusterrole to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: "auth-reviewer"}, &rbacv1.ClusterRole{}) - }, existsTimeout, interval) - - // Wait for the clusterrolebinding to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: "auth-reviewer-binding"}, &rbacv1.ClusterRoleBinding{}) - }, existsTimeout, interval) + testutils.CreateObjsFromYaml(testConfig, outManifests) // Wait for the deployment to exist. - deploy := &appsv1.Deployment{} - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, deploy) - }, existsTimeout, interval) - + deploy := &appsv1.Deployment{ + ObjectMeta: v1.ObjectMeta{ + Name: inferExtName, + Namespace: testConfig.NsName, + }, + } if leaderElectionEnabled { // With leader election enabled, only 1 replica will be "Ready" at any given time (the leader). - testutils.DeploymentReadyReplicas(ctx, k8sClient, deploy, 1, modelReadyTimeout, interval) + testutils.DeploymentReadyReplicas(testConfig, deploy, 1) } else { - testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) - } - - // Wait for the service to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, &corev1.Service{}) - }, existsTimeout, interval) -} - -// applyYAMLFile reads a file containing YAML (possibly multiple docs) -// and applies each object to the cluster. -func applyYAMLFile(k8sClient client.Client, filePath string) { - // Create the resources from the manifest file - createObjsFromYaml(k8sClient, readYaml(filePath)) -} - -func readYaml(filePath string) []string { - ginkgo.By("Reading YAML file: " + filePath) - yamlBytes, err := os.ReadFile(filePath) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Split multiple docs, if needed - return strings.Split(string(yamlBytes), "\n---") -} - -func createObjsFromYaml(k8sClient client.Client, docs []string) { - // For each doc, decode and create - decoder := serializer.NewCodecFactory(scheme).UniversalDeserializer() - for _, doc := range docs { - trimmed := strings.TrimSpace(doc) - if trimmed == "" { - continue - } - // Decode into a runtime.Object - obj, gvk, decodeErr := decoder.Decode([]byte(trimmed), nil, nil) - gomega.Expect(decodeErr).NotTo(gomega.HaveOccurred(), - "Failed to decode YAML document to a Kubernetes object") - - ginkgo.By(fmt.Sprintf("Decoded GVK: %s", gvk)) - - unstrObj, ok := obj.(*unstructured.Unstructured) - if !ok { - // Fallback if it's a typed object - unstrObj = &unstructured.Unstructured{} - // Convert typed to unstructured - err := scheme.Convert(obj, unstrObj, nil) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - unstrObj.SetNamespace(nsName) - - // Create the object - err := k8sClient.Create(ctx, unstrObj) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - "Failed to create object from YAML") + testutils.DeploymentAvailable(testConfig, deploy) } } diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index e01240d8c..c3ff49e58 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -43,23 +43,23 @@ var _ = ginkgo.Describe("InferencePool", func() { var infObjective *v1alpha2.InferenceObjective ginkgo.BeforeEach(func() { ginkgo.By("Waiting for the namespace to exist.") - namespaceExists(cli, nsName) + namespaceExists(testConfig) ginkgo.By("Creating an InferenceObjective resource") - infObjective = newInferenceObjective(nsName) - gomega.Expect(cli.Create(ctx, infObjective)).To(gomega.Succeed()) + infObjective = newInferenceObjective(testConfig.NsName) + gomega.Expect(testConfig.K8sClient.Create(testConfig.Context, infObjective)).To(gomega.Succeed()) ginkgo.By("Ensuring the InferenceObjective resource exists in the namespace") gomega.Eventually(func() error { - return cli.Get(ctx, types.NamespacedName{Namespace: infObjective.Namespace, Name: infObjective.Name}, infObjective) - }, existsTimeout, interval).Should(gomega.Succeed()) + return testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: infObjective.Namespace, Name: infObjective.Name}, infObjective) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.Succeed()) }) ginkgo.AfterEach(func() { ginkgo.By("Deleting the InferenceObjective test resource.") cleanupInferModelResources() gomega.Eventually(func() error { - err := cli.Get(ctx, types.NamespacedName{Namespace: infObjective.Namespace, Name: infObjective.Name}, infObjective) + err := testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: infObjective.Namespace, Name: infObjective.Name}, infObjective) if err == nil { return errors.New("InferenceObjective resource still exists") } @@ -67,7 +67,7 @@ var _ = ginkgo.Describe("InferencePool", func() { return nil } return nil - }, existsTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.Succeed()) }) ginkgo.When("The Inference Extension is running", func() { @@ -89,7 +89,7 @@ var _ = ginkgo.Describe("InferencePool", func() { ginkgo.By("Verifying that exactly one EPP pod is ready") gomega.Eventually(func(g gomega.Gomega) { podList := &corev1.PodList{} - err := cli.List(ctx, podList, client.InNamespace(nsName), client.MatchingLabels{"app": inferExtName}) + err := testConfig.K8sClient.List(testConfig.Context, podList, client.InNamespace(testConfig.NsName), client.MatchingLabels{"app": inferExtName}) g.Expect(err).NotTo(gomega.HaveOccurred()) // The deployment should have 3 replicas for leader election. @@ -104,7 +104,7 @@ var _ = ginkgo.Describe("InferencePool", func() { } } g.Expect(readyPods).To(gomega.Equal(1), "Expected exactly one pod to be ready") - }, readyTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, testConfig.Interval).Should(gomega.Succeed()) }) ginkgo.It("Should successfully failover and serve traffic after the leader pod is deleted", func() { @@ -121,26 +121,26 @@ var _ = ginkgo.Describe("InferencePool", func() { ginkgo.By("Found initial leader pod: " + oldLeaderPod.Name) ginkgo.By(fmt.Sprintf("Deleting leader pod %s to trigger failover", oldLeaderPod.Name)) - gomega.Expect(cli.Delete(ctx, oldLeaderPod)).To(gomega.Succeed()) + gomega.Expect(testConfig.K8sClient.Delete(testConfig.Context, oldLeaderPod)).To(gomega.Succeed()) ginkgo.By("STEP 3: Waiting for a new leader to be elected") // The deployment controller will create a new pod. We need to wait for the total number of pods // to be back to 3, and for one of the other pods to become the new leader. deploy := &appsv1.Deployment{} gomega.Eventually(func() error { - return cli.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, deploy) - }, existsTimeout, interval).Should(gomega.Succeed()) + return testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: testConfig.NsName, Name: inferExtName}, deploy) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.Succeed()) // Wait for one replica to become ready again. - testutils.DeploymentReadyReplicas(ctx, cli, deploy, 1, readyTimeout, interval) + testutils.DeploymentReadyReplicas(testConfig, deploy, 1) // Also wait for the total number of replicas to be back to 3. gomega.Eventually(func(g gomega.Gomega) { d := &appsv1.Deployment{} - err := cli.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, d) + err := testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: testConfig.NsName, Name: inferExtName}, d) g.Expect(err).NotTo(gomega.HaveOccurred()) g.Expect(d.Status.Replicas).To(gomega.Equal(int32(3)), "Deployment should have 3 replicas") - }, readyTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, testConfig.Interval).Should(gomega.Succeed()) ginkgo.By("STEP 4: Verifying a new, different leader is elected") var newLeaderPod *corev1.Pod @@ -152,7 +152,7 @@ var _ = ginkgo.Describe("InferencePool", func() { // This guards against a race condition where we might find the old leader // before its status is updated to NotReady. g.Expect(newLeaderPod.Name).NotTo(gomega.Equal(oldLeaderPod.Name), "The new leader should not be the same as the old deleted leader") - }, readyTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, testConfig.Interval).Should(gomega.Succeed()) ginkgo.By("Found new leader pod: " + newLeaderPod.Name) ginkgo.By("STEP 5: Verifying the new leader is working correctly after failover") @@ -207,11 +207,11 @@ func verifyTrafficRouting() { // Ensure the expected responses include the InferenceObjective target model names. var expected []string expected = append(expected, targetModelName) - curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName, curlTimeout, t.api, t.promptOrMessages, false) + curlCmd := getCurlCommand(envoyName, testConfig.NsName, envoyPort, modelName, curlTimeout, t.api, t.promptOrMessages, false) actual := make(map[string]int) gomega.Eventually(func() error { - resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + resp, err := testutils.ExecCommandInPod(testConfig, "curl", "curl", curlCmd) if err != nil { return err } @@ -232,7 +232,7 @@ func verifyTrafficRouting() { return fmt.Errorf("actual (%v) != expected (%v); resp=%q", got, expected, resp) } return nil - }, readyTimeout, curlInterval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, curlInterval).Should(gomega.Succeed()) } } @@ -241,37 +241,37 @@ func verifyMetrics() { ginkgo.By("Verifying metrics exposure") // Define the metrics we expect to see expectedMetrics := []string{ - "inference_model_request_total", - "inference_model_request_error_total", - "inference_model_request_duration_seconds", + "inference_objective_request_total", + "inference_objective_request_error_total", + "inference_objective_request_duration_seconds", // TODO: normalized_time_per_output_token_seconds is not actually recorded yet // "normalized_time_per_output_token_seconds", - "inference_model_request_sizes", - "inference_model_response_sizes", - "inference_model_input_tokens", - "inference_model_output_tokens", + "inference_objective_request_sizes", + "inference_objective_response_sizes", + "inference_objective_input_tokens", + "inference_objective_output_tokens", "inference_pool_average_kv_cache_utilization", "inference_pool_average_queue_size", "inference_pool_per_pod_queue_size", - "inference_model_running_requests", + "inference_objective_running_requests", "inference_pool_ready_pods", "inference_extension_info", } // Generate traffic by sending requests through the inference extension ginkgo.By("Generating traffic through the inference extension") - curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName, curlTimeout, "/completions", "Write as if you were a critic: San Francisco", true) + curlCmd := getCurlCommand(envoyName, testConfig.NsName, envoyPort, modelName, curlTimeout, "/completions", "Write as if you were a critic: San Francisco", true) // Run the curl command multiple times to generate some metrics data for i := 0; i < 5; i++ { - _, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + _, err := testutils.ExecCommandInPod(testConfig, "curl", "curl", curlCmd) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } // modify the curl command to generate some error metrics curlCmd[len(curlCmd)-1] = "invalid input" for i := 0; i < 5; i++ { - _, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + _, err := testutils.ExecCommandInPod(testConfig, "curl", "curl", curlCmd) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } @@ -282,11 +282,11 @@ func verifyMetrics() { // Get the authorization token for reading metrics token := "" gomega.Eventually(func(g gomega.Gomega) { - t, err := getMetricsReaderToken(cli) + t, err := getMetricsReaderToken(testConfig.K8sClient) g.Expect(err).NotTo(gomega.HaveOccurred()) g.Expect(t).NotTo(gomega.BeEmpty()) token = t - }, existsTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.Succeed()) // Construct the metric scraping curl command using Pod IP metricScrapeCmd := getMetricsScrapeCommand(podIP, token) @@ -294,7 +294,7 @@ func verifyMetrics() { ginkgo.By("Verifying that all expected metrics are present.") gomega.Eventually(func() error { // Execute the metrics scrape command inside the curl pod - resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", metricScrapeCmd) + resp, err := testutils.ExecCommandInPod(testConfig, "curl", "curl", metricScrapeCmd) if err != nil { return err } @@ -309,12 +309,12 @@ func verifyMetrics() { } } return nil - }, readyTimeout, curlInterval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, curlInterval).Should(gomega.Succeed()) } func getMetricsReaderToken(k8sClient client.Client) (string, error) { secret := &corev1.Secret{} - err := k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: metricsReaderSecretName}, secret) + err := k8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: testConfig.NsName, Name: metricsReaderSecretName}, secret) if err != nil { return "", err } @@ -327,7 +327,7 @@ func findReadyPod() *corev1.Pod { var readyPod *corev1.Pod gomega.Eventually(func(g gomega.Gomega) { podList := &corev1.PodList{} - err := cli.List(ctx, podList, client.InNamespace(nsName), client.MatchingLabels{"app": inferExtName}) + err := testConfig.K8sClient.List(testConfig.Context, podList, client.InNamespace(testConfig.NsName), client.MatchingLabels{"app": inferExtName}) g.Expect(err).NotTo(gomega.HaveOccurred()) foundReadyPod := false @@ -346,7 +346,7 @@ func findReadyPod() *corev1.Pod { } } g.Expect(foundReadyPod).To(gomega.BeTrue(), "No ready EPP pod found") - }, readyTimeout, interval).Should(gomega.Succeed()) + }, testConfig.ReadyTimeout, testConfig.Interval).Should(gomega.Succeed()) return readyPod } diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index a215adcf5..f543c2845 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -133,11 +133,10 @@ func labelsToString(labels []label) string { func inferenceObjectiveRequestTotal(labels []label) string { return fmt.Sprintf(` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{%s} 1 - `, labelsToString(labels), - ) + # HELP inference_objective_request_total [ALPHA] Counter of inference objective requests broken out for each model and target model. + # TYPE inference_objective_request_total counter + inference_objective_request_total{%s} 1 + `, labelsToString(labels)) } func inferencePoolReadyPods(v int, labels []label) string { @@ -145,8 +144,7 @@ func inferencePoolReadyPods(v int, labels []label) string { # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. # TYPE inference_pool_ready_pods gauge inference_pool_ready_pods{%s} %d - `, labelsToString(labels), v, - ) + `, labelsToString(labels), v) } func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { @@ -170,7 +168,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { podState{index: 2, queueSize: 10, kvCacheUsage: 0.2}, ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelMyModel}, {"target_model_name", modelMyModelTarget}, }), @@ -242,7 +240,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSQLLora}, {"target_model_name", modelSQLLoraTarget}, }), @@ -277,7 +275,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { podState{index: 2, queueSize: 10, kvCacheUsage: 0.3, activeModels: []string{"foo"}}, ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSQLLora}, {"target_model_name", modelSQLLoraTarget}, }), @@ -312,7 +310,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo"}}, ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSQLLora}, {"target_model_name", modelSQLLoraTarget}, }), @@ -363,8 +361,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { }, }, }, - }, - { + }, { Request: &extProcPb.ProcessingRequest_RequestBody{ RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, }, @@ -382,7 +379,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSheddable}, {"target_model_name", modelSheddableTarget}, }), @@ -456,7 +453,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelDirect}, {"target_model_name", modelDirect}, }), @@ -705,32 +702,31 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { }, }, }, - wantErr: false, - wantMetrics: map[string]string{`inference_model_input_tokens`: ` - # HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model. - # TYPE inference_model_input_tokens histogram - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1"} 0 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="64"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="128"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="256"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="512"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1024"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="2048"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="4096"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8192"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16384"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32778"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="65536"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="131072"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="262144"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="524288"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1.048576e+06"} 1 - inference_model_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1 - inference_model_input_tokens_sum{model_name="",target_model_name=""} 7 - inference_model_input_tokens_count{model_name="",target_model_name=""} 1 + wantMetrics: map[string]string{`inference_objective_input_tokens`: ` + # HELP inference_objective_input_tokens [ALPHA] Inference objective input token count distribution for requests in each model. + # TYPE inference_objective_input_tokens histogram + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="1"} 0 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="8"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="16"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="32"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="64"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="128"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="256"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="512"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="1024"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="2048"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="4096"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="8192"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="16384"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="32778"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="65536"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="131072"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="262144"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="524288"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="1.048576e+06"} 1 + inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1 + inference_objective_input_tokens_sum{model_name="",target_model_name=""} 7 + inference_objective_input_tokens_count{model_name="",target_model_name=""} 1 `}, wantResponses: []*extProcPb.ProcessingResponse{ integrationutils.NewResponseHeaders( @@ -808,7 +804,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSQLLora}, {"target_model_name", modelSQLLoraTarget}, }), @@ -847,7 +843,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { ), wantMetrics: map[string]string{ - "inference_model_request_total": inferenceObjectiveRequestTotal([]label{ + "inference_objective_request_total": inferenceObjectiveRequestTotal([]label{ {"model_name", modelSQLLora}, {"target_model_name", modelSQLLoraTarget}, }), @@ -974,7 +970,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { responses, err := integrationutils.StreamedRequest(t, client, test.requests, len(test.wantResponses)) if err != nil && !test.wantErr { - t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + t.Errorf("In test %s, unexpected error, got: %v, want error: %v", test.name, err, test.wantErr) } if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform(), @@ -982,13 +978,13 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) { return a.GetHeader().GetKey() < b.GetHeader().GetKey() }), ); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) + t.Errorf("In test %s, unexpected response, (-want +got): %v", test.name, diff) } if len(test.wantMetrics) != 0 { for metricName, value := range test.wantMetrics { if err := metricsutils.GatherAndCompare(crmetrics.Registry, strings.NewReader(value), metricName); err != nil { - t.Error(err) + t.Error(fmt.Errorf("In test %s, %v", test.name, err)) } } } @@ -1013,11 +1009,11 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[*backend.Pod]*backendme } for pod := range podAndMetrics { - pod := epptestutil.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.PodName). Namespace(pod.NamespacedName.Namespace). ReadyCondition(). Labels(podLabels). - IP(pod.Address). + IP(pod.GetIPAddress()). Complete(). ObjRef() @@ -1063,7 +1059,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[*backend.Pod]*backendme // clear created pods for pod := range podAndMetrics { - pod := epptestutil.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.PodName). Namespace(pod.NamespacedName.Namespace).Complete().ObjRef() if err := k8sClient.Delete(context.Background(), pod); err != nil { @@ -1075,8 +1071,9 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[*backend.Pod]*backendme func fakePod(index int) *backend.Pod { return &backend.Pod{ - NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: testNamespace}, + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v-rank-0", index), Namespace: testNamespace}, Address: fmt.Sprintf("192.168.1.%d", index+1), + PodName: fmt.Sprintf("pod-%v", index), Labels: make(map[string]string, 0), } } @@ -1157,7 +1154,7 @@ func BeforeSuite() func() { NamespacedName: types.NamespacedName{Namespace: testNamespace, Name: testPoolName}, GroupKind: schema.GroupKind{Group: v1.GroupVersion.Group, Kind: "InferencePool"}, } - serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) + serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf, 0) kvCacheUtilizationScorer := scorer.NewKVCacheUtilizationScorer() queueingScorer := scorer.NewQueueScorer() @@ -1184,7 +1181,8 @@ func BeforeSuite() func() { } detector := saturationdetector.NewDetector(sdConfig, logger.WithName("saturation-detector")) serverRunner.SaturationDetector = detector - serverRunner.Director = requestcontrol.NewDirectorWithConfig(serverRunner.Datastore, scheduler, detector, requestcontrol.NewConfig()) + admissionController := requestcontrol.NewLegacyAdmissionController(detector) + serverRunner.Director = requestcontrol.NewDirectorWithConfig(serverRunner.Datastore, scheduler, admissionController, requestcontrol.NewConfig()) serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { diff --git a/test/integration/util.go b/test/integration/util.go index d305c9f8f..45647acc3 100644 --- a/test/integration/util.go +++ b/test/integration/util.go @@ -55,7 +55,13 @@ func SendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, return res, err } -func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { +// StreamedRequest sends a series of requests and collects the specified number of responses. +func StreamedRequest( + t *testing.T, + client extProcPb.ExternalProcessor_ProcessClient, + requests []*extProcPb.ProcessingRequest, + expectedResponses int, +) ([]*extProcPb.ProcessingResponse, error) { for _, req := range requests { t.Logf("Sending request: %v", req) if err := client.Send(req); err != nil { @@ -63,27 +69,35 @@ func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessCli return nil, err } } + responses := []*extProcPb.ProcessingResponse{} + for i := range expectedResponses { + type recvResult struct { + res *extProcPb.ProcessingResponse + err error + } + recvChan := make(chan recvResult, 1) - // Make an incredible simple timeout func in the case where - // there is less than the expected amount of responses; bail and fail. - var simpleTimeout bool - go func() { - time.Sleep(10 * time.Second) - simpleTimeout = true - }() + go func() { + res, err := client.Recv() + recvChan <- recvResult{res, err} + }() - for range expectedResponses { - if simpleTimeout { - break - } - res, err := client.Recv() - if err != nil && err != io.EOF { - t.Logf("Failed to receive: %v", err) - return nil, err + select { + case <-time.After(10 * time.Second): + t.Logf("Timeout waiting for response %d of %d", i+1, expectedResponses) + return responses, nil + case result := <-recvChan: + if result.err != nil { + if result.err == io.EOF { + return responses, nil + } + t.Logf("Failed to receive: %v", result.err) + return nil, result.err + } + t.Logf("Received response %+v", result.res) + responses = append(responses, result.res) } - t.Logf("Received response %+v", res) - responses = append(responses, res) } return responses, nil } diff --git a/test/testdata/configloader_1_test.yaml b/test/testdata/configloader_1_test.yaml index f1f167efb..db75a4265 100644 --- a/test/testdata/configloader_1_test.yaml +++ b/test/testdata/configloader_1_test.yaml @@ -9,7 +9,7 @@ plugins: type: test-profile-handler - type: test-two parameters: - hashBlockSize: 32 + blockSize: 32 - name: testPicker type: test-picker diff --git a/test/testdata/inferencepool-e2e.yaml b/test/testdata/inferencepool-e2e.yaml index b8d7fb697..77d454e6f 100644 --- a/test/testdata/inferencepool-e2e.yaml +++ b/test/testdata/inferencepool-e2e.yaml @@ -37,6 +37,87 @@ metadata: name: vllm-llama3-8b-instruct-epp namespace: $E2E_NS --- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read + namespace: $E2E_NS +rules: +- apiGroups: [ "inference.networking.x-k8s.io" ] + resources: [ "inferenceobjectives", "inferencepools" ] + verbs: [ "get", "watch", "list" ] +- apiGroups: [ "inference.networking.k8s.io" ] + resources: [ "inferencepools" ] + verbs: [ "get", "watch", "list" ] +- apiGroups: [ "" ] + resources: [ "pods" ] + verbs: [ "get", "watch", "list" ] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding + namespace: $E2E_NS +subjects: +- kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: pod-read +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: auth-reviewer +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: auth-reviewer-binding +subjects: +- kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: auth-reviewer +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: plugins-config + namespace: $E2E_NS +data: + default-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: prefix-cache-scorer +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -100,84 +181,3 @@ spec: - name: plugins-config-volume configMap: name: plugins-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: plugins-config - namespace: $E2E_NS -data: - default-plugins.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: queue-scorer - - type: kv-cache-utilization-scorer - - type: prefix-cache-scorer - schedulingProfiles: - - name: default - plugins: - - pluginRef: queue-scorer - - pluginRef: kv-cache-utilization-scorer - - pluginRef: prefix-cache-scorer ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read - namespace: $E2E_NS -rules: -- apiGroups: [ "inference.networking.x-k8s.io" ] - resources: [ "inferenceobjectives", "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "inference.networking.k8s.io" ] - resources: [ "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "" ] - resources: [ "pods" ] - verbs: [ "get", "watch", "list" ] ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read-binding - namespace: $E2E_NS -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: $E2E_NS -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pod-read ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer-binding -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: $E2E_NS -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: auth-reviewer diff --git a/test/testdata/inferencepool-leader-election-e2e.yaml b/test/testdata/inferencepool-leader-election-e2e.yaml index 9ba5dcb4a..976fbbd02 100644 --- a/test/testdata/inferencepool-leader-election-e2e.yaml +++ b/test/testdata/inferencepool-leader-election-e2e.yaml @@ -35,91 +35,6 @@ metadata: name: vllm-llama3-8b-instruct-epp namespace: $E2E_NS --- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: $E2E_NS - labels: - app: vllm-llama3-8b-instruct-epp -spec: - replicas: 3 - selector: - matchLabels: - app: vllm-llama3-8b-instruct-epp - template: - metadata: - labels: - app: vllm-llama3-8b-instruct-epp - spec: - serviceAccountName: vllm-llama3-8b-instruct-epp - # Conservatively, this timeout should mirror the longest grace period of the pods within the pool - terminationGracePeriodSeconds: 130 - containers: - - name: epp - image: $E2E_IMAGE - imagePullPolicy: IfNotPresent - args: - - --pool-name - - "vllm-llama3-8b-instruct" - - --pool-namespace - - "$E2E_NS" - - --v - - "4" - - --zap-encoder - - "json" - - --grpc-port - - "9002" - - --grpc-health-port - - "9003" - - --ha-enable-leader-election - - "--config-file" - - "/config/default-plugins.yaml" - ports: - - containerPort: 9002 - - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: liveness - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: readiness - initialDelaySeconds: 5 - periodSeconds: 10 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: plugins-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: plugins-config - namespace: $E2E_NS -data: - default-plugins.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: queue-scorer - - type: kv-cache-utilization-scorer - - type: prefix-cache-scorer - schedulingProfiles: - - name: default - plugins: - - pluginRef: queue-scorer - - pluginRef: kv-cache-utilization-scorer - - pluginRef: prefix-cache-scorer ---- kind: Role apiVersion: rbac.authorization.k8s.io/v1 metadata: @@ -207,3 +122,88 @@ roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: auth-reviewer +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: plugins-config + namespace: $E2E_NS +data: + default-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: prefix-cache-scorer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + serviceAccountName: vllm-llama3-8b-instruct-epp + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp + image: $E2E_IMAGE + imagePullPolicy: IfNotPresent + args: + - --pool-name + - "vllm-llama3-8b-instruct" + - --pool-namespace + - "$E2E_NS" + - --v + - "4" + - --zap-encoder + - "json" + - --grpc-port + - "9002" + - --grpc-health-port + - "9003" + - --ha-enable-leader-election + - "--config-file" + - "/config/default-plugins.yaml" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: liveness + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: readiness + initialDelaySeconds: 5 + periodSeconds: 10 + volumeMounts: + - name: plugins-config-volume + mountPath: "/config" + volumes: + - name: plugins-config-volume + configMap: + name: plugins-config diff --git a/test/utils/handle.go b/test/utils/handle.go index 4a29dda87..273539f81 100644 --- a/test/utils/handle.go +++ b/test/utils/handle.go @@ -19,6 +19,7 @@ package utils import ( "context" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" ) @@ -33,6 +34,10 @@ func (h *testHandle) Context() context.Context { return h.ctx } +func (h *testHandle) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { + return []backendmetrics.PodMetrics{} +} + type testHandlePlugins struct { plugins map[string]plugins.Plugin } diff --git a/test/utils/server.go b/test/utils/server.go index 51eb33fa0..9cf907d29 100644 --- a/test/utils/server.go +++ b/test/utils/server.go @@ -50,7 +50,7 @@ func PrepareForTestStreamingServer(objectives []*v1alpha2.InferenceObjective, po pmc := &metrics.FakePodMetricsClient{} pmf := metrics.NewPodMetricsFactory(pmc, time.Second) - ds := datastore.NewDatastore(ctx, pmf) + ds := datastore.NewDatastore(ctx, pmf, 0) initObjs := []client.Object{} for _, objective := range objectives { diff --git a/test/utils/utils.go b/test/utils/utils.go index 8cc19b7ac..b65834cb6 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -20,6 +20,8 @@ import ( "bytes" "context" "fmt" + "os" + "strings" "time" "github.com/onsi/ginkgo/v2" @@ -30,25 +32,84 @@ import ( apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" ) +const ( + // defaultExistsTimeout is the default timeout for a resource to exist in the api server. + defaultExistsTimeout = 30 * time.Second + // defaultReadyTimeout is the default timeout for a resource to report a ready state. + defaultReadyTimeout = 3 * time.Minute + // defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state. + defaultModelReadyTimeout = 10 * time.Minute + // defaultInterval is the default interval to check if a resource exists or ready conditions. + defaultInterval = time.Millisecond * 250 +) + +// TestConfig groups various fields together for use in the test helpers +type TestConfig struct { + Context context.Context + KubeCli *kubernetes.Clientset + K8sClient client.Client + RestConfig *rest.Config + NsName string + Scheme *runtime.Scheme + ExistsTimeout time.Duration + ReadyTimeout time.Duration + ModelReadyTimeout time.Duration + Interval time.Duration +} + +// NewTestConfig creates a new TestConfig instance +func NewTestConfig(nsName string) *TestConfig { + cfg := config.GetConfigOrDie() + gomega.Expect(cfg).NotTo(gomega.BeNil()) + + kubeCli, err := kubernetes.NewForConfig(cfg) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(kubeCli).NotTo(gomega.BeNil()) + + return &TestConfig{ + Context: context.Background(), + KubeCli: kubeCli, + NsName: nsName, + RestConfig: cfg, + Scheme: runtime.NewScheme(), + ExistsTimeout: env.GetEnvDuration("EXISTS_TIMEOUT", defaultExistsTimeout, ginkgo.GinkgoLogr), + ReadyTimeout: env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr), + ModelReadyTimeout: env.GetEnvDuration("MODEL_READY_TIMEOUT", defaultModelReadyTimeout, ginkgo.GinkgoLogr), + Interval: defaultInterval, + } +} + +// CreateCli creates the Kubernetes client used in the tests, invoked after the scheme has been setup. +func (testConfig *TestConfig) CreateCli() { + var err error + testConfig.K8sClient, err = client.New(testConfig.RestConfig, client.Options{Scheme: testConfig.Scheme}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(testConfig.K8sClient).NotTo(gomega.BeNil()) +} + // DeleteClusterResources deletes all cluster-scoped objects the tests typically create. -func DeleteClusterResources(ctx context.Context, cli client.Client) error { +func DeleteClusterResources(testConfig *TestConfig) error { binding := &rbacv1.ClusterRoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: "auth-reviewer-binding", }, } - err := cli.Delete(ctx, binding, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err := testConfig.K8sClient.Delete(testConfig.Context, binding, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -57,7 +118,7 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { Name: "auth-reviewer", }, } - err = cli.Delete(ctx, role, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.Delete(testConfig.Context, role, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -66,7 +127,7 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { Name: "inference-gateway-sa-metrics-reader-role-binding", }, } - err = cli.Delete(ctx, metricsReaderBinding, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.Delete(testConfig.Context, metricsReaderBinding, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -75,7 +136,7 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { Name: "inference-gateway-metrics-reader", }, } - err = cli.Delete(ctx, metricsReaderRole, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.Delete(testConfig.Context, metricsReaderRole, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -84,7 +145,7 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { Name: "inferenceobjectives.inference.networking.x-k8s.io", }, } - err = cli.Delete(ctx, model, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.Delete(testConfig.Context, model, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -93,7 +154,7 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { Name: "inferencepools.inference.networking.x-k8s.io", }, } - err = cli.Delete(ctx, pool, client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.Delete(testConfig.Context, pool, client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -102,49 +163,49 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { // DeleteNamespacedResources deletes all namespace-scoped objects the tests typically create. // The given namespace will also be deleted if it's not "default". -func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string) error { - if ns == "" { +func DeleteNamespacedResources(testConfig *TestConfig) error { + if testConfig.NsName == "" { return nil } - err := cli.DeleteAllOf(ctx, &appsv1.Deployment{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err := testConfig.K8sClient.DeleteAllOf(testConfig.Context, &appsv1.Deployment{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &corev1.Service{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &corev1.Service{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &corev1.Pod{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &corev1.ConfigMap{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &corev1.ConfigMap{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &corev1.Secret{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &corev1.Secret{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &corev1.ServiceAccount{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &corev1.ServiceAccount{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &v1.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &v1.InferencePool{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &v1alpha2.InferenceObjective{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = testConfig.K8sClient.DeleteAllOf(testConfig.Context, &v1alpha2.InferenceObjective{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - if ns != "default" { + if testConfig.NsName != "default" { ns := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ - Name: ns, + Name: testConfig.NsName, }, } - if err := cli.Delete(ctx, ns, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil && !apierrors.IsNotFound(err) { + if err := testConfig.K8sClient.Delete(testConfig.Context, ns, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil && !apierrors.IsNotFound(err) { return err } } @@ -152,11 +213,11 @@ func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string } // DeleteInferenceObjectiveResources deletes all InferenceObjective objects in the given namespace. -func DeleteInferenceObjectiveResources(ctx context.Context, cli client.Client, ns string) error { - if ns == "" { +func DeleteInferenceObjectiveResources(testConfig *TestConfig) error { + if testConfig.NsName == "" { return nil } - err := cli.DeleteAllOf(ctx, &v1alpha2.InferenceObjective{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err := testConfig.K8sClient.DeleteAllOf(testConfig.Context, &v1alpha2.InferenceObjective{}, client.InNamespace(testConfig.NsName), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -164,7 +225,7 @@ func DeleteInferenceObjectiveResources(ctx context.Context, cli client.Client, n } // PodReady checks if the given Pod reports the "Ready" status condition before the given timeout. -func PodReady(ctx context.Context, cli client.Client, pod *corev1.Pod, timeout, interval time.Duration) { +func PodReady(testConfig *TestConfig, pod *corev1.Pod) { ginkgo.By(fmt.Sprintf("Checking pod %s/%s status is: %s", pod.Namespace, pod.Name, corev1.PodReady)) conditions := []corev1.PodCondition{ { @@ -172,13 +233,14 @@ func PodReady(ctx context.Context, cli client.Client, pod *corev1.Pod, timeout, Status: corev1.ConditionTrue, }, } - gomega.Eventually(checkPodStatus, timeout, interval).WithArguments(ctx, cli, pod, conditions).Should(gomega.BeTrue()) + gomega.Eventually(checkPodStatus, testConfig.ExistsTimeout, testConfig.Interval). + WithArguments(testConfig, pod, conditions).Should(gomega.BeTrue()) } // checkPodStatus checks if the given Pod status matches the expected conditions. -func checkPodStatus(ctx context.Context, cli client.Client, pod *corev1.Pod, conditions []corev1.PodCondition) (bool, error) { +func checkPodStatus(testConfig *TestConfig, pod *corev1.Pod, conditions []corev1.PodCondition) (bool, error) { var fetchedPod corev1.Pod - if err := cli.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, &fetchedPod); err != nil { + if err := testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, &fetchedPod); err != nil { return false, err } found := 0 @@ -193,7 +255,7 @@ func checkPodStatus(ctx context.Context, cli client.Client, pod *corev1.Pod, con } // DeploymentAvailable checks if the given Deployment reports the "Available" status condition before the given timeout. -func DeploymentAvailable(ctx context.Context, cli client.Client, deploy *appsv1.Deployment, timeout, interval time.Duration) { +func DeploymentAvailable(testConfig *TestConfig, deploy *appsv1.Deployment) { ginkgo.By(fmt.Sprintf("Checking if deployment %s/%s status is: %s", deploy.Namespace, deploy.Name, appsv1.DeploymentAvailable)) conditions := []appsv1.DeploymentCondition{ { @@ -201,19 +263,21 @@ func DeploymentAvailable(ctx context.Context, cli client.Client, deploy *appsv1. Status: corev1.ConditionTrue, }, } - gomega.Eventually(checkDeploymentStatus, timeout, interval).WithArguments(ctx, cli, deploy, conditions).Should(gomega.BeTrue()) + gomega.Eventually(checkDeploymentStatus, testConfig.ModelReadyTimeout, testConfig.Interval). + WithArguments(testConfig.Context, testConfig.K8sClient, deploy, conditions). + Should(gomega.BeTrue()) } // DeploymentReadyReplicas checks if the given Deployment has at least `count` ready replicas before the given timeout. -func DeploymentReadyReplicas(ctx context.Context, cli client.Client, deploy *appsv1.Deployment, count int, timeout, interval time.Duration) { +func DeploymentReadyReplicas(testConfig *TestConfig, deploy *appsv1.Deployment, count int) { ginkgo.By(fmt.Sprintf("Checking if deployment %s/%s has at least %d ready replica(s)", deploy.Namespace, deploy.Name, count)) gomega.Eventually(func(g gomega.Gomega) { var fetchedDeploy appsv1.Deployment - err := cli.Get(ctx, types.NamespacedName{Namespace: deploy.Namespace, Name: deploy.Name}, &fetchedDeploy) + err := testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: deploy.Namespace, Name: deploy.Name}, &fetchedDeploy) g.Expect(err).NotTo(gomega.HaveOccurred()) g.Expect(fetchedDeploy.Status.ReadyReplicas).To(gomega.BeNumerically(">=", count), fmt.Sprintf("Deployment only has %d ready replicas, want at least %d", fetchedDeploy.Status.ReadyReplicas, count)) - }, timeout, interval).Should(gomega.Succeed()) + }, testConfig.ModelReadyTimeout, testConfig.Interval).Should(gomega.Succeed()) } // checkDeploymentStatus checks if the given Deployment status matches the expected conditions. @@ -234,7 +298,7 @@ func checkDeploymentStatus(ctx context.Context, cli client.Client, deploy *appsv } // CRDEstablished checks if the given CRD reports the "Established" status condition before the given timeout. -func CRDEstablished(ctx context.Context, cli client.Client, crd *apiextv1.CustomResourceDefinition, timeout, interval time.Duration) { +func CRDEstablished(testConfig *TestConfig, crd *apiextv1.CustomResourceDefinition) { ginkgo.By(fmt.Sprintf("Checking CRD %s status is: %s", crd.Name, apiextv1.Established)) conditions := []apiextv1.CustomResourceDefinitionCondition{ { @@ -242,7 +306,9 @@ func CRDEstablished(ctx context.Context, cli client.Client, crd *apiextv1.Custom Status: apiextv1.ConditionTrue, }, } - gomega.Eventually(checkCrdStatus, timeout, interval).WithArguments(ctx, cli, crd, conditions).Should(gomega.BeTrue()) + gomega.Eventually(checkCrdStatus, testConfig.ReadyTimeout, testConfig.Interval). + WithArguments(testConfig.Context, testConfig.K8sClient, crd, conditions). + Should(gomega.BeTrue()) } // checkCrdStatus checks if the given CRD status matches the expected conditions. @@ -268,22 +334,14 @@ func checkCrdStatus( } // ExecCommandInPod runs a command in a given container of a given Pod, returning combined stdout+stderr. -func ExecCommandInPod( - ctx context.Context, - cfg *rest.Config, - scheme *runtime.Scheme, - kubeClient *kubernetes.Clientset, - podNamespace, podName, containerName string, - cmd []string, -) (string, error) { - - parameterCodec := runtime.NewParameterCodec(scheme) +func ExecCommandInPod(testConfig *TestConfig, podName, containerName string, cmd []string) (string, error) { + parameterCodec := runtime.NewParameterCodec(testConfig.Scheme) - req := kubeClient.CoreV1().RESTClient(). + req := testConfig.KubeCli.CoreV1().RESTClient(). Post(). Resource("pods"). Name(podName). - Namespace(podNamespace). + Namespace(testConfig.NsName). SubResource("exec"). VersionedParams(&corev1.PodExecOptions{ Container: containerName, @@ -294,13 +352,13 @@ func ExecCommandInPod( TTY: false, }, parameterCodec) - exec, err := remotecommand.NewSPDYExecutor(cfg, "POST", req.URL()) + exec, err := remotecommand.NewSPDYExecutor(testConfig.RestConfig, "POST", req.URL()) if err != nil { return "", fmt.Errorf("could not initialize executor: %w", err) } var stdout, stderr bytes.Buffer - execErr := exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + execErr := exec.StreamWithContext(testConfig.Context, remotecommand.StreamOptions{ Stdout: &stdout, Stderr: &stderr, }) @@ -316,8 +374,135 @@ func ExecCommandInPod( // EventuallyExists checks if a Kubernetes resource exists and returns nil if successful. // It takes a function `getResource` which retrieves the resource and returns an error if it doesn't exist. -func EventuallyExists(ctx context.Context, getResource func() error, timeout, interval time.Duration) { +func EventuallyExists(testConfig *TestConfig, getResource func() error) { gomega.Eventually(func() error { return getResource() - }, timeout, interval).Should(gomega.Succeed()) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.Succeed()) +} + +// CreateObjsFromYaml creates K8S objects from yaml and waits for them to be instantiated +func CreateObjsFromYaml(testConfig *TestConfig, docs []string) []string { + objNames := []string{} + + // For each doc, decode and create + decoder := serializer.NewCodecFactory(testConfig.Scheme).UniversalDeserializer() + for _, doc := range docs { + trimmed := strings.TrimSpace(doc) + if trimmed == "" { + continue + } + // Decode into a runtime.Object + obj, gvk, decodeErr := decoder.Decode([]byte(trimmed), nil, nil) + gomega.Expect(decodeErr).NotTo(gomega.HaveOccurred(), + "Failed to decode YAML document to a Kubernetes object") + + ginkgo.By(fmt.Sprintf("Decoded GVK: %s", gvk)) + + unstrObj, ok := obj.(*unstructured.Unstructured) + if !ok { + // Fallback if it's a typed object + unstrObj = &unstructured.Unstructured{} + // Convert typed to unstructured + err := testConfig.Scheme.Convert(obj, unstrObj, nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + unstrObj.SetNamespace(testConfig.NsName) + kind := unstrObj.GetKind() + name := unstrObj.GetName() + objNames = append(objNames, kind+"/"+name) + + // Create the object + err := testConfig.K8sClient.Create(testConfig.Context, unstrObj, &client.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Failed to create object from YAML") + + // Wait for the created object to exist. + clientObj := getClientObject(kind) + EventuallyExists(testConfig, func() error { + return testConfig.K8sClient.Get(testConfig.Context, + types.NamespacedName{Namespace: testConfig.NsName, Name: name}, clientObj) + }) + + switch kind { + case "CustomResourceDefinition": + // Wait for the CRD to be established. + CRDEstablished(testConfig, clientObj.(*apiextv1.CustomResourceDefinition)) + case "Deployment": + // Wait for the deployment to be available. + DeploymentAvailable(testConfig, clientObj.(*appsv1.Deployment)) + case "Pod": + // Wait for the pod to be ready. + PodReady(testConfig, clientObj.(*corev1.Pod)) + } + } + return objNames +} + +// DeleteObjects deletes set of Kubernetes objects in the form of kind/name +func DeleteObjects(testConfig *TestConfig, kindAndNames []string) { + for _, kindAndName := range kindAndNames { + split := strings.Split(kindAndName, "/") + clientObj := getClientObject(split[0]) + err := testConfig.K8sClient.Get(testConfig.Context, + types.NamespacedName{Namespace: testConfig.NsName, Name: split[1]}, clientObj) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = testConfig.K8sClient.Delete(testConfig.Context, clientObj) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Eventually(func() bool { + clientObj := getClientObject(split[0]) + err := testConfig.K8sClient.Get(testConfig.Context, + types.NamespacedName{Namespace: testConfig.NsName, Name: split[1]}, clientObj) + return apierrors.IsNotFound(err) + }, testConfig.ExistsTimeout, testConfig.Interval).Should(gomega.BeTrue()) + } +} + +// applyYAMLFile reads a file containing YAML (possibly multiple docs) +// and applies each object to the cluster. +func ApplyYAMLFile(testConfig *TestConfig, filePath string) { + // Create the resources from the manifest file + CreateObjsFromYaml(testConfig, ReadYaml(filePath)) +} + +// ReadYaml is a helper function to read in K8S YAML files and split by the --- separator +func ReadYaml(filePath string) []string { + ginkgo.By("Reading YAML file: " + filePath) + yamlBytes, err := os.ReadFile(filePath) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Split multiple docs, if needed + return strings.Split(string(yamlBytes), "\n---") +} + +func getClientObject(kind string) client.Object { + switch strings.ToLower(kind) { + case "clusterrole": + return &rbacv1.ClusterRole{} + case "clusterrolebinding": + return &rbacv1.ClusterRoleBinding{} + case "configmap": + return &corev1.ConfigMap{} + case "customresourcedefinition": + return &apiextv1.CustomResourceDefinition{} + case "deployment": + return &appsv1.Deployment{} + case "inferencepool": + return &v1.InferencePool{} + case "pod": + return &corev1.Pod{} + case "role": + return &rbacv1.Role{} + case "rolebinding": + return &rbacv1.RoleBinding{} + case "secret": + return &corev1.Secret{} + case "service": + return &corev1.Service{} + case "serviceaccount": + return &corev1.ServiceAccount{} + default: + ginkgo.Fail("unsupported K8S kind "+kind, 1) + return nil + } } diff --git a/tools/alerts/alert.yaml b/tools/alerts/alert.yaml index c712207a4..687123feb 100644 --- a/tools/alerts/alert.yaml +++ b/tools/alerts/alert.yaml @@ -5,7 +5,7 @@ groups: annotations: title: 'High latency (P99) for model {{ $labels.model_name }}' description: 'The 99th percentile request duration for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 10.0 seconds for 5 minutes.' - expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0 + expr: histogram_quantile(0.99, rate(inference_objective_request_duration_seconds_bucket[5m])) > 10.0 for: 5m labels: severity: 'warning' @@ -13,7 +13,7 @@ groups: annotations: title: 'High error rate for model {{ $labels.model_name }}' description: 'The error rate for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 5% for 5 minutes.' - expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05 + expr: sum by (model_name) (rate(inference_objective_request_error_total[5m])) / sum by (model_name) (rate(inference_objective_request_total[5m])) > 0.05 for: 5m labels: severity: 'critical' diff --git a/tools/dashboards/README.md b/tools/dashboards/README.md index 21282bf23..564d8a6df 100644 --- a/tools/dashboards/README.md +++ b/tools/dashboards/README.md @@ -4,7 +4,7 @@ This documentation provides instructions for setting up grafana dashboards to se ## Requirements -Please follow [metrics](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/?h=metrics) page to configure the proxy to enable all metrics. +Please follow [metrics](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics-and-observability/) page to configure the proxy to enable all metrics. ## Load Inference Extension dashboard into Grafana diff --git a/tools/dashboards/inference_gateway.json b/tools/dashboards/inference_gateway.json index 244f4ab14..949b03ffe 100644 --- a/tools/dashboards/inference_gateway.json +++ b/tools/dashboards/inference_gateway.json @@ -36,7 +36,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", + "content": "# Inference Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", "mode": "markdown" }, "pluginVersion": "10.2.4", @@ -443,7 +443,7 @@ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_objective_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -458,7 +458,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_objective_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -474,7 +474,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_objective_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -576,7 +576,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", + "expr": "sum by(model_name, target_model_name) (rate(inference_objective_request_total{}[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", @@ -678,7 +678,7 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_model_request_error_total[$__rate_interval]))", + "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_objective_request_error_total[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", @@ -775,7 +775,7 @@ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_objective_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -790,7 +790,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_objective_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -806,7 +806,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_objective_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -903,7 +903,7 @@ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_objective_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -918,7 +918,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_objective_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -934,7 +934,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_objective_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -1031,7 +1031,7 @@ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_objective_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -1046,7 +1046,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_objective_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -1062,7 +1062,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_objective_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -1159,7 +1159,7 @@ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_objective_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -1174,7 +1174,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_objective_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -1190,7 +1190,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_objective_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -1204,7 +1204,7 @@ "type": "timeseries" } ], - "title": "Inference Model", + "title": "Inference Objective", "type": "row" }, {