Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ test-integration: download-tokenizer install-dependencies ## Run integration tes
go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/

.PHONY: test-e2e
test-e2e: image-build ## Run end-to-end tests against a new kind cluster
test-e2e: image-build sidecar-image-build ## Run end-to-end tests against a new kind cluster
@printf "\033[33;1m==== Running End to End Tests ====\033[0m\n"
./test/scripts/run_e2e.sh

Expand Down
18 changes: 9 additions & 9 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ require (
k8s.io/client-go v0.34.1
sigs.k8s.io/controller-runtime v0.22.3
sigs.k8s.io/gateway-api v1.4.0
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20251016181044-831a919943ba
sigs.k8s.io/gateway-api-inference-extension v1.1.0-rc.1
)

require (
Expand Down Expand Up @@ -57,9 +57,9 @@ require (
github.com/google/btree v1.1.3 // indirect
github.com/google/cel-go v0.26.0 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 // indirect
github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
Expand All @@ -77,7 +77,7 @@ require (
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.1 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/prometheus/prometheus v0.306.0 // indirect
github.com/prometheus/prometheus v0.307.1 // indirect
github.com/redis/go-redis/v9 v9.11.0 // indirect
github.com/spf13/cobra v1.9.1 // indirect
github.com/spf13/pflag v1.0.7 // indirect
Expand All @@ -90,7 +90,7 @@ require (
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect
Expand All @@ -105,19 +105,19 @@ require (
go.uber.org/zap v1.27.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect
golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
golang.org/x/mod v0.28.0 // indirect
golang.org/x/net v0.44.0 // indirect
golang.org/x/oauth2 v0.31.0 // indirect
golang.org/x/sync v0.17.0 // indirect
golang.org/x/sys v0.36.0 // indirect
golang.org/x/term v0.35.0 // indirect
golang.org/x/text v0.29.0 // indirect
golang.org/x/time v0.12.0 // indirect
golang.org/x/time v0.13.0 // indirect
golang.org/x/tools v0.37.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250922171735-9219d122eba9 // indirect
google.golang.org/protobuf v1.36.10 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
Expand Down
128 changes: 64 additions & 64 deletions go.sum

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pkg/plugins/pre-request/pd_prerequest.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"encoding/json"
"fmt"
"net"
"strconv"

"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
Expand Down Expand Up @@ -68,7 +67,7 @@ func (p *PrefillHeaderHandler) WithName(name string) *PrefillHeaderHandler {
}

// PreRequest wires prefill SchedulerProfile result into a header to indicate prefill worker
func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, targetPort int) {
func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
if _, found := request.Headers[common.PrefillPodHeader]; found {
request.Headers[common.PrefillPodHeader] = "" // clear header, if already set
}
Expand All @@ -78,6 +77,7 @@ func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMR
return // prefill profile failed to run or we chose not to run it, no-op in this case
}

prefillHostPort := net.JoinHostPort(prefillProfileRunResult.TargetPods[0].GetPod().Address, strconv.Itoa(targetPort))
targetPod := prefillProfileRunResult.TargetPods[0].GetPod()
prefillHostPort := net.JoinHostPort(targetPod.Address, targetPod.Port)
request.Headers[common.PrefillPodHeader] = prefillHostPort // in the form of <ip:port>
}
2 changes: 1 addition & 1 deletion pkg/plugins/scorer/active_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ func (s *ActiveRequest) Score(ctx context.Context, _ *types.CycleState, _ *types
// It creates a new request entry in the cache with its own TTL and
// increments the pod count for fast lookup.
func (s *ActiveRequest) PreRequest(ctx context.Context, request *types.LLMRequest,
schedulingResult *types.SchedulingResult, _ int) {
schedulingResult *types.SchedulingResult) {
debugLogger := log.FromContext(ctx).V(logutil.DEBUG)

for _, profileResult := range schedulingResult.ProfileResults { // schedulingResult guaranteed not to be nil
Expand Down
8 changes: 4 additions & 4 deletions pkg/plugins/scorer/active_request_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func TestActiveRequestScorer_PreRequest(t *testing.T) {
}

// First request
scorer.PreRequest(ctx, request, schedulingResult, 0)
scorer.PreRequest(ctx, request, schedulingResult)

// Check cache and pod counts
compositeKey := "default/pod-a.test-request-1"
Expand All @@ -151,7 +151,7 @@ func TestActiveRequestScorer_PreRequest(t *testing.T) {
},
}

scorer.PreRequest(ctx, request2, schedulingResult2, 0)
scorer.PreRequest(ctx, request2, schedulingResult2)

// Check incremented count
scorer.mutex.RLock()
Expand Down Expand Up @@ -192,7 +192,7 @@ func TestActiveRequestScorer_ResponseComplete(t *testing.T) {
},
}

scorer.PreRequest(ctx, request, schedulingResult, 0)
scorer.PreRequest(ctx, request, schedulingResult)

// Verify initial state
compositeKey := "default/pod-a.test-request-1"
Expand Down Expand Up @@ -248,7 +248,7 @@ func TestActiveRequestScorer_TTLExpiration(t *testing.T) {
}

// Add request
scorer.PreRequest(ctx, request, schedulingResult, 0)
scorer.PreRequest(ctx, request, schedulingResult)

// Verify request is added
scorer.mutex.RLock()
Expand Down
2 changes: 1 addition & 1 deletion pkg/plugins/scorer/no_hit_lru.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ func (s *NoHitLRU) Score(ctx context.Context, cycleState *types.CycleState, requ

// PreRequest is called before a request is sent to the target pod.
// For cold requests, it updates the LRU cache to track which pods have been used recently.
func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, _ int) {
func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
logger := log.FromContext(ctx).V(logutil.DEBUG)

if schedulingResult == nil || len(schedulingResult.ProfileResults) == 0 {
Expand Down
8 changes: 4 additions & 4 deletions pkg/plugins/scorer/no_hit_lru_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
t.Run("initial cold request seeds cache", func(_ *testing.T) {
coldReqA := &types.LLMRequest{RequestId: "cold-1"}
scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqA, pods)
scorer.PreRequest(ctx, coldReqA, requestToPod(podA), 0)
scorer.PreRequest(ctx, coldReqA, requestToPod(podA))
// After podA handles a cold request, other pods should score higher for new cold requests
assertHighestScoredPod(podB, "after-podA-used")
})
Expand Down Expand Up @@ -367,7 +367,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
t.Fatalf("expected neutral score for warm request, got %f", score)
}
}
scorer.PreRequest(ctx, warmReq, requestToPod(podB), 0)
scorer.PreRequest(ctx, warmReq, requestToPod(podB))
postWarmReq := &types.LLMRequest{RequestId: "cold-after-warm"}
postWarmScores := scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), postWarmReq, pods)
if postWarmScores[podB] <= postWarmScores[podA] {
Expand All @@ -379,7 +379,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
// Simulate podB handling a cold request
coldReqB := &types.LLMRequest{RequestId: "cold-2"}
scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqB, pods)
scorer.PreRequest(ctx, coldReqB, requestToPod(podB), 0)
scorer.PreRequest(ctx, coldReqB, requestToPod(podB))
// Now podC should score highest since both podA and podB have been used
assertHighestScoredPod(podC, "after-podB-used")
})
Expand All @@ -388,7 +388,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
// Simulate podC handling a cold request
coldReqC := &types.LLMRequest{RequestId: "cold-3"}
scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqC, pods)
scorer.PreRequest(ctx, coldReqC, requestToPod(podC), 0)
scorer.PreRequest(ctx, coldReqC, requestToPod(podC))
// Now podA should score highest again (LRU rotation)
assertHighestScoredPod(podA, "after-podC-used")
})
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduling/pd/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ func TestPDSchedule(t *testing.T) {

if test.wantRes2 != nil { // Checking the prefix match in the decode pod.
// make sure prefix plugin stores the prefix hit in cache, so we can test it in the following schedule call
prefixScorer.PreRequest(ctx, test.req, got, 0)
prefixScorer.PreRequest(ctx, test.req, got)
time.Sleep(time.Second)

got, err = scheduler.Schedule(ctx, test.req, test.input)
Expand Down
53 changes: 24 additions & 29 deletions test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package e2e

import (
"context"
"fmt"
"io"
"os/exec"
Expand All @@ -14,22 +13,19 @@ import (
"github.com/onsi/gomega"
"github.com/onsi/gomega/gexec"
apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
k8slog "sigs.k8s.io/controller-runtime/pkg/log"

infextv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
infextv1a2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils"
)

const (
// defaultExistsTimeout is the default timeout for a resource to exist in the api server.
defaultExistsTimeout = 30 * time.Second
// defaultReadyTimeout is the default timeout for a resource to report a ready state.
defaultReadyTimeout = 3 * time.Minute
// defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state.
defaultModelReadyTimeout = 10 * time.Minute
// defaultInterval is the default interval to check if a resource exists or ready conditions.
defaultInterval = time.Millisecond * 250
// xInferPoolManifest is the manifest for the inference pool CRD with 'inference.networking.x-k8s.io' group.
Expand Down Expand Up @@ -57,19 +53,16 @@ const (
)

var (
ctx = context.Background()
k8sClient client.Client
port string
scheme = k8sruntime.NewScheme()
port string

testConfig *testutils.TestConfig

eppTag = env.GetEnvString("EPP_TAG", "dev", ginkgo.GinkgoLogr)
vllmSimTag = env.GetEnvString("VLLM_SIMULATOR_TAG", "dev", ginkgo.GinkgoLogr)
routingSideCarTag = env.GetEnvString("ROUTING_SIDECAR_TAG", "v0.2.0", ginkgo.GinkgoLogr)
routingSideCarTag = env.GetEnvString("ROUTING_SIDECAR_TAG", "dev", ginkgo.GinkgoLogr)

existsTimeout = env.GetEnvDuration("EXISTS_TIMEOUT", defaultExistsTimeout, ginkgo.GinkgoLogr)
readyTimeout = env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr)
modelReadyTimeout = env.GetEnvDuration("MODEL_READY_TIMEOUT", defaultModelReadyTimeout, ginkgo.GinkgoLogr)
interval = defaultInterval
readyTimeout = env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr)
interval = defaultInterval
)

func TestEndToEnd(t *testing.T) {
Expand All @@ -83,16 +76,17 @@ var _ = ginkgo.BeforeSuite(func() {
port = "30080"

setupK8sCluster()
testConfig = testutils.NewTestConfig(nsName)
setupK8sClient()
createCRDs()
createEnvoy()
applyYAMLFile(rbacManifest)
applyYAMLFile(serviceAccountManifest)
applyYAMLFile(servicesManifest)
testutils.ApplyYAMLFile(testConfig, rbacManifest)
testutils.ApplyYAMLFile(testConfig, serviceAccountManifest)
testutils.ApplyYAMLFile(testConfig, servicesManifest)

infPoolYaml := readYaml(inferExtManifest)
infPoolYaml := testutils.ReadYaml(inferExtManifest)
infPoolYaml = substituteMany(infPoolYaml, map[string]string{"${POOL_NAME}": modelName + "-inference-pool"})
createObjsFromYaml(infPoolYaml)
testutils.CreateObjsFromYaml(testConfig, infPoolYaml)
})

var _ = ginkgo.AfterSuite(func() {
Expand Down Expand Up @@ -147,32 +141,33 @@ func setupK8sClient() {
k8sCfg := config.GetConfigOrDie()
gomega.ExpectWithOffset(1, k8sCfg).NotTo(gomega.BeNil())

err := clientgoscheme.AddToScheme(scheme)
err := clientgoscheme.AddToScheme(testConfig.Scheme)
gomega.Expect(err).NotTo(gomega.HaveOccurred())

err = apiextv1.AddToScheme(scheme)
err = infextv1.Install(testConfig.Scheme)
gomega.Expect(err).NotTo(gomega.HaveOccurred())

err = infextv1a2.Install(scheme)
err = apiextv1.AddToScheme(testConfig.Scheme)
gomega.Expect(err).NotTo(gomega.HaveOccurred())

k8sClient, err = client.New(k8sCfg, client.Options{Scheme: scheme})
err = infextv1a2.Install(testConfig.Scheme)
gomega.Expect(err).NotTo(gomega.HaveOccurred())
gomega.Expect(k8sClient).NotTo(gomega.BeNil())

testConfig.CreateCli()

k8slog.SetLogger(ginkgo.GinkgoLogr)
}

// createCRDs creates the Inference Extension CRDs used for testing.
func createCRDs() {
crds := runKustomize(gieCrdsKustomize)
createObjsFromYaml(crds)
testutils.CreateObjsFromYaml(testConfig, crds)
}

func createEnvoy() {
manifests := readYaml(envoyManifest)
manifests := testutils.ReadYaml(envoyManifest)
ginkgo.By("Creating envoy proxy resources from manifest: " + envoyManifest)
createObjsFromYaml(manifests)
testutils.CreateObjsFromYaml(testConfig, manifests)
}

const kindClusterConfig = `
Expand Down
Loading