llm-d · github-actions · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/Makefile b/Makefile
@@ -99,7 +99,7 @@ test-integration: download-tokenizer install-dependencies ## Run integration tes
 	go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/
 
 .PHONY: test-e2e
-test-e2e: image-build ## Run end-to-end tests against a new kind cluster
+test-e2e: image-build sidecar-image-build ## Run end-to-end tests against a new kind cluster
 	@printf "\033[33;1m==== Running End to End Tests ====\033[0m\n"
 	./test/scripts/run_e2e.sh
 

diff --git a/go.mod b/go.mod
@@ -22,7 +22,7 @@ require (
 	k8s.io/client-go v0.34.1
 	sigs.k8s.io/controller-runtime v0.22.3
 	sigs.k8s.io/gateway-api v1.4.0
-	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20251016181044-831a919943ba
+	sigs.k8s.io/gateway-api-inference-extension v1.1.0-rc.1
 )
 
 require (
@@ -57,9 +57,9 @@ require (
 	github.com/google/btree v1.1.3 // indirect
 	github.com/google/cel-go v0.26.0 // indirect
 	github.com/google/gnostic-models v0.7.0 // indirect
-	github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 // indirect
+	github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 // indirect
 	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
-	github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
+	github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
@@ -77,7 +77,7 @@ require (
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.67.1 // indirect
 	github.com/prometheus/procfs v0.17.0 // indirect
-	github.com/prometheus/prometheus v0.306.0 // indirect
+	github.com/prometheus/prometheus v0.307.1 // indirect
 	github.com/redis/go-redis/v9 v9.11.0 // indirect
 	github.com/spf13/cobra v1.9.1 // indirect
 	github.com/spf13/pflag v1.0.7 // indirect
@@ -90,7 +90,7 @@ require (
 	github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
-	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
 	go.opentelemetry.io/otel v1.38.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect
@@ -105,19 +105,19 @@ require (
 	go.uber.org/zap v1.27.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
-	golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect
+	golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
 	golang.org/x/mod v0.28.0 // indirect
 	golang.org/x/net v0.44.0 // indirect
 	golang.org/x/oauth2 v0.31.0 // indirect
 	golang.org/x/sync v0.17.0 // indirect
 	golang.org/x/sys v0.36.0 // indirect
 	golang.org/x/term v0.35.0 // indirect
 	golang.org/x/text v0.29.0 // indirect
-	golang.org/x/time v0.12.0 // indirect
+	golang.org/x/time v0.13.0 // indirect
 	golang.org/x/tools v0.37.0 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
-	google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
-	google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20250922171735-9219d122eba9 // indirect
 	google.golang.org/protobuf v1.36.10 // indirect
 	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect

diff --git a/go.sum b/go.sum
diff --git a/pkg/plugins/pre-request/pd_prerequest.go b/pkg/plugins/pre-request/pd_prerequest.go
@@ -6,7 +6,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"net"
-	"strconv"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
@@ -68,7 +67,7 @@ func (p *PrefillHeaderHandler) WithName(name string) *PrefillHeaderHandler {
 }
 
 // PreRequest wires prefill SchedulerProfile result into a header to indicate prefill worker
-func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, targetPort int) {
+func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
 	if _, found := request.Headers[common.PrefillPodHeader]; found {
 		request.Headers[common.PrefillPodHeader] = "" // clear header, if already set
 	}
@@ -78,6 +77,7 @@ func (p *PrefillHeaderHandler) PreRequest(_ context.Context, request *types.LLMR
 		return // prefill profile failed to run or we chose not to run it, no-op in this case
 	}
 
-	prefillHostPort := net.JoinHostPort(prefillProfileRunResult.TargetPods[0].GetPod().Address, strconv.Itoa(targetPort))
+	targetPod := prefillProfileRunResult.TargetPods[0].GetPod()
+	prefillHostPort := net.JoinHostPort(targetPod.Address, targetPod.Port)
 	request.Headers[common.PrefillPodHeader] = prefillHostPort // in the form of <ip:port>
 }
diff --git a/pkg/plugins/scorer/active_request.go b/pkg/plugins/scorer/active_request.go
@@ -167,7 +167,7 @@ func (s *ActiveRequest) Score(ctx context.Context, _ *types.CycleState, _ *types
 // It creates a new request entry in the cache with its own TTL and
 // increments the pod count for fast lookup.
 func (s *ActiveRequest) PreRequest(ctx context.Context, request *types.LLMRequest,
-	schedulingResult *types.SchedulingResult, _ int) {
+	schedulingResult *types.SchedulingResult) {
 	debugLogger := log.FromContext(ctx).V(logutil.DEBUG)
 
 	for _, profileResult := range schedulingResult.ProfileResults { // schedulingResult guaranteed not to be nil

diff --git a/pkg/plugins/scorer/active_request_test.go b/pkg/plugins/scorer/active_request_test.go
@@ -124,7 +124,7 @@ func TestActiveRequestScorer_PreRequest(t *testing.T) {
 	}
 
 	// First request
-	scorer.PreRequest(ctx, request, schedulingResult, 0)
+	scorer.PreRequest(ctx, request, schedulingResult)
 
 	// Check cache and pod counts
 	compositeKey := "default/pod-a.test-request-1"
@@ -151,7 +151,7 @@ func TestActiveRequestScorer_PreRequest(t *testing.T) {
 		},
 	}
 
-	scorer.PreRequest(ctx, request2, schedulingResult2, 0)
+	scorer.PreRequest(ctx, request2, schedulingResult2)
 
 	// Check incremented count
 	scorer.mutex.RLock()
@@ -192,7 +192,7 @@ func TestActiveRequestScorer_ResponseComplete(t *testing.T) {
 		},
 	}
 
-	scorer.PreRequest(ctx, request, schedulingResult, 0)
+	scorer.PreRequest(ctx, request, schedulingResult)
 
 	// Verify initial state
 	compositeKey := "default/pod-a.test-request-1"
@@ -248,7 +248,7 @@ func TestActiveRequestScorer_TTLExpiration(t *testing.T) {
 	}
 
 	// Add request
-	scorer.PreRequest(ctx, request, schedulingResult, 0)
+	scorer.PreRequest(ctx, request, schedulingResult)
 
 	// Verify request is added
 	scorer.mutex.RLock()

diff --git a/pkg/plugins/scorer/no_hit_lru.go b/pkg/plugins/scorer/no_hit_lru.go
@@ -256,7 +256,7 @@ func (s *NoHitLRU) Score(ctx context.Context, cycleState *types.CycleState, requ
 
 // PreRequest is called before a request is sent to the target pod.
 // For cold requests, it updates the LRU cache to track which pods have been used recently.
-func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, _ int) {
+func (s *NoHitLRU) PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult) {
 	logger := log.FromContext(ctx).V(logutil.DEBUG)
 
 	if schedulingResult == nil || len(schedulingResult.ProfileResults) == 0 {

diff --git a/pkg/plugins/scorer/no_hit_lru_test.go b/pkg/plugins/scorer/no_hit_lru_test.go
@@ -337,7 +337,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
 	t.Run("initial cold request seeds cache", func(_ *testing.T) {
 		coldReqA := &types.LLMRequest{RequestId: "cold-1"}
 		scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqA, pods)
-		scorer.PreRequest(ctx, coldReqA, requestToPod(podA), 0)
+		scorer.PreRequest(ctx, coldReqA, requestToPod(podA))
 		// After podA handles a cold request, other pods should score higher for new cold requests
 		assertHighestScoredPod(podB, "after-podA-used")
 	})
@@ -367,7 +367,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
 				t.Fatalf("expected neutral score for warm request, got %f", score)
 			}
 		}
-		scorer.PreRequest(ctx, warmReq, requestToPod(podB), 0)
+		scorer.PreRequest(ctx, warmReq, requestToPod(podB))
 		postWarmReq := &types.LLMRequest{RequestId: "cold-after-warm"}
 		postWarmScores := scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), postWarmReq, pods)
 		if postWarmScores[podB] <= postWarmScores[podA] {
@@ -379,7 +379,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
 		// Simulate podB handling a cold request
 		coldReqB := &types.LLMRequest{RequestId: "cold-2"}
 		scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqB, pods)
-		scorer.PreRequest(ctx, coldReqB, requestToPod(podB), 0)
+		scorer.PreRequest(ctx, coldReqB, requestToPod(podB))
 		// Now podC should score highest since both podA and podB have been used
 		assertHighestScoredPod(podC, "after-podB-used")
 	})
@@ -388,7 +388,7 @@ func TestNoHitLRUPreferLeastRecentlyUsedAfterColdRequests(t *testing.T) {
 		// Simulate podC handling a cold request
 		coldReqC := &types.LLMRequest{RequestId: "cold-3"}
 		scorer.Score(ctx, toPrefixState(make(map[prefix.ServerID]int)), coldReqC, pods)
-		scorer.PreRequest(ctx, coldReqC, requestToPod(podC), 0)
+		scorer.PreRequest(ctx, coldReqC, requestToPod(podC))
 		// Now podA should score highest again (LRU rotation)
 		assertHighestScoredPod(podA, "after-podC-used")
 	})

diff --git a/pkg/scheduling/pd/scheduler_test.go b/pkg/scheduling/pd/scheduler_test.go
@@ -266,7 +266,7 @@ func TestPDSchedule(t *testing.T) {
 
 			if test.wantRes2 != nil { // Checking the prefix match in the decode pod.
 				// make sure prefix plugin stores the prefix hit in cache, so we can test it in the following schedule call
-				prefixScorer.PreRequest(ctx, test.req, got, 0)
+				prefixScorer.PreRequest(ctx, test.req, got)
 				time.Sleep(time.Second)
 
 				got, err = scheduler.Schedule(ctx, test.req, test.input)

diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go
@@ -1,7 +1,6 @@
 package e2e
 
 import (
-	"context"
 	"fmt"
 	"io"
 	"os/exec"
@@ -14,22 +13,19 @@ import (
 	"github.com/onsi/gomega"
 	"github.com/onsi/gomega/gexec"
 	apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
-	k8sruntime "k8s.io/apimachinery/pkg/runtime"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
-	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/config"
 	k8slog "sigs.k8s.io/controller-runtime/pkg/log"
+
+	infextv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
 	infextv1a2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
+	testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils"
 )
 
 const (
-	// defaultExistsTimeout is the default timeout for a resource to exist in the api server.
-	defaultExistsTimeout = 30 * time.Second
 	// defaultReadyTimeout is the default timeout for a resource to report a ready state.
 	defaultReadyTimeout = 3 * time.Minute
-	// defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state.
-	defaultModelReadyTimeout = 10 * time.Minute
 	// defaultInterval is the default interval to check if a resource exists or ready conditions.
 	defaultInterval = time.Millisecond * 250
 	// xInferPoolManifest is the manifest for the inference pool CRD with 'inference.networking.x-k8s.io' group.
@@ -57,19 +53,16 @@ const (
 )
 
 var (
-	ctx       = context.Background()
-	k8sClient client.Client
-	port      string
-	scheme    = k8sruntime.NewScheme()
+	port string
+
+	testConfig *testutils.TestConfig
 
 	eppTag            = env.GetEnvString("EPP_TAG", "dev", ginkgo.GinkgoLogr)
 	vllmSimTag        = env.GetEnvString("VLLM_SIMULATOR_TAG", "dev", ginkgo.GinkgoLogr)
-	routingSideCarTag = env.GetEnvString("ROUTING_SIDECAR_TAG", "v0.2.0", ginkgo.GinkgoLogr)
+	routingSideCarTag = env.GetEnvString("ROUTING_SIDECAR_TAG", "dev", ginkgo.GinkgoLogr)
 
-	existsTimeout     = env.GetEnvDuration("EXISTS_TIMEOUT", defaultExistsTimeout, ginkgo.GinkgoLogr)
-	readyTimeout      = env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr)
-	modelReadyTimeout = env.GetEnvDuration("MODEL_READY_TIMEOUT", defaultModelReadyTimeout, ginkgo.GinkgoLogr)
-	interval          = defaultInterval
+	readyTimeout = env.GetEnvDuration("READY_TIMEOUT", defaultReadyTimeout, ginkgo.GinkgoLogr)
+	interval     = defaultInterval
 )
 
 func TestEndToEnd(t *testing.T) {
@@ -83,16 +76,17 @@ var _ = ginkgo.BeforeSuite(func() {
 	port = "30080"
 
 	setupK8sCluster()
+	testConfig = testutils.NewTestConfig(nsName)
 	setupK8sClient()
 	createCRDs()
 	createEnvoy()
-	applyYAMLFile(rbacManifest)
-	applyYAMLFile(serviceAccountManifest)
-	applyYAMLFile(servicesManifest)
+	testutils.ApplyYAMLFile(testConfig, rbacManifest)
+	testutils.ApplyYAMLFile(testConfig, serviceAccountManifest)
+	testutils.ApplyYAMLFile(testConfig, servicesManifest)
 
-	infPoolYaml := readYaml(inferExtManifest)
+	infPoolYaml := testutils.ReadYaml(inferExtManifest)
 	infPoolYaml = substituteMany(infPoolYaml, map[string]string{"${POOL_NAME}": modelName + "-inference-pool"})
-	createObjsFromYaml(infPoolYaml)
+	testutils.CreateObjsFromYaml(testConfig, infPoolYaml)
 })
 
 var _ = ginkgo.AfterSuite(func() {
@@ -147,32 +141,33 @@ func setupK8sClient() {
 	k8sCfg := config.GetConfigOrDie()
 	gomega.ExpectWithOffset(1, k8sCfg).NotTo(gomega.BeNil())
 
-	err := clientgoscheme.AddToScheme(scheme)
+	err := clientgoscheme.AddToScheme(testConfig.Scheme)
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
 
-	err = apiextv1.AddToScheme(scheme)
+	err = infextv1.Install(testConfig.Scheme)
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
 
-	err = infextv1a2.Install(scheme)
+	err = apiextv1.AddToScheme(testConfig.Scheme)
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
 
-	k8sClient, err = client.New(k8sCfg, client.Options{Scheme: scheme})
+	err = infextv1a2.Install(testConfig.Scheme)
 	gomega.Expect(err).NotTo(gomega.HaveOccurred())
-	gomega.Expect(k8sClient).NotTo(gomega.BeNil())
+
+	testConfig.CreateCli()
 
 	k8slog.SetLogger(ginkgo.GinkgoLogr)
 }
 
 // createCRDs creates the Inference Extension CRDs used for testing.
 func createCRDs() {
 	crds := runKustomize(gieCrdsKustomize)
-	createObjsFromYaml(crds)
+	testutils.CreateObjsFromYaml(testConfig, crds)
 }
 
 func createEnvoy() {
-	manifests := readYaml(envoyManifest)
+	manifests := testutils.ReadYaml(envoyManifest)
 	ginkgo.By("Creating envoy proxy resources from manifest: " + envoyManifest)
-	createObjsFromYaml(manifests)
+	testutils.CreateObjsFromYaml(testConfig, manifests)
 }
 
 const kindClusterConfig = `