llm-d · nirrozenbaum · Jun 24, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/go.mod b/go.mod
@@ -16,7 +16,7 @@ require (
 	k8s.io/client-go v0.33.1
 	sigs.k8s.io/controller-runtime v0.21.0
 	sigs.k8s.io/gateway-api v1.3.0
-	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250619193052-1362b1474677
+	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250623153357-7df5d3dfdafa
 )
 
 require (

diff --git a/go.sum b/go.sum
@@ -277,6 +277,8 @@ sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250618193051-f66be2d1e763 h
 sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250618193051-f66be2d1e763/go.mod h1:JYY3NYINfBa4ULLEowjqb/E+SkY1iqk4N2miCplP528=
 sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250619193052-1362b1474677 h1:ImKlQzbdg2USp/+w6kJbvrGQLVUe9ISb8n/yyJhSJ+4=
 sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250619193052-1362b1474677/go.mod h1:JYY3NYINfBa4ULLEowjqb/E+SkY1iqk4N2miCplP528=
+sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250623153357-7df5d3dfdafa h1:A9+jI1H50lG6tHMOaQfk6FiKx+1ehCbU1X947cVlCu4=
+sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250623153357-7df5d3dfdafa/go.mod h1:JYY3NYINfBa4ULLEowjqb/E+SkY1iqk4N2miCplP528=
 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
 sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=

diff --git a/pkg/plugins/filter/by_labels.go b/pkg/plugins/filter/by_labels.go
@@ -36,13 +36,13 @@ type ByLabels struct {
 	selector labels.Selector
 }
 
-// Name returns the name of the filter
-func (blf *ByLabels) Name() string {
+// Type returns the type of the filter
+func (blf *ByLabels) Type() string {
 	return blf.name
 }
 
 // Filter filters out all pods that do not satisfy the label selector
-func (blf *ByLabels) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
+func (blf *ByLabels) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod {
 	filtered := []types.Pod{}
 
 	for _, pod := range pods {

diff --git a/pkg/plugins/filter/pd_role_filter.go b/pkg/plugins/filter/pd_role_filter.go
@@ -24,13 +24,13 @@ var _ framework.Filter = &PrefillFilter{}
 // PrefillFilter - filters out pods that are not marked with role Prefill
 type PrefillFilter struct{}
 
-// Name returns the name of the filter
-func (pf *PrefillFilter) Name() string {
+// Type returns the type of the filter
+func (pf *PrefillFilter) Type() string {
 	return "prefill-filter"
 }
 
 // Filter filters out all pods that are not marked as "prefill"
-func (pf *PrefillFilter) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
+func (pf *PrefillFilter) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod {
 	filteredPods := []types.Pod{}
 
 	for _, pod := range pods {
@@ -48,13 +48,13 @@ var _ framework.Filter = &DecodeFilter{}
 // DecodeFilter - filters out pods that are not marked with role Decode or Both
 type DecodeFilter struct{}
 
-// Name returns the name of the filter
-func (df *DecodeFilter) Name() string {
+// Type returns the type of the filter
+func (df *DecodeFilter) Type() string {
 	return "decode-filter"
 }
 
 // Filter removes all pods that are not marked as "decode" or "both"
-func (df *DecodeFilter) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
+func (df *DecodeFilter) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod {
 	filteredPods := []types.Pod{}
 
 	for _, pod := range pods {

diff --git a/pkg/plugins/pre-request/pd_prerequest.go b/pkg/plugins/pre-request/pd_prerequest.go
@@ -25,8 +25,8 @@ func NewPrefillHeaderHandler() *PrefillHeaderHandler {
 // PrefillHeaderHandler PreRequest plugin
 type PrefillHeaderHandler struct{}
 
-// Name returns the PreRequest plugin name
-func (p *PrefillHeaderHandler) Name() string {
+// Type returns the type of the PreRequest plugin.
+func (p *PrefillHeaderHandler) Type() string {
 	return "prefill-header"
 }
 

diff --git a/pkg/plugins/profile/pd-profile-handler.go b/pkg/plugins/profile/pd-profile-handler.go
@@ -35,14 +35,14 @@ type PdProfileHandler struct {
 	prefixScorer *scorer.PrefixAwareScorer
 }
 
-// Name returns the name of the Profile Handler.
-func (h *PdProfileHandler) Name() string {
+// Type returns the type of the Profile Handler.
+func (h *PdProfileHandler) Type() string {
 	return name
 }
 
 // Pick selects the SchedulingProfiles to run from the list of candidate profiles, while taking into consideration the request properties and the
 // previously executed cycles along with their results.
-func (h *PdProfileHandler) Pick(ctx context.Context, request *types.LLMRequest, profiles map[string]*framework.SchedulerProfile,
+func (h *PdProfileHandler) Pick(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, profiles map[string]*framework.SchedulerProfile,
 	profileResults map[string]*types.ProfileRunResult) map[string]*framework.SchedulerProfile {
 	if _, executed := profileResults[decode]; !executed {
 		// if decode profile was not executed yet, first let the scheduler run the decode profile
@@ -77,7 +77,7 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *types.LLMRequest,
 // ProcessResults handles the outcome of the profile runs after the selected profiles ran.
 // In case of an error in any of the profiles, the matching entry in the profileResults will contain nil, to indicate there was
 // an error while running the profile.
-func (h *PdProfileHandler) ProcessResults(_ context.Context, _ *types.LLMRequest,
+func (h *PdProfileHandler) ProcessResults(_ context.Context, _ *types.CycleState, _ *types.LLMRequest,
 	profileResults map[string]*types.ProfileRunResult) (*types.SchedulingResult, error) {
 	if profileResults[decode] == nil { // if decode profile failed to run, we should fail
 		return nil, errors.New("failed to find available decode workers")

diff --git a/pkg/plugins/scorer/kvcache-aware.go b/pkg/plugins/scorer/kvcache-aware.go
@@ -73,14 +73,14 @@ type KVCacheAwareScorer struct {
 	kvCacheIndexer *kvcache.Indexer
 }
 
-// Name returns the name of the scorer.
-func (s *KVCacheAwareScorer) Name() string {
+// Type returns the type of the scorer.
+func (s *KVCacheAwareScorer) Type() string {
 	return kvCacheAwareScorerName
 }
 
 // Score scores the provided pod based on the KVCache index state.
 // The returned scores are normalized to a range of 0-1.
-func (s *KVCacheAwareScorer) Score(ctx context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
+func (s *KVCacheAwareScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	loggerDebug := log.FromContext(ctx).WithName(kvCacheAwareScorerName).V(logutil.DEBUG)
 	if request == nil {
 		loggerDebug.Info("Request is nil, skipping scoring")

diff --git a/pkg/plugins/scorer/load_aware_scorer.go b/pkg/plugins/scorer/load_aware_scorer.go
@@ -29,8 +29,8 @@ type LoadAwareScorer struct {
 	queueThreshold float64
 }
 
-// Name returns the scorer's name
-func (s *LoadAwareScorer) Name() string {
+// Type returns the type of the scorer.
+func (s *LoadAwareScorer) Type() string {
 	return "load-aware-scorer"
 }
 
@@ -41,7 +41,7 @@ func (s *LoadAwareScorer) Name() string {
 // Pod with requests in the queue will get score between 0.5 and 0.
 // Score 0 will get pod with number of requests in the queue equal to the threshold used in load-based filter (QueueingThresholdLoRA)
 // In future pods with additional capacity will get score higher than 0.5
-func (s *LoadAwareScorer) Score(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
+func (s *LoadAwareScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	scoredPods := make(map[types.Pod]float64)
 
 	for _, pod := range pods {

diff --git a/pkg/plugins/scorer/prefix_aware.go b/pkg/plugins/scorer/prefix_aware.go
@@ -55,13 +55,13 @@ type PrefixAwareScorer struct {
 	podToPromptHits sync.Map
 }
 
-// Name returns the scorer's name
-func (s *PrefixAwareScorer) Name() string {
+// Type returns the type of the scorer.
+func (s *PrefixAwareScorer) Type() string {
 	return "prefix-aware-scorer"
 }
 
 // Score scores the target pods based on the longest prefix match.
-func (s *PrefixAwareScorer) Score(ctx context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
+func (s *PrefixAwareScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	loggerDebug := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG)
 	if request == nil {
 		loggerDebug.Info("Request is nil, skipping scoring")

diff --git a/pkg/plugins/scorer/prefix_aware_test.go b/pkg/plugins/scorer/prefix_aware_test.go
@@ -127,7 +127,7 @@ func TestPrefixAwareScorer(t *testing.T) {
 
 			// Score pods
 			pods := []types.Pod{pod1, pod2}
-			scores := s.Score(context.Background(), request, nil, pods)
+			scores := s.Score(context.Background(), nil, request, pods)
 
 			for p, score := range scores {
 				if score != test.expectedScores[p] {
@@ -168,7 +168,7 @@ func TestPrefixAwareScorerProfiling(t *testing.T) {
 			pods = append(pods, v)
 		}
 
-		scores := s.Score(context.Background(), request, nil, pods)
+		scores := s.Score(context.Background(), nil, request, pods)
 
 		highestScore := scores[name2Pod["pod"+strconv.Itoa(nPodsInStore-1)]]
 		if highestScore < 0.99 {

diff --git a/pkg/plugins/scorer/session_affinity.go b/pkg/plugins/scorer/session_affinity.go
@@ -35,13 +35,13 @@ func NewSessionAffinity() *SessionAffinity {
 type SessionAffinity struct {
 }
 
-// Name returns the scorer's name
-func (s *SessionAffinity) Name() string {
+// Type returns the type of the scorer.
+func (s *SessionAffinity) Type() string {
 	return "session-affinity-scorer"
 }
 
 // Score assign a high score to the pod used in previous requests and zero to others
-func (s *SessionAffinity) Score(ctx context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
+func (s *SessionAffinity) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	scoredPods := make(map[types.Pod]float64)
 	sessionToken := request.Headers[sessionTokenHeader]
 	podName := ""