llm-d
diff --git a/‎go.mod‎
Lines changed: 1 addition & 1 deletion b/‎go.mod‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/plugins/multi/context_length_aware.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/plugins/multi/context_length_aware.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/plugins/multi/context_length_aware_test.go‎
Lines changed: 2 additions & 2 deletions b/‎pkg/plugins/multi/context_length_aware_test.go‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/plugins/preparedata/tokenizer.go‎
Lines changed: 2 additions & 2 deletions b/‎pkg/plugins/preparedata/tokenizer.go‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/plugins/preparedata/tokenizer_scorer_test.go‎
Lines changed: 4 additions & 4 deletions b/‎pkg/plugins/preparedata/tokenizer_scorer_test.go‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pkg/plugins/profile/disagg_profile_handler_test.go‎
Lines changed: 8 additions & 8 deletions b/‎pkg/plugins/profile/disagg_profile_handler_test.go‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎pkg/plugins/profile/pd_profile_handler.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/plugins/profile/pd_profile_handler.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/plugins/profile/pd_profile_handler_test.go‎
Lines changed: 20 additions & 20 deletions b/‎pkg/plugins/profile/pd_profile_handler_test.go‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎pkg/plugins/profile/prefix_based_pd_decider.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/plugins/profile/prefix_based_pd_decider.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/plugins/scorer/no_hit_lru.go‎
Lines changed: 5 additions & 4 deletions b/‎pkg/plugins/scorer/no_hit_lru.go‎
Lines changed: 5 additions & 4 deletions
@@ -31,7 +31,7 @@ require (
 	k8s.io/utils v0.0.0-20260108192941-914a6e750570
 	sigs.k8s.io/controller-runtime v0.23.3
 	sigs.k8s.io/gateway-api v1.5.1
-	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260324083816-c5a0052e14a4
+	sigs.k8s.io/gateway-api-inference-extension v1.5.0-rc.1
 )
 
 require (
 
@@ -237,7 +237,7 @@ func estimateContextLength(request *scheduling.LLMRequest) int {
 
 	// Handle regular completions
 	if request.Body.Completions != nil {
-		totalChars += len(request.Body.Completions.Prompt)
+		totalChars += len(request.Body.Completions.Prompt.PlainText())
 	}
 
 	// Convert characters to approximate token count
 
@@ -278,7 +278,7 @@ func TestContextLengthAwareWithTokenizedPromptInCycleState(t *testing.T) {
 		TargetModel: "test-model",
 		Body: &scheduling.LLMRequestBody{
 			Completions: &scheduling.CompletionsRequest{
-				Prompt: "some prompt text",
+				Prompt: scheduling.Prompt{Raw: "some prompt text"},
 			},
 		},
 	}
@@ -314,7 +314,7 @@ func TestContextLengthAwareFallbackWithoutTokenizedPrompt(t *testing.T) {
 		TargetModel: "test-model",
 		Body: &scheduling.LLMRequestBody{
 			Completions: &scheduling.CompletionsRequest{
-				Prompt: prompt,
+				Prompt: scheduling.Prompt{Raw: prompt},
 			},
 		},
 	}
 
@@ -178,8 +178,8 @@ func (p *TokenizerPlugin) tokenize(ctx context.Context, request *scheduling.LLMR
 
 	switch {
 	case request.Body.Completions != nil:
-		traceLogger.Info("Calling Render for completions", "prompt", request.Body.Completions.Prompt)
-		tokenIDs, _, err = p.tokenizer.Render(request.Body.Completions.Prompt)
+		traceLogger.Info("Calling Render for completions", "prompt", request.Body.Completions.Prompt.PlainText())
+		tokenIDs, _, err = p.tokenizer.Render(request.Body.Completions.Prompt.PlainText())
 	case request.Body.ChatCompletions != nil:
 		renderReq := ChatCompletionsToRenderChatRequest(request.Body.ChatCompletions)
 		traceLogger.Info("Calling RenderChat for chat completions", "messageCount", len(request.Body.ChatCompletions.Messages))
 
@@ -82,7 +82,7 @@ func TestTokenizerScorer_Score(t *testing.T) {
 				RequestId: "completions",
 				Body: &scheduling.LLMRequestBody{
 					Completions: &scheduling.CompletionsRequest{
-						Prompt: "The quick brown fox",
+						Prompt: scheduling.Prompt{Raw: "The quick brown fox"},
 					},
 				},
 			},
@@ -109,7 +109,7 @@ func TestTokenizerScorer_Score(t *testing.T) {
 			request: &scheduling.LLMRequest{
 				RequestId: "fail-open",
 				Body: &scheduling.LLMRequestBody{
-					Completions: &scheduling.CompletionsRequest{Prompt: "fail"},
+					Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "fail"}},
 				},
 			},
 			tokenizer: &mockTokenizer{
@@ -169,7 +169,7 @@ func TestTokenizerScorer_SkipsWhenAlreadyInCycleState(t *testing.T) {
 	request := &scheduling.LLMRequest{
 		RequestId: "already-tokenized",
 		Body: &scheduling.LLMRequestBody{
-			Completions: &scheduling.CompletionsRequest{Prompt: "hello"},
+			Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "hello"}},
 		},
 	}
 
@@ -292,7 +292,7 @@ func TestTokenizerScorer_Render_NilMMFeatures(t *testing.T) {
 	request := &scheduling.LLMRequest{
 		RequestId: "text-completions",
 		Body: &scheduling.LLMRequestBody{
-			Completions: &scheduling.CompletionsRequest{Prompt: "hello"},
+			Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "hello"}},
 		},
 	}
 
 
@@ -67,7 +67,7 @@ func profileNames(m map[string]scheduling.SchedulerProfile) []string {
 func completionsRequest(prompt string) *scheduling.LLMRequest {
 	return &scheduling.LLMRequest{
 		Body: &scheduling.LLMRequestBody{
-			Completions: &scheduling.CompletionsRequest{Prompt: prompt},
+			Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: prompt}},
 		},
 	}
 }
@@ -95,7 +95,7 @@ func chatRequest(hasImage, hasVideo, hasAudio bool) *scheduling.LLMRequest {
 
 // withPrompt adds a completions body to a chat request so the PD decider can estimate tokens.
 func withPrompt(req *scheduling.LLMRequest, prompt string) *scheduling.LLMRequest {
-	req.Body.Completions = &scheduling.CompletionsRequest{Prompt: prompt}
+	req.Body.Completions = &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: prompt}}
 	return req
 }
 
@@ -404,7 +404,7 @@ func TestDisaggProfileHandler_Pick_PD(t *testing.T) {
 			h := NewDisaggProfileHandler(defaultDecodeProfile, defaultPrefillProfile, "",
 				decider, nil)
 
-			inputTokens := len(req.Body.Completions.Prompt) / AverageCharactersPerToken
+			inputTokens := len(req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 			injectPrefixCache(tt.profileResults, tt.cachedTokens, inputTokens)
 
 			got := h.Pick(ctx, nil, req, profiles, tt.profileResults)
@@ -463,7 +463,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
 				want         []string
 			}{
 				{short, 0, []string{defaultPrefillProfile}},
-				{short, len(short.Body.Completions.Prompt) / AverageCharactersPerToken, []string{}},
+				{short, len(short.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, []string{}},
 			},
 		},
 		{
@@ -475,7 +475,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
 				want         []string
 			}{
 				{short, 0, []string{defaultPrefillProfile}},
-				{long, len(short.Body.Completions.Prompt) / AverageCharactersPerToken, []string{defaultPrefillProfile}},
+				{long, len(short.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, []string{defaultPrefillProfile}},
 			},
 		},
 	}
@@ -492,7 +492,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
 				results := map[string]*scheduling.ProfileRunResult{
 					defaultDecodeProfile: makeProfileRunResult("pod1"),
 				}
-				inputTokens := len(step.req.Body.Completions.Prompt) / AverageCharactersPerToken
+				inputTokens := len(step.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 				injectPrefixCache(results, step.cachedTokens, inputTokens)
 				got := h.Pick(ctx, &scheduling.CycleState{}, step.req, profiles, results)
 				assert.ElementsMatch(t, step.want, profileNames(got))
@@ -910,7 +910,7 @@ func TestDisaggProfileHandler_Pick_EPD_Full(t *testing.T) {
 
 			inputTokens := 0
 			if tt.req.Body.Completions != nil {
-				inputTokens = len(tt.req.Body.Completions.Prompt) / AverageCharactersPerToken
+				inputTokens = len(tt.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 			} else if tt.req.Body.ChatCompletions != nil {
 				b, _ := json.Marshal(tt.req.Body.ChatCompletions.Messages)
 				inputTokens = len(b) / AverageCharactersPerToken
@@ -1136,7 +1136,7 @@ func TestDisaggProfileHandler_Pick_NilDeciders(t *testing.T) {
 
 			// Inject prefix cache if needed for PD decider
 			if tt.req.Body.Completions != nil {
-				inputTokens := len(tt.req.Body.Completions.Prompt) / AverageCharactersPerToken
+				inputTokens := len(tt.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 				injectPrefixCache(tt.results, 0, inputTokens)
 			}
 
 
@@ -25,7 +25,7 @@ import (
 const (
 	// PdProfileHandlerType is a legacy alias for DisaggProfileHandlerType.
 	PdProfileHandlerType     = "pd-profile-handler"
-	defaultPrefixPluginType  = prefix.PrefixCachePluginType
+	defaultPrefixPluginType  = prefix.PrefixCacheScorerPluginType
 	defaultDeciderPluginName = PrefixBasedPDDeciderPluginType
 )
 
 
@@ -217,7 +217,7 @@ func createRequest(prompt string) *scheduling.LLMRequest {
 	return &scheduling.LLMRequest{
 		Body: &scheduling.LLMRequestBody{
 			Completions: &scheduling.CompletionsRequest{
-				Prompt: prompt,
+				Prompt: scheduling.Prompt{Raw: prompt},
 			},
 		},
 	}
@@ -257,16 +257,16 @@ func TestPdProfileHandler_Pick(t *testing.T) {
 		{
 			name:                 "decode not executed yet → run decode",
 			nonCachedTokensLimit: 10,
-			prefixPluginType:     prefix.PrefixCachePluginType,
-			prefixPluginName:     prefix.PrefixCachePluginType,
+			prefixPluginType:     prefix.PrefixCacheScorerPluginType,
+			prefixPluginName:     prefix.PrefixCacheScorerPluginType,
 			profileResults:       map[string]*scheduling.ProfileRunResult{},
 			expectedProfiles:     []string{defaultDecodeProfile},
 		},
 		{
 			name:                 "decode failed (nil result) → run nothing",
 			nonCachedTokensLimit: 10,
-			prefixPluginType:     prefix.PrefixCachePluginType,
-			prefixPluginName:     prefix.PrefixCachePluginType,
+			prefixPluginType:     prefix.PrefixCacheScorerPluginType,
+			prefixPluginName:     prefix.PrefixCacheScorerPluginType,
 			profileResults: map[string]*scheduling.ProfileRunResult{
 				defaultDecodeProfile: nil,
 			},
@@ -275,8 +275,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
 		{
 			name:                 "all profiles already executed → run nothing",
 			nonCachedTokensLimit: 10,
-			prefixPluginType:     prefix.PrefixCachePluginType,
-			prefixPluginName:     prefix.PrefixCachePluginType,
+			prefixPluginType:     prefix.PrefixCacheScorerPluginType,
+			prefixPluginName:     prefix.PrefixCacheScorerPluginType,
 			profileResults: map[string]*scheduling.ProfileRunResult{
 				defaultDecodeProfile:  newMockProfileRunResult(DefaultTestPodPort, "pod1"),
 				defaultPrefillProfile: newMockProfileRunResult(DefaultTestPodPort, "pod2"),
@@ -289,8 +289,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
 			// In this case: prompt length is 35 chars (8 tokens), cached length is 2 tokens -> disaggregated prefill should trigger
 			nonCachedTokensLimit: 4,
 			cachedTokens:         2,
-			prefixPluginType:     prefix.PrefixCachePluginType,
-			prefixPluginName:     prefix.PrefixCachePluginType,
+			prefixPluginType:     prefix.PrefixCacheScorerPluginType,
+			prefixPluginName:     prefix.PrefixCacheScorerPluginType,
 			profileResults: map[string]*scheduling.ProfileRunResult{
 				defaultDecodeProfile: newMockProfileRunResult(DefaultTestPodPort, "pod1"),
 			},
@@ -302,8 +302,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
 			// In this case: prompt length is 35 chars (8 tokens), cached length is 5 tokens -> skip prefill
 			nonCachedTokensLimit: 4,
 			cachedTokens:         5,
-			prefixPluginType:     prefix.PrefixCachePluginType,
-			prefixPluginName:     prefix.PrefixCachePluginType,
+			prefixPluginType:     prefix.PrefixCacheScorerPluginType,
+			prefixPluginName:     prefix.PrefixCacheScorerPluginType,
 			profileResults: map[string]*scheduling.ProfileRunResult{
 				defaultDecodeProfile: newMockProfileRunResult(DefaultTestPodPort, "pod1"),
 			},
@@ -327,7 +327,7 @@ func TestPdProfileHandler_Pick(t *testing.T) {
 			assert.NoError(t, err)
 
 			// set prefix to the given cached tokens number for pod "pod1" in decode profile results
-			inputTokens := len(request.Body.Completions.Prompt) / AverageCharactersPerToken
+			inputTokens := len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 
 			for profileName, profileRes := range tt.profileResults {
 				if profileName == defaultDecodeProfile && profileRes != nil {
@@ -377,7 +377,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
 				expectedProfiles: []string{defaultPrefillProfile},
 			}, {
 				request:          request,
-				cachedTokens:     len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
+				cachedTokens:     len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
 				expectedProfiles: []string{},
 			}},
 		}, {
@@ -391,7 +391,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
 				expectedProfiles: []string{defaultPrefillProfile},
 			}, {
 				request:          longerRequest,
-				cachedTokens:     len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
+				cachedTokens:     len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
 				expectedProfiles: []string{},
 			}},
 		}, {
@@ -405,7 +405,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
 				expectedProfiles: []string{defaultPrefillProfile},
 			}, {
 				request:          longRequest,
-				cachedTokens:     len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
+				cachedTokens:     len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
 				expectedProfiles: []string{defaultPrefillProfile},
 			}},
 		},
@@ -419,8 +419,8 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
 			handler, err := NewPdProfileHandler(
 				defaultPrefillProfile,
 				defaultDecodeProfile,
-				prefix.PrefixCachePluginType,
-				prefix.PrefixCachePluginType,
+				prefix.PrefixCacheScorerPluginType,
+				prefix.PrefixCacheScorerPluginType,
 				0,
 				deciderPlugin,
 			)
@@ -431,7 +431,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
 				cs := &scheduling.CycleState{}
 
 				// set prefix to the given cached tokens number for pod "pod1" in decode profile results
-				inputTokens := len(innerTest.request.Body.Completions.Prompt) / AverageCharactersPerToken
+				inputTokens := len(innerTest.request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
 
 				for profileName, profileRes := range profileResults {
 					if profileName == defaultDecodeProfile && profileRes != nil {
@@ -519,8 +519,8 @@ func TestPdProfileHandler_ProcessResults(t *testing.T) {
 			handler, err := NewPdProfileHandler(
 				defaultPrefillProfile,
 				defaultDecodeProfile,
-				prefix.PrefixCachePluginType,
-				prefix.PrefixCachePluginType,
+				prefix.PrefixCacheScorerPluginType,
+				prefix.PrefixCacheScorerPluginType,
 				tt.primaryPort,
 				deciderPlugin,
 			)
 
@@ -153,7 +153,7 @@ func getUserInputLenInTokens(request *scheduling.LLMRequest) (int, error) {
 		return 0, errors.New("request or request body is nil")
 	}
 	if request.Body.Completions != nil {
-		return len(request.Body.Completions.Prompt) / AverageCharactersPerToken, nil
+		return len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, nil
 	}
 	if request.Body.ChatCompletions == nil {
 		return 0, errors.New("request has neither completions nor chat completions body")
 
@@ -11,6 +11,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/plugin"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/scheduling"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/plugins/requestcontrol/dataproducer/approximateprefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/plugins/scheduling/scorer/prefix"
 )
 
@@ -66,7 +67,7 @@ func NoHitLRUFactory(name string, rawParameters json.RawMessage, handle plugin.H
 	}
 
 	if parameters.PrefixPluginName == "" {
-		parameters.PrefixPluginName = prefix.PrefixCachePluginType
+		parameters.PrefixPluginName = prefix.PrefixCacheScorerPluginType
 	}
 
 	// Note: We don't enforce that the prefix plugin exists here
@@ -77,8 +78,8 @@ func NoHitLRUFactory(name string, rawParameters json.RawMessage, handle plugin.H
 
 // NewNoHitLRU creates a new NoHitLRU scorer
 func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU {
-	prefixPluginType := prefix.PrefixCachePluginType
-	prefixPluginName := prefix.PrefixCachePluginType
+	prefixPluginType := prefix.PrefixCacheScorerPluginType
+	prefixPluginName := prefix.PrefixCacheScorerPluginType
 	lruSize := defaultLRUSize
 
 	if params != nil {
@@ -140,7 +141,7 @@ func (s *NoHitLRU) isColdRequest(ctx context.Context, cycleState *scheduling.Cyc
 
 	// Read prefix cache state to determine if this is a cold request
 	// This is treated as an optimization - if the state isn't available, we assume cold request
-	prefixState, err := scheduling.ReadCycleStateKey[*prefix.SchedulingContextState](cycleState, plugin.StateKey(s.prefixPluginTypedName.String()))
+	prefixState, err := scheduling.ReadCycleStateKey[*approximateprefix.SchedulingContextState](cycleState, plugin.StateKey(s.prefixPluginTypedName.String()))
 
 	if err != nil {
 		logger.Info("No prefix cache state found, treating as cold request for LRU optimization", "error", err)
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ require (`
`31`	`31`	`k8s.io/utils v0.0.0-20260108192941-914a6e750570`
`32`	`32`	`sigs.k8s.io/controller-runtime v0.23.3`
`33`	`33`	`sigs.k8s.io/gateway-api v1.5.1`
`34`		`- sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260324083816-c5a0052e14a4`
	`34`	`+ sigs.k8s.io/gateway-api-inference-extension v1.5.0-rc.1`
`35`	`35`	`)`
`36`	`36`
`37`	`37`	`require (`
Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ func estimateContextLength(request *scheduling.LLMRequest) int {`
`237`	`237`
`238`	`238`	`// Handle regular completions`
`239`	`239`	`if request.Body.Completions != nil {`
`240`		`- totalChars += len(request.Body.Completions.Prompt)`
	`240`	`+ totalChars += len(request.Body.Completions.Prompt.PlainText())`
`241`	`241`	`}`
`242`	`242`
`243`	`243`	`// Convert characters to approximate token count`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ import (`
`25`	`25`	`const (`
`26`	`26`	`// PdProfileHandlerType is a legacy alias for DisaggProfileHandlerType.`
`27`	`27`	`PdProfileHandlerType = "pd-profile-handler"`
`28`		`- defaultPrefixPluginType = prefix.PrefixCachePluginType`
	`28`	`+ defaultPrefixPluginType = prefix.PrefixCacheScorerPluginType`
`29`	`29`	`defaultDeciderPluginName = PrefixBasedPDDeciderPluginType`
`30`	`30`	`)`
`31`	`31`
Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ func getUserInputLenInTokens(request *scheduling.LLMRequest) (int, error) {`
`153`	`153`	`return 0, errors.New("request or request body is nil")`
`154`	`154`	`}`
`155`	`155`	`if request.Body.Completions != nil {`
`156`		`- return len(request.Body.Completions.Prompt) / AverageCharactersPerToken, nil`
	`156`	`+ return len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, nil`
`157`	`157`	`}`
`158`	`158`	`if request.Body.ChatCompletions == nil {`
`159`	`159`	`return 0, errors.New("request has neither completions nor chat completions body")`