Skip to content

Commit 0921f39

Browse files
committed
import igw@v1.5.0-rc.1
Signed-off-by: bobzetian <bobzetian@google.com>
1 parent d07d9b2 commit 0921f39

16 files changed

+113
-110
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ require (
3131
k8s.io/utils v0.0.0-20260108192941-914a6e750570
3232
sigs.k8s.io/controller-runtime v0.23.3
3333
sigs.k8s.io/gateway-api v1.5.1
34-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260324083816-c5a0052e14a4
34+
sigs.k8s.io/gateway-api-inference-extension v1.5.0-rc.1
3535
)
3636

3737
require (

pkg/plugins/multi/context_length_aware.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ func estimateContextLength(request *scheduling.LLMRequest) int {
237237

238238
// Handle regular completions
239239
if request.Body.Completions != nil {
240-
totalChars += len(request.Body.Completions.Prompt)
240+
totalChars += len(request.Body.Completions.Prompt.PlainText())
241241
}
242242

243243
// Convert characters to approximate token count

pkg/plugins/multi/context_length_aware_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ func TestContextLengthAwareWithTokenizedPromptInCycleState(t *testing.T) {
278278
TargetModel: "test-model",
279279
Body: &scheduling.LLMRequestBody{
280280
Completions: &scheduling.CompletionsRequest{
281-
Prompt: "some prompt text",
281+
Prompt: scheduling.Prompt{Raw: "some prompt text"},
282282
},
283283
},
284284
}
@@ -314,7 +314,7 @@ func TestContextLengthAwareFallbackWithoutTokenizedPrompt(t *testing.T) {
314314
TargetModel: "test-model",
315315
Body: &scheduling.LLMRequestBody{
316316
Completions: &scheduling.CompletionsRequest{
317-
Prompt: prompt,
317+
Prompt: scheduling.Prompt{Raw: prompt},
318318
},
319319
},
320320
}

pkg/plugins/preparedata/tokenizer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ func (p *TokenizerPlugin) tokenize(ctx context.Context, request *scheduling.LLMR
178178

179179
switch {
180180
case request.Body.Completions != nil:
181-
traceLogger.Info("Calling Render for completions", "prompt", request.Body.Completions.Prompt)
182-
tokenIDs, _, err = p.tokenizer.Render(request.Body.Completions.Prompt)
181+
traceLogger.Info("Calling Render for completions", "prompt", request.Body.Completions.Prompt.PlainText())
182+
tokenIDs, _, err = p.tokenizer.Render(request.Body.Completions.Prompt.PlainText())
183183
case request.Body.ChatCompletions != nil:
184184
renderReq := ChatCompletionsToRenderChatRequest(request.Body.ChatCompletions)
185185
traceLogger.Info("Calling RenderChat for chat completions", "messageCount", len(request.Body.ChatCompletions.Messages))

pkg/plugins/preparedata/tokenizer_scorer_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ func TestTokenizerScorer_Score(t *testing.T) {
8282
RequestId: "completions",
8383
Body: &scheduling.LLMRequestBody{
8484
Completions: &scheduling.CompletionsRequest{
85-
Prompt: "The quick brown fox",
85+
Prompt: scheduling.Prompt{Raw: "The quick brown fox"},
8686
},
8787
},
8888
},
@@ -109,7 +109,7 @@ func TestTokenizerScorer_Score(t *testing.T) {
109109
request: &scheduling.LLMRequest{
110110
RequestId: "fail-open",
111111
Body: &scheduling.LLMRequestBody{
112-
Completions: &scheduling.CompletionsRequest{Prompt: "fail"},
112+
Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "fail"}},
113113
},
114114
},
115115
tokenizer: &mockTokenizer{
@@ -169,7 +169,7 @@ func TestTokenizerScorer_SkipsWhenAlreadyInCycleState(t *testing.T) {
169169
request := &scheduling.LLMRequest{
170170
RequestId: "already-tokenized",
171171
Body: &scheduling.LLMRequestBody{
172-
Completions: &scheduling.CompletionsRequest{Prompt: "hello"},
172+
Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "hello"}},
173173
},
174174
}
175175

@@ -292,7 +292,7 @@ func TestTokenizerScorer_Render_NilMMFeatures(t *testing.T) {
292292
request := &scheduling.LLMRequest{
293293
RequestId: "text-completions",
294294
Body: &scheduling.LLMRequestBody{
295-
Completions: &scheduling.CompletionsRequest{Prompt: "hello"},
295+
Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: "hello"}},
296296
},
297297
}
298298

pkg/plugins/profile/disagg_profile_handler_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func profileNames(m map[string]scheduling.SchedulerProfile) []string {
6767
func completionsRequest(prompt string) *scheduling.LLMRequest {
6868
return &scheduling.LLMRequest{
6969
Body: &scheduling.LLMRequestBody{
70-
Completions: &scheduling.CompletionsRequest{Prompt: prompt},
70+
Completions: &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: prompt}},
7171
},
7272
}
7373
}
@@ -95,7 +95,7 @@ func chatRequest(hasImage, hasVideo, hasAudio bool) *scheduling.LLMRequest {
9595

9696
// withPrompt adds a completions body to a chat request so the PD decider can estimate tokens.
9797
func withPrompt(req *scheduling.LLMRequest, prompt string) *scheduling.LLMRequest {
98-
req.Body.Completions = &scheduling.CompletionsRequest{Prompt: prompt}
98+
req.Body.Completions = &scheduling.CompletionsRequest{Prompt: scheduling.Prompt{Raw: prompt}}
9999
return req
100100
}
101101

@@ -404,7 +404,7 @@ func TestDisaggProfileHandler_Pick_PD(t *testing.T) {
404404
h := NewDisaggProfileHandler(defaultDecodeProfile, defaultPrefillProfile, "",
405405
decider, nil)
406406

407-
inputTokens := len(req.Body.Completions.Prompt) / AverageCharactersPerToken
407+
inputTokens := len(req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
408408
injectPrefixCache(tt.profileResults, tt.cachedTokens, inputTokens)
409409

410410
got := h.Pick(ctx, nil, req, profiles, tt.profileResults)
@@ -463,7 +463,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
463463
want []string
464464
}{
465465
{short, 0, []string{defaultPrefillProfile}},
466-
{short, len(short.Body.Completions.Prompt) / AverageCharactersPerToken, []string{}},
466+
{short, len(short.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, []string{}},
467467
},
468468
},
469469
{
@@ -475,7 +475,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
475475
want []string
476476
}{
477477
{short, 0, []string{defaultPrefillProfile}},
478-
{long, len(short.Body.Completions.Prompt) / AverageCharactersPerToken, []string{defaultPrefillProfile}},
478+
{long, len(short.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, []string{defaultPrefillProfile}},
479479
},
480480
},
481481
}
@@ -492,7 +492,7 @@ func TestDisaggProfileHandler_Pick_PD_Series(t *testing.T) {
492492
results := map[string]*scheduling.ProfileRunResult{
493493
defaultDecodeProfile: makeProfileRunResult("pod1"),
494494
}
495-
inputTokens := len(step.req.Body.Completions.Prompt) / AverageCharactersPerToken
495+
inputTokens := len(step.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
496496
injectPrefixCache(results, step.cachedTokens, inputTokens)
497497
got := h.Pick(ctx, &scheduling.CycleState{}, step.req, profiles, results)
498498
assert.ElementsMatch(t, step.want, profileNames(got))
@@ -910,7 +910,7 @@ func TestDisaggProfileHandler_Pick_EPD_Full(t *testing.T) {
910910

911911
inputTokens := 0
912912
if tt.req.Body.Completions != nil {
913-
inputTokens = len(tt.req.Body.Completions.Prompt) / AverageCharactersPerToken
913+
inputTokens = len(tt.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
914914
} else if tt.req.Body.ChatCompletions != nil {
915915
b, _ := json.Marshal(tt.req.Body.ChatCompletions.Messages)
916916
inputTokens = len(b) / AverageCharactersPerToken
@@ -1136,7 +1136,7 @@ func TestDisaggProfileHandler_Pick_NilDeciders(t *testing.T) {
11361136

11371137
// Inject prefix cache if needed for PD decider
11381138
if tt.req.Body.Completions != nil {
1139-
inputTokens := len(tt.req.Body.Completions.Prompt) / AverageCharactersPerToken
1139+
inputTokens := len(tt.req.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
11401140
injectPrefixCache(tt.results, 0, inputTokens)
11411141
}
11421142

pkg/plugins/profile/pd_profile_handler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
const (
2626
// PdProfileHandlerType is a legacy alias for DisaggProfileHandlerType.
2727
PdProfileHandlerType = "pd-profile-handler"
28-
defaultPrefixPluginType = prefix.PrefixCachePluginType
28+
defaultPrefixPluginType = prefix.PrefixCacheScorerPluginType
2929
defaultDeciderPluginName = PrefixBasedPDDeciderPluginType
3030
)
3131

pkg/plugins/profile/pd_profile_handler_test.go

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func createRequest(prompt string) *scheduling.LLMRequest {
217217
return &scheduling.LLMRequest{
218218
Body: &scheduling.LLMRequestBody{
219219
Completions: &scheduling.CompletionsRequest{
220-
Prompt: prompt,
220+
Prompt: scheduling.Prompt{Raw: prompt},
221221
},
222222
},
223223
}
@@ -257,16 +257,16 @@ func TestPdProfileHandler_Pick(t *testing.T) {
257257
{
258258
name: "decode not executed yet → run decode",
259259
nonCachedTokensLimit: 10,
260-
prefixPluginType: prefix.PrefixCachePluginType,
261-
prefixPluginName: prefix.PrefixCachePluginType,
260+
prefixPluginType: prefix.PrefixCacheScorerPluginType,
261+
prefixPluginName: prefix.PrefixCacheScorerPluginType,
262262
profileResults: map[string]*scheduling.ProfileRunResult{},
263263
expectedProfiles: []string{defaultDecodeProfile},
264264
},
265265
{
266266
name: "decode failed (nil result) → run nothing",
267267
nonCachedTokensLimit: 10,
268-
prefixPluginType: prefix.PrefixCachePluginType,
269-
prefixPluginName: prefix.PrefixCachePluginType,
268+
prefixPluginType: prefix.PrefixCacheScorerPluginType,
269+
prefixPluginName: prefix.PrefixCacheScorerPluginType,
270270
profileResults: map[string]*scheduling.ProfileRunResult{
271271
defaultDecodeProfile: nil,
272272
},
@@ -275,8 +275,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
275275
{
276276
name: "all profiles already executed → run nothing",
277277
nonCachedTokensLimit: 10,
278-
prefixPluginType: prefix.PrefixCachePluginType,
279-
prefixPluginName: prefix.PrefixCachePluginType,
278+
prefixPluginType: prefix.PrefixCacheScorerPluginType,
279+
prefixPluginName: prefix.PrefixCacheScorerPluginType,
280280
profileResults: map[string]*scheduling.ProfileRunResult{
281281
defaultDecodeProfile: newMockProfileRunResult(DefaultTestPodPort, "pod1"),
282282
defaultPrefillProfile: newMockProfileRunResult(DefaultTestPodPort, "pod2"),
@@ -289,8 +289,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
289289
// In this case: prompt length is 35 chars (8 tokens), cached length is 2 tokens -> disaggregated prefill should trigger
290290
nonCachedTokensLimit: 4,
291291
cachedTokens: 2,
292-
prefixPluginType: prefix.PrefixCachePluginType,
293-
prefixPluginName: prefix.PrefixCachePluginType,
292+
prefixPluginType: prefix.PrefixCacheScorerPluginType,
293+
prefixPluginName: prefix.PrefixCacheScorerPluginType,
294294
profileResults: map[string]*scheduling.ProfileRunResult{
295295
defaultDecodeProfile: newMockProfileRunResult(DefaultTestPodPort, "pod1"),
296296
},
@@ -302,8 +302,8 @@ func TestPdProfileHandler_Pick(t *testing.T) {
302302
// In this case: prompt length is 35 chars (8 tokens), cached length is 5 tokens -> skip prefill
303303
nonCachedTokensLimit: 4,
304304
cachedTokens: 5,
305-
prefixPluginType: prefix.PrefixCachePluginType,
306-
prefixPluginName: prefix.PrefixCachePluginType,
305+
prefixPluginType: prefix.PrefixCacheScorerPluginType,
306+
prefixPluginName: prefix.PrefixCacheScorerPluginType,
307307
profileResults: map[string]*scheduling.ProfileRunResult{
308308
defaultDecodeProfile: newMockProfileRunResult(DefaultTestPodPort, "pod1"),
309309
},
@@ -327,7 +327,7 @@ func TestPdProfileHandler_Pick(t *testing.T) {
327327
assert.NoError(t, err)
328328

329329
// set prefix to the given cached tokens number for pod "pod1" in decode profile results
330-
inputTokens := len(request.Body.Completions.Prompt) / AverageCharactersPerToken
330+
inputTokens := len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
331331

332332
for profileName, profileRes := range tt.profileResults {
333333
if profileName == defaultDecodeProfile && profileRes != nil {
@@ -377,7 +377,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
377377
expectedProfiles: []string{defaultPrefillProfile},
378378
}, {
379379
request: request,
380-
cachedTokens: len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
380+
cachedTokens: len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
381381
expectedProfiles: []string{},
382382
}},
383383
}, {
@@ -391,7 +391,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
391391
expectedProfiles: []string{defaultPrefillProfile},
392392
}, {
393393
request: longerRequest,
394-
cachedTokens: len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
394+
cachedTokens: len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
395395
expectedProfiles: []string{},
396396
}},
397397
}, {
@@ -405,7 +405,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
405405
expectedProfiles: []string{defaultPrefillProfile},
406406
}, {
407407
request: longRequest,
408-
cachedTokens: len(request.Body.Completions.Prompt) / AverageCharactersPerToken,
408+
cachedTokens: len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken,
409409
expectedProfiles: []string{defaultPrefillProfile},
410410
}},
411411
},
@@ -419,8 +419,8 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
419419
handler, err := NewPdProfileHandler(
420420
defaultPrefillProfile,
421421
defaultDecodeProfile,
422-
prefix.PrefixCachePluginType,
423-
prefix.PrefixCachePluginType,
422+
prefix.PrefixCacheScorerPluginType,
423+
prefix.PrefixCacheScorerPluginType,
424424
0,
425425
deciderPlugin,
426426
)
@@ -431,7 +431,7 @@ func TestPdProfileHandler_PickSeries(t *testing.T) {
431431
cs := &scheduling.CycleState{}
432432

433433
// set prefix to the given cached tokens number for pod "pod1" in decode profile results
434-
inputTokens := len(innerTest.request.Body.Completions.Prompt) / AverageCharactersPerToken
434+
inputTokens := len(innerTest.request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken
435435

436436
for profileName, profileRes := range profileResults {
437437
if profileName == defaultDecodeProfile && profileRes != nil {
@@ -519,8 +519,8 @@ func TestPdProfileHandler_ProcessResults(t *testing.T) {
519519
handler, err := NewPdProfileHandler(
520520
defaultPrefillProfile,
521521
defaultDecodeProfile,
522-
prefix.PrefixCachePluginType,
523-
prefix.PrefixCachePluginType,
522+
prefix.PrefixCacheScorerPluginType,
523+
prefix.PrefixCacheScorerPluginType,
524524
tt.primaryPort,
525525
deciderPlugin,
526526
)

pkg/plugins/profile/prefix_based_pd_decider.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ func getUserInputLenInTokens(request *scheduling.LLMRequest) (int, error) {
153153
return 0, errors.New("request or request body is nil")
154154
}
155155
if request.Body.Completions != nil {
156-
return len(request.Body.Completions.Prompt) / AverageCharactersPerToken, nil
156+
return len(request.Body.Completions.Prompt.PlainText()) / AverageCharactersPerToken, nil
157157
}
158158
if request.Body.ChatCompletions == nil {
159159
return 0, errors.New("request has neither completions nor chat completions body")

pkg/plugins/scorer/no_hit_lru.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/plugin"
1212
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/requestcontrol"
1313
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/scheduling"
14+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/plugins/requestcontrol/dataproducer/approximateprefix"
1415
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/plugins/scheduling/scorer/prefix"
1516
)
1617

@@ -66,7 +67,7 @@ func NoHitLRUFactory(name string, rawParameters json.RawMessage, handle plugin.H
6667
}
6768

6869
if parameters.PrefixPluginName == "" {
69-
parameters.PrefixPluginName = prefix.PrefixCachePluginType
70+
parameters.PrefixPluginName = prefix.PrefixCacheScorerPluginType
7071
}
7172

7273
// Note: We don't enforce that the prefix plugin exists here
@@ -77,8 +78,8 @@ func NoHitLRUFactory(name string, rawParameters json.RawMessage, handle plugin.H
7778

7879
// NewNoHitLRU creates a new NoHitLRU scorer
7980
func NewNoHitLRU(ctx context.Context, params *NoHitLRUParameters) *NoHitLRU {
80-
prefixPluginType := prefix.PrefixCachePluginType
81-
prefixPluginName := prefix.PrefixCachePluginType
81+
prefixPluginType := prefix.PrefixCacheScorerPluginType
82+
prefixPluginName := prefix.PrefixCacheScorerPluginType
8283
lruSize := defaultLRUSize
8384

8485
if params != nil {
@@ -140,7 +141,7 @@ func (s *NoHitLRU) isColdRequest(ctx context.Context, cycleState *scheduling.Cyc
140141

141142
// Read prefix cache state to determine if this is a cold request
142143
// This is treated as an optimization - if the state isn't available, we assume cold request
143-
prefixState, err := scheduling.ReadCycleStateKey[*prefix.SchedulingContextState](cycleState, plugin.StateKey(s.prefixPluginTypedName.String()))
144+
prefixState, err := scheduling.ReadCycleStateKey[*approximateprefix.SchedulingContextState](cycleState, plugin.StateKey(s.prefixPluginTypedName.String()))
144145

145146
if err != nil {
146147
logger.Info("No prefix cache state found, treating as cold request for LRU optimization", "error", err)

0 commit comments

Comments
 (0)