Skip to content

Commit e8648e2

Browse files
committed
Update PreRequest and ResponseComplete hook to track Prefill pod requests too in runningRequestsList
1 parent 6ccbbe9 commit e8648e2

File tree

3 files changed

+60
-5
lines changed

3 files changed

+60
-5
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ require (
2929
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a
3030
)
3131

32-
replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5
32+
replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417
3333

3434
require (
3535
cel.dev/expr v0.24.0 // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgv
1616
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
1717
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
1818
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
19-
github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5 h1:Rz2D9py5WWKYN08XT0JQ7/5QBaiF/4dEqkIb4GVsRPU=
20-
github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5/go.mod h1:lvMpB9a+Lk+xBi5Pk6teUG+NqA16WR8nRpmBNFJbflU=
19+
github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417 h1:DQs9A9gfgXQIqMw35L6nOZ2HG7lwhV6HP8dvo8zt0zg=
20+
github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417/go.mod h1:UXD5NFf/ukKoCYWT3mPOnun7xGhqGeY4ac3VPJnbcOo=
2121
github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0=
2222
github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
2323
github.com/alicebob/miniredis/v2 v2.35.0 h1:QwLphYqCEAo1eu1TqPRN2jgVMPBweeQcR21jeqDCONI=

pkg/plugins/scorer/pd_slo_aware_router_hooks.go

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,40 @@ var _ requestcontrol.ResponseReceived = &PDSLOAwareRouter{}
4444
var _ requestcontrol.ResponseStreaming = &PDSLOAwareRouter{}
4545
var _ requestcontrol.ResponseComplete = &PDSLOAwareRouter{}
4646

47-
// PreRequest delegates to the base router
47+
// PreRequest tracks both prefill and decode pods in running request lists.
48+
// The base router tracks the decode pod (primary profile), and we additionally
49+
// track the prefill pod to ensure accurate load visibility during scoring.
4850
func (p *PDSLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult) {
51+
logger := log.FromContext(ctx)
52+
53+
// Delegate to base router (tracks decode pod - primary profile)
4954
p.PredictedLatency.PreRequest(ctx, request, schedulingResult)
55+
56+
// P/D-specific: Also track prefill pod if it was selected
57+
if prefillResult, exists := schedulingResult.ProfileResults["prefill"]; exists && prefillResult != nil {
58+
if len(prefillResult.TargetEndpoints) > 0 {
59+
prefillPod := prefillResult.TargetEndpoints[0]
60+
requestID := request.Headers[requtil.RequestIdHeaderKey]
61+
62+
// Get average TPOT SLO to determine priority
63+
avgTPOTSLO, err := p.PredictedLatency.GetAvgTPOTSLO(request)
64+
if err != nil {
65+
logger.V(logutil.DEBUG).Info("Could not get SLO context for prefill tracking", "error", err)
66+
return
67+
}
68+
69+
// Track prefill pod in running requests
70+
p.PredictedLatency.AddToRunningRequests(
71+
prefillPod.GetMetadata().NamespacedName,
72+
requestID,
73+
avgTPOTSLO,
74+
)
75+
76+
logger.V(logutil.DEBUG).Info("Tracked prefill pod in running requests",
77+
"prefillPod", prefillPod.GetMetadata().NamespacedName.Name,
78+
"requestID", requestID)
79+
}
80+
}
5081
}
5182

5283
// ResponseReceived adds P/D-specific logic to extract prefill timing headers
@@ -79,8 +110,32 @@ func (p *PDSLOAwareRouter) ResponseStreaming(ctx context.Context, request *sched
79110
p.PredictedLatency.ResponseStreaming(ctx, request, response, pod)
80111
}
81112

82-
// ResponseComplete delegates to the base router
113+
// ResponseComplete cleans up both prefill and decode pod tracking.
114+
// We remove the prefill pod from running requests (if it was used) before
115+
// delegating to the base router, which removes the decode pod.
83116
func (p *PDSLOAwareRouter) ResponseComplete(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *datalayer.EndpointMetadata) {
117+
logger := log.FromContext(ctx)
118+
requestID := request.Headers[requtil.RequestIdHeaderKey]
119+
120+
// P/D-specific: Remove prefill pod from tracking if it was used
121+
schedulingResult, err := p.PredictedLatency.GetSchedulingResult(request)
122+
if err == nil && schedulingResult != nil {
123+
if prefillResult, exists := schedulingResult.ProfileResults["prefill"]; exists && prefillResult != nil {
124+
if len(prefillResult.TargetEndpoints) > 0 {
125+
prefillPod := prefillResult.TargetEndpoints[0]
126+
p.PredictedLatency.RemoveFromRunningRequests(
127+
prefillPod.GetMetadata().NamespacedName,
128+
requestID,
129+
)
130+
131+
logger.V(logutil.DEBUG).Info("Removed prefill pod from running requests",
132+
"prefillPod", prefillPod.GetMetadata().NamespacedName.Name,
133+
"requestID", requestID)
134+
}
135+
}
136+
}
137+
138+
// Delegate to base router (removes decode pod)
84139
p.PredictedLatency.ResponseComplete(ctx, request, response, pod)
85140
}
86141

0 commit comments

Comments
 (0)