Update PreRequest and ResponseComplete hook to track Prefill pod requests too in runningRequestsList

RishabhSaini · RishabhSaini · commit e8648e2db884 · 2026-02-03T18:55:19.000-05:00
diff --git a/go.mod b/go.mod
@@ -29,7 +29,7 @@ require (
 	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a
 )
 
-replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5
+replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417
 
 require (
 	cel.dev/expr v0.24.0 // indirect
diff --git a/go.sum b/go.sum
@@ -16,8 +16,8 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgv
 github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
 github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
-github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5 h1:Rz2D9py5WWKYN08XT0JQ7/5QBaiF/4dEqkIb4GVsRPU=
-github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5/go.mod h1:lvMpB9a+Lk+xBi5Pk6teUG+NqA16WR8nRpmBNFJbflU=
+github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417 h1:DQs9A9gfgXQIqMw35L6nOZ2HG7lwhV6HP8dvo8zt0zg=
+github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417/go.mod h1:UXD5NFf/ukKoCYWT3mPOnun7xGhqGeY4ac3VPJnbcOo=
 github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0=
 github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
 github.com/alicebob/miniredis/v2 v2.35.0 h1:QwLphYqCEAo1eu1TqPRN2jgVMPBweeQcR21jeqDCONI=
diff --git a/pkg/plugins/scorer/pd_slo_aware_router_hooks.go b/pkg/plugins/scorer/pd_slo_aware_router_hooks.go
@@ -44,9 +44,40 @@ var _ requestcontrol.ResponseReceived = &PDSLOAwareRouter{}
 var _ requestcontrol.ResponseStreaming = &PDSLOAwareRouter{}
 var _ requestcontrol.ResponseComplete = &PDSLOAwareRouter{}
 
-// PreRequest delegates to the base router
+// PreRequest tracks both prefill and decode pods in running request lists.
+// The base router tracks the decode pod (primary profile), and we additionally
+// track the prefill pod to ensure accurate load visibility during scoring.
 func (p *PDSLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult) {
+	logger := log.FromContext(ctx)
+
+	// Delegate to base router (tracks decode pod - primary profile)
 	p.PredictedLatency.PreRequest(ctx, request, schedulingResult)
+
+	// P/D-specific: Also track prefill pod if it was selected
+	if prefillResult, exists := schedulingResult.ProfileResults["prefill"]; exists && prefillResult != nil {
+		if len(prefillResult.TargetEndpoints) > 0 {
+			prefillPod := prefillResult.TargetEndpoints[0]
+			requestID := request.Headers[requtil.RequestIdHeaderKey]
+
+			// Get average TPOT SLO to determine priority
+			avgTPOTSLO, err := p.PredictedLatency.GetAvgTPOTSLO(request)
+			if err != nil {
+				logger.V(logutil.DEBUG).Info("Could not get SLO context for prefill tracking", "error", err)
+				return
+			}
+
+			// Track prefill pod in running requests
+			p.PredictedLatency.AddToRunningRequests(
+				prefillPod.GetMetadata().NamespacedName,
+				requestID,
+				avgTPOTSLO,
+			)
+
+			logger.V(logutil.DEBUG).Info("Tracked prefill pod in running requests",
+				"prefillPod", prefillPod.GetMetadata().NamespacedName.Name,
+				"requestID", requestID)
+		}
+	}
 }
 
 // ResponseReceived adds P/D-specific logic to extract prefill timing headers
@@ -79,8 +110,32 @@ func (p *PDSLOAwareRouter) ResponseStreaming(ctx context.Context, request *sched
 	p.PredictedLatency.ResponseStreaming(ctx, request, response, pod)
 }
 
-// ResponseComplete delegates to the base router
+// ResponseComplete cleans up both prefill and decode pod tracking.
+// We remove the prefill pod from running requests (if it was used) before
+// delegating to the base router, which removes the decode pod.
 func (p *PDSLOAwareRouter) ResponseComplete(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *datalayer.EndpointMetadata) {
+	logger := log.FromContext(ctx)
+	requestID := request.Headers[requtil.RequestIdHeaderKey]
+
+	// P/D-specific: Remove prefill pod from tracking if it was used
+	schedulingResult, err := p.PredictedLatency.GetSchedulingResult(request)
+	if err == nil && schedulingResult != nil {
+		if prefillResult, exists := schedulingResult.ProfileResults["prefill"]; exists && prefillResult != nil {
+			if len(prefillResult.TargetEndpoints) > 0 {
+				prefillPod := prefillResult.TargetEndpoints[0]
+				p.PredictedLatency.RemoveFromRunningRequests(
+					prefillPod.GetMetadata().NamespacedName,
+					requestID,
+				)
+
+				logger.V(logutil.DEBUG).Info("Removed prefill pod from running requests",
+					"prefillPod", prefillPod.GetMetadata().NamespacedName.Name,
+					"requestID", requestID)
+			}
+		}
+	}
+
+	// Delegate to base router (removes decode pod)
 	p.PredictedLatency.ResponseComplete(ctx, request, response, pod)
 }
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ require (`
`29`	`29`	`sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a`
`30`	`30`	`)`
`31`	`31`
`32`		`-replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260202180902-7d961854f2e5`
	`32`	`+replace sigs.k8s.io/gateway-api-inference-extension => github.com/RishabhSaini/gateway-api-inference-extension v0.0.0-20260203235024-2a01dcdc5417`
`33`	`33`
`34`	`34`	`require (`
`35`	`35`	`cel.dev/expr v0.24.0 // indirect`