@@ -44,9 +44,40 @@ var _ requestcontrol.ResponseReceived = &PDSLOAwareRouter{}
4444var _ requestcontrol.ResponseStreaming = & PDSLOAwareRouter {}
4545var _ requestcontrol.ResponseComplete = & PDSLOAwareRouter {}
4646
47- // PreRequest delegates to the base router
47+ // PreRequest tracks both prefill and decode pods in running request lists.
48+ // The base router tracks the decode pod (primary profile), and we additionally
49+ // track the prefill pod to ensure accurate load visibility during scoring.
4850func (p * PDSLOAwareRouter ) PreRequest (ctx context.Context , request * schedulingtypes.LLMRequest , schedulingResult * schedulingtypes.SchedulingResult ) {
51+ logger := log .FromContext (ctx )
52+
53+ // Delegate to base router (tracks decode pod - primary profile)
4954 p .PredictedLatency .PreRequest (ctx , request , schedulingResult )
55+
56+ // P/D-specific: Also track prefill pod if it was selected
57+ if prefillResult , exists := schedulingResult .ProfileResults ["prefill" ]; exists && prefillResult != nil {
58+ if len (prefillResult .TargetEndpoints ) > 0 {
59+ prefillPod := prefillResult .TargetEndpoints [0 ]
60+ requestID := request .Headers [requtil .RequestIdHeaderKey ]
61+
62+ // Get average TPOT SLO to determine priority
63+ avgTPOTSLO , err := p .PredictedLatency .GetAvgTPOTSLO (request )
64+ if err != nil {
65+ logger .V (logutil .DEBUG ).Info ("Could not get SLO context for prefill tracking" , "error" , err )
66+ return
67+ }
68+
69+ // Track prefill pod in running requests
70+ p .PredictedLatency .AddToRunningRequests (
71+ prefillPod .GetMetadata ().NamespacedName ,
72+ requestID ,
73+ avgTPOTSLO ,
74+ )
75+
76+ logger .V (logutil .DEBUG ).Info ("Tracked prefill pod in running requests" ,
77+ "prefillPod" , prefillPod .GetMetadata ().NamespacedName .Name ,
78+ "requestID" , requestID )
79+ }
80+ }
5081}
5182
5283// ResponseReceived adds P/D-specific logic to extract prefill timing headers
@@ -79,8 +110,32 @@ func (p *PDSLOAwareRouter) ResponseStreaming(ctx context.Context, request *sched
79110 p .PredictedLatency .ResponseStreaming (ctx , request , response , pod )
80111}
81112
82- // ResponseComplete delegates to the base router
113+ // ResponseComplete cleans up both prefill and decode pod tracking.
114+ // We remove the prefill pod from running requests (if it was used) before
115+ // delegating to the base router, which removes the decode pod.
83116func (p * PDSLOAwareRouter ) ResponseComplete (ctx context.Context , request * schedulingtypes.LLMRequest , response * requestcontrol.Response , pod * datalayer.EndpointMetadata ) {
117+ logger := log .FromContext (ctx )
118+ requestID := request .Headers [requtil .RequestIdHeaderKey ]
119+
120+ // P/D-specific: Remove prefill pod from tracking if it was used
121+ schedulingResult , err := p .PredictedLatency .GetSchedulingResult (request )
122+ if err == nil && schedulingResult != nil {
123+ if prefillResult , exists := schedulingResult .ProfileResults ["prefill" ]; exists && prefillResult != nil {
124+ if len (prefillResult .TargetEndpoints ) > 0 {
125+ prefillPod := prefillResult .TargetEndpoints [0 ]
126+ p .PredictedLatency .RemoveFromRunningRequests (
127+ prefillPod .GetMetadata ().NamespacedName ,
128+ requestID ,
129+ )
130+
131+ logger .V (logutil .DEBUG ).Info ("Removed prefill pod from running requests" ,
132+ "prefillPod" , prefillPod .GetMetadata ().NamespacedName .Name ,
133+ "requestID" , requestID )
134+ }
135+ }
136+ }
137+
138+ // Delegate to base router (removes decode pod)
84139 p .PredictedLatency .ResponseComplete (ctx , request , response , pod )
85140}
86141
0 commit comments