Add the EndpointRoleLabel as a parameter for the predicted-latency-scorer

RishabhSaini · RishabhSaini · commit 423ee583be91 · 2026-02-11T11:01:56.000-05:00
Get rid of the RequestBuilderStruct fashion and use helper funcs instead
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/latencypredictor_helper.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/latencypredictor_helper.go
@@ -65,7 +65,7 @@ func getLatestMetricsForProfile(predictedLatencyCtx *predictedLatencyCtx) (*fwkd
 func processPreRequestForLatencyPrediction(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	predictedLatencyCtx *predictedLatencyCtx,
 ) error {
 	logger := log.FromContext(ctx)
@@ -83,9 +83,9 @@ func processPreRequestForLatencyPrediction(
 	target_endpoint_metadata := predictedLatencyCtx.targetMetadata
 	prefix_cache_score := predictedLatencyCtx.prefixCacheScoresForEndpoints[target_endpoint_metadata.NamespacedName.Name]
 
-	// Build prediction request using the builder (ensures pod type is included for P/D)
-	in := requestBuilder.BuildPredictionRequest(
-		ctx,
+	// Build prediction request (pod type is included if endpointRoleLabel is configured)
+	in := buildPredictionRequest(
+		endpointRoleLabel,
 		target_endpoint_metadata,
 		m,
 		predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -121,7 +121,7 @@ func processFirstTokenForLatencyPrediction(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
 	streamingMode bool,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	predictedLatencyCtx *predictedLatencyCtx,
 	now time.Time,
 	samplingMean float64,
@@ -141,10 +141,10 @@ func processFirstTokenForLatencyPrediction(
 	targetEndpointMetadata := predictedLatencyCtx.targetMetadata
 	prefixCacheScore := predictedLatencyCtx.prefixCacheScoresForEndpoints[targetEndpointMetadata.NamespacedName.Name]
 	logger.V(logutil.DEBUG).Info("Recording TTFT training data", "ttft_ms", predictedLatencyCtx.ttft, "prefixCacheScore", prefixCacheScore)
-	recordTTFTTrainingData(ctx, predictor, requestBuilder, predictedLatencyCtx, m, targetEndpointMetadata, now, prefixCacheScore)
+	recordTTFTTrainingData(ctx, predictor, endpointRoleLabel, predictedLatencyCtx, m, targetEndpointMetadata, now, prefixCacheScore)
 
 	if streamingMode {
-		predictFirstTPOT(ctx, predictor, requestBuilder, predictedLatencyCtx, targetEndpointMetadata)
+		predictFirstTPOT(ctx, predictor, endpointRoleLabel, predictedLatencyCtx, targetEndpointMetadata)
 	}
 
 	// Advance timestamp
@@ -165,17 +165,16 @@ func initializeSampler(ctx context.Context, predictedLatencyCtx *predictedLatenc
 func recordTTFTTrainingData(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	predictedLatencyCtx *predictedLatencyCtx,
 	m *fwkdl.Metrics,
 	targetEndpointMetadata *fwkdl.EndpointMetadata,
 	now time.Time,
 	prefixCacheScore float64,
 ) {
 	logger := log.FromContext(ctx)
-	// Build training entry using the builder
-	entry := requestBuilder.BuildTrainingEntry(
-		ctx,
+	entry := buildTrainingEntry(
+		endpointRoleLabel,
 		targetEndpointMetadata,
 		m,
 		predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -193,7 +192,7 @@ func recordTTFTTrainingData(
 func predictFirstTPOT(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	predictedLatencyCtx *predictedLatencyCtx,
 	targetEndpointMetadata *fwkdl.EndpointMetadata,
 ) {
@@ -205,9 +204,8 @@ func predictFirstTPOT(
 		return
 	}
 
-	// Build prediction request using the builder (ensures pod type is included for P/D)
-	in := requestBuilder.BuildPredictionRequest(
-		ctx,
+	in := buildPredictionRequest(
+		endpointRoleLabel,
 		targetEndpointMetadata,
 		m,
 		predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -233,7 +231,7 @@ func predictFirstTPOT(
 func processTokenForLatencyPrediction(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	predictedLatencyCtx *predictedLatencyCtx,
 	targetEndpointMetadata *fwkdl.EndpointMetadata,
 	now time.Time,
@@ -265,9 +263,8 @@ func processTokenForLatencyPrediction(
 			"error", err)
 		return
 	}
-	// Record actual TPOT using builder
-	entry := requestBuilder.BuildTrainingEntry(
-		ctx,
+	entry := buildTrainingEntry(
+		endpointRoleLabel,
 		targetEndpointMetadata,
 		m,
 		predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -283,9 +280,8 @@ func processTokenForLatencyPrediction(
 
 	// Sampled predict
 	if predictedLatencyCtx.tokenSampler.shouldPredict(predictedLatencyCtx.generatedTokenCount) {
-		// Build prediction request using the builder (ensures pod type is included for P/D)
-		in := requestBuilder.BuildPredictionRequest(
-			ctx,
+		in := buildPredictionRequest(
+			endpointRoleLabel,
 			targetEndpointMetadata,
 			m,
 			predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -321,7 +317,7 @@ func bulkPredictWithMetrics(
 	ctx context.Context,
 	predictor latencypredictor.PredictorInterface,
 	metricsStates []*fwkdl.Metrics,
-	requestBuilder PredictionRequestBuilder,
+	endpointRoleLabel string,
 	targetEndpointsMetadatas []*fwkdl.EndpointMetadata,
 	prompts []string,
 	generatedTokenCounts []int,
@@ -353,11 +349,11 @@ func bulkPredictWithMetrics(
 		}
 	}
 
-	// Build bulk prediction requests using the builder
+	// Build bulk prediction requests
 	bulkRequests := make([]latencypredictor.PredictionRequest, len(metricsStates))
 	for i := range metricsStates {
-		bulkRequests[i] = requestBuilder.BuildPredictionRequest(
-			ctx,
+		bulkRequests[i] = buildPredictionRequest(
+			endpointRoleLabel,
 			targetEndpointsMetadatas[i],
 			metricsStates[i],
 			prompts[i],
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/latencypredictor_helper_test.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/latencypredictor_helper_test.go
@@ -40,7 +40,6 @@ func TestBulkPredictWithMetrics(t *testing.T) {
 		{KVCacheUsagePercent: 0.5},
 		{KVCacheUsagePercent: 0.6},
 	}
-	requestBuilder := &DefaultPredictionRequestBuilder{}
 	pods := []*fwkdl.EndpointMetadata{
 		{
 			NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"},
@@ -53,7 +52,7 @@ func TestBulkPredictWithMetrics(t *testing.T) {
 	generatedTokenCounts := []int{1, 1}
 	prefixCacheScores := []float64{0.0, 0.0}
 
-	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, requestBuilder, pods, prompts, generatedTokenCounts, prefixCacheScores)
+	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, "", pods, prompts, generatedTokenCounts, prefixCacheScores)
 
 	assert.NoError(t, err)
 	assert.Len(t, results, 2)
@@ -71,7 +70,6 @@ func TestBulkPredictWithMetrics_Error(t *testing.T) {
 	metricsStates := []*fwkdl.Metrics{
 		{KVCacheUsagePercent: 0.5},
 	}
-	requestBuilder := &DefaultPredictionRequestBuilder{}
 	pods := []*fwkdl.EndpointMetadata{
 		{
 			NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"},
@@ -81,7 +79,7 @@ func TestBulkPredictWithMetrics_Error(t *testing.T) {
 	generatedTokenCounts := []int{1}
 	prefixCacheScores := []float64{0.0}
 
-	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, requestBuilder, pods, prompts, generatedTokenCounts, prefixCacheScores)
+	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, "", pods, prompts, generatedTokenCounts, prefixCacheScores)
 
 	assert.Error(t, err)
 	assert.Nil(t, results)
@@ -90,7 +88,6 @@ func TestBulkPredictWithMetrics_Error(t *testing.T) {
 func TestBulkPredictWithMetrics_InputMismatch(t *testing.T) {
 	mockPredictor := &mockPredictor{}
 	metricsStates := []*fwkdl.Metrics{{}}
-	requestBuilder := &DefaultPredictionRequestBuilder{}
 	pods := []*fwkdl.EndpointMetadata{
 		{
 			NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"},
@@ -100,7 +97,7 @@ func TestBulkPredictWithMetrics_InputMismatch(t *testing.T) {
 	generatedTokenCounts := []int{1}
 	prefixCacheScores := []float64{0.0}
 
-	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, requestBuilder, pods, prompts, generatedTokenCounts, prefixCacheScores)
+	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, "", pods, prompts, generatedTokenCounts, prefixCacheScores)
 
 	assert.Error(t, err)
 	assert.Nil(t, results)
@@ -110,7 +107,6 @@ func TestBulkPredictWithMetrics_InputMismatch(t *testing.T) {
 func TestBulkPredictWithMetrics_NilMetricsState(t *testing.T) {
 	mockPredictor := &mockPredictor{}
 	metricsStates := []*fwkdl.Metrics{nil} // Nil metrics state
-	requestBuilder := &DefaultPredictionRequestBuilder{}
 	pods := []*fwkdl.EndpointMetadata{
 		{
 			NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"},
@@ -120,7 +116,7 @@ func TestBulkPredictWithMetrics_NilMetricsState(t *testing.T) {
 	generatedTokenCounts := []int{1}
 	prefixCacheScores := []float64{0.0}
 
-	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, requestBuilder, pods, prompts, generatedTokenCounts, prefixCacheScores)
+	results, err := bulkPredictWithMetrics(context.Background(), mockPredictor, metricsStates, "", pods, prompts, generatedTokenCounts, prefixCacheScores)
 
 	assert.Error(t, err)
 	assert.Nil(t, results)
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/prediction.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/prediction.go
@@ -69,7 +69,7 @@ func (s *PredictedLatency) generatePredictions(ctx context.Context, request *sch
 	}
 
 	// Bulk predict
-	bulkPredictions, err := bulkPredictWithMetrics(ctx, s.latencypredictor, metricsStates, s.requestBuilder, targetEndpointsMetadatas, prompts, generatedTokenCounts, prefixCacheScores)
+	bulkPredictions, err := bulkPredictWithMetrics(ctx, s.latencypredictor, metricsStates, s.config.EndpointRoleLabel, targetEndpointsMetadatas, prompts, generatedTokenCounts, prefixCacheScores)
 	if err != nil {
 		logger.V(logutil.DEBUG).Error(err, "Bulk prediction failed")
 		return nil, err
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks.go
@@ -187,7 +187,7 @@ func (t *PredictedLatency) PreRequest(ctx context.Context, request *schedulingty
 	refreshLastSeenMetrics(ctx, predictedLatencyCtx)
 	t.setPredictedLatencyContextForRequest(request, predictedLatencyCtx)
 
-	if err := processPreRequestForLatencyPrediction(ctx, t.latencypredictor, t.requestBuilder, predictedLatencyCtx); err != nil {
+	if err := processPreRequestForLatencyPrediction(ctx, t.latencypredictor, t.config.EndpointRoleLabel, predictedLatencyCtx); err != nil {
 		logger.V(logutil.DEBUG).Error(err, "Process PreRequest in latencypredictor failed")
 	}
 }
@@ -233,8 +233,8 @@ func (t *PredictedLatency) ResponseReceived(ctx context.Context, request *schedu
 				"prefillPod", prefillMetadata.NamespacedName.Name,
 				"prefixCacheScore", prefixCacheScore)
 
-			entry := t.requestBuilder.BuildTrainingEntry(
-				ctx,
+			entry := buildTrainingEntry(
+				t.config.EndpointRoleLabel,
 				prefillMetadata,
 				prefillMetrics,
 				predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,
@@ -272,9 +272,9 @@ func (t *PredictedLatency) ResponseStreaming(ctx context.Context, request *sched
 	}
 
 	if predictedLatencyCtx.ttft == 0 {
-		processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.requestBuilder, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)
+		processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.config.EndpointRoleLabel, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)
 	} else {
-		processTokenForLatencyPrediction(ctx, t.latencypredictor, t.requestBuilder, predictedLatencyCtx, targetMetadata, now, t.config.SamplingMean, t.config.MaxSampledTokens)
+		processTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.EndpointRoleLabel, predictedLatencyCtx, targetMetadata, now, t.config.SamplingMean, t.config.MaxSampledTokens)
 	}
 
 }
@@ -298,7 +298,7 @@ func (t *PredictedLatency) ResponseComplete(ctx context.Context, request *schedu
 	}
 	now := time.Now()
 	if !t.config.StreamingMode {
-		processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.requestBuilder, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)
+		processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.config.EndpointRoleLabel, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)
 	}
 
 	if predictedLatencyCtx.ttft > 0 {
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks_test.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks_test.go
@@ -64,7 +64,6 @@ func createTestRouter() *PredictedLatency {
 		),
 		// runningRequestLists is a sync.Map and needs no initialization
 		latencypredictor: nil,
-		requestBuilder:   &DefaultPredictionRequestBuilder{},
 		config:           DefaultConfig,
 	}
 }
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/scorer.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/scorer.go
@@ -40,7 +40,6 @@ import (
 type PredictedLatency struct {
 	typedName           plugin.TypedName
 	latencypredictor    latencypredictor.PredictorInterface
-	requestBuilder      PredictionRequestBuilder
 	runningRequestLists sync.Map                                      // Key: types.NamespacedName, Value: *requestPriorityQueue
 	sloContextStore     *ttlcache.Cache[string, *predictedLatencyCtx] // TTL cache for request contexts
 	headroomStrategy    headroomStrategy
@@ -68,6 +67,7 @@ type Config struct {
 	ContextTTL                time.Duration `json:"contextTTL,omitempty"`
 	SelectionMode             string        `json:"selectionMode,omitempty"`
 	StreamingMode             bool          `json:"streamingMode,omitempty"`
+	EndpointRoleLabel         string        `json:"endpointRoleLabel,omitempty"`
 }
 
 var DefaultConfig = Config{
@@ -164,7 +164,6 @@ func NewPredictedLatency(config Config, predictor latencypredictor.PredictorInte
 	predictedLatency := &PredictedLatency{
 		typedName:        plugin.TypedName{Type: PredictedLatencyPluginType, Name: PredictedLatencyPluginType},
 		latencypredictor: predictor,
-		requestBuilder:   &DefaultPredictionRequestBuilder{}, // Default, can be customized via SetRequestBuilder
 		// runningRequestLists is a sync.Map and needs no initialization
 		headroomStrategy: strategy,
 		config:           config,
@@ -215,16 +214,6 @@ func (s *PredictedLatency) WithName(name string) *PredictedLatency {
 	return s
 }
 
-// SetRequestBuilder sets a custom prediction request builder.
-// This allows external packages (e.g., llm-d-inference-scheduler) to customize
-// how prediction and training requests are constructed, for example to add
-// pod type information for disaggregated serving scenarios.
-func (s *PredictedLatency) SetRequestBuilder(builder PredictionRequestBuilder) {
-	if builder != nil {
-		s.requestBuilder = builder
-	}
-}
-
 func (s *PredictedLatency) epsilonGreedyAffinityGate(
 	ctx context.Context,
 	candidates []endpointPredictionResult,
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/types.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/types.go

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ func (s PredictedLatency) generatePredictions(ctx context.Context, request sch`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`// Bulk predict`
`72`		`- bulkPredictions, err := bulkPredictWithMetrics(ctx, s.latencypredictor, metricsStates, s.requestBuilder, targetEndpointsMetadatas, prompts, generatedTokenCounts, prefixCacheScores)`
	`72`	`+ bulkPredictions, err := bulkPredictWithMetrics(ctx, s.latencypredictor, metricsStates, s.config.EndpointRoleLabel, targetEndpointsMetadatas, prompts, generatedTokenCounts, prefixCacheScores)`
`73`	`73`	`if err != nil {`
`74`	`74`	`logger.V(logutil.DEBUG).Error(err, "Bulk prediction failed")`
`75`	`75`	`return nil, err`
Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ func (t PredictedLatency) PreRequest(ctx context.Context, request schedulingty`
`187`	`187`	`refreshLastSeenMetrics(ctx, predictedLatencyCtx)`
`188`	`188`	`t.setPredictedLatencyContextForRequest(request, predictedLatencyCtx)`
`189`	`189`
`190`		`- if err := processPreRequestForLatencyPrediction(ctx, t.latencypredictor, t.requestBuilder, predictedLatencyCtx); err != nil {`
	`190`	`+ if err := processPreRequestForLatencyPrediction(ctx, t.latencypredictor, t.config.EndpointRoleLabel, predictedLatencyCtx); err != nil {`
`191`	`191`	`logger.V(logutil.DEBUG).Error(err, "Process PreRequest in latencypredictor failed")`
`192`	`192`	`}`
`193`	`193`	`}`
`@@ -233,8 +233,8 @@ func (t PredictedLatency) ResponseReceived(ctx context.Context, request schedu`
`233`	`233`	`"prefillPod", prefillMetadata.NamespacedName.Name,`
`234`	`234`	`"prefixCacheScore", prefixCacheScore)`
`235`	`235`
`236`		`- entry := t.requestBuilder.BuildTrainingEntry(`
`237`		`- ctx,`
	`236`	`+ entry := buildTrainingEntry(`
	`237`	`+ t.config.EndpointRoleLabel,`
`238`	`238`	`prefillMetadata,`
`239`	`239`	`prefillMetrics,`
`240`	`240`	`predictedLatencyCtx.schedulingRequest.Body.Completions.Prompt,`
`@@ -272,9 +272,9 @@ func (t PredictedLatency) ResponseStreaming(ctx context.Context, request sched`
`272`	`272`	`}`
`273`	`273`
`274`	`274`	`if predictedLatencyCtx.ttft == 0 {`
`275`		`- processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.requestBuilder, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
	`275`	`+ processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.config.EndpointRoleLabel, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
`276`	`276`	`} else {`
`277`		`- processTokenForLatencyPrediction(ctx, t.latencypredictor, t.requestBuilder, predictedLatencyCtx, targetMetadata, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
	`277`	`+ processTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.EndpointRoleLabel, predictedLatencyCtx, targetMetadata, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
`278`	`278`	`}`
`279`	`279`
`280`	`280`	`}`
`@@ -298,7 +298,7 @@ func (t PredictedLatency) ResponseComplete(ctx context.Context, request schedu`
`298`	`298`	`}`
`299`	`299`	`now := time.Now()`
`300`	`300`	`if !t.config.StreamingMode {`
`301`		`- processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.requestBuilder, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
	`301`	`+ processFirstTokenForLatencyPrediction(ctx, t.latencypredictor, t.config.StreamingMode, t.config.EndpointRoleLabel, predictedLatencyCtx, now, t.config.SamplingMean, t.config.MaxSampledTokens)`
`302`	`302`	`}`
`303`	`303`
`304`	`304`	`if predictedLatencyCtx.ttft > 0 {`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,6 @@ func createTestRouter() *PredictedLatency {`
`64`	`64`	`),`
`65`	`65`	`// runningRequestLists is a sync.Map and needs no initialization`
`66`	`66`	`latencypredictor: nil,`
`67`		`- requestBuilder: &DefaultPredictionRequestBuilder{},`
`68`	`67`	`config: DefaultConfig,`
`69`	`68`	`}`
`70`	`69`	`}`