From ca157f0e16f3e2e35e73602d80053aa876bc5617 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Fri, 12 Dec 2025 11:22:20 -0500 Subject: [PATCH 01/12] prefill and decode predictor and training sidecars support for SLO --- .../inferencepool/PD-SLO-CHART-GUIDE.md | 334 ++++++++++++++++++ .../templates/_latency-predictor.tpl | 121 ++++++- .../inferencepool/templates/epp-config.yaml | 27 ++ .../inferencepool/values-pd-slo-example.yaml | 135 +++++++ config/charts/inferencepool/values.yaml | 73 +++- 5 files changed, 687 insertions(+), 3 deletions(-) create mode 100644 config/charts/inferencepool/PD-SLO-CHART-GUIDE.md create mode 100644 config/charts/inferencepool/values-pd-slo-example.yaml diff --git a/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md b/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md new file mode 100644 index 0000000000..37bf66c30b --- /dev/null +++ b/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md @@ -0,0 +1,334 @@ +# PD-SLO Chart Configuration Guide + +This guide explains how to configure the inferencepool Helm chart for PD (Prefill-Decode) disaggregated scheduling with SLO-aware optimization. + +## Overview + +The chart now supports **two modes** for latency predictors: + +1. **Legacy Mode** (default): Single predictor for unified TTFT/TPOT prediction +2. **PD Mode**: Separate predictors for prefill and decode pods (required for PD-SLO scheduling) + +## Architecture Comparison + +### Legacy Mode (Non-PD) +``` +EPP Pod +├─ EPP Container +└─ Sidecars: + ├─ 1 Training Server (port 8000) + └─ N Prediction Servers (ports 8001+) + +Environment Variables: +- TRAINING_SERVER_URL=http://localhost:8000 +- PREDICTION_SERVER_URL=http://localhost:8001,... +``` + +### PD Mode (PD-SLO) +``` +EPP Pod +├─ EPP Container +└─ Sidecars: + ├─ Prefill Training Server (port 8000) + ├─ Prefill Prediction Server (port 8001) + ├─ Decode Training Server (port 8010) + └─ Decode Prediction Server (port 8011) + +Environment Variables: +- PREFILL_TRAINING_URL=http://localhost:8000 +- PREFILL_PREDICTION_URL=http://localhost:8001 +- DECODE_TRAINING_URL=http://localhost:8010 +- DECODE_PREDICTION_URL=http://localhost:8011 +``` + +## Enabling PD Mode + +### Step 1: Set `pdMode.enabled=true` + +```yaml +inferenceExtension: + latencyPredictor: + enabled: true + + pdMode: + enabled: true # Enable PD mode +``` + +### Step 2: Configure Predictor Types + +Configure each predictor type (prefill, decode) with separate ports and configurations: + +```yaml +inferenceExtension: + latencyPredictor: + enabled: true + pdMode: + enabled: true + predictors: + prefill: + trainingServer: + port: 8000 # Unique port for prefill training + # ... config + predictionServers: + count: 1 + startPort: 8001 + # ... config + + decode: + trainingServer: + port: 8010 # Different port from prefill + # ... config + predictionServers: + count: 1 + startPort: 8011 + # ... config +``` + +### Step 3: Deploy with PD-SLO values + +```bash +helm install my-pool ./inferencepool \ + --values values-pd-slo-example.yaml \ + --namespace llm-d +``` + +## Key Configuration Points + +### Port Allocation + +**Default Port Ranges** (adjust if you have conflicts): + +| Predictor | Training Port | Prediction Ports | +|-----------|---------------|------------------| +| Prefill | 8000 | 8001 | +| Decode | 8010 | 8011 | + +**Important**: Ports must not conflict! Each predictor needs unique ports. + +### Replica Counts + +**Recommended for MVP**: +- Training servers: `count: 1` (one per predictor type) +- Prediction servers: `count: 1` (one per predictor type) + +**Total sidecars**: 4 containers (2 training + 2 prediction) + +**For production**, you can increase prediction server replicas for higher throughput: +```yaml +predictionServers: + count: 3 # 3 prediction server replicas + startPort: 8001 # Ports: 8001, 8002, 8003 +``` + +### Image Configuration + +You can override images per predictor type or use global defaults: + +**Option 1: Use global defaults** (recommended) +```yaml +# Set global defaults in legacy section +trainingServer: + image: + hub: your-docker-repo + name: latencypredictor-training-server + tag: latest + +# PD predictors will inherit these by default +pdMode: + enabled: true + predictors: + prefill: + trainingServer: + port: 8000 + # No image specified - uses global default +``` + +**Option 2: Override per predictor** +```yaml +pdMode: + predictors: + prefill: + trainingServer: + image: + hub: custom-repo + name: prefill-training-server + tag: v2.0 +``` + +### Resource Configuration + +Similarly, resources can be global or per-predictor: + +```yaml +# Global defaults +trainingServer: + resources: + requests: + cpu: "2000m" + memory: "4Gi" + +# Override for specific predictor +pdMode: + predictors: + decode: + trainingServer: + resources: + requests: + cpu: "4000m" # Decode needs more CPU + memory: "8Gi" +``` + +## ConfigMaps Generated + +In PD mode, the chart creates separate ConfigMaps for each predictor: + +**Legacy Mode**: +- `-latency-predictor-training` +- `-latency-predictor-prediction` + +**PD Mode**: +- `-latency-predictor-prefill-training` +- `-latency-predictor-prefill-prediction` +- `-latency-predictor-decode-training` +- `-latency-predictor-decode-prediction` + +## Integration with llm-d-inference-scheduler + +The environment variables generated by this chart are consumed by `llm-d-inference-scheduler`'s `PDPredictorSet`: + +**File**: `llm-d-inference-scheduler/pkg/predictors/pd_predictors.go` + +```go +func NewPDPredictorSet(logger logr.Logger) (*PDPredictorSet, error) { + prefillConfig := &latencypredictor.Config{ + TrainingURL: getEnvOrDefault("PREFILL_TRAINING_URL", ""), + PredictionURLs: getPredictionURLs("PREFILL_PREDICTION_URL"), + // ... + } + + decodeConfig := &latencypredictor.Config{ + TrainingURL: getEnvOrDefault("DECODE_TRAINING_URL", ""), + PredictionURLs: getPredictionURLs("DECODE_PREDICTION_URL"), + // ... + } + // ... +} +``` + +## Validation + +After deployment, verify the sidecars are running: + +```bash +# Check pod has 5 containers (1 EPP + 4 sidecars) +kubectl get pods -n llm-d + +# Describe pod to see all containers +kubectl describe pod -n llm-d + +# Check environment variables in EPP container +kubectl exec -n llm-d -c epp -- env | grep -E "PREFILL|DECODE" + +# Expected output: +# PREFILL_TRAINING_URL=http://localhost:8000 +# PREFILL_PREDICTION_URL=http://localhost:8001 +# DECODE_TRAINING_URL=http://localhost:8010 +# DECODE_PREDICTION_URL=http://localhost:8011 + +# Check predictor health +kubectl exec -n llm-d -c training-server-prefill -- curl http://localhost:8000/healthz +kubectl exec -n llm-d -c training-server-decode -- curl http://localhost:8010/healthz +``` + +## Backward Compatibility + +**Important**: Legacy mode continues to work unchanged. To use legacy mode: + +```yaml +inferenceExtension: + latencyPredictor: + enabled: true + + # Do NOT set pdMode.enabled or set it to false + pdMode: + enabled: false + + # Use legacy configuration + trainingServer: + port: 8000 + # ... + + predictionServers: + count: 10 + startPort: 8001 + # ... +``` + +This will create the same deployment as before PD mode was added. + +## Troubleshooting + +### Port Conflicts + +**Symptom**: Containers failing to start with "address already in use" + +**Solution**: Ensure each predictor uses unique ports. Check: +```yaml +prefill.trainingServer.port != decode.trainingServer.port +prefill.predictionServers.startPort != decode.predictionServers.startPort +``` + +### Missing Environment Variables + +**Symptom**: llm-d-inference-scheduler logs show "PREFILL_TRAINING_URL must be set" + +**Solution**: Verify `pdMode.enabled=true` and check EPP pod environment: +```bash +kubectl exec -c epp -- env | grep -E "PREFILL|DECODE" +``` + +### ConfigMap Not Found + +**Symptom**: Containers failing with "configmap not found" + +**Solution**: Verify ConfigMaps were created: +```bash +kubectl get configmaps -n llm-d | grep latency-predictor +``` + +Should show: +- `-latency-predictor-prefill-training` +- `-latency-predictor-prefill-prediction` +- `-latency-predictor-decode-training` +- `-latency-predictor-decode-prediction` + +## Example Deployment + +See `values-pd-slo-example.yaml` for a complete working configuration. + +```bash +# Deploy with PD-SLO mode +helm install llm-d-epp ./inferencepool \ + --values values-pd-slo-example.yaml \ + --namespace llm-d \ + --create-namespace + +# Verify deployment +kubectl get pods -n llm-d +kubectl logs -n llm-d -c epp | grep "PD predictor" +``` + +## Summary + +| Feature | Legacy Mode | PD Mode | +|---------|-------------|---------| +| Enable flag | `latencyPredictor.enabled=true` | `latencyPredictor.pdMode.enabled=true` | +| Training servers | 1 | 2 (prefill + decode) | +| Prediction servers | N (configurable) | 1 per predictor type | +| Environment vars | `TRAINING_SERVER_URL`, `PREDICTION_SERVER_URL` | `PREFILL_*`, `DECODE_*` | +| ConfigMaps | 2 | 4 | +| Use case | Non-disaggregated scheduling | PD-SLO disaggregated scheduling | + +--- + +**Next Steps**: After deploying the EPP with PD mode, configure `llm-d-inference-scheduler` to use PD-SLO scheduling. See `llm-d-inference-scheduler/README-PD-SLO.md`. diff --git a/config/charts/inferencepool/templates/_latency-predictor.tpl b/config/charts/inferencepool/templates/_latency-predictor.tpl index 4ac7b7ed2e..7f02d0c63e 100644 --- a/config/charts/inferencepool/templates/_latency-predictor.tpl +++ b/config/charts/inferencepool/templates/_latency-predictor.tpl @@ -1,8 +1,23 @@ {{/* Latency Predictor Env +Supports both legacy mode (single predictor) and PD mode (multiple predictors) */}} {{- define "gateway-api-inference-extension.latencyPredictor.env" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} +{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} +{{/* PD Mode: Generate environment variables for each predictor type */}} +{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} +- name: {{ $predictorName | upper }}_TRAINING_URL + value: "http://localhost:{{ $predictorConfig.trainingServer.port }}" +- name: {{ $predictorName | upper }}_PREDICTION_URL + value: "{{- $count := int $predictorConfig.predictionServers.count -}} + {{- $startPort := int $predictorConfig.predictionServers.startPort -}} + {{- range $i := until $count -}} + {{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }} + {{- end }}" +{{- end }} +{{- else }} +{{/* Legacy Mode: Single predictor environment variables */}} - name: PREDICTION_SERVER_URL value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}} {{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}} @@ -11,6 +26,7 @@ Latency Predictor Env {{- end }}" - name: TRAINING_SERVER_URL value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" +{{- end }} {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }} - name: {{ $key }} value: {{ $value | quote }} @@ -20,9 +36,93 @@ Latency Predictor Env {{/* Latency Predictor Sidecar Containers +Supports both legacy mode (single predictor) and PD mode (multiple predictors) */}} {{- define "gateway-api-inference-extension.latencyPredictor.containers" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} +{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} +{{/* PD Mode: Create training and prediction servers for each predictor type */}} +{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} +# Training Server for {{ $predictorName }} predictor +- name: training-server-{{ $predictorName }} + image: {{ $predictorConfig.trainingServer.image.hub | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ $predictorConfig.trainingServer.image.name | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ $predictorConfig.trainingServer.image.tag | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} + imagePullPolicy: {{ $predictorConfig.trainingServer.image.pullPolicy | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} + ports: + - containerPort: {{ $predictorConfig.trainingServer.port }} + name: train-{{ $predictorName }} + livenessProbe: + httpGet: + path: {{ $predictorConfig.trainingServer.livenessProbe.httpGet.path | default "/healthz" }} + port: {{ $predictorConfig.trainingServer.port }} + initialDelaySeconds: {{ $predictorConfig.trainingServer.livenessProbe.initialDelaySeconds | default 30 }} + periodSeconds: {{ $predictorConfig.trainingServer.livenessProbe.periodSeconds | default 20 }} + readinessProbe: + httpGet: + path: {{ $predictorConfig.trainingServer.readinessProbe.httpGet.path | default "/readyz" }} + port: {{ $predictorConfig.trainingServer.port }} + initialDelaySeconds: {{ $predictorConfig.trainingServer.readinessProbe.initialDelaySeconds | default 45 }} + periodSeconds: {{ $predictorConfig.trainingServer.readinessProbe.periodSeconds | default 10 }} + resources: + {{- toYaml ($predictorConfig.trainingServer.resources | default $.Values.inferenceExtension.latencyPredictor.trainingServer.resources) | nindent 4 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-training + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "training-{{ $predictorName }}" + volumeMounts: + - name: training-server-{{ $predictorName }}-storage + mountPath: /models +{{- range $i := until (int $predictorConfig.predictionServers.count) }} +# Prediction Server {{ add $i 1 }} for {{ $predictorName }} predictor +- name: prediction-server-{{ $predictorName }}-{{ add $i 1 }} + image: {{ $predictorConfig.predictionServers.image.hub | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $predictorConfig.predictionServers.image.name | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $predictorConfig.predictionServers.image.tag | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }} + imagePullPolicy: {{ $predictorConfig.predictionServers.image.pullPolicy | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }} + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $predictorConfig.predictionServers.startPort $i }}"] + ports: + - containerPort: {{ add $predictorConfig.predictionServers.startPort $i }} + name: pred-{{ $predictorName }}-{{ add $i 1 }} + livenessProbe: + httpGet: + path: {{ $predictorConfig.predictionServers.livenessProbe.httpGet.path | default "/healthz" }} + port: {{ add $predictorConfig.predictionServers.startPort $i }} + initialDelaySeconds: {{ $predictorConfig.predictionServers.livenessProbe.initialDelaySeconds | default 15 }} + periodSeconds: {{ $predictorConfig.predictionServers.livenessProbe.periodSeconds | default 15 }} + readinessProbe: + httpGet: + path: {{ $predictorConfig.predictionServers.readinessProbe.httpGet.path | default "/readyz" }} + port: {{ add $predictorConfig.predictionServers.startPort $i }} + initialDelaySeconds: {{ $predictorConfig.predictionServers.readinessProbe.initialDelaySeconds | default 10 }} + periodSeconds: {{ $predictorConfig.predictionServers.readinessProbe.periodSeconds | default 5 }} + failureThreshold: {{ $predictorConfig.predictionServers.readinessProbe.failureThreshold | default 10 }} + resources: + {{- toYaml ($predictorConfig.predictionServers.resources | default $.Values.inferenceExtension.latencyPredictor.predictionServers.resources) | nindent 4 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-prediction + env: + - name: PREDICT_PORT + value: "{{ add $predictorConfig.predictionServers.startPort $i }}" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-{{ $predictorName }}-{{ add $i 1 }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:{{ $predictorConfig.trainingServer.port }}" + volumeMounts: + - name: prediction-server-{{ $predictorName }}-{{ add $i 1 }}-storage + mountPath: /server_models +{{- end }} +{{- end }} +{{- else }} +{{/* Legacy Mode: Single predictor containers */}} # Training Server Sidecar Container - name: training-server image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} @@ -94,19 +194,36 @@ Latency Predictor Sidecar Containers {{- end }} {{- end }} {{- end }} +{{- end }} {{/* Latency Predictor Volumes +Supports both legacy mode (single predictor) and PD mode (multiple predictors) */}} {{- define "gateway-api-inference-extension.latencyPredictor.volumes" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} +{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} +{{/* PD Mode: Create volumes for each predictor type */}} +{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} +- name: training-server-{{ $predictorName }}-storage + emptyDir: + sizeLimit: {{ $predictorConfig.trainingServer.volumeSize | default $.Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} +{{- range $i := until (int $predictorConfig.predictionServers.count) }} +- name: prediction-server-{{ $predictorName }}-{{ add $i 1 }}-storage + emptyDir: + sizeLimit: {{ $predictorConfig.predictionServers.volumeSize | default $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} +{{- end }} +{{- end }} +{{- else }} +{{/* Legacy Mode: Single predictor volumes */}} - name: training-server-storage - emptyDir: + emptyDir: sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} - name: prediction-server-{{ add $i 1 }}-storage - emptyDir: + emptyDir: sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} {{- end }} {{- end }} {{- end }} +{{- end }} diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index f34d5cf218..be3807d539 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -75,6 +75,32 @@ data: {{- end }} --- {{- if .Values.inferenceExtension.latencyPredictor.enabled }} +{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} +{{/* PD Mode: Create ConfigMaps for each predictor type */}} +{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-training + namespace: {{ $.Release.Namespace }} +data: + {{- range $key, $value := $predictorConfig.trainingServer.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-prediction + namespace: {{ $.Release.Namespace }} +data: + {{- range $key, $value := $predictorConfig.predictionServers.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} +--- +{{- end }} +{{- else }} +{{/* Legacy Mode: Single predictor ConfigMaps */}} apiVersion: v1 kind: ConfigMap metadata: @@ -95,3 +121,4 @@ data: {{ $key }}: {{ $value | quote }} {{- end }} {{- end }} +{{- end }} diff --git a/config/charts/inferencepool/values-pd-slo-example.yaml b/config/charts/inferencepool/values-pd-slo-example.yaml new file mode 100644 index 0000000000..a2d70702bb --- /dev/null +++ b/config/charts/inferencepool/values-pd-slo-example.yaml @@ -0,0 +1,135 @@ +# Example values.yaml for PD-SLO Architecture +# This configuration enables separate predictors for prefill and decode pods +# Required for llm-d-inference-scheduler with PD disaggregation + +inferenceExtension: + latencyPredictor: + enabled: true + + # Enable PD Mode for disaggregated scheduling + pdMode: + enabled: true # Set to true to enable PD-SLO architecture + + predictors: + # Prefill predictor: Predicts TTFT for prefill pods + # Training data comes from prefill pod metrics + prefill: + trainingServer: + port: 8000 + image: + hub: your-docker-repo # Update with your Docker registry + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + predictionServers: + count: 1 # Number of prefill prediction server replicas + startPort: 8001 + image: + hub: your-docker-repo # Update with your Docker registry + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # Decode predictor: Predicts TTFT + TPOT for decode pods + # Training data comes from decode pod metrics + decode: + trainingServer: + port: 8010 # Different port from prefill + image: + hub: your-docker-repo # Update with your Docker registry + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + predictionServers: + count: 1 # Number of decode prediction server replicas + startPort: 8011 + image: + hub: your-docker-repo # Update with your Docker registry + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + +# This will generate the following environment variables in the EPP container: +# +# - PREFILL_TRAINING_URL=http://localhost:8000 +# - PREFILL_PREDICTION_URL=http://localhost:8001 +# - DECODE_TRAINING_URL=http://localhost:8010 +# - DECODE_PREDICTION_URL=http://localhost:8011 +# +# Total sidecar containers in EPP pod: +# - 1 prefill training server (port 8000) +# - 1 prefill prediction server (port 8001) +# - 1 decode training server (port 8010) +# - 1 decode prediction server (port 8011) +# = 4 total sidecar containers +# +# These environment variables are consumed by llm-d-inference-scheduler's +# PDPredictorSet (pkg/predictors/pd_predictors.go) diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index aba6bbfdae..5919c0c00e 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -74,7 +74,78 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: enabled: false - + + # PD (Prefill-Decode) Disaggregation Mode + # When enabled, deploys separate predictors for prefill and decode pods + # Required for PD-SLO scheduling in llm-d-inference-scheduler + pdMode: + enabled: false # Set to true to enable PD-SLO architecture + predictors: + # Prefill predictor (predicts prefill pod TTFT) + prefill: + trainingServer: + port: 8000 + # Optional: Override global image/resources (falls back to global if not specified) + # image: + # hub: path/to/your/docker/repo + # name: latencypredictor-training-server + # tag: latest + # pullPolicy: Always + # resources: + # requests: + # cpu: "2000m" + # memory: "4Gi" + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + predictionServers: + count: 1 # Number of prefill prediction server replicas + startPort: 8001 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # Decode predictor (predicts decode pod TTFT + TPOT) + decode: + trainingServer: + port: 8010 # Different port from prefill + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + predictionServers: + count: 1 # Number of decode prediction server replicas + startPort: 8011 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # Legacy Mode: Single Predictor Configuration (used when pdMode.enabled=false) # Training Server Configuration trainingServer: image: From 20b5577faf3ea9d3c62ed6acea04c1a48a14a7b1 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Fri, 12 Dec 2025 12:39:18 -0500 Subject: [PATCH 02/12] resolve port conflicts --- .../inferencepool/templates/_latency-predictor.tpl | 14 ++++++++++++++ latencypredictor/build-deploy.sh | 0 2 files changed, 14 insertions(+) mode change 100644 => 100755 latencypredictor/build-deploy.sh diff --git a/config/charts/inferencepool/templates/_latency-predictor.tpl b/config/charts/inferencepool/templates/_latency-predictor.tpl index 7f02d0c63e..e17a4b971b 100644 --- a/config/charts/inferencepool/templates/_latency-predictor.tpl +++ b/config/charts/inferencepool/templates/_latency-predictor.tpl @@ -47,6 +47,13 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) - name: training-server-{{ $predictorName }} image: {{ $predictorConfig.trainingServer.image.hub | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ $predictorConfig.trainingServer.image.name | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ $predictorConfig.trainingServer.image.tag | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} imagePullPolicy: {{ $predictorConfig.trainingServer.image.pullPolicy | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} + command: ["uvicorn"] + args: + - "training_server:app" + - "--host" + - "0.0.0.0" + - "--port" + - "{{ $predictorConfig.trainingServer.port }}" ports: - containerPort: {{ $predictorConfig.trainingServer.port }} name: train-{{ $predictorName }} @@ -127,6 +134,13 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) - name: training-server image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} + command: ["uvicorn"] + args: + - "training_server:app" + - "--host" + - "0.0.0.0" + - "--port" + - "{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" ports: - containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} name: training-port diff --git a/latencypredictor/build-deploy.sh b/latencypredictor/build-deploy.sh old mode 100644 new mode 100755 From d9556ef03e1987c7fd17b1d7c472a7474600c543 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Fri, 12 Dec 2025 14:08:15 -0500 Subject: [PATCH 03/12] make readme succinct --- .../inferencepool/PD-SLO-CHART-GUIDE.md | 368 +++++------------- .../inferencepool/values-pd-slo-example.yaml | 135 ------- 2 files changed, 95 insertions(+), 408 deletions(-) delete mode 100644 config/charts/inferencepool/values-pd-slo-example.yaml diff --git a/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md b/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md index 37bf66c30b..e1f0588ef8 100644 --- a/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md +++ b/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md @@ -1,32 +1,15 @@ # PD-SLO Chart Configuration Guide -This guide explains how to configure the inferencepool Helm chart for PD (Prefill-Decode) disaggregated scheduling with SLO-aware optimization. +Configure the inferencepool Helm chart for PD (Prefill-Decode) disaggregated scheduling with SLO-aware optimization. -## Overview +## Modes -The chart now supports **two modes** for latency predictors: +**Legacy Mode** (default): Single predictor, 1 training + N prediction servers +**PD Mode**: Two predictors (prefill + decode), 4 sidecar containers total -1. **Legacy Mode** (default): Single predictor for unified TTFT/TPOT prediction -2. **PD Mode**: Separate predictors for prefill and decode pods (required for PD-SLO scheduling) - -## Architecture Comparison - -### Legacy Mode (Non-PD) +### PD Mode Architecture ``` -EPP Pod -├─ EPP Container -└─ Sidecars: - ├─ 1 Training Server (port 8000) - └─ N Prediction Servers (ports 8001+) - -Environment Variables: -- TRAINING_SERVER_URL=http://localhost:8000 -- PREDICTION_SERVER_URL=http://localhost:8001,... -``` - -### PD Mode (PD-SLO) -``` -EPP Pod +EPP Pod (5 containers) ├─ EPP Container └─ Sidecars: ├─ Prefill Training Server (port 8000) @@ -34,30 +17,16 @@ EPP Pod ├─ Decode Training Server (port 8010) └─ Decode Prediction Server (port 8011) -Environment Variables: +Environment Variables (auto-generated): - PREFILL_TRAINING_URL=http://localhost:8000 - PREFILL_PREDICTION_URL=http://localhost:8001 - DECODE_TRAINING_URL=http://localhost:8010 - DECODE_PREDICTION_URL=http://localhost:8011 ``` -## Enabling PD Mode - -### Step 1: Set `pdMode.enabled=true` - -```yaml -inferenceExtension: - latencyPredictor: - enabled: true - - pdMode: - enabled: true # Enable PD mode -``` - -### Step 2: Configure Predictor Types - -Configure each predictor type (prefill, decode) with separate ports and configurations: +## Quick Start +**Minimal Configuration**: ```yaml inferenceExtension: latencyPredictor: @@ -67,268 +36,121 @@ inferenceExtension: predictors: prefill: trainingServer: - port: 8000 # Unique port for prefill training - # ... config + port: 8000 + resources: + requests: {cpu: "500m", memory: "1Gi"} predictionServers: count: 1 startPort: 8001 - # ... config - + resources: + requests: {cpu: "250m", memory: "512Mi"} decode: trainingServer: - port: 8010 # Different port from prefill - # ... config + port: 8010 # Must differ from prefill! + resources: + requests: {cpu: "500m", memory: "1Gi"} predictionServers: count: 1 startPort: 8011 - # ... config -``` - -### Step 3: Deploy with PD-SLO values - -```bash -helm install my-pool ./inferencepool \ - --values values-pd-slo-example.yaml \ - --namespace llm-d -``` - -## Key Configuration Points - -### Port Allocation - -**Default Port Ranges** (adjust if you have conflicts): - -| Predictor | Training Port | Prediction Ports | -|-----------|---------------|------------------| -| Prefill | 8000 | 8001 | -| Decode | 8010 | 8011 | - -**Important**: Ports must not conflict! Each predictor needs unique ports. - -### Replica Counts - -**Recommended for MVP**: -- Training servers: `count: 1` (one per predictor type) -- Prediction servers: `count: 1` (one per predictor type) - -**Total sidecars**: 4 containers (2 training + 2 prediction) - -**For production**, you can increase prediction server replicas for higher throughput: -```yaml -predictionServers: - count: 3 # 3 prediction server replicas - startPort: 8001 # Ports: 8001, 8002, 8003 -``` - -### Image Configuration - -You can override images per predictor type or use global defaults: - -**Option 1: Use global defaults** (recommended) -```yaml -# Set global defaults in legacy section -trainingServer: - image: - hub: your-docker-repo - name: latencypredictor-training-server - tag: latest - -# PD predictors will inherit these by default -pdMode: - enabled: true - predictors: - prefill: - trainingServer: - port: 8000 - # No image specified - uses global default -``` - -**Option 2: Override per predictor** -```yaml -pdMode: - predictors: - prefill: - trainingServer: - image: - hub: custom-repo - name: prefill-training-server - tag: v2.0 -``` - -### Resource Configuration - -Similarly, resources can be global or per-predictor: - -```yaml -# Global defaults -trainingServer: - resources: - requests: - cpu: "2000m" - memory: "4Gi" - -# Override for specific predictor -pdMode: - predictors: - decode: - trainingServer: - resources: - requests: - cpu: "4000m" # Decode needs more CPU - memory: "8Gi" + resources: + requests: {cpu: "250m", memory: "512Mi"} ``` -## ConfigMaps Generated - -In PD mode, the chart creates separate ConfigMaps for each predictor: - -**Legacy Mode**: -- `-latency-predictor-training` -- `-latency-predictor-prediction` - -**PD Mode**: -- `-latency-predictor-prefill-training` -- `-latency-predictor-prefill-prediction` -- `-latency-predictor-decode-training` -- `-latency-predictor-decode-prediction` - -## Integration with llm-d-inference-scheduler - -The environment variables generated by this chart are consumed by `llm-d-inference-scheduler`'s `PDPredictorSet`: - -**File**: `llm-d-inference-scheduler/pkg/predictors/pd_predictors.go` - -```go -func NewPDPredictorSet(logger logr.Logger) (*PDPredictorSet, error) { - prefillConfig := &latencypredictor.Config{ - TrainingURL: getEnvOrDefault("PREFILL_TRAINING_URL", ""), - PredictionURLs: getPredictionURLs("PREFILL_PREDICTION_URL"), - // ... - } - - decodeConfig := &latencypredictor.Config{ - TrainingURL: getEnvOrDefault("DECODE_TRAINING_URL", ""), - PredictionURLs: getPredictionURLs("DECODE_PREDICTION_URL"), - // ... - } - // ... -} -``` - -## Validation - -After deployment, verify the sidecars are running: - +**Deploy**: ```bash -# Check pod has 5 containers (1 EPP + 4 sidecars) -kubectl get pods -n llm-d - -# Describe pod to see all containers -kubectl describe pod -n llm-d - -# Check environment variables in EPP container -kubectl exec -n llm-d -c epp -- env | grep -E "PREFILL|DECODE" - -# Expected output: -# PREFILL_TRAINING_URL=http://localhost:8000 -# PREFILL_PREDICTION_URL=http://localhost:8001 -# DECODE_TRAINING_URL=http://localhost:8010 -# DECODE_PREDICTION_URL=http://localhost:8011 - -# Check predictor health -kubectl exec -n llm-d -c training-server-prefill -- curl http://localhost:8000/healthz -kubectl exec -n llm-d -c training-server-decode -- curl http://localhost:8010/healthz +helm install my-pool ./inferencepool -f values-pd-slo.yaml -n llm-d ``` -## Backward Compatibility +## Configuration Details -**Important**: Legacy mode continues to work unchanged. To use legacy mode: +### Required Settings -```yaml -inferenceExtension: - latencyPredictor: - enabled: true +| Component | Setting | Value | Notes | +|-----------|---------|-------|-------| +| Prefill Training | `port` | 8000 | Must differ from decode | +| Prefill Prediction | `startPort` | 8001 | | +| Decode Training | `port` | 8010 | Must differ from prefill | +| Decode Prediction | `startPort` | 8011 | | +| Prediction Count | `count` | 1 | Can increase for production | - # Do NOT set pdMode.enabled or set it to false - pdMode: - enabled: false +### Health Probes (Required) - # Use legacy configuration - trainingServer: - port: 8000 - # ... +Both training and prediction servers **must** have `livenessProbe` and `readinessProbe` configured with `httpGet.path` and `port`: - predictionServers: - count: 10 - startPort: 8001 - # ... -``` - -This will create the same deployment as before PD mode was added. - -## Troubleshooting - -### Port Conflicts - -**Symptom**: Containers failing to start with "address already in use" - -**Solution**: Ensure each predictor uses unique ports. Check: ```yaml -prefill.trainingServer.port != decode.trainingServer.port -prefill.predictionServers.startPort != decode.predictionServers.startPort -``` - -### Missing Environment Variables - -**Symptom**: llm-d-inference-scheduler logs show "PREFILL_TRAINING_URL must be set" - -**Solution**: Verify `pdMode.enabled=true` and check EPP pod environment: -```bash -kubectl exec -c epp -- env | grep -E "PREFILL|DECODE" -``` - -### ConfigMap Not Found - -**Symptom**: Containers failing with "configmap not found" - -**Solution**: Verify ConfigMaps were created: -```bash -kubectl get configmaps -n llm-d | grep latency-predictor -``` - -Should show: +prefill: + trainingServer: + livenessProbe: + httpGet: {path: /healthz, port: 8000} + initialDelaySeconds: 30 + readinessProbe: + httpGet: {path: /readyz, port: 8000} + initialDelaySeconds: 45 + predictionServers: + livenessProbe: + httpGet: {path: /healthz} + initialDelaySeconds: 15 + readinessProbe: + httpGet: {path: /readyz} + initialDelaySeconds: 10 +``` + +### Images and Resources + +**Override per predictor** or **use global defaults** from legacy section (see `values.yaml`). + +## Generated Resources + +**ConfigMaps** (4 in PD mode): - `-latency-predictor-prefill-training` - `-latency-predictor-prefill-prediction` - `-latency-predictor-decode-training` - `-latency-predictor-decode-prediction` -## Example Deployment +**Environment Variables** (auto-injected into EPP container): +- `PREFILL_TRAINING_URL`, `PREFILL_PREDICTION_URL` +- `DECODE_TRAINING_URL`, `DECODE_PREDICTION_URL` -See `values-pd-slo-example.yaml` for a complete working configuration. +These are consumed by `llm-d-inference-scheduler`'s `PDPredictorSet` for latency prediction. -```bash -# Deploy with PD-SLO mode -helm install llm-d-epp ./inferencepool \ - --values values-pd-slo-example.yaml \ - --namespace llm-d \ - --create-namespace +## Validation -# Verify deployment +```bash +# Check 5 containers (1 EPP + 4 sidecars) kubectl get pods -n llm-d -kubectl logs -n llm-d -c epp | grep "PD predictor" -``` +kubectl describe pod -n llm-d -## Summary +# Verify environment variables +kubectl exec -n llm-d -c epp -- env | grep -E "PREFILL|DECODE" +# Expected: PREFILL_TRAINING_URL=http://localhost:8000, etc. -| Feature | Legacy Mode | PD Mode | -|---------|-------------|---------| -| Enable flag | `latencyPredictor.enabled=true` | `latencyPredictor.pdMode.enabled=true` | -| Training servers | 1 | 2 (prefill + decode) | -| Prediction servers | N (configurable) | 1 per predictor type | -| Environment vars | `TRAINING_SERVER_URL`, `PREDICTION_SERVER_URL` | `PREFILL_*`, `DECODE_*` | -| ConfigMaps | 2 | 4 | -| Use case | Non-disaggregated scheduling | PD-SLO disaggregated scheduling | +# Test predictor health +kubectl exec -n llm-d -c training-server-prefill -- curl http://localhost:8000/healthz +kubectl exec -n llm-d -c training-server-decode -- curl http://localhost:8010/healthz +``` ---- +## Troubleshooting -**Next Steps**: After deploying the EPP with PD mode, configure `llm-d-inference-scheduler` to use PD-SLO scheduling. See `llm-d-inference-scheduler/README-PD-SLO.md`. +| Issue | Symptom | Solution | +|-------|---------|----------| +| Port conflict | `address already in use` | Ensure prefill/decode ports differ (8000 vs 8010) | +| Missing env vars | `PREFILL_TRAINING_URL must be set` | Verify `pdMode.enabled=true`, check `kubectl exec -c epp -- env` | +| ConfigMap missing | `configmap not found` | Check `kubectl get cm -n llm-d \| grep latency-predictor` (should show 4) | +| Pod pending | `Insufficient cpu/memory` | Reduce resource requests (500m/1Gi for training, 250m/512Mi for prediction) | +| Probe failures | Containers restarting | Verify probe paths (`/healthz`, `/readyz`) and ports are configured | + +## Important Notes + +1. **PD Mode**: Set `pdMode.enabled=true` to enable dual-predictor architecture +2. **Ports**: Training servers must use different ports (prefill: 8000, decode: 8010) +3. **Probes**: Both `livenessProbe` and `readinessProbe` with `httpGet.path` and `port` are required +4. **Resources**: Start with 500m/1Gi (training), 250m/512Mi (prediction) for MVP +5. **Prediction Count**: `count: 1` for MVP, increase for production throughput +6. **Backward Compatibility**: Legacy mode (single predictor) still works when `pdMode.enabled=false` +7. **Joint Optimization**: Currently uses fallback (best pod from each profile). Full joint optimization TBD. + +## Reference + +- Chart values: `values.yaml` +- Scheduler guide: `llm-d-inference-scheduler/PD-SLO-GUIDE.md` +- Chart template: `templates/_latency-predictor.tpl` diff --git a/config/charts/inferencepool/values-pd-slo-example.yaml b/config/charts/inferencepool/values-pd-slo-example.yaml deleted file mode 100644 index a2d70702bb..0000000000 --- a/config/charts/inferencepool/values-pd-slo-example.yaml +++ /dev/null @@ -1,135 +0,0 @@ -# Example values.yaml for PD-SLO Architecture -# This configuration enables separate predictors for prefill and decode pods -# Required for llm-d-inference-scheduler with PD disaggregation - -inferenceExtension: - latencyPredictor: - enabled: true - - # Enable PD Mode for disaggregated scheduling - pdMode: - enabled: true # Set to true to enable PD-SLO architecture - - predictors: - # Prefill predictor: Predicts TTFT for prefill pods - # Training data comes from prefill pod metrics - prefill: - trainingServer: - port: 8000 - image: - hub: your-docker-repo # Update with your Docker registry - name: latencypredictor-training-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - - predictionServers: - count: 1 # Number of prefill prediction server replicas - startPort: 8001 - image: - hub: your-docker-repo # Update with your Docker registry - name: latencypredictor-prediction-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # Decode predictor: Predicts TTFT + TPOT for decode pods - # Training data comes from decode pod metrics - decode: - trainingServer: - port: 8010 # Different port from prefill - image: - hub: your-docker-repo # Update with your Docker registry - name: latencypredictor-training-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - - predictionServers: - count: 1 # Number of decode prediction server replicas - startPort: 8011 - image: - hub: your-docker-repo # Update with your Docker registry - name: latencypredictor-prediction-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - -# This will generate the following environment variables in the EPP container: -# -# - PREFILL_TRAINING_URL=http://localhost:8000 -# - PREFILL_PREDICTION_URL=http://localhost:8001 -# - DECODE_TRAINING_URL=http://localhost:8010 -# - DECODE_PREDICTION_URL=http://localhost:8011 -# -# Total sidecar containers in EPP pod: -# - 1 prefill training server (port 8000) -# - 1 prefill prediction server (port 8001) -# - 1 decode training server (port 8010) -# - 1 decode prediction server (port 8011) -# = 4 total sidecar containers -# -# These environment variables are consumed by llm-d-inference-scheduler's -# PDPredictorSet (pkg/predictors/pd_predictors.go) From 9d3c9db3df57b324205dd6ff0c8d038d0c2fade8 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 16:51:58 -0500 Subject: [PATCH 04/12] add podType to prediction and training --- sidecars/latencypredictorasync/types.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sidecars/latencypredictorasync/types.go b/sidecars/latencypredictorasync/types.go index c8eadefe23..2d2ebb67c7 100644 --- a/sidecars/latencypredictorasync/types.go +++ b/sidecars/latencypredictorasync/types.go @@ -133,6 +133,7 @@ type TrainingEntry struct { ActualTTFT float64 `json:"actual_ttft_ms"` ActualTPOT float64 `json:"actual_tpot_ms"` PrefixCacheScore float64 `json:"prefix_cache_score"` + PodType string `json:"pod_type,omitempty"` // "prefill", "decode", or "" for monolithic Timestamp time.Time `json:"timestamp"` } @@ -147,6 +148,7 @@ type PredictionRequest struct { NumRequestRunning int `json:"num_request_running"` NumTokensGenerated int `json:"num_tokens_generated"` PrefixCacheScore float64 `json:"prefix_cache_score"` + PodType string `json:"pod_type,omitempty"` // "prefill", "decode", or "" for monolithic } type PredictionResponse struct { From 3da1f2a2628b6914a6d87a4a87575ccd25659925 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 17:48:54 -0500 Subject: [PATCH 05/12] add pod type as categorical data to xgboost model --- latencypredictor/prediction_server.py | 36 ++++++++++++++------ latencypredictor/training_server.py | 48 +++++++++++++++++++++------ 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/latencypredictor/prediction_server.py b/latencypredictor/prediction_server.py index 581f83421c..263f55dc9d 100644 --- a/latencypredictor/prediction_server.py +++ b/latencypredictor/prediction_server.py @@ -219,14 +219,27 @@ def is_ready(self) -> bool: def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) -> pd.DataFrame: """ Prepare features with interaction terms to match training server. - + Args: df: DataFrame with raw features model_type: 'ttft' or 'tpot' - + Returns: DataFrame with engineered features including interactions """ + # Encode pod_type as categorical (common for both TTFT and TPOT) + # Convert to categorical with known categories for consistent encoding + if 'pod_type' in df.columns: + df['pod_type'] = df['pod_type'].fillna('') # Handle NaN + df['pod_type_cat'] = pd.Categorical( + df['pod_type'], + categories=['', 'prefill', 'decode'], # '' = monolithic, prefill, decode + ordered=False + ) + else: + # If pod_type column doesn't exist, create it as empty (monolithic) + df['pod_type_cat'] = pd.Categorical([''] * len(df), categories=['', 'prefill', 'decode'], ordered=False) + if model_type == "ttft": # Create interaction: prefix score * input length df['effective_input_tokens'] = (1-df['prefix_cache_score']) * df['input_token_length'] @@ -238,9 +251,9 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) # make it categorical for tree models (safe for LGB, XGB with enable_categorical) df['prefill_score_bucket'] = pd.Categorical(df['prefill_score_bucket'], categories=[0,1,2,3], ordered=True) - - - # Return TTFT features with interaction + + + # Return TTFT features with interaction and pod_type feature_cols = [ 'kv_cache_percentage', 'input_token_length', @@ -248,11 +261,12 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'num_request_running', 'prefix_cache_score', 'effective_input_tokens', - 'prefill_score_bucket' + 'prefill_score_bucket', + 'pod_type_cat' ] - + return df[feature_cols] - + else: # tpot # TPOT doesn't use prefix_cache_score, so no interaction needed feature_cols = [ @@ -260,9 +274,10 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'input_token_length', 'num_request_waiting', 'num_request_running', - 'num_tokens_generated' + 'num_tokens_generated', + 'pod_type_cat' ] - + return df[feature_cols] def load_models(self) -> bool: @@ -471,6 +486,7 @@ class PredictionRequest(BaseModel): num_request_running: int = Field(..., ge=0) num_tokens_generated: int = Field(..., ge=0) prefix_cache_score: float = Field(..., ge=0.0, le=1.0, description="Prefix cache hit ratio score (0.0 to 1.0)") + pod_type: Optional[str] = Field(default="", description="Pod type: 'prefill', 'decode', or '' for monolithic") class PredictionResponse(BaseModel): diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index 3e1e2751f4..7e7b09eda4 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -336,14 +336,27 @@ def _store_descaled_coefficients(self, model, scaler, feature_names, model_name) def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) -> pd.DataFrame: """ Prepare features with interaction terms for better model learning. - + Args: df: DataFrame with raw features model_type: 'ttft' or 'tpot' - + Returns: DataFrame with engineered features including interactions """ + # Encode pod_type as categorical (common for both TTFT and TPOT) + # Convert to categorical with known categories for consistent encoding + if 'pod_type' in df.columns: + df['pod_type'] = df['pod_type'].fillna('') # Handle NaN + df['pod_type_cat'] = pd.Categorical( + df['pod_type'], + categories=['', 'prefill', 'decode'], # '' = monolithic, prefill, decode + ordered=False + ) + else: + # If pod_type column doesn't exist, create it as empty (monolithic) + df['pod_type_cat'] = pd.Categorical([''] * len(df), categories=['', 'prefill', 'decode'], ordered=False) + if model_type == "ttft": # Create interaction: prefix score * input length # This captures that prefix caching benefit scales with input size @@ -358,7 +371,7 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) df['prefill_score_bucket'] = pd.Categorical(df['prefill_score_bucket'], categories=[0,1,2,3], ordered=True) - # Return TTFT features with interaction + # Return TTFT features with interaction and pod_type feature_cols = [ 'kv_cache_percentage', 'input_token_length', @@ -366,11 +379,12 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'num_request_running', 'prefix_cache_score', 'effective_input_tokens', - 'prefill_score_bucket' + 'prefill_score_bucket', + 'pod_type_cat' ] - + return df[feature_cols] - + else: # tpot # TPOT doesn't use prefix_cache_score, so no interaction needed feature_cols = [ @@ -378,9 +392,10 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'input_token_length', 'num_request_waiting', 'num_request_running', - 'num_tokens_generated' + 'num_tokens_generated', + 'pod_type_cat' ] - + return df[feature_cols] @@ -816,15 +831,24 @@ def predict(self, features: dict) -> Tuple[float, float, float, float]: if not isinstance(features[f], (int, float)): raise ValueError(f"Invalid type for feature {f}: expected number") - # Updated TTFT features to include prefix_cache_score + # Updated TTFT features to include prefix_cache_score and pod_type ttft_cols = ['kv_cache_percentage','input_token_length','num_request_waiting','num_request_running','prefix_cache_score'] tpot_cols = ['kv_cache_percentage','input_token_length','num_request_waiting','num_request_running','num_tokens_generated'] - + # Create DataFrames for predictions df_ttft = pd.DataFrame([{col: features[col] for col in ttft_cols}]) - # Add interaction term for TTFT + # Add pod_type if present (otherwise _prepare_features_with_interaction will default to '') + if 'pod_type' in features: + df_ttft['pod_type'] = features['pod_type'] + # Add interaction term for TTFT (includes pod_type encoding) df_ttft = self._prepare_features_with_interaction(df_ttft, model_type="ttft") + df_tpot = pd.DataFrame([{col: features[col] for col in tpot_cols}]) + # Add pod_type if present + if 'pod_type' in features: + df_tpot['pod_type'] = features['pod_type'] + # Add pod_type encoding for TPOT + df_tpot = self._prepare_features_with_interaction(df_tpot, model_type="tpot") if self.model_type == ModelType.BAYESIAN_RIDGE: # Use scaling for Bayesian Ridge @@ -1302,6 +1326,7 @@ class TrainingEntry(BaseModel): actual_tpot_ms: float = Field(..., ge=0.0) num_tokens_generated: int = Field(..., ge=0) prefix_cache_score: float = Field(..., ge=0.0, le=1.0, description="Prefix cache hit ratio score (0.0 to 1.0)") + pod_type: Optional[str] = Field(default="", description="Pod type: 'prefill', 'decode', or '' for monolithic") timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) class PredictionRequest(BaseModel): @@ -1311,6 +1336,7 @@ class PredictionRequest(BaseModel): num_request_running: int = Field(..., ge=0) num_tokens_generated: int = Field(..., ge=0) prefix_cache_score: float = Field(..., ge=0.0, le=1.0, description="Prefix cache hit ratio score (0.0 to 1.0)") + pod_type: Optional[str] = Field(default="", description="Pod type: 'prefill', 'decode', or '' for monolithic") class PredictionResponse(BaseModel): ttft_ms: float = Field(..., description=f"Predicted {settings.QUANTILE_ALPHA:.0%} quantile TTFT in milliseconds") From 5ab1540607bd833fdc350bf86fdf65c9d2db1396 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 18:03:14 -0500 Subject: [PATCH 06/12] bayesian ridge does not have cat data, use 1 hot encoding instead for pod_type --- latencypredictor/prediction_server.py | 21 ++++++++++++++++++--- latencypredictor/training_server.py | 20 ++++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/latencypredictor/prediction_server.py b/latencypredictor/prediction_server.py index 263f55dc9d..261d2e0687 100644 --- a/latencypredictor/prediction_server.py +++ b/latencypredictor/prediction_server.py @@ -348,10 +348,17 @@ def predict(self, features: dict) -> Tuple[float, float]: #df_tpot = pd.DataFrame([tpot_raw_data]) if self.model_type == ModelType.BAYESIAN_RIDGE: - + # Bayesian Ridge can't handle categorical features directly + # Drop categorical bucket, but one-hot encode pod_type ttft_for_scale = df_ttft.drop(columns=['prefill_score_bucket'], errors='ignore') + if 'pod_type_cat' in ttft_for_scale.columns: + ttft_for_scale = pd.get_dummies(ttft_for_scale, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) ttft_scaled = self.ttft_scaler.transform(ttft_for_scale) - tpot_scaled = self.tpot_scaler.transform(df_tpot) + + tpot_for_scale = df_tpot.copy() + if 'pod_type_cat' in tpot_for_scale.columns: + tpot_for_scale = pd.get_dummies(tpot_for_scale, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) + tpot_scaled = self.tpot_scaler.transform(tpot_for_scale) ttft_pred_mean, ttft_std = self.ttft_model.predict(ttft_scaled, return_std=True) tpot_pred_mean, tpot_std = self.tpot_model.predict(tpot_scaled, return_std=True) @@ -431,9 +438,17 @@ def predict_batch(self, features_list: List[dict]) -> Tuple[np.ndarray, np.ndarr #df_tpot_batch = pd.DataFrame(tpot_raw_data) if self.model_type == ModelType.BAYESIAN_RIDGE: + # Bayesian Ridge can't handle categorical features directly + # Drop categorical bucket, but one-hot encode pod_type ttft_for_scale = df_ttft_batch.drop(columns=['prefill_score_bucket'], errors='ignore') + if 'pod_type_cat' in ttft_for_scale.columns: + ttft_for_scale = pd.get_dummies(ttft_for_scale, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) ttft_scaled = self.ttft_scaler.transform(ttft_for_scale) - tpot_scaled = self.tpot_scaler.transform(df_tpot_batch) + + tpot_for_scale = df_tpot_batch.copy() + if 'pod_type_cat' in tpot_for_scale.columns: + tpot_for_scale = pd.get_dummies(tpot_for_scale, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) + tpot_scaled = self.tpot_scaler.transform(tpot_for_scale) ttft_pred_mean, ttft_std = self.ttft_model.predict(ttft_scaled, return_std=True) tpot_pred_mean, tpot_std = self.tpot_model.predict(tpot_scaled, return_std=True) diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index 7e7b09eda4..2964b59ed6 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -434,11 +434,22 @@ def _train_model_with_scaling(self, features: pd.DataFrame, target: pd.Series, m raise ValueError("Empty training data") if features.isnull().any().any() or target.isnull().any(): raise ValueError("Training data contains NaN values") - if np.isinf(features.values).any() or np.isinf(target.values).any(): + # Check only numeric columns for infinity (categorical columns cause isinf to fail) + numeric_features = features.select_dtypes(include=[np.number]) + if len(numeric_features.columns) > 0 and np.isinf(numeric_features.values).any(): raise ValueError("Training data contains infinite values") + if np.isinf(target.values).any(): + raise ValueError("Target data contains infinite values") if self.model_type == ModelType.BAYESIAN_RIDGE: + # Bayesian Ridge can't handle categorical features directly + # Drop categorical bucket, but one-hot encode pod_type to preserve the information features = features.drop(columns=['prefill_score_bucket'], errors='ignore') + + # One-hot encode pod_type_cat if it exists (converts to numeric 0/1 columns) + if 'pod_type_cat' in features.columns: + features = pd.get_dummies(features, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) + scaler = StandardScaler() features_scaled = scaler.fit_transform(features) if np.isnan(features_scaled).any() or np.isinf(features_scaled).any(): @@ -851,9 +862,14 @@ def predict(self, features: dict) -> Tuple[float, float, float, float]: df_tpot = self._prepare_features_with_interaction(df_tpot, model_type="tpot") if self.model_type == ModelType.BAYESIAN_RIDGE: - # Use scaling for Bayesian Ridge + # Use scaling for Bayesian Ridge - drop categorical bucket, one-hot encode pod_type df_ttft = df_ttft.drop(columns=['prefill_score_bucket'], errors='ignore') + if 'pod_type_cat' in df_ttft.columns: + df_ttft = pd.get_dummies(df_ttft, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) ttft_scaled = self.ttft_scaler.transform(df_ttft) + + if 'pod_type_cat' in df_tpot.columns: + df_tpot = pd.get_dummies(df_tpot, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) tpot_scaled = self.tpot_scaler.transform(df_tpot) ttft_pred_mean, ttft_std = self.ttft_model.predict(ttft_scaled, return_std=True) From 160658be8eee232c134a84a557713510d5ff3d39 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 18:21:36 -0500 Subject: [PATCH 07/12] bug with tpot pod_type feature --- latencypredictor/training_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index 2964b59ed6..ef48bda225 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -661,6 +661,7 @@ def _create_default_model(self, model_type: str) -> Union[Tuple[BayesianRidge, S 'num_request_running': [0, ], 'num_tokens_generated': [1,] }) + features = self._prepare_features_with_interaction(features, "tpot") target = pd.Series([10.0]) return self._train_model_with_scaling(features, target, model_name=model_type) except Exception as e: From f71b83ffb9df00726d76a8583e3c94d22ea8c5f4 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 18:35:35 -0500 Subject: [PATCH 08/12] modify hardocoded feature orders --- latencypredictor/training_server.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index ef48bda225..2e040b0c37 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -464,10 +464,10 @@ def _train_model_with_scaling(self, features: pd.DataFrame, target: pd.Series, m elif self.model_type == ModelType.XGBOOST: # XGBoost with quantile regression if model_name == "ttft": - # enforce your TTFT feature order + # enforce your TTFT feature order (including pod_type_cat) ttft_order = [ "kv_cache_percentage", "input_token_length", "num_request_waiting", - "num_request_running", "prefix_cache_score", "effective_input_tokens", "prefill_score_bucket" + "num_request_running", "prefix_cache_score", "effective_input_tokens", "prefill_score_bucket", "pod_type_cat" ] if list(features.columns) != ttft_order: try: @@ -517,15 +517,15 @@ def _train_model_with_scaling(self, features: pd.DataFrame, target: pd.Series, m elif model_name == "tpot": - tpot_order = ["kv_cache_percentage","input_token_length","num_request_waiting","num_request_running","num_tokens_generated"] + tpot_order = ["kv_cache_percentage","input_token_length","num_request_waiting","num_request_running","num_tokens_generated","pod_type_cat"] if list(features.columns) != tpot_order: try: features = features[tpot_order] except Exception as _: raise ValueError(f"TPOT features must be exactly {tpot_order}; got {list(features.columns)}") - mono_str = "(1,1,1,1,1)" + mono_str = "(1,1,1,1,1,0)" # pod_type_cat has no monotone constraint else: - mono_str = "(0,0,0,0,0)" # default + mono_str = "(0,0,0,0,0,0)" # default (6 features with pod_type_cat) model = xgb.XGBRegressor( n_estimators=200, # Number of trees to build (moderate value for balanced accuracy and speed) max_depth=6, # Depth of trees; 6 is typically a sweet spot balancing bias/variance @@ -689,10 +689,10 @@ def train(self): df_ttft = self._prepare_features_with_interaction(raw_ttft.copy(), model_type="ttft") print(f"TTFT training data size: {len(df_ttft)} with sample data: {df_ttft.columns.tolist()}") if len(df_ttft) >= settings.MIN_SAMPLES_FOR_RETRAIN: - # Updated TTFT features to include prefix_cache_score + # Updated TTFT features to include prefix_cache_score and pod_type_cat ttft_feature_cols_tree = [ 'kv_cache_percentage','input_token_length','num_request_waiting', - 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket' + 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket','pod_type_cat' ] ttft_feature_cols_br = [ 'kv_cache_percentage','input_token_length','num_request_waiting', @@ -702,7 +702,8 @@ def train(self): # Build X_ttft for all model types, then trim for BR X_ttft = df_ttft[ttft_feature_cols_tree] if self.model_type == ModelType.BAYESIAN_RIDGE: - X_ttft = X_ttft[ttft_feature_cols_br] + # For Bayesian Ridge, drop categorical features (handled by one-hot encoding in _train_model_with_scaling) + X_ttft = df_ttft # Use full df_ttft which will be processed by _train_model_with_scaling y_ttft = raw_ttft['actual_ttft_ms'] @@ -761,8 +762,8 @@ def train(self): df_tpot = pd.DataFrame(tpot_snap).dropna() df_tpot = df_tpot[df_tpot['actual_tpot_ms'] > 0] if len(df_tpot) >= settings.MIN_SAMPLES_FOR_RETRAIN: - # TPOT features remain unchanged - X_tpot = df_tpot[['kv_cache_percentage', 'input_token_length', 'num_request_waiting', 'num_request_running', 'num_tokens_generated']] + # TPOT features - use feature preparation to add pod_type_cat + X_tpot = self._prepare_features_with_interaction(df_tpot.copy(), model_type="tpot") y_tpot = df_tpot['actual_tpot_ms'] try: result = self._train_model_with_scaling(X_tpot, y_tpot, model_name="tpot") From 5e0c987ac4043324eeb25404f5fb4fdfc6febc9b Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 18:53:36 -0500 Subject: [PATCH 09/12] add pod_Type_cat to training in other edge cases --- latencypredictor/training_server.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index 2e040b0c37..ba4ea8e644 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -595,23 +595,24 @@ def _calculate_quantile_metrics_on_test(self, model, scaler, test_data, model_na if self.model_type == ModelType.BAYESIAN_RIDGE: feature_cols = [ 'kv_cache_percentage','input_token_length','num_request_waiting', - 'num_request_running','prefix_cache_score','effective_input_tokens' + 'num_request_running','prefix_cache_score','effective_input_tokens','pod_type_cat' ] else: # XGBoost or LightGBM feature_cols = [ 'kv_cache_percentage','input_token_length','num_request_waiting', - 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket' + 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket','pod_type_cat' ] else: # tpot - feature_cols = ['kv_cache_percentage', 'input_token_length', - 'num_request_waiting', 'num_request_running', 'num_tokens_generated'] + feature_cols = ['kv_cache_percentage', 'input_token_length', + 'num_request_waiting', 'num_request_running', 'num_tokens_generated', 'pod_type_cat'] - X = df_features[feature_cols] # ✅ Now has properly typed categorical! - - - + X = df_features[feature_cols] + # For Bayesian Ridge, one-hot encode pod_type_cat before scaling if self.model_type == ModelType.BAYESIAN_RIDGE and scaler is not None: + # One-hot encode pod_type_cat (Bayesian Ridge can't handle categorical features) + if 'pod_type_cat' in X.columns: + X = pd.get_dummies(X, columns=['pod_type_cat'], prefix='pod_type', drop_first=False) X = scaler.transform(X) y_true = df_raw[target_col].values @@ -1211,12 +1212,13 @@ def emit_metrics(model, coefficients, feats, prefix): if self.model_type == ModelType.BAYESIAN_RIDGE: ttft_feats = ["kv_cache_percentage","input_token_length","num_request_waiting", "num_request_running","prefix_cache_score","effective_input_tokens"] + tpot_feats = ["kv_cache_percentage","input_token_length","num_request_waiting", + "num_request_running","num_tokens_generated"] else: ttft_feats = ["kv_cache_percentage","input_token_length","num_request_waiting", - "num_request_running","prefix_cache_score","effective_input_tokens","prefill_score_bucket"] - - tpot_feats = ["kv_cache_percentage","input_token_length","num_request_waiting", - "num_request_running","num_tokens_generated"] + "num_request_running","prefix_cache_score","effective_input_tokens","prefill_score_bucket","pod_type_cat"] + tpot_feats = ["kv_cache_percentage","input_token_length","num_request_waiting", + "num_request_running","num_tokens_generated","pod_type_cat"] emit_metrics(ttft_model, self.ttft_coefficients, ttft_feats, "ttft") emit_metrics(tpot_model, self.tpot_coefficients, tpot_feats, "tpot") From 79ed9953e686b7ac4ca6b01d9156a54b078bfc49 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 19:34:20 -0500 Subject: [PATCH 10/12] remove pdMode since only 1 predictor now --- .../inferencepool/PD-SLO-CHART-GUIDE.md | 156 ------------------ .../templates/_latency-predictor.tpl | 129 +-------------- .../inferencepool/templates/epp-config.yaml | 27 --- config/charts/inferencepool/values.yaml | 73 +------- 4 files changed, 6 insertions(+), 379 deletions(-) delete mode 100644 config/charts/inferencepool/PD-SLO-CHART-GUIDE.md diff --git a/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md b/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md deleted file mode 100644 index e1f0588ef8..0000000000 --- a/config/charts/inferencepool/PD-SLO-CHART-GUIDE.md +++ /dev/null @@ -1,156 +0,0 @@ -# PD-SLO Chart Configuration Guide - -Configure the inferencepool Helm chart for PD (Prefill-Decode) disaggregated scheduling with SLO-aware optimization. - -## Modes - -**Legacy Mode** (default): Single predictor, 1 training + N prediction servers -**PD Mode**: Two predictors (prefill + decode), 4 sidecar containers total - -### PD Mode Architecture -``` -EPP Pod (5 containers) -├─ EPP Container -└─ Sidecars: - ├─ Prefill Training Server (port 8000) - ├─ Prefill Prediction Server (port 8001) - ├─ Decode Training Server (port 8010) - └─ Decode Prediction Server (port 8011) - -Environment Variables (auto-generated): -- PREFILL_TRAINING_URL=http://localhost:8000 -- PREFILL_PREDICTION_URL=http://localhost:8001 -- DECODE_TRAINING_URL=http://localhost:8010 -- DECODE_PREDICTION_URL=http://localhost:8011 -``` - -## Quick Start - -**Minimal Configuration**: -```yaml -inferenceExtension: - latencyPredictor: - enabled: true - pdMode: - enabled: true - predictors: - prefill: - trainingServer: - port: 8000 - resources: - requests: {cpu: "500m", memory: "1Gi"} - predictionServers: - count: 1 - startPort: 8001 - resources: - requests: {cpu: "250m", memory: "512Mi"} - decode: - trainingServer: - port: 8010 # Must differ from prefill! - resources: - requests: {cpu: "500m", memory: "1Gi"} - predictionServers: - count: 1 - startPort: 8011 - resources: - requests: {cpu: "250m", memory: "512Mi"} -``` - -**Deploy**: -```bash -helm install my-pool ./inferencepool -f values-pd-slo.yaml -n llm-d -``` - -## Configuration Details - -### Required Settings - -| Component | Setting | Value | Notes | -|-----------|---------|-------|-------| -| Prefill Training | `port` | 8000 | Must differ from decode | -| Prefill Prediction | `startPort` | 8001 | | -| Decode Training | `port` | 8010 | Must differ from prefill | -| Decode Prediction | `startPort` | 8011 | | -| Prediction Count | `count` | 1 | Can increase for production | - -### Health Probes (Required) - -Both training and prediction servers **must** have `livenessProbe` and `readinessProbe` configured with `httpGet.path` and `port`: - -```yaml -prefill: - trainingServer: - livenessProbe: - httpGet: {path: /healthz, port: 8000} - initialDelaySeconds: 30 - readinessProbe: - httpGet: {path: /readyz, port: 8000} - initialDelaySeconds: 45 - predictionServers: - livenessProbe: - httpGet: {path: /healthz} - initialDelaySeconds: 15 - readinessProbe: - httpGet: {path: /readyz} - initialDelaySeconds: 10 -``` - -### Images and Resources - -**Override per predictor** or **use global defaults** from legacy section (see `values.yaml`). - -## Generated Resources - -**ConfigMaps** (4 in PD mode): -- `-latency-predictor-prefill-training` -- `-latency-predictor-prefill-prediction` -- `-latency-predictor-decode-training` -- `-latency-predictor-decode-prediction` - -**Environment Variables** (auto-injected into EPP container): -- `PREFILL_TRAINING_URL`, `PREFILL_PREDICTION_URL` -- `DECODE_TRAINING_URL`, `DECODE_PREDICTION_URL` - -These are consumed by `llm-d-inference-scheduler`'s `PDPredictorSet` for latency prediction. - -## Validation - -```bash -# Check 5 containers (1 EPP + 4 sidecars) -kubectl get pods -n llm-d -kubectl describe pod -n llm-d - -# Verify environment variables -kubectl exec -n llm-d -c epp -- env | grep -E "PREFILL|DECODE" -# Expected: PREFILL_TRAINING_URL=http://localhost:8000, etc. - -# Test predictor health -kubectl exec -n llm-d -c training-server-prefill -- curl http://localhost:8000/healthz -kubectl exec -n llm-d -c training-server-decode -- curl http://localhost:8010/healthz -``` - -## Troubleshooting - -| Issue | Symptom | Solution | -|-------|---------|----------| -| Port conflict | `address already in use` | Ensure prefill/decode ports differ (8000 vs 8010) | -| Missing env vars | `PREFILL_TRAINING_URL must be set` | Verify `pdMode.enabled=true`, check `kubectl exec -c epp -- env` | -| ConfigMap missing | `configmap not found` | Check `kubectl get cm -n llm-d \| grep latency-predictor` (should show 4) | -| Pod pending | `Insufficient cpu/memory` | Reduce resource requests (500m/1Gi for training, 250m/512Mi for prediction) | -| Probe failures | Containers restarting | Verify probe paths (`/healthz`, `/readyz`) and ports are configured | - -## Important Notes - -1. **PD Mode**: Set `pdMode.enabled=true` to enable dual-predictor architecture -2. **Ports**: Training servers must use different ports (prefill: 8000, decode: 8010) -3. **Probes**: Both `livenessProbe` and `readinessProbe` with `httpGet.path` and `port` are required -4. **Resources**: Start with 500m/1Gi (training), 250m/512Mi (prediction) for MVP -5. **Prediction Count**: `count: 1` for MVP, increase for production throughput -6. **Backward Compatibility**: Legacy mode (single predictor) still works when `pdMode.enabled=false` -7. **Joint Optimization**: Currently uses fallback (best pod from each profile). Full joint optimization TBD. - -## Reference - -- Chart values: `values.yaml` -- Scheduler guide: `llm-d-inference-scheduler/PD-SLO-GUIDE.md` -- Chart template: `templates/_latency-predictor.tpl` diff --git a/config/charts/inferencepool/templates/_latency-predictor.tpl b/config/charts/inferencepool/templates/_latency-predictor.tpl index e17a4b971b..aadd43256a 100644 --- a/config/charts/inferencepool/templates/_latency-predictor.tpl +++ b/config/charts/inferencepool/templates/_latency-predictor.tpl @@ -1,23 +1,9 @@ {{/* -Latency Predictor Env -Supports both legacy mode (single predictor) and PD mode (multiple predictors) +Latency Predictor Environment Variables +Generates environment variables for training and prediction server URLs */}} {{- define "gateway-api-inference-extension.latencyPredictor.env" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} -{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} -{{/* PD Mode: Generate environment variables for each predictor type */}} -{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} -- name: {{ $predictorName | upper }}_TRAINING_URL - value: "http://localhost:{{ $predictorConfig.trainingServer.port }}" -- name: {{ $predictorName | upper }}_PREDICTION_URL - value: "{{- $count := int $predictorConfig.predictionServers.count -}} - {{- $startPort := int $predictorConfig.predictionServers.startPort -}} - {{- range $i := until $count -}} - {{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }} - {{- end }}" -{{- end }} -{{- else }} -{{/* Legacy Mode: Single predictor environment variables */}} - name: PREDICTION_SERVER_URL value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}} {{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}} @@ -26,7 +12,6 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) {{- end }}" - name: TRAINING_SERVER_URL value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" -{{- end }} {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }} - name: {{ $key }} value: {{ $value | quote }} @@ -36,100 +21,10 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) {{/* Latency Predictor Sidecar Containers -Supports both legacy mode (single predictor) and PD mode (multiple predictors) +Creates training and prediction server sidecar containers */}} {{- define "gateway-api-inference-extension.latencyPredictor.containers" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} -{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} -{{/* PD Mode: Create training and prediction servers for each predictor type */}} -{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} -# Training Server for {{ $predictorName }} predictor -- name: training-server-{{ $predictorName }} - image: {{ $predictorConfig.trainingServer.image.hub | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ $predictorConfig.trainingServer.image.name | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ $predictorConfig.trainingServer.image.tag | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} - imagePullPolicy: {{ $predictorConfig.trainingServer.image.pullPolicy | default $.Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} - command: ["uvicorn"] - args: - - "training_server:app" - - "--host" - - "0.0.0.0" - - "--port" - - "{{ $predictorConfig.trainingServer.port }}" - ports: - - containerPort: {{ $predictorConfig.trainingServer.port }} - name: train-{{ $predictorName }} - livenessProbe: - httpGet: - path: {{ $predictorConfig.trainingServer.livenessProbe.httpGet.path | default "/healthz" }} - port: {{ $predictorConfig.trainingServer.port }} - initialDelaySeconds: {{ $predictorConfig.trainingServer.livenessProbe.initialDelaySeconds | default 30 }} - periodSeconds: {{ $predictorConfig.trainingServer.livenessProbe.periodSeconds | default 20 }} - readinessProbe: - httpGet: - path: {{ $predictorConfig.trainingServer.readinessProbe.httpGet.path | default "/readyz" }} - port: {{ $predictorConfig.trainingServer.port }} - initialDelaySeconds: {{ $predictorConfig.trainingServer.readinessProbe.initialDelaySeconds | default 45 }} - periodSeconds: {{ $predictorConfig.trainingServer.readinessProbe.periodSeconds | default 10 }} - resources: - {{- toYaml ($predictorConfig.trainingServer.resources | default $.Values.inferenceExtension.latencyPredictor.trainingServer.resources) | nindent 4 }} - envFrom: - - configMapRef: - name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-training - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "training-{{ $predictorName }}" - volumeMounts: - - name: training-server-{{ $predictorName }}-storage - mountPath: /models -{{- range $i := until (int $predictorConfig.predictionServers.count) }} -# Prediction Server {{ add $i 1 }} for {{ $predictorName }} predictor -- name: prediction-server-{{ $predictorName }}-{{ add $i 1 }} - image: {{ $predictorConfig.predictionServers.image.hub | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $predictorConfig.predictionServers.image.name | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $predictorConfig.predictionServers.image.tag | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }} - imagePullPolicy: {{ $predictorConfig.predictionServers.image.pullPolicy | default $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }} - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $predictorConfig.predictionServers.startPort $i }}"] - ports: - - containerPort: {{ add $predictorConfig.predictionServers.startPort $i }} - name: pred-{{ $predictorName }}-{{ add $i 1 }} - livenessProbe: - httpGet: - path: {{ $predictorConfig.predictionServers.livenessProbe.httpGet.path | default "/healthz" }} - port: {{ add $predictorConfig.predictionServers.startPort $i }} - initialDelaySeconds: {{ $predictorConfig.predictionServers.livenessProbe.initialDelaySeconds | default 15 }} - periodSeconds: {{ $predictorConfig.predictionServers.livenessProbe.periodSeconds | default 15 }} - readinessProbe: - httpGet: - path: {{ $predictorConfig.predictionServers.readinessProbe.httpGet.path | default "/readyz" }} - port: {{ add $predictorConfig.predictionServers.startPort $i }} - initialDelaySeconds: {{ $predictorConfig.predictionServers.readinessProbe.initialDelaySeconds | default 10 }} - periodSeconds: {{ $predictorConfig.predictionServers.readinessProbe.periodSeconds | default 5 }} - failureThreshold: {{ $predictorConfig.predictionServers.readinessProbe.failureThreshold | default 10 }} - resources: - {{- toYaml ($predictorConfig.predictionServers.resources | default $.Values.inferenceExtension.latencyPredictor.predictionServers.resources) | nindent 4 }} - envFrom: - - configMapRef: - name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-prediction - env: - - name: PREDICT_PORT - value: "{{ add $predictorConfig.predictionServers.startPort $i }}" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-{{ $predictorName }}-{{ add $i 1 }}" - - name: TRAINING_SERVER_URL - value: "http://localhost:{{ $predictorConfig.trainingServer.port }}" - volumeMounts: - - name: prediction-server-{{ $predictorName }}-{{ add $i 1 }}-storage - mountPath: /server_models -{{- end }} -{{- end }} -{{- else }} -{{/* Legacy Mode: Single predictor containers */}} # Training Server Sidecar Container - name: training-server image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} @@ -208,28 +103,13 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) {{- end }} {{- end }} {{- end }} -{{- end }} {{/* Latency Predictor Volumes -Supports both legacy mode (single predictor) and PD mode (multiple predictors) +Creates emptyDir volumes for training and prediction server storage */}} {{- define "gateway-api-inference-extension.latencyPredictor.volumes" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} -{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} -{{/* PD Mode: Create volumes for each predictor type */}} -{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} -- name: training-server-{{ $predictorName }}-storage - emptyDir: - sizeLimit: {{ $predictorConfig.trainingServer.volumeSize | default $.Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} -{{- range $i := until (int $predictorConfig.predictionServers.count) }} -- name: prediction-server-{{ $predictorName }}-{{ add $i 1 }}-storage - emptyDir: - sizeLimit: {{ $predictorConfig.predictionServers.volumeSize | default $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} -{{- end }} -{{- end }} -{{- else }} -{{/* Legacy Mode: Single predictor volumes */}} - name: training-server-storage emptyDir: sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} @@ -240,4 +120,3 @@ Supports both legacy mode (single predictor) and PD mode (multiple predictors) {{- end }} {{- end }} {{- end }} -{{- end }} diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index be3807d539..f34d5cf218 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -75,32 +75,6 @@ data: {{- end }} --- {{- if .Values.inferenceExtension.latencyPredictor.enabled }} -{{- if .Values.inferenceExtension.latencyPredictor.pdMode.enabled }} -{{/* PD Mode: Create ConfigMaps for each predictor type */}} -{{- range $predictorName, $predictorConfig := .Values.inferenceExtension.latencyPredictor.pdMode.predictors }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-training - namespace: {{ $.Release.Namespace }} -data: - {{- range $key, $value := $predictorConfig.trainingServer.config }} - {{ $key }}: {{ $value | quote }} - {{- end }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-{{ $predictorName }}-prediction - namespace: {{ $.Release.Namespace }} -data: - {{- range $key, $value := $predictorConfig.predictionServers.config }} - {{ $key }}: {{ $value | quote }} - {{- end }} ---- -{{- end }} -{{- else }} -{{/* Legacy Mode: Single predictor ConfigMaps */}} apiVersion: v1 kind: ConfigMap metadata: @@ -121,4 +95,3 @@ data: {{ $key }}: {{ $value | quote }} {{- end }} {{- end }} -{{- end }} diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 5919c0c00e..e1f847805a 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -75,78 +75,9 @@ inferenceExtension: latencyPredictor: enabled: false - # PD (Prefill-Decode) Disaggregation Mode - # When enabled, deploys separate predictors for prefill and decode pods - # Required for PD-SLO scheduling in llm-d-inference-scheduler - pdMode: - enabled: false # Set to true to enable PD-SLO architecture - predictors: - # Prefill predictor (predicts prefill pod TTFT) - prefill: - trainingServer: - port: 8000 - # Optional: Override global image/resources (falls back to global if not specified) - # image: - # hub: path/to/your/docker/repo - # name: latencypredictor-training-server - # tag: latest - # pullPolicy: Always - # resources: - # requests: - # cpu: "2000m" - # memory: "4Gi" - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - predictionServers: - count: 1 # Number of prefill prediction server replicas - startPort: 8001 - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # Decode predictor (predicts decode pod TTFT + TPOT) - decode: - trainingServer: - port: 8010 # Different port from prefill - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - predictionServers: - count: 1 # Number of decode prediction server replicas - startPort: 8011 - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # Legacy Mode: Single Predictor Configuration (used when pdMode.enabled=false) # Training Server Configuration + # Single unified predictor for both monolithic and PD disaggregation modes + # Uses pod_type feature to differentiate prefill vs decode training data trainingServer: image: hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars From 2fe13a2700971dbbc46666bccf77e0bcc608690a Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 21:12:43 -0500 Subject: [PATCH 11/12] revert configs since llm-d-inference-scheduler and llm-d guide for pd should handle --- .../templates/_latency-predictor.tpl | 16 +++------------- config/charts/inferencepool/values.yaml | 4 +--- latencypredictor/build-deploy.sh | 0 3 files changed, 4 insertions(+), 16 deletions(-) mode change 100755 => 100644 latencypredictor/build-deploy.sh diff --git a/config/charts/inferencepool/templates/_latency-predictor.tpl b/config/charts/inferencepool/templates/_latency-predictor.tpl index aadd43256a..4ac7b7ed2e 100644 --- a/config/charts/inferencepool/templates/_latency-predictor.tpl +++ b/config/charts/inferencepool/templates/_latency-predictor.tpl @@ -1,6 +1,5 @@ {{/* -Latency Predictor Environment Variables -Generates environment variables for training and prediction server URLs +Latency Predictor Env */}} {{- define "gateway-api-inference-extension.latencyPredictor.env" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} @@ -21,7 +20,6 @@ Generates environment variables for training and prediction server URLs {{/* Latency Predictor Sidecar Containers -Creates training and prediction server sidecar containers */}} {{- define "gateway-api-inference-extension.latencyPredictor.containers" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} @@ -29,13 +27,6 @@ Creates training and prediction server sidecar containers - name: training-server image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} - command: ["uvicorn"] - args: - - "training_server:app" - - "--host" - - "0.0.0.0" - - "--port" - - "{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" ports: - containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} name: training-port @@ -106,16 +97,15 @@ Creates training and prediction server sidecar containers {{/* Latency Predictor Volumes -Creates emptyDir volumes for training and prediction server storage */}} {{- define "gateway-api-inference-extension.latencyPredictor.volumes" -}} {{- if .Values.inferenceExtension.latencyPredictor.enabled }} - name: training-server-storage - emptyDir: + emptyDir: sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} - name: prediction-server-{{ add $i 1 }}-storage - emptyDir: + emptyDir: sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} {{- end }} {{- end }} diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index e1f847805a..aba6bbfdae 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -74,10 +74,8 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: enabled: false - + # Training Server Configuration - # Single unified predictor for both monolithic and PD disaggregation modes - # Uses pod_type feature to differentiate prefill vs decode training data trainingServer: image: hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars diff --git a/latencypredictor/build-deploy.sh b/latencypredictor/build-deploy.sh old mode 100755 new mode 100644 From 7ec9c0847476f2799ff72d09d4c1cc213f1422ac Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Tue, 16 Dec 2025 21:28:08 -0500 Subject: [PATCH 12/12] remove spurious whitespace changes --- latencypredictor/prediction_server.py | 3 --- latencypredictor/training_server.py | 7 ------- 2 files changed, 10 deletions(-) diff --git a/latencypredictor/prediction_server.py b/latencypredictor/prediction_server.py index 261d2e0687..5b8ae0fc64 100644 --- a/latencypredictor/prediction_server.py +++ b/latencypredictor/prediction_server.py @@ -219,11 +219,9 @@ def is_ready(self) -> bool: def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) -> pd.DataFrame: """ Prepare features with interaction terms to match training server. - Args: df: DataFrame with raw features model_type: 'ttft' or 'tpot' - Returns: DataFrame with engineered features including interactions """ @@ -277,7 +275,6 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'num_tokens_generated', 'pod_type_cat' ] - return df[feature_cols] def load_models(self) -> bool: diff --git a/latencypredictor/training_server.py b/latencypredictor/training_server.py index ba4ea8e644..d2e8cb1917 100644 --- a/latencypredictor/training_server.py +++ b/latencypredictor/training_server.py @@ -336,11 +336,9 @@ def _store_descaled_coefficients(self, model, scaler, feature_names, model_name) def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) -> pd.DataFrame: """ Prepare features with interaction terms for better model learning. - Args: df: DataFrame with raw features model_type: 'ttft' or 'tpot' - Returns: DataFrame with engineered features including interactions """ @@ -382,9 +380,7 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'prefill_score_bucket', 'pod_type_cat' ] - return df[feature_cols] - else: # tpot # TPOT doesn't use prefix_cache_score, so no interaction needed feature_cols = [ @@ -395,10 +391,8 @@ def _prepare_features_with_interaction(self, df: pd.DataFrame, model_type: str) 'num_tokens_generated', 'pod_type_cat' ] - return df[feature_cols] - def shutdown(self): """Signal the training thread to exit and join it.""" self._shutdown_event.set() @@ -848,7 +842,6 @@ def predict(self, features: dict) -> Tuple[float, float, float, float]: # Updated TTFT features to include prefix_cache_score and pod_type ttft_cols = ['kv_cache_percentage','input_token_length','num_request_waiting','num_request_running','prefix_cache_score'] tpot_cols = ['kv_cache_percentage','input_token_length','num_request_waiting','num_request_running','num_tokens_generated'] - # Create DataFrames for predictions df_ttft = pd.DataFrame([{col: features[col] for col in ttft_cols}]) # Add pod_type if present (otherwise _prepare_features_with_interaction will default to '')