llm-d-incubation
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/src/api/routes.py‎
Lines changed: 34 additions & 9 deletions b/‎backend/src/api/routes.py‎
Lines changed: 34 additions & 9 deletions
diff --git a/‎backend/src/context_intent/schema.py‎
Lines changed: 11 additions & 6 deletions b/‎backend/src/context_intent/schema.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎backend/src/knowledge_base/benchmarks.py‎
Lines changed: 160 additions & 12 deletions b/‎backend/src/knowledge_base/benchmarks.py‎
Lines changed: 160 additions & 12 deletions
@@ -67,4 +67,5 @@ logs/
 
 # Data
 *.sql
-!scripts/schema.sql
+!scripts/schema.sql
+data/benchmarks_redhat_performance.json
@@ -466,16 +466,34 @@ class RankedRecommendationFromSpecRequest(BaseModel):
     output_tokens: int
     expected_qps: float
 
-    # SLO target fields
-    ttft_p95_target_ms: int
-    itl_p95_target_ms: int
-    e2e_p95_target_ms: int
+    # SLO target fields (generic - works with any percentile)
+    ttft_target_ms: int | None = None
+    itl_target_ms: int | None = None
+    e2e_target_ms: int | None = None
+    percentile: str = "p95"  # "mean", "p90", "p95", "p99"
+    
+    # Legacy p95 fields (for backwards compatibility)
+    ttft_p95_target_ms: int | None = None
+    itl_p95_target_ms: int | None = None
+    e2e_p95_target_ms: int | None = None
 
     # Ranking options
     min_accuracy: int | None = None
     max_cost: float | None = None
     include_near_miss: bool = True
     weights: BalancedWeights | None = None
+    
+    def get_ttft_target(self) -> int:
+        """Get TTFT target, preferring new field over legacy."""
+        return self.ttft_target_ms if self.ttft_target_ms is not None else (self.ttft_p95_target_ms or 500)
+    
+    def get_itl_target(self) -> int:
+        """Get ITL target, preferring new field over legacy."""
+        return self.itl_target_ms if self.itl_target_ms is not None else (self.itl_p95_target_ms or 50)
+    
+    def get_e2e_target(self) -> int:
+        """Get E2E target, preferring new field over legacy."""
+        return self.e2e_target_ms if self.e2e_target_ms is not None else (self.e2e_p95_target_ms or 5000)
 
 
 @app.post("/api/ranked-recommend")
@@ -566,13 +584,19 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques
         RankedRecommendationsResponse with 5 ranked lists
     """
     try:
+        # Get SLO targets using helper methods (supports both new and legacy fields)
+        ttft_target = request.get_ttft_target()
+        itl_target = request.get_itl_target()
+        e2e_target = request.get_e2e_target()
+        percentile = request.percentile
+        
         logger.info(
             f"Received ranked recommendation from spec: use_case={request.use_case}, "
             f"user_count={request.user_count}, qps={request.expected_qps}"
         )
         logger.info(
-            f"  SLO targets: TTFT={request.ttft_p95_target_ms}ms, "
-            f"ITL={request.itl_p95_target_ms}ms, E2E={request.e2e_p95_target_ms}ms"
+            f"  SLO targets ({percentile}): TTFT={ttft_target}ms, "
+            f"ITL={itl_target}ms, E2E={e2e_target}ms"
         )
         logger.info(
             f"  Token config: {request.prompt_tokens} -> {request.output_tokens}"
@@ -599,9 +623,10 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques
                 "expected_qps": request.expected_qps,
             },
             "slo_targets": {
-                "ttft_p95_target_ms": request.ttft_p95_target_ms,
-                "itl_p95_target_ms": request.itl_p95_target_ms,
-                "e2e_p95_target_ms": request.e2e_p95_target_ms,
+                "ttft_p95_target_ms": ttft_target,
+                "itl_p95_target_ms": itl_target,
+                "e2e_p95_target_ms": e2e_target,
+                "percentile": percentile,
             },
         }
 
 
@@ -1,6 +1,6 @@
 """Data schemas for deployment intent and specifications."""
 
-from typing import Literal
+from typing import Literal, Optional
 
 from pydantic import BaseModel, Field
 
@@ -14,11 +14,12 @@ class TrafficProfile(BaseModel):
 
 
 class SLOTargets(BaseModel):
-    """Service Level Objective targets for the deployment (p95 percentiles)."""
+    """Service Level Objective targets for the deployment."""
 
-    ttft_p95_target_ms: int = Field(..., description="Time to First Token p95 target (ms)")
-    itl_p95_target_ms: int = Field(..., description="Inter-Token Latency p95 target (ms/token)")
-    e2e_p95_target_ms: int = Field(..., description="End-to-end latency p95 target (ms)")
+    ttft_p95_target_ms: int = Field(..., description="Time to First Token target (ms)")
+    itl_p95_target_ms: int = Field(..., description="Inter-Token Latency target (ms/token)")
+    e2e_p95_target_ms: int = Field(..., description="End-to-end latency target (ms)")
+    percentile: str = Field(default="p95", description="Percentile for SLO comparison (mean, p90, p95, p99)")
 
 
 class GPUConfig(BaseModel):
@@ -51,7 +52,7 @@ class DeploymentIntent(BaseModel):
         "code_completion",
         "code_generation_detailed",
         "translation",
-        "content_creation",
+        "content_generation",
         "summarization_short",
         "document_analysis_rag",
         "long_document_summarization",
@@ -109,6 +110,9 @@ class DeploymentRecommendation(BaseModel):
     predicted_itl_p95_ms: int | None = None
     predicted_e2e_p95_ms: int | None = None
     predicted_throughput_qps: float | None = None
+    
+    # All percentile metrics from benchmark (for UI to display based on user selection)
+    benchmark_metrics: Optional[dict] = Field(default=None, description="All percentile metrics from benchmark")
 
     # Cost estimation (None when no viable config found)
     cost_per_hour_usd: float | None = None
@@ -148,6 +152,7 @@ def to_alternative_dict(self) -> dict:
             "cost_per_month_usd": self.cost_per_month_usd,
             "reasoning": self.reasoning,
             "scores": self.scores.model_dump() if self.scores else None,
+            "benchmark_metrics": self.benchmark_metrics,
         }
 
 
 
@@ -19,8 +19,13 @@
 
 These methods are kept for potential Phase 2 API endpoints, debugging, interactive
 testing, or future UI features that may need to display available options.
+
+ESTIMATED BENCHMARKS:
+- Estimated benchmark data is loaded from benchmarks_redhat_performance.json
+- These are marked with estimated=True and included in SLO queries
 """
 
+import json
 import logging
 import os
 from typing import Optional
@@ -66,9 +71,18 @@ def __init__(self, data: dict):
         self.e2e_p95 = data["e2e_p95"]
         self.e2e_p99 = data["e2e_p99"]
 
-        # Throughput
+        # TPS (tokens per second) metrics - all percentiles
+        self.tps_mean = data.get("tps_mean")
+        self.tps_p90 = data.get("tps_p90")
+        self.tps_p95 = data.get("tps_p95")
+        self.tps_p99 = data.get("tps_p99")
+
+        # Throughput (legacy fields kept for backwards compatibility)
         self.tokens_per_second = data["tokens_per_second"]
         self.requests_per_second = data["requests_per_second"]
+        
+        # Estimated flag (True for interpolated benchmarks)
+        self.estimated = data.get("estimated", False)
 
     def to_dict(self) -> dict:
         """Convert to dictionary."""
@@ -94,13 +108,17 @@ def to_dict(self) -> dict:
             "e2e_p90": self.e2e_p90,
             "e2e_p95": self.e2e_p95,
             "e2e_p99": self.e2e_p99,
+            "tps_mean": self.tps_mean,
+            "tps_p90": self.tps_p90,
+            "tps_p95": self.tps_p95,
+            "tps_p99": self.tps_p99,
             "tokens_per_second": self.tokens_per_second,
             "requests_per_second": self.requests_per_second,
         }
 
 
 class BenchmarkRepository:
-    """Repository for querying model benchmark data from PostgreSQL."""
+    """Repository for querying model benchmark data from PostgreSQL + estimated JSON."""
 
     def __init__(self, database_url: Optional[str] = None):
         """
@@ -114,6 +132,65 @@ def __init__(self, database_url: Optional[str] = None):
             "postgresql://postgres:compass@localhost:5432/compass"
         )
         self._test_connection()
+        
+        # Load estimated benchmarks from JSON
+        self._estimated_benchmarks = self._load_estimated_benchmarks()
+        logger.info(f"Loaded {len(self._estimated_benchmarks)} estimated benchmark configs")
+    
+    def _load_estimated_benchmarks(self) -> list[dict]:
+        """Load estimated benchmark data from JSON file."""
+        json_path = os.path.join(
+            os.path.dirname(__file__), "..", "..", "..", "data", 
+            "benchmarks_redhat_performance.json"
+        )
+        
+        try:
+            with open(json_path, 'r') as f:
+                data = json.load(f)
+            
+            # Filter only estimated benchmarks and convert to dict format
+            estimated = []
+            for b in data.get('benchmarks', []):
+                if b.get('estimated'):
+                    # Map JSON keys to database column names
+                    # Support both old format (hardware/hardware_count) and new format (hardware_type/gpu_count)
+                    hw = b.get('hardware') or b.get('hardware_type') or b.get('gpu_type') or ''
+                    hw_count = b.get('hardware_count') or b.get('gpu_count') or 1
+                    estimated.append({
+                        'model_hf_repo': b.get('model_id', b.get('model_name', '')),
+                        'hardware': hw,
+                        'hardware_count': hw_count,
+                        'framework': b.get('framework', 'vllm'),
+                        'framework_version': b.get('framework_version', '0.6.2'),
+                        'prompt_tokens': b.get('prompt_tokens', 512),
+                        'output_tokens': b.get('output_tokens', 256),
+                        'mean_input_tokens': b.get('prompt_tokens', 512),
+                        'mean_output_tokens': b.get('output_tokens', 256),
+                        'ttft_mean': b.get('ttft_mean', 0),
+                        'ttft_p90': b.get('ttft_p90', 0),
+                        'ttft_p95': b.get('ttft_p95', 0),
+                        'ttft_p99': b.get('ttft_p99', 0),
+                        'itl_mean': b.get('itl_mean', 0),
+                        'itl_p90': b.get('itl_p90', 0),
+                        'itl_p95': b.get('itl_p95', 0),
+                        'itl_p99': b.get('itl_p99', 0),
+                        'e2e_mean': b.get('e2e_mean', 0),
+                        'e2e_p90': b.get('e2e_p90', 0),
+                        'e2e_p95': b.get('e2e_p95', 0),
+                        'e2e_p99': b.get('e2e_p99', 0),
+                        # Support both field names (tps_mean or tokens_per_second_mean)
+                        'tps_mean': b.get('tps_mean') or b.get('tokens_per_second_mean', 0),
+                        'tps_p90': b.get('tps_p90') or b.get('tokens_per_second_p90', 0),
+                        'tps_p95': b.get('tps_p95') or b.get('tokens_per_second_p95', 0),
+                        'tps_p99': b.get('tps_p99') or b.get('tokens_per_second_p99', 0),
+                        'tokens_per_second': b.get('tps_mean') or b.get('tokens_per_second_mean') or b.get('tokens_per_second', 0),
+                        'requests_per_second': b.get('requests_per_second', 1.0),
+                        'estimated': True,
+                    })
+            return estimated
+        except Exception as e:
+            logger.warning(f"Could not load estimated benchmarks: {e}")
+            return []
 
     def _test_connection(self):
         """Test database connection on initialization."""
@@ -285,7 +362,8 @@ def find_configurations_meeting_slo(
         ttft_p95_max_ms: int,
         itl_p95_max_ms: int,
         e2e_p95_max_ms: int,
-        min_qps: float = 0
+        min_qps: float = 0,
+        percentile: str = "p95"
     ) -> list[BenchmarkData]:
         """
         Find all configurations that meet SLO requirements for a traffic profile.
@@ -299,30 +377,45 @@ def find_configurations_meeting_slo(
         Args:
             prompt_tokens: Target prompt length
             output_tokens: Target output length
-            ttft_p95_max_ms: Maximum acceptable TTFT p95 (ms)
-            itl_p95_max_ms: Maximum acceptable ITL p95 (ms/token)
-            e2e_p95_max_ms: Maximum acceptable E2E p95 (ms)
+            ttft_p95_max_ms: Maximum acceptable TTFT (ms) - parameter name kept for backwards compat
+            itl_p95_max_ms: Maximum acceptable ITL (ms/token) - parameter name kept for backwards compat
+            e2e_p95_max_ms: Maximum acceptable E2E (ms) - parameter name kept for backwards compat
             min_qps: Minimum required QPS
+            percentile: Which percentile column to use (mean, p90, p95, p99)
 
         Returns:
             List of benchmarks meeting all criteria (one per system configuration)
         """
+        # Map percentile to column suffix
+        valid_percentiles = {"mean", "p90", "p95", "p99"}
+        if percentile not in valid_percentiles:
+            logger.warning(f"Invalid percentile '{percentile}', defaulting to p95")
+            percentile = "p95"
+        
+        # Build column names based on percentile
+        ttft_col = f"ttft_{percentile}"
+        itl_col = f"itl_{percentile}"
+        e2e_col = f"e2e_{percentile}"
+        
+        logger.info(f"Querying benchmarks with percentile={percentile} (columns: {ttft_col}, {itl_col}, {e2e_col})")
+        
         # Use window function to rank benchmarks by requests_per_second within each
         # system configuration, then select only the highest QPS that meets SLO.
         # When multiple benchmarks exist at the same QPS, prefer the one with lowest E2E latency.
-        query = """
+        # NOTE: Using string formatting for column names is safe here since we validate percentile above
+        query = f"""
             WITH ranked_configs AS (
                 SELECT *,
                        ROW_NUMBER() OVER (
                            PARTITION BY model_hf_repo, hardware, hardware_count
-                           ORDER BY requests_per_second DESC, e2e_p95 ASC
+                           ORDER BY requests_per_second DESC, {e2e_col} ASC
                        ) as rn
                 FROM exported_summaries
                 WHERE prompt_tokens = %s
                   AND output_tokens = %s
-                  AND ttft_p95 <= %s
-                  AND itl_p95 <= %s
-                  AND e2e_p95 <= %s
+                  AND {ttft_col} <= %s
+                  AND {itl_col} <= %s
+                  AND {e2e_col} <= %s
                   AND requests_per_second >= %s
             )
             SELECT
@@ -349,9 +442,64 @@ def find_configurations_meeting_slo(
             rows = cursor.fetchall()
             cursor.close()
 
-            return [BenchmarkData(dict(row)) for row in rows]
+            results = [BenchmarkData(dict(row)) for row in rows]
         finally:
             conn.close()
+        
+        # Also include estimated benchmarks that meet SLO criteria
+        estimated_results = self._get_estimated_benchmarks_meeting_slo(
+            prompt_tokens=prompt_tokens,
+            output_tokens=output_tokens,
+            ttft_max_ms=ttft_p95_max_ms,
+            itl_max_ms=itl_p95_max_ms,
+            e2e_max_ms=e2e_p95_max_ms,
+            min_qps=min_qps,
+            percentile=percentile,
+        )
+        
+        logger.info(f"Found {len(results)} DB benchmarks + {len(estimated_results)} estimated benchmarks")
+        return results + estimated_results
+    
+    def _get_estimated_benchmarks_meeting_slo(
+        self,
+        prompt_tokens: int,
+        output_tokens: int,
+        ttft_max_ms: float,
+        itl_max_ms: float,
+        e2e_max_ms: float,
+        min_qps: float,
+        percentile: str = "p95",
+    ) -> list[BenchmarkData]:
+        """Filter estimated benchmarks by SLO criteria."""
+        ttft_col = f"ttft_{percentile}"
+        itl_col = f"itl_{percentile}"
+        e2e_col = f"e2e_{percentile}"
+        
+        # Group by model+hardware to get best config per system
+        best_configs = {}
+        
+        for bench in self._estimated_benchmarks:
+            # Filter by token config
+            if bench.get('prompt_tokens') != prompt_tokens or bench.get('output_tokens') != output_tokens:
+                continue
+            
+            # Check SLO criteria
+            ttft_val = bench.get(ttft_col, float('inf'))
+            itl_val = bench.get(itl_col, float('inf'))
+            e2e_val = bench.get(e2e_col, float('inf'))
+            rps = bench.get('requests_per_second', 0)
+            
+            if ttft_val > ttft_max_ms or itl_val > itl_max_ms or e2e_val > e2e_max_ms or rps < min_qps:
+                continue
+            
+            # Key by model+hardware+count
+            key = (bench['model_hf_repo'], bench['hardware'], bench['hardware_count'])
+            
+            # Keep best RPS config per system
+            if key not in best_configs or rps > best_configs[key].get('requests_per_second', 0):
+                best_configs[key] = bench
+        
+        return [BenchmarkData(bench) for bench in best_configs.values()]
 
     def get_available_models(self) -> list[str]:
         """Get list of all available models in the database."""