1919
2020These methods are kept for potential Phase 2 API endpoints, debugging, interactive
2121testing, or future UI features that may need to display available options.
22+
23+ ESTIMATED BENCHMARKS:
24+ - Estimated benchmark data is loaded from benchmarks_redhat_performance.json
25+ - These are marked with estimated=True and included in SLO queries
2226"""
2327
28+ import json
2429import logging
2530import os
2631from typing import Optional
@@ -66,9 +71,18 @@ def __init__(self, data: dict):
6671 self .e2e_p95 = data ["e2e_p95" ]
6772 self .e2e_p99 = data ["e2e_p99" ]
6873
69- # Throughput
74+ # TPS (tokens per second) metrics - all percentiles
75+ self .tps_mean = data .get ("tps_mean" )
76+ self .tps_p90 = data .get ("tps_p90" )
77+ self .tps_p95 = data .get ("tps_p95" )
78+ self .tps_p99 = data .get ("tps_p99" )
79+
80+ # Throughput (legacy fields kept for backwards compatibility)
7081 self .tokens_per_second = data ["tokens_per_second" ]
7182 self .requests_per_second = data ["requests_per_second" ]
83+
84+ # Estimated flag (True for interpolated benchmarks)
85+ self .estimated = data .get ("estimated" , False )
7286
7387 def to_dict (self ) -> dict :
7488 """Convert to dictionary."""
@@ -94,13 +108,17 @@ def to_dict(self) -> dict:
94108 "e2e_p90" : self .e2e_p90 ,
95109 "e2e_p95" : self .e2e_p95 ,
96110 "e2e_p99" : self .e2e_p99 ,
111+ "tps_mean" : self .tps_mean ,
112+ "tps_p90" : self .tps_p90 ,
113+ "tps_p95" : self .tps_p95 ,
114+ "tps_p99" : self .tps_p99 ,
97115 "tokens_per_second" : self .tokens_per_second ,
98116 "requests_per_second" : self .requests_per_second ,
99117 }
100118
101119
102120class BenchmarkRepository :
103- """Repository for querying model benchmark data from PostgreSQL."""
121+ """Repository for querying model benchmark data from PostgreSQL + estimated JSON ."""
104122
105123 def __init__ (self , database_url : Optional [str ] = None ):
106124 """
@@ -114,6 +132,65 @@ def __init__(self, database_url: Optional[str] = None):
114132 "postgresql://postgres:compass@localhost:5432/compass"
115133 )
116134 self ._test_connection ()
135+
136+ # Load estimated benchmarks from JSON
137+ self ._estimated_benchmarks = self ._load_estimated_benchmarks ()
138+ logger .info (f"Loaded { len (self ._estimated_benchmarks )} estimated benchmark configs" )
139+
140+ def _load_estimated_benchmarks (self ) -> list [dict ]:
141+ """Load estimated benchmark data from JSON file."""
142+ json_path = os .path .join (
143+ os .path .dirname (__file__ ), ".." , ".." , ".." , "data" ,
144+ "benchmarks_redhat_performance.json"
145+ )
146+
147+ try :
148+ with open (json_path , 'r' ) as f :
149+ data = json .load (f )
150+
151+ # Filter only estimated benchmarks and convert to dict format
152+ estimated = []
153+ for b in data .get ('benchmarks' , []):
154+ if b .get ('estimated' ):
155+ # Map JSON keys to database column names
156+ # Support both old format (hardware/hardware_count) and new format (hardware_type/gpu_count)
157+ hw = b .get ('hardware' ) or b .get ('hardware_type' ) or b .get ('gpu_type' ) or ''
158+ hw_count = b .get ('hardware_count' ) or b .get ('gpu_count' ) or 1
159+ estimated .append ({
160+ 'model_hf_repo' : b .get ('model_id' , b .get ('model_name' , '' )),
161+ 'hardware' : hw ,
162+ 'hardware_count' : hw_count ,
163+ 'framework' : b .get ('framework' , 'vllm' ),
164+ 'framework_version' : b .get ('framework_version' , '0.6.2' ),
165+ 'prompt_tokens' : b .get ('prompt_tokens' , 512 ),
166+ 'output_tokens' : b .get ('output_tokens' , 256 ),
167+ 'mean_input_tokens' : b .get ('prompt_tokens' , 512 ),
168+ 'mean_output_tokens' : b .get ('output_tokens' , 256 ),
169+ 'ttft_mean' : b .get ('ttft_mean' , 0 ),
170+ 'ttft_p90' : b .get ('ttft_p90' , 0 ),
171+ 'ttft_p95' : b .get ('ttft_p95' , 0 ),
172+ 'ttft_p99' : b .get ('ttft_p99' , 0 ),
173+ 'itl_mean' : b .get ('itl_mean' , 0 ),
174+ 'itl_p90' : b .get ('itl_p90' , 0 ),
175+ 'itl_p95' : b .get ('itl_p95' , 0 ),
176+ 'itl_p99' : b .get ('itl_p99' , 0 ),
177+ 'e2e_mean' : b .get ('e2e_mean' , 0 ),
178+ 'e2e_p90' : b .get ('e2e_p90' , 0 ),
179+ 'e2e_p95' : b .get ('e2e_p95' , 0 ),
180+ 'e2e_p99' : b .get ('e2e_p99' , 0 ),
181+ # Support both field names (tps_mean or tokens_per_second_mean)
182+ 'tps_mean' : b .get ('tps_mean' ) or b .get ('tokens_per_second_mean' , 0 ),
183+ 'tps_p90' : b .get ('tps_p90' ) or b .get ('tokens_per_second_p90' , 0 ),
184+ 'tps_p95' : b .get ('tps_p95' ) or b .get ('tokens_per_second_p95' , 0 ),
185+ 'tps_p99' : b .get ('tps_p99' ) or b .get ('tokens_per_second_p99' , 0 ),
186+ 'tokens_per_second' : b .get ('tps_mean' ) or b .get ('tokens_per_second_mean' ) or b .get ('tokens_per_second' , 0 ),
187+ 'requests_per_second' : b .get ('requests_per_second' , 1.0 ),
188+ 'estimated' : True ,
189+ })
190+ return estimated
191+ except Exception as e :
192+ logger .warning (f"Could not load estimated benchmarks: { e } " )
193+ return []
117194
118195 def _test_connection (self ):
119196 """Test database connection on initialization."""
@@ -285,7 +362,8 @@ def find_configurations_meeting_slo(
285362 ttft_p95_max_ms : int ,
286363 itl_p95_max_ms : int ,
287364 e2e_p95_max_ms : int ,
288- min_qps : float = 0
365+ min_qps : float = 0 ,
366+ percentile : str = "p95"
289367 ) -> list [BenchmarkData ]:
290368 """
291369 Find all configurations that meet SLO requirements for a traffic profile.
@@ -299,30 +377,45 @@ def find_configurations_meeting_slo(
299377 Args:
300378 prompt_tokens: Target prompt length
301379 output_tokens: Target output length
302- ttft_p95_max_ms: Maximum acceptable TTFT p95 (ms)
303- itl_p95_max_ms: Maximum acceptable ITL p95 (ms/token)
304- e2e_p95_max_ms: Maximum acceptable E2E p95 (ms)
380+ ttft_p95_max_ms: Maximum acceptable TTFT (ms) - parameter name kept for backwards compat
381+ itl_p95_max_ms: Maximum acceptable ITL (ms/token) - parameter name kept for backwards compat
382+ e2e_p95_max_ms: Maximum acceptable E2E (ms) - parameter name kept for backwards compat
305383 min_qps: Minimum required QPS
384+ percentile: Which percentile column to use (mean, p90, p95, p99)
306385
307386 Returns:
308387 List of benchmarks meeting all criteria (one per system configuration)
309388 """
389+ # Map percentile to column suffix
390+ valid_percentiles = {"mean" , "p90" , "p95" , "p99" }
391+ if percentile not in valid_percentiles :
392+ logger .warning (f"Invalid percentile '{ percentile } ', defaulting to p95" )
393+ percentile = "p95"
394+
395+ # Build column names based on percentile
396+ ttft_col = f"ttft_{ percentile } "
397+ itl_col = f"itl_{ percentile } "
398+ e2e_col = f"e2e_{ percentile } "
399+
400+ logger .info (f"Querying benchmarks with percentile={ percentile } (columns: { ttft_col } , { itl_col } , { e2e_col } )" )
401+
310402 # Use window function to rank benchmarks by requests_per_second within each
311403 # system configuration, then select only the highest QPS that meets SLO.
312404 # When multiple benchmarks exist at the same QPS, prefer the one with lowest E2E latency.
313- query = """
405+ # NOTE: Using string formatting for column names is safe here since we validate percentile above
406+ query = f"""
314407 WITH ranked_configs AS (
315408 SELECT *,
316409 ROW_NUMBER() OVER (
317410 PARTITION BY model_hf_repo, hardware, hardware_count
318- ORDER BY requests_per_second DESC, e2e_p95 ASC
411+ ORDER BY requests_per_second DESC, { e2e_col } ASC
319412 ) as rn
320413 FROM exported_summaries
321414 WHERE prompt_tokens = %s
322415 AND output_tokens = %s
323- AND ttft_p95 <= %s
324- AND itl_p95 <= %s
325- AND e2e_p95 <= %s
416+ AND { ttft_col } <= %s
417+ AND { itl_col } <= %s
418+ AND { e2e_col } <= %s
326419 AND requests_per_second >= %s
327420 )
328421 SELECT
@@ -349,9 +442,64 @@ def find_configurations_meeting_slo(
349442 rows = cursor .fetchall ()
350443 cursor .close ()
351444
352- return [BenchmarkData (dict (row )) for row in rows ]
445+ results = [BenchmarkData (dict (row )) for row in rows ]
353446 finally :
354447 conn .close ()
448+
449+ # Also include estimated benchmarks that meet SLO criteria
450+ estimated_results = self ._get_estimated_benchmarks_meeting_slo (
451+ prompt_tokens = prompt_tokens ,
452+ output_tokens = output_tokens ,
453+ ttft_max_ms = ttft_p95_max_ms ,
454+ itl_max_ms = itl_p95_max_ms ,
455+ e2e_max_ms = e2e_p95_max_ms ,
456+ min_qps = min_qps ,
457+ percentile = percentile ,
458+ )
459+
460+ logger .info (f"Found { len (results )} DB benchmarks + { len (estimated_results )} estimated benchmarks" )
461+ return results + estimated_results
462+
463+ def _get_estimated_benchmarks_meeting_slo (
464+ self ,
465+ prompt_tokens : int ,
466+ output_tokens : int ,
467+ ttft_max_ms : float ,
468+ itl_max_ms : float ,
469+ e2e_max_ms : float ,
470+ min_qps : float ,
471+ percentile : str = "p95" ,
472+ ) -> list [BenchmarkData ]:
473+ """Filter estimated benchmarks by SLO criteria."""
474+ ttft_col = f"ttft_{ percentile } "
475+ itl_col = f"itl_{ percentile } "
476+ e2e_col = f"e2e_{ percentile } "
477+
478+ # Group by model+hardware to get best config per system
479+ best_configs = {}
480+
481+ for bench in self ._estimated_benchmarks :
482+ # Filter by token config
483+ if bench .get ('prompt_tokens' ) != prompt_tokens or bench .get ('output_tokens' ) != output_tokens :
484+ continue
485+
486+ # Check SLO criteria
487+ ttft_val = bench .get (ttft_col , float ('inf' ))
488+ itl_val = bench .get (itl_col , float ('inf' ))
489+ e2e_val = bench .get (e2e_col , float ('inf' ))
490+ rps = bench .get ('requests_per_second' , 0 )
491+
492+ if ttft_val > ttft_max_ms or itl_val > itl_max_ms or e2e_val > e2e_max_ms or rps < min_qps :
493+ continue
494+
495+ # Key by model+hardware+count
496+ key = (bench ['model_hf_repo' ], bench ['hardware' ], bench ['hardware_count' ])
497+
498+ # Keep best RPS config per system
499+ if key not in best_configs or rps > best_configs [key ].get ('requests_per_second' , 0 ):
500+ best_configs [key ] = bench
501+
502+ return [BenchmarkData (bench ) for bench in best_configs .values ()]
355503
356504 def get_available_models (self ) -> list [str ]:
357505 """Get list of all available models in the database."""
0 commit comments