Skip to content

Commit 9e4d994

Browse files
authored
Merge pull request #48 from amito/feature/recommendations-1
Recommendations and UI work
2 parents fdf5c24 + 48a36d5 commit 9e4d994

File tree

51 files changed

+27170
-7609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+27170
-7609
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,5 @@ logs/
6767

6868
# Data
6969
*.sql
70-
!scripts/schema.sql
70+
!scripts/schema.sql
71+
data/benchmarks_redhat_performance.json

backend/src/api/routes.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -466,16 +466,34 @@ class RankedRecommendationFromSpecRequest(BaseModel):
466466
output_tokens: int
467467
expected_qps: float
468468

469-
# SLO target fields
470-
ttft_p95_target_ms: int
471-
itl_p95_target_ms: int
472-
e2e_p95_target_ms: int
469+
# SLO target fields (generic - works with any percentile)
470+
ttft_target_ms: int | None = None
471+
itl_target_ms: int | None = None
472+
e2e_target_ms: int | None = None
473+
percentile: str = "p95" # "mean", "p90", "p95", "p99"
474+
475+
# Legacy p95 fields (for backwards compatibility)
476+
ttft_p95_target_ms: int | None = None
477+
itl_p95_target_ms: int | None = None
478+
e2e_p95_target_ms: int | None = None
473479

474480
# Ranking options
475481
min_accuracy: int | None = None
476482
max_cost: float | None = None
477483
include_near_miss: bool = True
478484
weights: BalancedWeights | None = None
485+
486+
def get_ttft_target(self) -> int:
487+
"""Get TTFT target, preferring new field over legacy."""
488+
return self.ttft_target_ms if self.ttft_target_ms is not None else (self.ttft_p95_target_ms or 500)
489+
490+
def get_itl_target(self) -> int:
491+
"""Get ITL target, preferring new field over legacy."""
492+
return self.itl_target_ms if self.itl_target_ms is not None else (self.itl_p95_target_ms or 50)
493+
494+
def get_e2e_target(self) -> int:
495+
"""Get E2E target, preferring new field over legacy."""
496+
return self.e2e_target_ms if self.e2e_target_ms is not None else (self.e2e_p95_target_ms or 5000)
479497

480498

481499
@app.post("/api/ranked-recommend")
@@ -566,13 +584,19 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques
566584
RankedRecommendationsResponse with 5 ranked lists
567585
"""
568586
try:
587+
# Get SLO targets using helper methods (supports both new and legacy fields)
588+
ttft_target = request.get_ttft_target()
589+
itl_target = request.get_itl_target()
590+
e2e_target = request.get_e2e_target()
591+
percentile = request.percentile
592+
569593
logger.info(
570594
f"Received ranked recommendation from spec: use_case={request.use_case}, "
571595
f"user_count={request.user_count}, qps={request.expected_qps}"
572596
)
573597
logger.info(
574-
f" SLO targets: TTFT={request.ttft_p95_target_ms}ms, "
575-
f"ITL={request.itl_p95_target_ms}ms, E2E={request.e2e_p95_target_ms}ms"
598+
f" SLO targets ({percentile}): TTFT={ttft_target}ms, "
599+
f"ITL={itl_target}ms, E2E={e2e_target}ms"
576600
)
577601
logger.info(
578602
f" Token config: {request.prompt_tokens} -> {request.output_tokens}"
@@ -599,9 +623,10 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques
599623
"expected_qps": request.expected_qps,
600624
},
601625
"slo_targets": {
602-
"ttft_p95_target_ms": request.ttft_p95_target_ms,
603-
"itl_p95_target_ms": request.itl_p95_target_ms,
604-
"e2e_p95_target_ms": request.e2e_p95_target_ms,
626+
"ttft_p95_target_ms": ttft_target,
627+
"itl_p95_target_ms": itl_target,
628+
"e2e_p95_target_ms": e2e_target,
629+
"percentile": percentile,
605630
},
606631
}
607632

backend/src/context_intent/schema.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Data schemas for deployment intent and specifications."""
22

3-
from typing import Literal
3+
from typing import Literal, Optional
44

55
from pydantic import BaseModel, Field
66

@@ -14,11 +14,12 @@ class TrafficProfile(BaseModel):
1414

1515

1616
class SLOTargets(BaseModel):
17-
"""Service Level Objective targets for the deployment (p95 percentiles)."""
17+
"""Service Level Objective targets for the deployment."""
1818

19-
ttft_p95_target_ms: int = Field(..., description="Time to First Token p95 target (ms)")
20-
itl_p95_target_ms: int = Field(..., description="Inter-Token Latency p95 target (ms/token)")
21-
e2e_p95_target_ms: int = Field(..., description="End-to-end latency p95 target (ms)")
19+
ttft_p95_target_ms: int = Field(..., description="Time to First Token target (ms)")
20+
itl_p95_target_ms: int = Field(..., description="Inter-Token Latency target (ms/token)")
21+
e2e_p95_target_ms: int = Field(..., description="End-to-end latency target (ms)")
22+
percentile: str = Field(default="p95", description="Percentile for SLO comparison (mean, p90, p95, p99)")
2223

2324

2425
class GPUConfig(BaseModel):
@@ -51,7 +52,7 @@ class DeploymentIntent(BaseModel):
5152
"code_completion",
5253
"code_generation_detailed",
5354
"translation",
54-
"content_creation",
55+
"content_generation",
5556
"summarization_short",
5657
"document_analysis_rag",
5758
"long_document_summarization",
@@ -109,6 +110,9 @@ class DeploymentRecommendation(BaseModel):
109110
predicted_itl_p95_ms: int | None = None
110111
predicted_e2e_p95_ms: int | None = None
111112
predicted_throughput_qps: float | None = None
113+
114+
# All percentile metrics from benchmark (for UI to display based on user selection)
115+
benchmark_metrics: Optional[dict] = Field(default=None, description="All percentile metrics from benchmark")
112116

113117
# Cost estimation (None when no viable config found)
114118
cost_per_hour_usd: float | None = None
@@ -148,6 +152,7 @@ def to_alternative_dict(self) -> dict:
148152
"cost_per_month_usd": self.cost_per_month_usd,
149153
"reasoning": self.reasoning,
150154
"scores": self.scores.model_dump() if self.scores else None,
155+
"benchmark_metrics": self.benchmark_metrics,
151156
}
152157

153158

backend/src/knowledge_base/benchmarks.py

Lines changed: 160 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@
1919
2020
These methods are kept for potential Phase 2 API endpoints, debugging, interactive
2121
testing, or future UI features that may need to display available options.
22+
23+
ESTIMATED BENCHMARKS:
24+
- Estimated benchmark data is loaded from benchmarks_redhat_performance.json
25+
- These are marked with estimated=True and included in SLO queries
2226
"""
2327

28+
import json
2429
import logging
2530
import os
2631
from typing import Optional
@@ -66,9 +71,18 @@ def __init__(self, data: dict):
6671
self.e2e_p95 = data["e2e_p95"]
6772
self.e2e_p99 = data["e2e_p99"]
6873

69-
# Throughput
74+
# TPS (tokens per second) metrics - all percentiles
75+
self.tps_mean = data.get("tps_mean")
76+
self.tps_p90 = data.get("tps_p90")
77+
self.tps_p95 = data.get("tps_p95")
78+
self.tps_p99 = data.get("tps_p99")
79+
80+
# Throughput (legacy fields kept for backwards compatibility)
7081
self.tokens_per_second = data["tokens_per_second"]
7182
self.requests_per_second = data["requests_per_second"]
83+
84+
# Estimated flag (True for interpolated benchmarks)
85+
self.estimated = data.get("estimated", False)
7286

7387
def to_dict(self) -> dict:
7488
"""Convert to dictionary."""
@@ -94,13 +108,17 @@ def to_dict(self) -> dict:
94108
"e2e_p90": self.e2e_p90,
95109
"e2e_p95": self.e2e_p95,
96110
"e2e_p99": self.e2e_p99,
111+
"tps_mean": self.tps_mean,
112+
"tps_p90": self.tps_p90,
113+
"tps_p95": self.tps_p95,
114+
"tps_p99": self.tps_p99,
97115
"tokens_per_second": self.tokens_per_second,
98116
"requests_per_second": self.requests_per_second,
99117
}
100118

101119

102120
class BenchmarkRepository:
103-
"""Repository for querying model benchmark data from PostgreSQL."""
121+
"""Repository for querying model benchmark data from PostgreSQL + estimated JSON."""
104122

105123
def __init__(self, database_url: Optional[str] = None):
106124
"""
@@ -114,6 +132,65 @@ def __init__(self, database_url: Optional[str] = None):
114132
"postgresql://postgres:compass@localhost:5432/compass"
115133
)
116134
self._test_connection()
135+
136+
# Load estimated benchmarks from JSON
137+
self._estimated_benchmarks = self._load_estimated_benchmarks()
138+
logger.info(f"Loaded {len(self._estimated_benchmarks)} estimated benchmark configs")
139+
140+
def _load_estimated_benchmarks(self) -> list[dict]:
141+
"""Load estimated benchmark data from JSON file."""
142+
json_path = os.path.join(
143+
os.path.dirname(__file__), "..", "..", "..", "data",
144+
"benchmarks_redhat_performance.json"
145+
)
146+
147+
try:
148+
with open(json_path, 'r') as f:
149+
data = json.load(f)
150+
151+
# Filter only estimated benchmarks and convert to dict format
152+
estimated = []
153+
for b in data.get('benchmarks', []):
154+
if b.get('estimated'):
155+
# Map JSON keys to database column names
156+
# Support both old format (hardware/hardware_count) and new format (hardware_type/gpu_count)
157+
hw = b.get('hardware') or b.get('hardware_type') or b.get('gpu_type') or ''
158+
hw_count = b.get('hardware_count') or b.get('gpu_count') or 1
159+
estimated.append({
160+
'model_hf_repo': b.get('model_id', b.get('model_name', '')),
161+
'hardware': hw,
162+
'hardware_count': hw_count,
163+
'framework': b.get('framework', 'vllm'),
164+
'framework_version': b.get('framework_version', '0.6.2'),
165+
'prompt_tokens': b.get('prompt_tokens', 512),
166+
'output_tokens': b.get('output_tokens', 256),
167+
'mean_input_tokens': b.get('prompt_tokens', 512),
168+
'mean_output_tokens': b.get('output_tokens', 256),
169+
'ttft_mean': b.get('ttft_mean', 0),
170+
'ttft_p90': b.get('ttft_p90', 0),
171+
'ttft_p95': b.get('ttft_p95', 0),
172+
'ttft_p99': b.get('ttft_p99', 0),
173+
'itl_mean': b.get('itl_mean', 0),
174+
'itl_p90': b.get('itl_p90', 0),
175+
'itl_p95': b.get('itl_p95', 0),
176+
'itl_p99': b.get('itl_p99', 0),
177+
'e2e_mean': b.get('e2e_mean', 0),
178+
'e2e_p90': b.get('e2e_p90', 0),
179+
'e2e_p95': b.get('e2e_p95', 0),
180+
'e2e_p99': b.get('e2e_p99', 0),
181+
# Support both field names (tps_mean or tokens_per_second_mean)
182+
'tps_mean': b.get('tps_mean') or b.get('tokens_per_second_mean', 0),
183+
'tps_p90': b.get('tps_p90') or b.get('tokens_per_second_p90', 0),
184+
'tps_p95': b.get('tps_p95') or b.get('tokens_per_second_p95', 0),
185+
'tps_p99': b.get('tps_p99') or b.get('tokens_per_second_p99', 0),
186+
'tokens_per_second': b.get('tps_mean') or b.get('tokens_per_second_mean') or b.get('tokens_per_second', 0),
187+
'requests_per_second': b.get('requests_per_second', 1.0),
188+
'estimated': True,
189+
})
190+
return estimated
191+
except Exception as e:
192+
logger.warning(f"Could not load estimated benchmarks: {e}")
193+
return []
117194

118195
def _test_connection(self):
119196
"""Test database connection on initialization."""
@@ -285,7 +362,8 @@ def find_configurations_meeting_slo(
285362
ttft_p95_max_ms: int,
286363
itl_p95_max_ms: int,
287364
e2e_p95_max_ms: int,
288-
min_qps: float = 0
365+
min_qps: float = 0,
366+
percentile: str = "p95"
289367
) -> list[BenchmarkData]:
290368
"""
291369
Find all configurations that meet SLO requirements for a traffic profile.
@@ -299,30 +377,45 @@ def find_configurations_meeting_slo(
299377
Args:
300378
prompt_tokens: Target prompt length
301379
output_tokens: Target output length
302-
ttft_p95_max_ms: Maximum acceptable TTFT p95 (ms)
303-
itl_p95_max_ms: Maximum acceptable ITL p95 (ms/token)
304-
e2e_p95_max_ms: Maximum acceptable E2E p95 (ms)
380+
ttft_p95_max_ms: Maximum acceptable TTFT (ms) - parameter name kept for backwards compat
381+
itl_p95_max_ms: Maximum acceptable ITL (ms/token) - parameter name kept for backwards compat
382+
e2e_p95_max_ms: Maximum acceptable E2E (ms) - parameter name kept for backwards compat
305383
min_qps: Minimum required QPS
384+
percentile: Which percentile column to use (mean, p90, p95, p99)
306385
307386
Returns:
308387
List of benchmarks meeting all criteria (one per system configuration)
309388
"""
389+
# Map percentile to column suffix
390+
valid_percentiles = {"mean", "p90", "p95", "p99"}
391+
if percentile not in valid_percentiles:
392+
logger.warning(f"Invalid percentile '{percentile}', defaulting to p95")
393+
percentile = "p95"
394+
395+
# Build column names based on percentile
396+
ttft_col = f"ttft_{percentile}"
397+
itl_col = f"itl_{percentile}"
398+
e2e_col = f"e2e_{percentile}"
399+
400+
logger.info(f"Querying benchmarks with percentile={percentile} (columns: {ttft_col}, {itl_col}, {e2e_col})")
401+
310402
# Use window function to rank benchmarks by requests_per_second within each
311403
# system configuration, then select only the highest QPS that meets SLO.
312404
# When multiple benchmarks exist at the same QPS, prefer the one with lowest E2E latency.
313-
query = """
405+
# NOTE: Using string formatting for column names is safe here since we validate percentile above
406+
query = f"""
314407
WITH ranked_configs AS (
315408
SELECT *,
316409
ROW_NUMBER() OVER (
317410
PARTITION BY model_hf_repo, hardware, hardware_count
318-
ORDER BY requests_per_second DESC, e2e_p95 ASC
411+
ORDER BY requests_per_second DESC, {e2e_col} ASC
319412
) as rn
320413
FROM exported_summaries
321414
WHERE prompt_tokens = %s
322415
AND output_tokens = %s
323-
AND ttft_p95 <= %s
324-
AND itl_p95 <= %s
325-
AND e2e_p95 <= %s
416+
AND {ttft_col} <= %s
417+
AND {itl_col} <= %s
418+
AND {e2e_col} <= %s
326419
AND requests_per_second >= %s
327420
)
328421
SELECT
@@ -349,9 +442,64 @@ def find_configurations_meeting_slo(
349442
rows = cursor.fetchall()
350443
cursor.close()
351444

352-
return [BenchmarkData(dict(row)) for row in rows]
445+
results = [BenchmarkData(dict(row)) for row in rows]
353446
finally:
354447
conn.close()
448+
449+
# Also include estimated benchmarks that meet SLO criteria
450+
estimated_results = self._get_estimated_benchmarks_meeting_slo(
451+
prompt_tokens=prompt_tokens,
452+
output_tokens=output_tokens,
453+
ttft_max_ms=ttft_p95_max_ms,
454+
itl_max_ms=itl_p95_max_ms,
455+
e2e_max_ms=e2e_p95_max_ms,
456+
min_qps=min_qps,
457+
percentile=percentile,
458+
)
459+
460+
logger.info(f"Found {len(results)} DB benchmarks + {len(estimated_results)} estimated benchmarks")
461+
return results + estimated_results
462+
463+
def _get_estimated_benchmarks_meeting_slo(
464+
self,
465+
prompt_tokens: int,
466+
output_tokens: int,
467+
ttft_max_ms: float,
468+
itl_max_ms: float,
469+
e2e_max_ms: float,
470+
min_qps: float,
471+
percentile: str = "p95",
472+
) -> list[BenchmarkData]:
473+
"""Filter estimated benchmarks by SLO criteria."""
474+
ttft_col = f"ttft_{percentile}"
475+
itl_col = f"itl_{percentile}"
476+
e2e_col = f"e2e_{percentile}"
477+
478+
# Group by model+hardware to get best config per system
479+
best_configs = {}
480+
481+
for bench in self._estimated_benchmarks:
482+
# Filter by token config
483+
if bench.get('prompt_tokens') != prompt_tokens or bench.get('output_tokens') != output_tokens:
484+
continue
485+
486+
# Check SLO criteria
487+
ttft_val = bench.get(ttft_col, float('inf'))
488+
itl_val = bench.get(itl_col, float('inf'))
489+
e2e_val = bench.get(e2e_col, float('inf'))
490+
rps = bench.get('requests_per_second', 0)
491+
492+
if ttft_val > ttft_max_ms or itl_val > itl_max_ms or e2e_val > e2e_max_ms or rps < min_qps:
493+
continue
494+
495+
# Key by model+hardware+count
496+
key = (bench['model_hf_repo'], bench['hardware'], bench['hardware_count'])
497+
498+
# Keep best RPS config per system
499+
if key not in best_configs or rps > best_configs[key].get('requests_per_second', 0):
500+
best_configs[key] = bench
501+
502+
return [BenchmarkData(bench) for bench in best_configs.values()]
355503

356504
def get_available_models(self) -> list[str]:
357505
"""Get list of all available models in the database."""

0 commit comments

Comments
 (0)