Skip to content

Commit eedcb64

Browse files
authored
Merge pull request #53 from anfredette/pr48-updates
Fix UI-to-backend data flow and add LLM intent extraction endpoint
2 parents 9e4d994 + e42e779 commit eedcb64

File tree

2 files changed

+418
-194
lines changed

2 files changed

+418
-194
lines changed

backend/src/api/routes.py

Lines changed: 223 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from fastapi.middleware.cors import CORSMiddleware
1111
from pydantic import BaseModel
1212

13+
from ..context_intent.extractor import IntentExtractor
1314
from ..context_intent.schema import (
1415
ConversationMessage,
1516
DeploymentRecommendation,
@@ -115,6 +116,12 @@ class DeploymentStatusResponse(BaseModel):
115116
recommendations: list[str] | None = None
116117

117118

119+
class ExtractRequest(BaseModel):
120+
"""Request for intent extraction from natural language."""
121+
122+
text: str
123+
124+
118125
# Health check endpoint
119126
@app.get("/health")
120127
async def health_check():
@@ -247,6 +254,191 @@ async def list_use_cases():
247254
raise HTTPException(status_code=500, detail=str(e)) from e
248255

249256

257+
@app.post("/api/v1/extract")
258+
async def extract_intent(request: ExtractRequest):
259+
"""Extract business context from natural language using LLM.
260+
261+
Takes a user's natural language description of their deployment needs
262+
and extracts structured intent using Ollama (Qwen 2.5 7B).
263+
264+
Args:
265+
request: ExtractRequest with 'text' field containing user input
266+
267+
Returns:
268+
Structured intent with use_case, user_count, priority, etc.
269+
"""
270+
logger.info("=" * 60)
271+
logger.info("EXTRACT INTENT REQUEST")
272+
logger.info("=" * 60)
273+
logger.info(f" Input text: {request.text[:200]}{'...' if len(request.text) > 200 else ''}")
274+
275+
try:
276+
# Create intent extractor (uses workflow's LLM client)
277+
intent_extractor = IntentExtractor(workflow.llm_client)
278+
279+
# Extract intent from natural language
280+
intent = intent_extractor.extract_intent(request.text)
281+
282+
# Infer any missing fields based on use case
283+
intent = intent_extractor.infer_missing_fields(intent)
284+
285+
logger.info(f" Extracted use_case: {intent.use_case}")
286+
logger.info(f" Extracted user_count: {intent.user_count}")
287+
logger.info(f" Extracted priority: {intent.latency_requirement}")
288+
logger.info("=" * 60)
289+
290+
# Return as dict for JSON serialization
291+
# Map latency_requirement to 'priority' for UI compatibility
292+
result = intent.model_dump()
293+
result["priority"] = intent.latency_requirement
294+
return result
295+
296+
except ValueError as e:
297+
logger.error(f"Intent extraction failed: {e}")
298+
raise HTTPException(status_code=422, detail=str(e)) from e
299+
except Exception as e:
300+
logger.error(f"Unexpected error during intent extraction: {e}")
301+
raise HTTPException(status_code=500, detail=str(e)) from e
302+
303+
304+
def _round_to_nearest(value: float, nearest: int = 5) -> int:
305+
"""Round a value to the nearest multiple of `nearest`."""
306+
return int(round(value / nearest) * nearest)
307+
308+
309+
def _calculate_percentile_value(min_val: int, max_val: int, percentile: float = 0.75) -> int:
310+
"""Calculate value at given percentile between min and max, rounded to nearest 5."""
311+
value = min_val + (max_val - min_val) * percentile
312+
return _round_to_nearest(value, 5)
313+
314+
315+
@app.get("/api/v1/slo-defaults/{use_case}")
316+
async def get_slo_defaults(use_case: str):
317+
"""Get default SLO values for a use case.
318+
319+
Returns SLO targets at the 75th percentile between min and max,
320+
rounded to the nearest 5.
321+
"""
322+
try:
323+
import json
324+
json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json"
325+
326+
if not json_path.exists():
327+
logger.error(f"SLO workload config not found at: {json_path}")
328+
raise HTTPException(status_code=404, detail="SLO workload configuration not found")
329+
330+
with open(json_path, 'r') as f:
331+
data = json.load(f)
332+
333+
use_case_data = data.get("use_case_slo_workload", {}).get(use_case)
334+
if not use_case_data:
335+
raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found")
336+
337+
slo_targets = use_case_data.get("slo_targets", {})
338+
339+
# Get SLO ranges - fail if data is missing
340+
ttft = slo_targets["ttft_ms"]
341+
itl = slo_targets["itl_ms"]
342+
e2e = slo_targets["e2e_ms"]
343+
344+
defaults = {
345+
"use_case": use_case,
346+
"description": use_case_data.get("description", ""),
347+
"ttft_ms": {
348+
"min": ttft["min"],
349+
"max": ttft["max"],
350+
"default": _calculate_percentile_value(ttft["min"], ttft["max"], 0.75)
351+
},
352+
"itl_ms": {
353+
"min": itl["min"],
354+
"max": itl["max"],
355+
"default": _calculate_percentile_value(itl["min"], itl["max"], 0.75)
356+
},
357+
"e2e_ms": {
358+
"min": e2e["min"],
359+
"max": e2e["max"],
360+
"default": _calculate_percentile_value(e2e["min"], e2e["max"], 0.75)
361+
}
362+
}
363+
364+
return {"success": True, "slo_defaults": defaults}
365+
366+
except HTTPException:
367+
raise
368+
except KeyError as e:
369+
logger.error(f"Missing SLO data for {use_case}: {e}")
370+
raise HTTPException(status_code=500, detail=f"Missing SLO data: {e}") from e
371+
except Exception as e:
372+
logger.error(f"Failed to get SLO defaults for {use_case}: {e}")
373+
raise HTTPException(status_code=500, detail=str(e)) from e
374+
375+
376+
@app.get("/api/v1/expected-rps/{use_case}")
377+
async def get_expected_rps(use_case: str, user_count: int = 1000):
378+
"""Calculate expected RPS for a use case based on workload patterns.
379+
380+
Uses research-backed workload distribution parameters:
381+
- active_fraction: percentage of users active at any time
382+
- requests_per_active_user_per_min: request rate per active user
383+
384+
Formula: expected_rps = (user_count * active_fraction * requests_per_min) / 60
385+
"""
386+
try:
387+
import json
388+
json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json"
389+
390+
if not json_path.exists():
391+
logger.error(f"SLO workload config not found at: {json_path}")
392+
raise HTTPException(status_code=404, detail="SLO workload configuration not found")
393+
394+
with open(json_path, 'r') as f:
395+
data = json.load(f)
396+
397+
use_case_data = data.get("use_case_slo_workload", {}).get(use_case)
398+
if not use_case_data:
399+
raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found")
400+
401+
workload = use_case_data.get("workload", {})
402+
403+
# Get workload parameters - fail if missing
404+
active_fraction = workload["active_fraction"]
405+
requests_per_min = workload["requests_per_active_user_per_min"]
406+
peak_multiplier = workload.get("peak_multiplier", 2.0)
407+
distribution = workload.get("distribution", "poisson")
408+
409+
# Calculate expected RPS using research-based formula
410+
expected_concurrent = int(user_count * active_fraction)
411+
expected_rps = (expected_concurrent * requests_per_min) / 60
412+
expected_rps = max(1, round(expected_rps, 2)) # Minimum 1 RPS, round to 2 decimals
413+
414+
# Calculate peak RPS for capacity planning
415+
peak_rps = expected_rps * peak_multiplier
416+
417+
return {
418+
"success": True,
419+
"use_case": use_case,
420+
"user_count": user_count,
421+
"workload_params": {
422+
"active_fraction": active_fraction,
423+
"requests_per_active_user_per_min": requests_per_min,
424+
"peak_multiplier": peak_multiplier,
425+
"distribution": distribution
426+
},
427+
"expected_rps": expected_rps,
428+
"expected_concurrent_users": expected_concurrent,
429+
"peak_rps": round(peak_rps, 2)
430+
}
431+
432+
except HTTPException:
433+
raise
434+
except KeyError as e:
435+
logger.error(f"Missing workload data for {use_case}: {e}")
436+
raise HTTPException(status_code=500, detail=f"Missing workload data: {e}") from e
437+
except Exception as e:
438+
logger.error(f"Failed to calculate expected RPS for {use_case}: {e}")
439+
raise HTTPException(status_code=500, detail=str(e)) from e
440+
441+
250442
# Get benchmark data
251443
@app.get("/api/v1/benchmarks")
252444
async def get_benchmarks():
@@ -589,23 +781,40 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques
589781
itl_target = request.get_itl_target()
590782
e2e_target = request.get_e2e_target()
591783
percentile = request.percentile
592-
593-
logger.info(
594-
f"Received ranked recommendation from spec: use_case={request.use_case}, "
595-
f"user_count={request.user_count}, qps={request.expected_qps}"
596-
)
597-
logger.info(
598-
f" SLO targets ({percentile}): TTFT={ttft_target}ms, "
599-
f"ITL={itl_target}ms, E2E={e2e_target}ms"
600-
)
601-
logger.info(
602-
f" Token config: {request.prompt_tokens} -> {request.output_tokens}"
603-
)
784+
785+
# Log complete request for debugging
786+
logger.info("=" * 60)
787+
logger.info("RANKED-RECOMMEND-FROM-SPEC REQUEST")
788+
logger.info("=" * 60)
789+
logger.info(f" use_case: {request.use_case}")
790+
logger.info(f" user_count: {request.user_count}")
791+
logger.info(f" latency_requirement: {request.latency_requirement}")
792+
logger.info(f" budget_constraint: {request.budget_constraint}")
793+
logger.info(f" hardware_preference: {request.hardware_preference}")
794+
logger.info(f" prompt_tokens: {request.prompt_tokens}")
795+
logger.info(f" output_tokens: {request.output_tokens}")
796+
logger.info(f" expected_qps: {request.expected_qps}")
797+
logger.info(f" percentile: {percentile}")
798+
logger.info(f" ttft_target_ms (raw): {request.ttft_target_ms}")
799+
logger.info(f" itl_target_ms (raw): {request.itl_target_ms}")
800+
logger.info(f" e2e_target_ms (raw): {request.e2e_target_ms}")
801+
logger.info(f" ttft_p95_target_ms (legacy): {request.ttft_p95_target_ms}")
802+
logger.info(f" itl_p95_target_ms (legacy): {request.itl_p95_target_ms}")
803+
logger.info(f" e2e_p95_target_ms (legacy): {request.e2e_p95_target_ms}")
804+
logger.info(f" -> Resolved TTFT: {ttft_target}ms")
805+
logger.info(f" -> Resolved ITL: {itl_target}ms")
806+
logger.info(f" -> Resolved E2E: {e2e_target}ms")
807+
logger.info(f" min_accuracy: {request.min_accuracy}")
808+
logger.info(f" max_cost: {request.max_cost}")
809+
logger.info(f" include_near_miss: {request.include_near_miss}")
604810
if request.weights:
605811
logger.info(
606-
f" Weights: A={request.weights.accuracy}, P={request.weights.price}, "
607-
f"L={request.weights.latency}, C={request.weights.complexity}"
812+
f" weights: accuracy={request.weights.accuracy}, price={request.weights.price}, "
813+
f"latency={request.weights.latency}, complexity={request.weights.complexity}"
608814
)
815+
else:
816+
logger.info(" weights: None (using defaults)")
817+
logger.info("=" * 60)
609818

610819
# Build specifications dict for workflow
611820
specifications = {

0 commit comments

Comments
 (0)