|
10 | 10 | from fastapi.middleware.cors import CORSMiddleware |
11 | 11 | from pydantic import BaseModel |
12 | 12 |
|
| 13 | +from ..context_intent.extractor import IntentExtractor |
13 | 14 | from ..context_intent.schema import ( |
14 | 15 | ConversationMessage, |
15 | 16 | DeploymentRecommendation, |
@@ -115,6 +116,12 @@ class DeploymentStatusResponse(BaseModel): |
115 | 116 | recommendations: list[str] | None = None |
116 | 117 |
|
117 | 118 |
|
| 119 | +class ExtractRequest(BaseModel): |
| 120 | + """Request for intent extraction from natural language.""" |
| 121 | + |
| 122 | + text: str |
| 123 | + |
| 124 | + |
118 | 125 | # Health check endpoint |
119 | 126 | @app.get("/health") |
120 | 127 | async def health_check(): |
@@ -247,6 +254,191 @@ async def list_use_cases(): |
247 | 254 | raise HTTPException(status_code=500, detail=str(e)) from e |
248 | 255 |
|
249 | 256 |
|
| 257 | +@app.post("/api/v1/extract") |
| 258 | +async def extract_intent(request: ExtractRequest): |
| 259 | + """Extract business context from natural language using LLM. |
| 260 | +
|
| 261 | + Takes a user's natural language description of their deployment needs |
| 262 | + and extracts structured intent using Ollama (Qwen 2.5 7B). |
| 263 | +
|
| 264 | + Args: |
| 265 | + request: ExtractRequest with 'text' field containing user input |
| 266 | +
|
| 267 | + Returns: |
| 268 | + Structured intent with use_case, user_count, priority, etc. |
| 269 | + """ |
| 270 | + logger.info("=" * 60) |
| 271 | + logger.info("EXTRACT INTENT REQUEST") |
| 272 | + logger.info("=" * 60) |
| 273 | + logger.info(f" Input text: {request.text[:200]}{'...' if len(request.text) > 200 else ''}") |
| 274 | + |
| 275 | + try: |
| 276 | + # Create intent extractor (uses workflow's LLM client) |
| 277 | + intent_extractor = IntentExtractor(workflow.llm_client) |
| 278 | + |
| 279 | + # Extract intent from natural language |
| 280 | + intent = intent_extractor.extract_intent(request.text) |
| 281 | + |
| 282 | + # Infer any missing fields based on use case |
| 283 | + intent = intent_extractor.infer_missing_fields(intent) |
| 284 | + |
| 285 | + logger.info(f" Extracted use_case: {intent.use_case}") |
| 286 | + logger.info(f" Extracted user_count: {intent.user_count}") |
| 287 | + logger.info(f" Extracted priority: {intent.latency_requirement}") |
| 288 | + logger.info("=" * 60) |
| 289 | + |
| 290 | + # Return as dict for JSON serialization |
| 291 | + # Map latency_requirement to 'priority' for UI compatibility |
| 292 | + result = intent.model_dump() |
| 293 | + result["priority"] = intent.latency_requirement |
| 294 | + return result |
| 295 | + |
| 296 | + except ValueError as e: |
| 297 | + logger.error(f"Intent extraction failed: {e}") |
| 298 | + raise HTTPException(status_code=422, detail=str(e)) from e |
| 299 | + except Exception as e: |
| 300 | + logger.error(f"Unexpected error during intent extraction: {e}") |
| 301 | + raise HTTPException(status_code=500, detail=str(e)) from e |
| 302 | + |
| 303 | + |
| 304 | +def _round_to_nearest(value: float, nearest: int = 5) -> int: |
| 305 | + """Round a value to the nearest multiple of `nearest`.""" |
| 306 | + return int(round(value / nearest) * nearest) |
| 307 | + |
| 308 | + |
| 309 | +def _calculate_percentile_value(min_val: int, max_val: int, percentile: float = 0.75) -> int: |
| 310 | + """Calculate value at given percentile between min and max, rounded to nearest 5.""" |
| 311 | + value = min_val + (max_val - min_val) * percentile |
| 312 | + return _round_to_nearest(value, 5) |
| 313 | + |
| 314 | + |
| 315 | +@app.get("/api/v1/slo-defaults/{use_case}") |
| 316 | +async def get_slo_defaults(use_case: str): |
| 317 | + """Get default SLO values for a use case. |
| 318 | +
|
| 319 | + Returns SLO targets at the 75th percentile between min and max, |
| 320 | + rounded to the nearest 5. |
| 321 | + """ |
| 322 | + try: |
| 323 | + import json |
| 324 | + json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json" |
| 325 | + |
| 326 | + if not json_path.exists(): |
| 327 | + logger.error(f"SLO workload config not found at: {json_path}") |
| 328 | + raise HTTPException(status_code=404, detail="SLO workload configuration not found") |
| 329 | + |
| 330 | + with open(json_path, 'r') as f: |
| 331 | + data = json.load(f) |
| 332 | + |
| 333 | + use_case_data = data.get("use_case_slo_workload", {}).get(use_case) |
| 334 | + if not use_case_data: |
| 335 | + raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found") |
| 336 | + |
| 337 | + slo_targets = use_case_data.get("slo_targets", {}) |
| 338 | + |
| 339 | + # Get SLO ranges - fail if data is missing |
| 340 | + ttft = slo_targets["ttft_ms"] |
| 341 | + itl = slo_targets["itl_ms"] |
| 342 | + e2e = slo_targets["e2e_ms"] |
| 343 | + |
| 344 | + defaults = { |
| 345 | + "use_case": use_case, |
| 346 | + "description": use_case_data.get("description", ""), |
| 347 | + "ttft_ms": { |
| 348 | + "min": ttft["min"], |
| 349 | + "max": ttft["max"], |
| 350 | + "default": _calculate_percentile_value(ttft["min"], ttft["max"], 0.75) |
| 351 | + }, |
| 352 | + "itl_ms": { |
| 353 | + "min": itl["min"], |
| 354 | + "max": itl["max"], |
| 355 | + "default": _calculate_percentile_value(itl["min"], itl["max"], 0.75) |
| 356 | + }, |
| 357 | + "e2e_ms": { |
| 358 | + "min": e2e["min"], |
| 359 | + "max": e2e["max"], |
| 360 | + "default": _calculate_percentile_value(e2e["min"], e2e["max"], 0.75) |
| 361 | + } |
| 362 | + } |
| 363 | + |
| 364 | + return {"success": True, "slo_defaults": defaults} |
| 365 | + |
| 366 | + except HTTPException: |
| 367 | + raise |
| 368 | + except KeyError as e: |
| 369 | + logger.error(f"Missing SLO data for {use_case}: {e}") |
| 370 | + raise HTTPException(status_code=500, detail=f"Missing SLO data: {e}") from e |
| 371 | + except Exception as e: |
| 372 | + logger.error(f"Failed to get SLO defaults for {use_case}: {e}") |
| 373 | + raise HTTPException(status_code=500, detail=str(e)) from e |
| 374 | + |
| 375 | + |
| 376 | +@app.get("/api/v1/expected-rps/{use_case}") |
| 377 | +async def get_expected_rps(use_case: str, user_count: int = 1000): |
| 378 | + """Calculate expected RPS for a use case based on workload patterns. |
| 379 | +
|
| 380 | + Uses research-backed workload distribution parameters: |
| 381 | + - active_fraction: percentage of users active at any time |
| 382 | + - requests_per_active_user_per_min: request rate per active user |
| 383 | +
|
| 384 | + Formula: expected_rps = (user_count * active_fraction * requests_per_min) / 60 |
| 385 | + """ |
| 386 | + try: |
| 387 | + import json |
| 388 | + json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json" |
| 389 | + |
| 390 | + if not json_path.exists(): |
| 391 | + logger.error(f"SLO workload config not found at: {json_path}") |
| 392 | + raise HTTPException(status_code=404, detail="SLO workload configuration not found") |
| 393 | + |
| 394 | + with open(json_path, 'r') as f: |
| 395 | + data = json.load(f) |
| 396 | + |
| 397 | + use_case_data = data.get("use_case_slo_workload", {}).get(use_case) |
| 398 | + if not use_case_data: |
| 399 | + raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found") |
| 400 | + |
| 401 | + workload = use_case_data.get("workload", {}) |
| 402 | + |
| 403 | + # Get workload parameters - fail if missing |
| 404 | + active_fraction = workload["active_fraction"] |
| 405 | + requests_per_min = workload["requests_per_active_user_per_min"] |
| 406 | + peak_multiplier = workload.get("peak_multiplier", 2.0) |
| 407 | + distribution = workload.get("distribution", "poisson") |
| 408 | + |
| 409 | + # Calculate expected RPS using research-based formula |
| 410 | + expected_concurrent = int(user_count * active_fraction) |
| 411 | + expected_rps = (expected_concurrent * requests_per_min) / 60 |
| 412 | + expected_rps = max(1, round(expected_rps, 2)) # Minimum 1 RPS, round to 2 decimals |
| 413 | + |
| 414 | + # Calculate peak RPS for capacity planning |
| 415 | + peak_rps = expected_rps * peak_multiplier |
| 416 | + |
| 417 | + return { |
| 418 | + "success": True, |
| 419 | + "use_case": use_case, |
| 420 | + "user_count": user_count, |
| 421 | + "workload_params": { |
| 422 | + "active_fraction": active_fraction, |
| 423 | + "requests_per_active_user_per_min": requests_per_min, |
| 424 | + "peak_multiplier": peak_multiplier, |
| 425 | + "distribution": distribution |
| 426 | + }, |
| 427 | + "expected_rps": expected_rps, |
| 428 | + "expected_concurrent_users": expected_concurrent, |
| 429 | + "peak_rps": round(peak_rps, 2) |
| 430 | + } |
| 431 | + |
| 432 | + except HTTPException: |
| 433 | + raise |
| 434 | + except KeyError as e: |
| 435 | + logger.error(f"Missing workload data for {use_case}: {e}") |
| 436 | + raise HTTPException(status_code=500, detail=f"Missing workload data: {e}") from e |
| 437 | + except Exception as e: |
| 438 | + logger.error(f"Failed to calculate expected RPS for {use_case}: {e}") |
| 439 | + raise HTTPException(status_code=500, detail=str(e)) from e |
| 440 | + |
| 441 | + |
250 | 442 | # Get benchmark data |
251 | 443 | @app.get("/api/v1/benchmarks") |
252 | 444 | async def get_benchmarks(): |
@@ -589,23 +781,40 @@ async def ranked_recommend_from_spec(request: RankedRecommendationFromSpecReques |
589 | 781 | itl_target = request.get_itl_target() |
590 | 782 | e2e_target = request.get_e2e_target() |
591 | 783 | percentile = request.percentile |
592 | | - |
593 | | - logger.info( |
594 | | - f"Received ranked recommendation from spec: use_case={request.use_case}, " |
595 | | - f"user_count={request.user_count}, qps={request.expected_qps}" |
596 | | - ) |
597 | | - logger.info( |
598 | | - f" SLO targets ({percentile}): TTFT={ttft_target}ms, " |
599 | | - f"ITL={itl_target}ms, E2E={e2e_target}ms" |
600 | | - ) |
601 | | - logger.info( |
602 | | - f" Token config: {request.prompt_tokens} -> {request.output_tokens}" |
603 | | - ) |
| 784 | + |
| 785 | + # Log complete request for debugging |
| 786 | + logger.info("=" * 60) |
| 787 | + logger.info("RANKED-RECOMMEND-FROM-SPEC REQUEST") |
| 788 | + logger.info("=" * 60) |
| 789 | + logger.info(f" use_case: {request.use_case}") |
| 790 | + logger.info(f" user_count: {request.user_count}") |
| 791 | + logger.info(f" latency_requirement: {request.latency_requirement}") |
| 792 | + logger.info(f" budget_constraint: {request.budget_constraint}") |
| 793 | + logger.info(f" hardware_preference: {request.hardware_preference}") |
| 794 | + logger.info(f" prompt_tokens: {request.prompt_tokens}") |
| 795 | + logger.info(f" output_tokens: {request.output_tokens}") |
| 796 | + logger.info(f" expected_qps: {request.expected_qps}") |
| 797 | + logger.info(f" percentile: {percentile}") |
| 798 | + logger.info(f" ttft_target_ms (raw): {request.ttft_target_ms}") |
| 799 | + logger.info(f" itl_target_ms (raw): {request.itl_target_ms}") |
| 800 | + logger.info(f" e2e_target_ms (raw): {request.e2e_target_ms}") |
| 801 | + logger.info(f" ttft_p95_target_ms (legacy): {request.ttft_p95_target_ms}") |
| 802 | + logger.info(f" itl_p95_target_ms (legacy): {request.itl_p95_target_ms}") |
| 803 | + logger.info(f" e2e_p95_target_ms (legacy): {request.e2e_p95_target_ms}") |
| 804 | + logger.info(f" -> Resolved TTFT: {ttft_target}ms") |
| 805 | + logger.info(f" -> Resolved ITL: {itl_target}ms") |
| 806 | + logger.info(f" -> Resolved E2E: {e2e_target}ms") |
| 807 | + logger.info(f" min_accuracy: {request.min_accuracy}") |
| 808 | + logger.info(f" max_cost: {request.max_cost}") |
| 809 | + logger.info(f" include_near_miss: {request.include_near_miss}") |
604 | 810 | if request.weights: |
605 | 811 | logger.info( |
606 | | - f" Weights: A={request.weights.accuracy}, P={request.weights.price}, " |
607 | | - f"L={request.weights.latency}, C={request.weights.complexity}" |
| 812 | + f" weights: accuracy={request.weights.accuracy}, price={request.weights.price}, " |
| 813 | + f"latency={request.weights.latency}, complexity={request.weights.complexity}" |
608 | 814 | ) |
| 815 | + else: |
| 816 | + logger.info(" weights: None (using defaults)") |
| 817 | + logger.info("=" * 60) |
609 | 818 |
|
610 | 819 | # Build specifications dict for workflow |
611 | 820 | specifications = { |
|
0 commit comments