Merge pull request #65 from yuvalluria/feat/gpu-preference-extraction

anfredette · web-flow · commit c9023dd465cd · 2026-01-20T14:09:58.000-05:00
feat: Add GPU preference extraction from natural language
diff --git a/backend/src/context_intent/schema.py b/backend/src/context_intent/schema.py
@@ -70,6 +70,12 @@ class DeploymentIntent(BaseModel):
         description="Domain requirements (general, code, multilingual, enterprise)",
     )
 
+    # Hardware preference extracted from natural language
+    preferred_gpu_type: str = Field(
+        default="Any GPU",
+        description="User's preferred GPU type if mentioned (e.g., H100, H200, A100, L4) or 'Any GPU' if not specified"
+    )
+
     # Priority hints extracted from natural language (used for weight calculation)
     accuracy_priority: Literal["low", "medium", "high"] = Field(
         default="medium", description="Accuracy/quality importance"
diff --git a/backend/src/llm/prompts.py b/backend/src/llm/prompts.py
@@ -7,6 +7,7 @@
   "experience_class": "instant|conversational|interactive|deferred|batch",
   "user_count": <integer>,
   "domain_specialization": ["general"|"code"|"multilingual"|"enterprise"],
+  "preferred_gpu_type": "<GPU type if mentioned (H100, H200, A100, L4), or 'Any GPU' if not specified>",
   "accuracy_priority": "low|medium|high",
   "cost_priority": "low|medium|high",
   "latency_priority": "low|medium|high",
@@ -63,13 +64,23 @@ def build_intent_extraction_prompt(user_message: str, conversation_history: list
 1. **Use case**: What type of application (chatbot, customer service, code generation, summarization, etc.)
 2. **User count**: How many users or scale mentioned (estimate if not explicit)
 3. **Domain specialization**: Any specific domains mentioned (code, multilingual, enterprise, etc.)
+4. **Latency requirement**: How important is low latency? (very_high = sub-500ms, high = sub-2s, medium = 2-5s, low = >5s acceptable)
+5. **Throughput priority**: Is high request volume more important than low latency?
+6. **Budget constraint**: How price-sensitive are they?
+7. **Domain specialization**: Any specific domains mentioned (code, multilingual, enterprise, etc.)
+8. **Preferred GPU**: If user mentions a specific GPU type (H100, H200, A100, A100-80, L4, B200), extract it
 
 Be intelligent about inference:
 - "thousands of users" → estimate specific number
 - "document Q&A" or "knowledge base" or "document search" → use_case: document_analysis_rag
 - "RAG" or "retrieval" → use_case: document_analysis_rag
 - "chatbot" or "customer service" or "conversational" → use_case: chatbot_conversational
 - "summarize document" or "summarization" → use_case: summarization_short or long_document_summarization
+- "running on h200" or "h200" or "H200" → preferred_gpu_type: "H200"
+- "h100" or "H100" → preferred_gpu_type: "H100"
+- "a100" or "A100" → preferred_gpu_type: "A100"
+- "l4" or "L4" → preferred_gpu_type: "L4"
+- No GPU mentioned → preferred_gpu_type: "Any GPU"
 
 Priority extraction (for scoring weights - use "medium" as baseline, adjust based on context):
 - accuracy_priority: "high" if user mentions accuracy matters, quality is important, accuracy is critical, best model, or top quality. "low" if user says good enough or accuracy less important.
diff --git a/backend/src/recommendation/capacity_planner.py b/backend/src/recommendation/capacity_planner.py
@@ -184,6 +184,22 @@ def plan_all_capacities(
             )
             return []
 
+        # Filter by preferred GPU type if specified (skip if "Any GPU")
+        if intent.preferred_gpu_type and intent.preferred_gpu_type.lower() != "any gpu":
+            preferred_gpu = intent.preferred_gpu_type.upper()
+            original_count = len(matching_configs)
+            matching_configs = [
+                c for c in matching_configs
+                if c.hardware.upper() == preferred_gpu
+            ]
+            logger.info(
+                f"Filtered by preferred GPU '{preferred_gpu}': "
+                f"{original_count} → {len(matching_configs)} configs"
+            )
+            if not matching_configs:
+                logger.warning(f"No configurations found for preferred GPU: {preferred_gpu}")
+                return []
+
         # Build model lookup from catalog for scoring
         # Models not in catalog will get accuracy score = 0
         all_models = self.catalog.get_all_models()
diff --git a/ui/app.py b/ui/app.py
@@ -2287,7 +2287,11 @@ def extract_business_context(user_input: str) -> Optional[dict]:
         )
         if response.status_code == 200:
             result = response.json()
-            logger.info(f"LLM extraction successful: {result.get('use_case')}, priorities: acc={result.get('accuracy_priority')}, cost={result.get('cost_priority')}, lat={result.get('latency_priority')}, comp={result.get('complexity_priority')}")
+            # Map preferred_gpu_type to hardware for UI compatibility
+            if 'preferred_gpu_type' in result:
+                gpu = result['preferred_gpu_type']
+                result['hardware'] = None if gpu == "Any GPU" else gpu
+            logger.info(f"LLM extraction successful: {result.get('use_case')}, hardware={result.get('hardware')}, priorities: acc={result.get('accuracy_priority')}, cost={result.get('cost_priority')}, lat={result.get('latency_priority')}, comp={result.get('complexity_priority')}")
             return result
         else:
             logger.warning(f"LLM extraction API returned status {response.status_code}: {response.text[:200]}")
@@ -2391,14 +2395,20 @@ def mock_extraction(user_input: str) -> dict:
             user_count = int(num)
             break
     
-    # Detect hardware
+    # Detect hardware preference
     hardware = None
-    if "h100" in text_lower:
+    if "h200" in text_lower:
+        hardware = "H200"
+    elif "h100" in text_lower:
         hardware = "H100"
     elif "a100" in text_lower:
         hardware = "A100"
+    elif "l4" in text_lower:
+        hardware = "L4"
     elif "l40" in text_lower:
         hardware = "L40S"
+    elif "b200" in text_lower:
+        hardware = "B200"
     
     # Detect priority from user input
     priority = "balanced"  # default