Skip to content

Commit c9023dd

Browse files
authored
Merge pull request #65 from yuvalluria/feat/gpu-preference-extraction
feat: Add GPU preference extraction from natural language
2 parents 9d3a221 + 73651a5 commit c9023dd

File tree

4 files changed

+46
-3
lines changed

4 files changed

+46
-3
lines changed

backend/src/context_intent/schema.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ class DeploymentIntent(BaseModel):
7070
description="Domain requirements (general, code, multilingual, enterprise)",
7171
)
7272

73+
# Hardware preference extracted from natural language
74+
preferred_gpu_type: str = Field(
75+
default="Any GPU",
76+
description="User's preferred GPU type if mentioned (e.g., H100, H200, A100, L4) or 'Any GPU' if not specified"
77+
)
78+
7379
# Priority hints extracted from natural language (used for weight calculation)
7480
accuracy_priority: Literal["low", "medium", "high"] = Field(
7581
default="medium", description="Accuracy/quality importance"

backend/src/llm/prompts.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"experience_class": "instant|conversational|interactive|deferred|batch",
88
"user_count": <integer>,
99
"domain_specialization": ["general"|"code"|"multilingual"|"enterprise"],
10+
"preferred_gpu_type": "<GPU type if mentioned (H100, H200, A100, L4), or 'Any GPU' if not specified>",
1011
"accuracy_priority": "low|medium|high",
1112
"cost_priority": "low|medium|high",
1213
"latency_priority": "low|medium|high",
@@ -63,13 +64,23 @@ def build_intent_extraction_prompt(user_message: str, conversation_history: list
6364
1. **Use case**: What type of application (chatbot, customer service, code generation, summarization, etc.)
6465
2. **User count**: How many users or scale mentioned (estimate if not explicit)
6566
3. **Domain specialization**: Any specific domains mentioned (code, multilingual, enterprise, etc.)
67+
4. **Latency requirement**: How important is low latency? (very_high = sub-500ms, high = sub-2s, medium = 2-5s, low = >5s acceptable)
68+
5. **Throughput priority**: Is high request volume more important than low latency?
69+
6. **Budget constraint**: How price-sensitive are they?
70+
7. **Domain specialization**: Any specific domains mentioned (code, multilingual, enterprise, etc.)
71+
8. **Preferred GPU**: If user mentions a specific GPU type (H100, H200, A100, A100-80, L4, B200), extract it
6672
6773
Be intelligent about inference:
6874
- "thousands of users" → estimate specific number
6975
- "document Q&A" or "knowledge base" or "document search" → use_case: document_analysis_rag
7076
- "RAG" or "retrieval" → use_case: document_analysis_rag
7177
- "chatbot" or "customer service" or "conversational" → use_case: chatbot_conversational
7278
- "summarize document" or "summarization" → use_case: summarization_short or long_document_summarization
79+
- "running on h200" or "h200" or "H200" → preferred_gpu_type: "H200"
80+
- "h100" or "H100" → preferred_gpu_type: "H100"
81+
- "a100" or "A100" → preferred_gpu_type: "A100"
82+
- "l4" or "L4" → preferred_gpu_type: "L4"
83+
- No GPU mentioned → preferred_gpu_type: "Any GPU"
7384
7485
Priority extraction (for scoring weights - use "medium" as baseline, adjust based on context):
7586
- accuracy_priority: "high" if user mentions accuracy matters, quality is important, accuracy is critical, best model, or top quality. "low" if user says good enough or accuracy less important.

backend/src/recommendation/capacity_planner.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,22 @@ def plan_all_capacities(
184184
)
185185
return []
186186

187+
# Filter by preferred GPU type if specified (skip if "Any GPU")
188+
if intent.preferred_gpu_type and intent.preferred_gpu_type.lower() != "any gpu":
189+
preferred_gpu = intent.preferred_gpu_type.upper()
190+
original_count = len(matching_configs)
191+
matching_configs = [
192+
c for c in matching_configs
193+
if c.hardware.upper() == preferred_gpu
194+
]
195+
logger.info(
196+
f"Filtered by preferred GPU '{preferred_gpu}': "
197+
f"{original_count}{len(matching_configs)} configs"
198+
)
199+
if not matching_configs:
200+
logger.warning(f"No configurations found for preferred GPU: {preferred_gpu}")
201+
return []
202+
187203
# Build model lookup from catalog for scoring
188204
# Models not in catalog will get accuracy score = 0
189205
all_models = self.catalog.get_all_models()

ui/app.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2287,7 +2287,11 @@ def extract_business_context(user_input: str) -> Optional[dict]:
22872287
)
22882288
if response.status_code == 200:
22892289
result = response.json()
2290-
logger.info(f"LLM extraction successful: {result.get('use_case')}, priorities: acc={result.get('accuracy_priority')}, cost={result.get('cost_priority')}, lat={result.get('latency_priority')}, comp={result.get('complexity_priority')}")
2290+
# Map preferred_gpu_type to hardware for UI compatibility
2291+
if 'preferred_gpu_type' in result:
2292+
gpu = result['preferred_gpu_type']
2293+
result['hardware'] = None if gpu == "Any GPU" else gpu
2294+
logger.info(f"LLM extraction successful: {result.get('use_case')}, hardware={result.get('hardware')}, priorities: acc={result.get('accuracy_priority')}, cost={result.get('cost_priority')}, lat={result.get('latency_priority')}, comp={result.get('complexity_priority')}")
22912295
return result
22922296
else:
22932297
logger.warning(f"LLM extraction API returned status {response.status_code}: {response.text[:200]}")
@@ -2391,14 +2395,20 @@ def mock_extraction(user_input: str) -> dict:
23912395
user_count = int(num)
23922396
break
23932397

2394-
# Detect hardware
2398+
# Detect hardware preference
23952399
hardware = None
2396-
if "h100" in text_lower:
2400+
if "h200" in text_lower:
2401+
hardware = "H200"
2402+
elif "h100" in text_lower:
23972403
hardware = "H100"
23982404
elif "a100" in text_lower:
23992405
hardware = "A100"
2406+
elif "l4" in text_lower:
2407+
hardware = "L4"
24002408
elif "l40" in text_lower:
24012409
hardware = "L40S"
2410+
elif "b200" in text_lower:
2411+
hardware = "B200"
24022412

24032413
# Detect priority from user input
24042414
priority = "balanced" # default

0 commit comments

Comments
 (0)