fix: reliable priority extraction and simplified intent prompt

anfredette · anfredette · commit b452605290c0 · 2026-04-14T09:27:11.000-04:00
Priority extraction fix: - Add deterministic keyword-based priority enforcement because the LLM (qwen2.5:7b) unreliably inferred priorities from use case type rather than from explicit user statements. Priorities now only reflect what the user actually said. The SLO profiles already handle use-case-appropriate latency/throughput targets. Prompt simplification (for smaller LLMs): - Remove experience_class, complexity_priority, and additional_context from the LLM prompt schema. experience_class is still inferred deterministically from use_case in post-processing; complexity_priority and additional_context were never consumed downstream. Use_case normalization improvements (follow-up to PR #180): - Lowercase use_case before alias/fuzzy lookup so mixed-case LLM responses like "Text_Summarization" are handled correctly. - Add logger.warning when an unrecognized use_case cannot be resolved by alias map or fuzzy match, aiding production debugging. - Remove stale complexity_priority from test helper _base_intent(). - Add 4 unit tests for case-insensitive normalization. Assisted-by: Claude <noreply@anthropic.com> Signed-off-by: Andre Fredette <afredette@redhat.com>
diff --git a/src/planner/intent_extraction/extractor.py b/src/planner/intent_extraction/extractor.py
@@ -111,7 +111,7 @@ def extract_intent(
             logger.info(f"[EXTRACTED INTENT] {extracted}")
 
             # Validate and parse into Pydantic model
-            intent = self._parse_extracted_intent(extracted)
+            intent = self._parse_extracted_intent(extracted, user_message)
             logger.info(f"Extracted intent: use_case={intent.use_case}, users={intent.user_count}")
 
             return intent
@@ -120,12 +120,13 @@ def extract_intent(
             logger.error(f"Failed to extract intent: {e}")
             raise ValueError(f"Intent extraction failed: {e}") from e
 
-    def _parse_extracted_intent(self, raw_data: dict) -> DeploymentIntent:
+    def _parse_extracted_intent(self, raw_data: dict, user_message: str = "") -> DeploymentIntent:
         """
         Parse and validate raw LLM output into DeploymentIntent.
 
         Args:
             raw_data: Raw dict from LLM
+            user_message: Original user message for priority validation
 
         Returns:
             Validated DeploymentIntent
@@ -134,20 +135,21 @@ def _parse_extracted_intent(self, raw_data: dict) -> DeploymentIntent:
             ValueError: If data is invalid
         """
         # Handle common LLM mistakes
-        cleaned_data = self._clean_llm_output(raw_data)
+        cleaned_data = self._clean_llm_output(raw_data, user_message)
 
         try:
             return DeploymentIntent(**cleaned_data)
         except Exception as e:
             logger.error(f"Failed to parse intent from: {cleaned_data}")
             raise ValueError(f"Invalid intent data: {e}") from e
 
-    def _clean_llm_output(self, data: dict) -> dict:
+    def _clean_llm_output(self, data: dict, user_message: str = "") -> dict:
         """
         Clean common LLM output mistakes.
 
         Args:
             data: Raw LLM output
+            user_message: Original user message for priority validation
 
         Returns:
             Cleaned data dict
@@ -161,7 +163,8 @@ def _clean_llm_output(self, data: dict) -> dict:
             cleaned["use_case"] = cleaned["use_case"].split("|")[0].strip()
 
         # Normalize hallucinated use_case values
-        use_case = cleaned.get("use_case", "")
+        use_case = cleaned.get("use_case", "").lower()
+        cleaned["use_case"] = use_case
         valid_use_cases = list(get_args(DeploymentIntent.model_fields["use_case"].annotation))
         if use_case not in valid_use_cases:
             mapped = _USE_CASE_ALIASES.get(use_case)
@@ -173,6 +176,10 @@ def _clean_llm_output(self, data: dict) -> dict:
                 if close:
                     logger.info("Fuzzy-matched use_case '%s' -> '%s'", use_case, close[0])
                     cleaned["use_case"] = close[0]
+                else:
+                    logger.warning(
+                        "Unrecognized use_case '%s' — no alias or fuzzy match found", use_case
+                    )
 
         # Infer experience_class if not provided
         if "experience_class" not in cleaned or not cleaned.get("experience_class"):
@@ -266,7 +273,6 @@ def _clean_llm_output(self, data: dict) -> dict:
             "accuracy_priority",
             "cost_priority",
             "latency_priority",
-            "complexity_priority",
         ]:
             if priority_field in cleaned:
                 # Normalize to lowercase and validate
@@ -282,12 +288,104 @@ def _clean_llm_output(self, data: dict) -> dict:
                 # Field not provided by LLM, default to medium
                 cleaned[priority_field] = "medium"
 
+        # Enforce explicit-only priority extraction.
+        # Priorities must only reflect explicit user statements, not be inferred
+        # from the use case type.  The SLO profile already handles
+        # use-case-appropriate latency/throughput targets.
+        if user_message:
+            msg_lower = user_message.lower()
+            self._enforce_explicit_priority(
+                cleaned,
+                "accuracy_priority",
+                msg_lower,
+                high_keywords=[
+                    "accuracy",
+                    "quality",
+                    "best model",
+                    "top quality",
+                    "most accurate",
+                    "accuracy is critical",
+                    "accuracy is paramount",
+                    "accuracy matters",
+                ],
+                low_keywords=["good enough", "accuracy less important", "accuracy doesn't matter"],
+            )
+            self._enforce_explicit_priority(
+                cleaned,
+                "cost_priority",
+                msg_lower,
+                high_keywords=[
+                    "cost-effective",
+                    "cost effective",
+                    "budget",
+                    "cost-sensitive",
+                    "cost sensitive",
+                    "minimize cost",
+                    "cheap",
+                    "affordable",
+                    "budget is tight",
+                ],
+                low_keywords=[
+                    "cost doesn't matter",
+                    "cost is not",
+                    "budget is unlimited",
+                    "money is no object",
+                ],
+            )
+            self._enforce_explicit_priority(
+                cleaned,
+                "latency_priority",
+                msg_lower,
+                high_keywords=[
+                    "low latency",
+                    "fast response",
+                    "speed is important",
+                    "latency is critical",
+                    "latency is important",
+                    "real-time",
+                    "instant response",
+                ],
+                low_keywords=[
+                    "latency less important",
+                    "latency doesn't matter",
+                    "async",
+                    "latency is not",
+                ],
+            )
         # Remove any unexpected fields that aren't in the schema
         valid_fields = DeploymentIntent.model_fields.keys()
         cleaned = {k: v for k, v in cleaned.items() if k in valid_fields}
 
         return cleaned
 
+    @staticmethod
+    def _enforce_explicit_priority(
+        cleaned: dict,
+        field: str,
+        msg_lower: str,
+        high_keywords: list[str],
+        low_keywords: list[str],
+    ) -> None:
+        """Reset a priority field to 'medium' unless the user message contains
+        explicit keywords that justify 'high' or 'low'."""
+        has_high = any(kw in msg_lower for kw in high_keywords)
+        has_low = any(kw in msg_lower for kw in low_keywords)
+
+        if has_high:
+            if cleaned.get(field) != "high":
+                logger.info(f"Overriding {field} to 'high' based on explicit user keywords")
+            cleaned[field] = "high"
+        elif has_low:
+            if cleaned.get(field) != "low":
+                logger.info(f"Overriding {field} to 'low' based on explicit user keywords")
+            cleaned[field] = "low"
+        elif cleaned.get(field) != "medium":
+            logger.info(
+                f"Resetting {field} from '{cleaned.get(field)}' to 'medium' "
+                f"(no explicit user statement found)"
+            )
+            cleaned[field] = "medium"
+
     def infer_missing_fields(self, intent: DeploymentIntent) -> DeploymentIntent:
         """
         Infer missing optional fields based on available information.
@@ -302,11 +400,7 @@ def infer_missing_fields(self, intent: DeploymentIntent) -> DeploymentIntent:
         if intent.domain_specialization == ["general"]:
             if intent.use_case in ["code_generation_detailed", "code_completion"]:
                 intent.domain_specialization = ["general", "code"]
-            elif intent.use_case == "translation" or (
-                "multilingual" in intent.additional_context.lower()
-                if intent.additional_context
-                else False
-            ):
+            elif intent.use_case == "translation":
                 intent.domain_specialization = ["general", "multilingual"]
 
         return intent
diff --git a/src/planner/llm/prompts.py b/src/planner/llm/prompts.py
@@ -4,18 +4,17 @@
 Expected JSON schema:
 {
   "use_case": "chatbot_conversational|code_completion|code_generation_detailed|translation|content_generation|summarization_short|document_analysis_rag|long_document_summarization|research_legal_analysis",
-  "experience_class": "instant|conversational|interactive|deferred|batch",
   "user_count": <integer>,
   "domain_specialization": ["general"|"code"|"multilingual"|"enterprise"],
   "preferred_gpu_types": ["<list of GPU types if mentioned, empty list if not specified>"],
   "preferred_models": ["<list of model IDs in HuggingFace format if mentioned, empty list if not specified>"],
   "accuracy_priority": "low|medium|high",
   "cost_priority": "low|medium|high",
-  "latency_priority": "low|medium|high",
-  "complexity_priority": "low|medium|high",
-  "additional_context": "<any other relevant details mentioned>"
+  "latency_priority": "low|medium|high"
 }
 
+All priority fields default to "medium". Set to "high" or "low" only when the user explicitly mentions that priority.
+
 Use case descriptions:
 - chatbot_conversational: Real-time conversational chatbots (short prompts, short responses)
 - code_completion: Fast code completion/autocomplete (short prompts, short completions)
@@ -28,13 +27,6 @@
 - research_legal_analysis: Research/legal document analysis (very long prompts, detailed analysis)
 
 CRITICAL: The use_case value MUST be exactly one of the 9 values listed above. Do not invent variations like "text_summarization" or "chatbot" — use the exact strings shown.
-
-Experience class guidance:
-- instant: Sub-200ms response time - code completion, autocomplete
-- conversational: Real-time user interaction - chatbots, interactive tools
-- interactive: User waiting but tolerates slight delay - RAG Q&A, content generation
-- deferred: Quality over speed - long summarization, detailed analysis
-- batch: Background/async processing - research, legal analysis
 """
 
 
@@ -82,6 +74,16 @@ def build_intent_extraction_prompt(
 - "chatbot" or "customer service" or "conversational" → use_case: chatbot_conversational
 - "content" or "marketing" or "blog" or "content generation" → use_case: content_generation
 - "summarize document" or "summarization" → use_case: summarization_short or long_document_summarization
+- "legal" or "regulatory" or "legal analysis" or "legal document" → use_case: research_legal_analysis (NOT document_analysis_rag)
+- "research" or "research analysis" → use_case: research_legal_analysis
+- "code generation" or "generate code" or "implementing features" or "full code" → use_case: code_generation_detailed (NOT chatbot_conversational, NOT code_completion)
+- "code completion" or "autocomplete" or "IDE completion" → use_case: code_completion (NOT code_generation_detailed)
+
+Domain specialization rules:
+- If use_case is code_completion or code_generation_detailed → domain_specialization MUST include "code"
+- If the request mentions "translation" or "multilingual" or multiple languages → include "multilingual"
+- If the request mentions "enterprise" or "knowledge base" → include "enterprise"
+- Default to ["general"] only when none of the above apply
 
 GPU extraction examples (canonical names: L4, A100-40, A100-80, H100, H200, B200):
 - "running on h200" or "h200" or "H200" → preferred_gpu_types: ["H200"]
@@ -98,22 +100,6 @@ def build_intent_extraction_prompt(
 - "run mistral small" → preferred_models: ["mistralai/Mistral-Small-24B-Instruct-2501"]
 - No model mentioned → preferred_models: []
 
-Priority extraction (for scoring weights - use "medium" as baseline, adjust based on context):
-- accuracy_priority: "high" if user mentions accuracy matters, quality is important, accuracy is critical, best model, or top quality. "low" if user says good enough or accuracy less important.
-- cost_priority: "high" if user EXPLICITLY says cost-effective, cost-sensitive, budget constrained, minimize cost, cost is important, or budget is tight. "low" ONLY if user EXPLICITLY says "cost doesn't matter" or "budget is unlimited" or "money is no object". Default to "medium" if not mentioned. DO NOT infer from GPU choice.
-- latency_priority: "high" if user mentions low latency needed, fast response critical, speed is important, real-time performance required, or instant responses needed. "low" if user says latency less important or async/batch is acceptable. Default to "medium" if not mentioned.
-- complexity_priority: "high" if user wants simple deployment, easy setup. "low" if they're okay with complex setups.
-IMPORTANT - Priority Extraction Rules (FOLLOW STRICTLY):
-- Only extract priorities from EXPLICIT user statements about priorities, not from hardware choices or use case type
-- Hardware preference (H100, L4, etc.) does NOT imply cost_priority - user may have GPUs available or budget allocated
-- Use case type does NOT imply latency_priority - default to "medium" unless user explicitly mentions speed/latency concerns
-- When in doubt, use "medium" - only deviate when user EXPLICITLY states a priority
-Examples:
-- "chatbot with H100" → cost_priority: "medium" (H100 doesn't mean cost doesn't matter)
-- "low latency is important" → latency_priority: "high" (explicit statement)
-- "cost-effective solution" → cost_priority: "high" (explicit statement)
-- "chatbot for 300 users, low latency important, H100 gpus" → latency_priority: "high", cost_priority: "medium" (only latency explicitly mentioned)
-
 {INTENT_EXTRACTION_SCHEMA}
 """
     return prompt
diff --git a/src/planner/shared/schemas/intent.py b/src/planner/shared/schemas/intent.py
@@ -54,13 +54,6 @@ class DeploymentIntent(BaseModel):
     latency_priority: Literal["low", "medium", "high"] = Field(
         default="medium", description="Latency importance"
     )
-    complexity_priority: Literal["low", "medium", "high"] = Field(
-        default="medium", description="Preference for simpler deployments"
-    )
-
-    additional_context: str | None = Field(
-        None, description="Any other relevant details from conversation"
-    )
 
 
 class ConversationMessage(BaseModel):
diff --git a/tests/unit/test_intent_extractor.py b/tests/unit/test_intent_extractor.py
@@ -21,7 +21,6 @@ def _base_intent(**overrides) -> dict:
         "accuracy_priority": "medium",
         "cost_priority": "medium",
         "latency_priority": "medium",
-        "complexity_priority": "medium",
     }
     data.update(overrides)
     return data
@@ -100,6 +99,26 @@ def test_clean_llm_output_fuzzy_matches_close_typos(extractor, typo, expected):
     assert cleaned["use_case"] == expected
 
 
+# --- Case-insensitive normalization ---
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "mixed_case, expected",
+    [
+        ("Text_Summarization", "summarization_short"),
+        ("CHATBOT", "chatbot_conversational"),
+        ("Code_Completion", "code_completion"),
+        ("DOCUMENT_ANALYSIS_RAG", "document_analysis_rag"),
+    ],
+)
+def test_clean_llm_output_handles_case_insensitive(extractor, mixed_case, expected):
+    """Mixed-case use_case values are lowercased before alias/fuzzy matching."""
+    raw = _base_intent(use_case=mixed_case)
+    cleaned = extractor._clean_llm_output(raw)
+    assert cleaned["use_case"] == expected
+
+
 # --- Garbage values are NOT matched ---