chore: typecheck (mypy) fixes

amito · amito · commit 6883fe280f0a · 2026-03-10T10:15:56.000+02:00
Signed-off-by: Amit Oren &lt;amoren@redhat.com&gt;
diff --git a/src/neuralnav/api/routes/configuration.py b/src/neuralnav/api/routes/configuration.py
@@ -443,10 +443,10 @@ async def list_all_deployments():
 
         deployments = []
         for deployment_id in deployment_ids:
-            status = manager.get_inferenceservice_status(deployment_id)
+            svc_status = manager.get_inferenceservice_status(deployment_id)
             pods = manager.get_deployment_pods(deployment_id)
 
-            deployments.append({"deployment_id": deployment_id, "status": status, "pods": pods})
+            deployments.append({"deployment_id": deployment_id, "status": svc_status, "pods": pods})
 
         return {
             "success": True,
diff --git a/src/neuralnav/api/routes/recommendation.py b/src/neuralnav/api/routes/recommendation.py
@@ -1,6 +1,7 @@
 """Recommendation endpoints."""
 
 import logging
+from typing import Literal
 
 from fastapi import APIRouter, HTTPException, status
 from pydantic import BaseModel
@@ -49,7 +50,7 @@ class RankedRecommendationFromSpecRequest(BaseModel):
     ttft_target_ms: int
     itl_target_ms: int
     e2e_target_ms: int
-    percentile: str = "p95"  # "mean", "p90", "p95", "p99"
+    percentile: Literal["mean", "p90", "p95", "p99"] = "p95"
 
     # Ranking options
     min_accuracy: int | None = None
@@ -92,7 +93,7 @@ async def simple_recommend(request: SimpleRecommendationRequest):
                     recommendation=recommendation, namespace="default"
                 )
                 deployment_id = yaml_result["deployment_id"]
-                yaml_files = yaml_result["files"]
+                yaml_files: dict = yaml_result["files"]
                 logger.info(
                     f"Auto-generated YAML files for {deployment_id}: {list(yaml_files.keys())}"
                 )
@@ -272,7 +273,9 @@ async def test_endpoint(message: str = "I need a chatbot for 1000 users"):
         return {
             "success": True,
             "model": recommendation.model_name,
-            "gpu_config": f"{recommendation.gpu_config.gpu_count}x {recommendation.gpu_config.gpu_type}",
+            "gpu_config": f"{recommendation.gpu_config.gpu_count}x {recommendation.gpu_config.gpu_type}"
+            if recommendation.gpu_config
+            else "N/A",
             "cost_per_month": f"${recommendation.cost_per_month_usd:.2f}",
             "meets_slo": recommendation.meets_slo,
             "reasoning": recommendation.reasoning,
diff --git a/src/neuralnav/configuration/generator.py b/src/neuralnav/configuration/generator.py
@@ -77,7 +77,7 @@ def generate_deployment_id(self, recommendation: DeploymentRecommendation) -> st
         use_case = recommendation.intent.use_case.replace("_", "-")
 
         # Clean model name: remove special chars, keep alphanumeric and hyphens
-        model_name = recommendation.model_id.split("/")[-1].lower()
+        model_name = (recommendation.model_id or "unknown").split("/")[-1].lower()
         model_name = re.sub(r"[^a-z0-9-]", "-", model_name)
         # Remove consecutive hyphens
         model_name = re.sub(r"-+", "-", model_name).strip("-")
@@ -120,6 +120,8 @@ def _prepare_template_context(
         traffic = recommendation.traffic_profile
         slo = recommendation.slo_targets
 
+        assert gpu_config is not None, "gpu_config is required for template context"
+
         # Calculate GPU hourly rate from ModelCatalog
         gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
         gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
@@ -166,7 +168,8 @@ def _prepare_template_context(
         # Calculate max_num_seqs based on expected QPS and latency
         # Rule of thumb: concurrent requests = QPS × avg_latency_seconds
         avg_latency_sec = slo.e2e_p95_target_ms / 1000.0
-        max_num_seqs = max(32, int(traffic.expected_qps * avg_latency_sec * 1.5))
+        expected_qps = traffic.expected_qps or 0.0
+        max_num_seqs = max(32, int(expected_qps * avg_latency_sec * 1.5))
 
         # Max batched tokens (vLLM parameter)
         max_num_batched_tokens = max_num_seqs * (traffic.prompt_tokens + traffic.output_tokens)
@@ -228,7 +231,7 @@ def _prepare_template_context(
 
     def generate_all(
         self, recommendation: DeploymentRecommendation, namespace: str = "default"
-    ) -> dict[str, str]:
+    ) -> dict[str, Any]:
         """
         Generate all deployment YAML files.
 
@@ -237,7 +240,7 @@ def generate_all(
             namespace: Kubernetes namespace
 
         Returns:
-            Dictionary mapping config type to file path
+            Dictionary with deployment_id, namespace, files, and metadata
         """
         deployment_id = self.generate_deployment_id(recommendation)
         context = self._prepare_template_context(recommendation, deployment_id, namespace)
diff --git a/src/neuralnav/knowledge_base/model_catalog.py b/src/neuralnav/knowledge_base/model_catalog.py
@@ -57,11 +57,11 @@ def __init__(self, data: dict):
         self.memory_gb = data["memory_gb"]
         self.compute_capability = data["compute_capability"]
         self.typical_use_cases = data["typical_use_cases"]
-        self.cost_per_hour_usd = data["cost_per_hour_usd"]  # Base/minimum price
+        self.cost_per_hour_usd: float = data["cost_per_hour_usd"]  # Base/minimum price
         # Cloud provider-specific pricing (optional)
-        self.cost_per_hour_aws = data.get("cost_per_hour_aws")
-        self.cost_per_hour_gcp = data.get("cost_per_hour_gcp")
-        self.cost_per_hour_azure = data.get("cost_per_hour_azure")
+        self.cost_per_hour_aws: float | None = data.get("cost_per_hour_aws")
+        self.cost_per_hour_gcp: float | None = data.get("cost_per_hour_gcp")
+        self.cost_per_hour_azure: float | None = data.get("cost_per_hour_azure")
         self.availability = data["availability"]
         self.notes = data.get("notes", "")
 
@@ -75,11 +75,11 @@ def get_cost_for_provider(self, provider: str | None = None) -> float:
         Returns:
             Cost per hour in USD
         """
-        if provider == "aws" and self.cost_per_hour_aws:
+        if provider == "aws" and self.cost_per_hour_aws is not None:
             return self.cost_per_hour_aws
-        elif provider == "gcp" and self.cost_per_hour_gcp:
+        elif provider == "gcp" and self.cost_per_hour_gcp is not None:
             return self.cost_per_hour_gcp
-        elif provider == "azure" and self.cost_per_hour_azure:
+        elif provider == "azure" and self.cost_per_hour_azure is not None:
             return self.cost_per_hour_azure
         return self.cost_per_hour_usd
 
@@ -313,13 +313,25 @@ def get_cost_breakdown(
             "hourly_rate_azure": gpu.cost_per_hour_azure,
             "cost_per_hour_total": gpu.cost_per_hour_usd * total_gpus,
             "cost_per_month_base": gpu.cost_per_hour_usd * total_gpus * hours_per_month,
-            "cost_per_month_aws": (gpu.cost_per_hour_aws or gpu.cost_per_hour_usd)
+            "cost_per_month_aws": (
+                gpu.cost_per_hour_aws
+                if gpu.cost_per_hour_aws is not None
+                else gpu.cost_per_hour_usd
+            )
             * total_gpus
             * hours_per_month,
-            "cost_per_month_gcp": (gpu.cost_per_hour_gcp or gpu.cost_per_hour_usd)
+            "cost_per_month_gcp": (
+                gpu.cost_per_hour_gcp
+                if gpu.cost_per_hour_gcp is not None
+                else gpu.cost_per_hour_usd
+            )
             * total_gpus
             * hours_per_month,
-            "cost_per_month_azure": (gpu.cost_per_hour_azure or gpu.cost_per_hour_usd)
+            "cost_per_month_azure": (
+                gpu.cost_per_hour_azure
+                if gpu.cost_per_hour_azure is not None
+                else gpu.cost_per_hour_usd
+            )
             * total_gpus
             * hours_per_month,
         }
diff --git a/src/neuralnav/llm/ollama_client.py b/src/neuralnav/llm/ollama_client.py
@@ -1,9 +1,11 @@
 """Ollama client wrapper for LLM interactions."""
 
+from __future__ import annotations
+
 import json
 import logging
 import os
-from typing import Any
+from typing import Any, Literal
 
 try:
     import ollama
@@ -30,14 +32,15 @@ def __init__(self, model: str | None = None, host: str | None = None):
             host: Optional Ollama host URL. Falls back to OLLAMA_HOST env var,
                   then localhost:11434.
         """
-        self.model = model or os.getenv("OLLAMA_MODEL", "qwen2.5:7b")
+        default_model = os.getenv("OLLAMA_MODEL", "qwen2.5:7b")
+        self.model: str = model if model else default_model
         self.host = host or os.getenv("OLLAMA_HOST")
 
+        self._client: ollama.Client | None = None
         if OLLAMA_AVAILABLE:
             client_kwargs = {"host": self.host} if self.host else {}
             self._client = ollama.Client(**client_kwargs)
         else:
-            self._client = None
             logger.error("Ollama library not installed. Install with: pip install ollama")
 
     def chat(
@@ -57,7 +60,7 @@ def chat(
         Returns:
             Response dict with 'message' containing 'content'
         """
-        if not OLLAMA_AVAILABLE:
+        if not OLLAMA_AVAILABLE or not self._client:
             raise RuntimeError("Ollama library not available")
 
         try:
@@ -71,16 +74,13 @@ def chat(
                     f"[LLM PROMPT] {last_msg.get('content', '')[:500]}..."
                 )  # Log first 500 chars at debug level
 
-            kwargs = {
-                "model": self.model,
-                "messages": messages,
-                "options": {"temperature": temperature},
-            }
-
-            if format_json:
-                kwargs["format"] = "json"
-
-            response = self._client.chat(**kwargs)
+            fmt: Literal["", "json"] = "json" if format_json else ""
+            response = self._client.chat(  # type: ignore[call-overload]
+                model=self.model,
+                messages=messages,
+                format=fmt,
+                options={"temperature": temperature},
+            )
 
             # Log the full response
             response_content = response.get("message", {}).get("content", "")
@@ -93,7 +93,7 @@ def chat(
             logger.info("[LLM RESPONSE CONTENT - END]")
             logger.info("=" * 80)
 
-            return response
+            return dict(response)
 
         except Exception as e:
             logger.error(f"Error calling Ollama: {e}")
@@ -122,7 +122,7 @@ def generate_completion(
 
         messages = [{"role": "user", "content": prompt}]
         response = self.chat(messages, format_json=format_json, temperature=temperature)
-        return response["message"]["content"]
+        return str(response["message"]["content"])
 
     def extract_structured_data(
         self,
@@ -152,7 +152,8 @@ def extract_structured_data(
         )
 
         try:
-            return json.loads(response_text)
+            result: dict[str, Any] = json.loads(response_text)
+            return result
         except json.JSONDecodeError as e:
             logger.error(f"Failed to parse JSON response: {response_text}")
             logger.error(f"JSON error: {e}")
diff --git a/src/neuralnav/llm/prompts.py b/src/neuralnav/llm/prompts.py
@@ -35,7 +35,9 @@
 """
 
 
-def build_intent_extraction_prompt(user_message: str, conversation_history: list = None) -> str:
+def build_intent_extraction_prompt(
+    user_message: str, conversation_history: list | None = None
+) -> str:
     """
     Build prompt for extracting deployment intent from user conversation.
 
diff --git a/src/neuralnav/llm/prompts_experimental.py b/src/neuralnav/llm/prompts_experimental.py
@@ -26,7 +26,7 @@
 
 
 def build_conversational_prompt(
-    user_message: str, current_understanding: dict, conversation_history: list = None
+    user_message: str, current_understanding: dict, conversation_history: list | None = None
 ) -> str:
     """
     Build prompt for conversational AI responses.
diff --git a/src/neuralnav/orchestration/workflow.py b/src/neuralnav/orchestration/workflow.py
@@ -205,10 +205,13 @@ def generate_recommendation_from_specs(self, specifications: dict) -> Deployment
         all_configs.sort(key=lambda x: x.scores.balanced_score if x.scores else 0, reverse=True)
         best_recommendation = all_configs[0]
 
+        gpu_cfg = best_recommendation.gpu_config
         logger.info(
             f"Selected: {best_recommendation.model_name} on "
-            f"{best_recommendation.gpu_config.gpu_count}x {best_recommendation.gpu_config.gpu_type} "
+            f"{gpu_cfg.gpu_count}x {gpu_cfg.gpu_type} "
             f"(balanced score: {best_recommendation.scores.balanced_score if best_recommendation.scores else 0:.1f})"
+            if gpu_cfg
+            else f"Selected: {best_recommendation.model_name} (no GPU config)"
         )
 
         # Add top 3 alternatives
diff --git a/src/neuralnav/recommendation/scorer.py b/src/neuralnav/recommendation/scorer.py
@@ -17,6 +17,7 @@
 import logging
 import re
 from pathlib import Path
+from typing import Literal
 
 logger = logging.getLogger(__name__)
 
@@ -97,7 +98,7 @@ def _load_slo_ranges(self) -> dict:
             with open(config_path) as f:
                 data = json.load(f)
             logger.debug(f"Loaded SLO ranges from {config_path}")
-            return data.get("use_case_slo_workload", {})
+            return dict(data.get("use_case_slo_workload", {}))
         except (FileNotFoundError, json.JSONDecodeError) as e:
             logger.warning(f"Could not load SLO ranges from {config_path}: {e}")
             return {}
@@ -244,9 +245,9 @@ def score_latency(
         target_ttft_ms: int,
         target_itl_ms: int,
         target_e2e_ms: int,
-        use_case: str = None,
+        use_case: str | None = None,
         near_miss_tolerance: float = 0.0,
-    ) -> tuple[int, str]:
+    ) -> tuple[int, Literal["compliant", "near_miss", "exceeds"]]:
         """
         Score latency using CAPPED RANGE SCORING.
 
@@ -294,6 +295,7 @@ def score_latency(
         worst_ratio = max(ratios)
 
         # Determine SLO status using the tolerance passed from config_finder
+        slo_status: Literal["compliant", "near_miss", "exceeds"]
         if worst_ratio <= 1.0:
             slo_status = "compliant"
         elif worst_ratio <= (1.0 + near_miss_tolerance):
diff --git a/src/neuralnav/shared/schemas/specification.py b/src/neuralnav/shared/schemas/specification.py
@@ -1,5 +1,7 @@
 """Specification-related schemas for traffic profiles and SLO targets."""
 
+from typing import Literal
+
 from pydantic import BaseModel, Field
 
 from .intent import DeploymentIntent
@@ -19,7 +21,7 @@ class SLOTargets(BaseModel):
     ttft_p95_target_ms: int = Field(..., description="Time to First Token target (ms)")
     itl_p95_target_ms: int = Field(..., description="Inter-Token Latency target (ms/token)")
     e2e_p95_target_ms: int = Field(..., description="End-to-end latency target (ms)")
-    percentile: str = Field(
+    percentile: Literal["mean", "p90", "p95", "p99"] = Field(
         default="p95", description="Percentile for SLO comparison (mean, p90, p95, p99)"
     )