Merge pull request #112 from anfredette/yaml-gen-test

amito · web-flow · commit 5d9f022f9784 · 2026-03-30T15:52:33.000+03:00
fix: use correct GPU node selector labels and improve prod probe timeouts
diff --git a/data/configuration/model_catalog.json b/data/configuration/model_catalog.json
@@ -1280,6 +1280,7 @@
     {
       "gpu_type": "L4",
       "aliases": ["NVIDIA-L4", "L4"],
+      "node_selector_label": "NVIDIA-L4",
       "memory_gb": 24,
       "compute_capability": "8.9",
       "typical_use_cases": ["inference"],
@@ -1293,6 +1294,7 @@
     {
       "gpu_type": "A10G",
       "aliases": ["NVIDIA-A10G", "A10G"],
+      "node_selector_label": "NVIDIA-A10G",
       "memory_gb": 24,
       "compute_capability": "8.6",
       "typical_use_cases": ["inference"],
@@ -1306,6 +1308,7 @@
     {
       "gpu_type": "A100-40",
       "aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-40GB",
       "memory_gb": 40,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1319,6 +1322,7 @@
     {
       "gpu_type": "A100-80",
       "aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-80GB",
       "memory_gb": 80,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1332,6 +1336,7 @@
     {
       "gpu_type": "H100",
       "aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
+      "node_selector_label": "NVIDIA-H100-80GB-HBM3",
       "memory_gb": 80,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1345,6 +1350,7 @@
     {
       "gpu_type": "H200",
       "aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
+      "node_selector_label": "NVIDIA-H200-141GB-HBM3",
       "memory_gb": 141,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1358,6 +1364,7 @@
     {
       "gpu_type": "B200",
       "aliases": ["NVIDIA-B200", "B200"],
+      "node_selector_label": "NVIDIA-B200",
       "memory_gb": 192,
       "compute_capability": "10.0",
       "typical_use_cases": ["inference", "training"],
@@ -1371,6 +1378,7 @@
     {
       "gpu_type": "MI300X",
       "aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
+      "node_selector_label": "AMD-Instinct-MI300X",
       "memory_gb": 192,
       "compute_capability": "N/A",
       "typical_use_cases": ["inference", "training"],
diff --git a/src/neuralnav/configuration/generator.py b/src/neuralnav/configuration/generator.py
@@ -21,7 +21,7 @@ class DeploymentGenerator:
     """Generate deployment configurations from recommendations."""
 
     # vLLM version to use
-    VLLM_VERSION = "v0.6.2"
+    VLLM_VERSION = "latest"
 
     def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
         """
@@ -122,9 +122,15 @@ def _prepare_template_context(
 
         assert gpu_config is not None, "gpu_config is required for template context"
 
-        # Calculate GPU hourly rate from ModelCatalog
+        # Look up GPU info from ModelCatalog
         gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
-        gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
+        if gpu_info is None:
+            raise ValueError(
+                f"Unknown GPU type '{gpu_config.gpu_type}'. "
+                f"Add it to the GPU catalog in data/configuration/model_catalog.json."
+            )
+        gpu_hourly_rate = gpu_info.cost_per_hour_usd
+        gpu_node_selector_label = gpu_info.node_selector_label
 
         # Determine resource requests based on GPU type
         gpu_type = gpu_config.gpu_type
@@ -187,6 +193,7 @@ def _prepare_template_context(
             "simulator_mode": self.simulator_mode,
             # GPU configuration
             "gpu_type": gpu_config.gpu_type,
+            "gpu_node_selector_label": gpu_node_selector_label,
             "gpu_count": gpu_config.gpu_count,
             "tensor_parallel": gpu_config.tensor_parallel,
             "gpus_per_replica": gpu_config.tensor_parallel,  # GPUs per pod
diff --git a/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2 b/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2
@@ -78,24 +78,25 @@ spec:
         {% if simulator_mode %}
         initialDelaySeconds: 10
         {% else %}
-        initialDelaySeconds: 120
+        initialDelaySeconds: 600
         {% endif %}
         periodSeconds: 30
         timeoutSeconds: 10
+        failureThreshold: 5
       readinessProbe:
         httpGet:
           path: /health
           port: 8080
         {% if simulator_mode %}
         initialDelaySeconds: 5
         {% else %}
-        initialDelaySeconds: 60
+        initialDelaySeconds: 120
         {% endif %}
         periodSeconds: 10
         timeoutSeconds: 5
     {% if not simulator_mode %}
     nodeSelector:
-      nvidia.com/gpu.product: {{ gpu_type }}
+      nvidia.com/gpu.product: {{ gpu_node_selector_label }}
     tolerations:
     - key: nvidia.com/gpu
       operator: Exists
diff --git a/src/neuralnav/knowledge_base/model_catalog.py b/src/neuralnav/knowledge_base/model_catalog.py
@@ -54,6 +54,7 @@ class GPUType:
     def __init__(self, data: dict):
         self.gpu_type = data["gpu_type"]
         self.aliases = data.get("aliases", [data["gpu_type"]])  # Default to primary name
+        self.node_selector_label = data.get("node_selector_label", self.aliases[0])
         self.memory_gb = data["memory_gb"]
         self.compute_capability = data["compute_capability"]
         self.typical_use_cases = data["typical_use_cases"]
@@ -88,6 +89,7 @@ def to_dict(self) -> dict:
         return {
             "gpu_type": self.gpu_type,
             "aliases": self.aliases,
+            "node_selector_label": self.node_selector_label,
             "memory_gb": self.memory_gb,
             "compute_capability": self.compute_capability,
             "typical_use_cases": self.typical_use_cases,