fix: use correct GPU node selector labels and improve prod probe timeouts

anfredette · anfredette · commit 80e0b5d8da4e · 2026-03-10T18:13:46.000-04:00
The generated YAML used short GPU names (e.g., "L4") for the
nvidia.com/gpu.product node selector, but K8s nodes use labels like
"NVIDIA-L4". Add node_selector_label field to GPU catalog and use it
in YAML generation. Also increase production liveness probe delay
from 120s to 600s to allow time for model loading, and update vLLM
image to latest.

Signed-off-by: Andre Fredette &lt;afredette@redhat.com&gt;
diff --git a/data/configuration/model_catalog.json b/data/configuration/model_catalog.json
@@ -1280,6 +1280,7 @@
     {
       "gpu_type": "L4",
       "aliases": ["NVIDIA-L4", "L4"],
+      "node_selector_label": "NVIDIA-L4",
       "memory_gb": 24,
       "compute_capability": "8.9",
       "typical_use_cases": ["inference"],
@@ -1293,6 +1294,7 @@
     {
       "gpu_type": "A10G",
       "aliases": ["NVIDIA-A10G", "A10G"],
+      "node_selector_label": "NVIDIA-A10G",
       "memory_gb": 24,
       "compute_capability": "8.6",
       "typical_use_cases": ["inference"],
@@ -1306,6 +1308,7 @@
     {
       "gpu_type": "A100-40",
       "aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-40GB",
       "memory_gb": 40,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1319,6 +1322,7 @@
     {
       "gpu_type": "A100-80",
       "aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-80GB",
       "memory_gb": 80,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1332,6 +1336,7 @@
     {
       "gpu_type": "H100",
       "aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
+      "node_selector_label": "NVIDIA-H100-80GB-HBM3",
       "memory_gb": 80,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1345,6 +1350,7 @@
     {
       "gpu_type": "H200",
       "aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
+      "node_selector_label": "NVIDIA-H200-141GB-HBM3",
       "memory_gb": 141,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1358,6 +1364,7 @@
     {
       "gpu_type": "B200",
       "aliases": ["NVIDIA-B200", "B200"],
+      "node_selector_label": "NVIDIA-B200",
       "memory_gb": 192,
       "compute_capability": "10.0",
       "typical_use_cases": ["inference", "training"],
@@ -1371,6 +1378,7 @@
     {
       "gpu_type": "MI300X",
       "aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
+      "node_selector_label": "AMD-Instinct-MI300X",
       "memory_gb": 192,
       "compute_capability": "N/A",
       "typical_use_cases": ["inference", "training"],
diff --git a/src/neuralnav/configuration/generator.py b/src/neuralnav/configuration/generator.py
@@ -21,7 +21,7 @@ class DeploymentGenerator:
     """Generate deployment configurations from recommendations."""
 
     # vLLM version to use
-    VLLM_VERSION = "v0.6.2"
+    VLLM_VERSION = "latest"
 
     def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
         """
@@ -122,9 +122,15 @@ def _prepare_template_context(
 
         assert gpu_config is not None, "gpu_config is required for template context"
 
-        # Calculate GPU hourly rate from ModelCatalog
+        # Look up GPU info from ModelCatalog
         gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
-        gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
+        if gpu_info is None:
+            raise ValueError(
+                f"Unknown GPU type '{gpu_config.gpu_type}'. "
+                f"Add it to the GPU catalog in data/configuration/model_catalog.json."
+            )
+        gpu_hourly_rate = gpu_info.cost_per_hour_usd
+        gpu_node_selector_label = gpu_info.node_selector_label
 
         # Determine resource requests based on GPU type
         gpu_type = gpu_config.gpu_type
@@ -187,6 +193,7 @@ def _prepare_template_context(
             "simulator_mode": self.simulator_mode,
             # GPU configuration
             "gpu_type": gpu_config.gpu_type,
+            "gpu_node_selector_label": gpu_node_selector_label,
             "gpu_count": gpu_config.gpu_count,
             "tensor_parallel": gpu_config.tensor_parallel,
             "gpus_per_replica": gpu_config.tensor_parallel,  # GPUs per pod
diff --git a/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2 b/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2
@@ -78,24 +78,25 @@ spec:
         {% if simulator_mode %}
         initialDelaySeconds: 10
         {% else %}
-        initialDelaySeconds: 120
+        initialDelaySeconds: 600
         {% endif %}
         periodSeconds: 30
         timeoutSeconds: 10
+        failureThreshold: 5
       readinessProbe:
         httpGet:
           path: /health
           port: 8080
         {% if simulator_mode %}
         initialDelaySeconds: 5
         {% else %}
-        initialDelaySeconds: 60
+        initialDelaySeconds: 120
         {% endif %}
         periodSeconds: 10
         timeoutSeconds: 5
     {% if not simulator_mode %}
     nodeSelector:
-      nvidia.com/gpu.product: {{ gpu_type }}
+      nvidia.com/gpu.product: {{ gpu_node_selector_label }}
     tolerations:
     - key: nvidia.com/gpu
       operator: Exists
diff --git a/src/neuralnav/knowledge_base/model_catalog.py b/src/neuralnav/knowledge_base/model_catalog.py
@@ -54,6 +54,7 @@ class GPUType:
     def __init__(self, data: dict):
         self.gpu_type = data["gpu_type"]
         self.aliases = data.get("aliases", [data["gpu_type"]])  # Default to primary name
+        self.node_selector_label = data.get("node_selector_label", self.aliases[0])
         self.memory_gb = data["memory_gb"]
         self.compute_capability = data["compute_capability"]
         self.typical_use_cases = data["typical_use_cases"]
@@ -88,6 +89,7 @@ def to_dict(self) -> dict:
         return {
             "gpu_type": self.gpu_type,
             "aliases": self.aliases,
+            "node_selector_label": self.node_selector_label,
             "memory_gb": self.memory_gb,
             "compute_capability": self.compute_capability,
             "typical_use_cases": self.typical_use_cases,