fix: use correct GPU node selector labels and improve prod probe timeouts

anfredette · anfredette · commit 379ad554bba8 · 2026-03-10T11:03:43.000-04:00
The generated YAML used short GPU names (e.g., "L4") for the
nvidia.com/gpu.product node selector, but K8s nodes use labels like
"NVIDIA-L4". Add node_selector_label field to GPU catalog and use it
in YAML generation. Also increase production liveness probe delay
from 120s to 600s to allow time for model loading, and update vLLM
image to latest.

Signed-off-by: Andre Fredette &lt;afredette@redhat.com&gt;
diff --git a/data/configuration/model_catalog.json b/data/configuration/model_catalog.json
@@ -1280,6 +1280,7 @@
     {
       "gpu_type": "L4",
       "aliases": ["NVIDIA-L4", "L4"],
+      "node_selector_label": "NVIDIA-L4",
       "memory_gb": 24,
       "compute_capability": "8.9",
       "typical_use_cases": ["inference"],
@@ -1293,6 +1294,7 @@
     {
       "gpu_type": "A10G",
       "aliases": ["NVIDIA-A10G", "A10G"],
+      "node_selector_label": "NVIDIA-A10G",
       "memory_gb": 24,
       "compute_capability": "8.6",
       "typical_use_cases": ["inference"],
@@ -1306,6 +1308,7 @@
     {
       "gpu_type": "A100-40",
       "aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-40GB",
       "memory_gb": 40,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1319,6 +1322,7 @@
     {
       "gpu_type": "A100-80",
       "aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
+      "node_selector_label": "NVIDIA-A100-SXM4-80GB",
       "memory_gb": 80,
       "compute_capability": "8.0",
       "typical_use_cases": ["inference", "training"],
@@ -1332,6 +1336,7 @@
     {
       "gpu_type": "H100",
       "aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
+      "node_selector_label": "NVIDIA-H100-80GB-HBM3",
       "memory_gb": 80,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1345,6 +1350,7 @@
     {
       "gpu_type": "H200",
       "aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
+      "node_selector_label": "NVIDIA-H200-141GB-HBM3",
       "memory_gb": 141,
       "compute_capability": "9.0",
       "typical_use_cases": ["inference", "training"],
@@ -1358,6 +1364,7 @@
     {
       "gpu_type": "B200",
       "aliases": ["NVIDIA-B200", "B200"],
+      "node_selector_label": "NVIDIA-B200",
       "memory_gb": 192,
       "compute_capability": "10.0",
       "typical_use_cases": ["inference", "training"],
@@ -1371,6 +1378,7 @@
     {
       "gpu_type": "MI300X",
       "aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
+      "node_selector_label": "AMD-Instinct-MI300X",
       "memory_gb": 192,
       "compute_capability": "N/A",
       "typical_use_cases": ["inference", "training"],
diff --git a/src/neuralnav/configuration/generator.py b/src/neuralnav/configuration/generator.py
@@ -21,7 +21,7 @@ class DeploymentGenerator:
     """Generate deployment configurations from recommendations."""
 
     # vLLM version to use
-    VLLM_VERSION = "v0.6.2"
+    VLLM_VERSION = "latest"
 
     def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
         """
@@ -120,9 +120,10 @@ def _prepare_template_context(
         traffic = recommendation.traffic_profile
         slo = recommendation.slo_targets
 
-        # Calculate GPU hourly rate from ModelCatalog
+        # Look up GPU info from ModelCatalog
         gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
         gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
+        gpu_node_selector_label = gpu_info.node_selector_label if gpu_info else gpu_config.gpu_type
 
         # Determine resource requests based on GPU type
         gpu_type = gpu_config.gpu_type
@@ -184,6 +185,7 @@ def _prepare_template_context(
             "simulator_mode": self.simulator_mode,
             # GPU configuration
             "gpu_type": gpu_config.gpu_type,
+            "gpu_node_selector_label": gpu_node_selector_label,
             "gpu_count": gpu_config.gpu_count,
             "tensor_parallel": gpu_config.tensor_parallel,
             "gpus_per_replica": gpu_config.tensor_parallel,  # GPUs per pod
diff --git a/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2 b/src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2
@@ -78,24 +78,25 @@ spec:
         {% if simulator_mode %}
         initialDelaySeconds: 10
         {% else %}
-        initialDelaySeconds: 120
+        initialDelaySeconds: 600
         {% endif %}
         periodSeconds: 30
         timeoutSeconds: 10
+        failureThreshold: 5
       readinessProbe:
         httpGet:
           path: /health
           port: 8080
         {% if simulator_mode %}
         initialDelaySeconds: 5
         {% else %}
-        initialDelaySeconds: 60
+        initialDelaySeconds: 120
         {% endif %}
         periodSeconds: 10
         timeoutSeconds: 5
     {% if not simulator_mode %}
     nodeSelector:
-      nvidia.com/gpu.product: {{ gpu_type }}
+      nvidia.com/gpu.product: {{ gpu_node_selector_label }}
     tolerations:
     - key: nvidia.com/gpu
       operator: Exists
diff --git a/src/neuralnav/knowledge_base/model_catalog.py b/src/neuralnav/knowledge_base/model_catalog.py
@@ -54,6 +54,7 @@ class GPUType:
     def __init__(self, data: dict):
         self.gpu_type = data["gpu_type"]
         self.aliases = data.get("aliases", [data["gpu_type"]])  # Default to primary name
+        self.node_selector_label = data.get("node_selector_label", data["gpu_type"])
         self.memory_gb = data["memory_gb"]
         self.compute_capability = data["compute_capability"]
         self.typical_use_cases = data["typical_use_cases"]
@@ -88,6 +89,7 @@ def to_dict(self) -> dict:
         return {
             "gpu_type": self.gpu_type,
             "aliases": self.aliases,
+            "node_selector_label": self.node_selector_label,
             "memory_gb": self.memory_gb,
             "compute_capability": self.compute_capability,
             "typical_use_cases": self.typical_use_cases,