Skip to content

Commit 379ad55

Browse files
committed
fix: use correct GPU node selector labels and improve prod probe timeouts
The generated YAML used short GPU names (e.g., "L4") for the nvidia.com/gpu.product node selector, but K8s nodes use labels like "NVIDIA-L4". Add node_selector_label field to GPU catalog and use it in YAML generation. Also increase production liveness probe delay from 120s to 600s to allow time for model loading, and update vLLM image to latest. Signed-off-by: Andre Fredette <afredette@redhat.com>
1 parent 2eb667f commit 379ad55

File tree

4 files changed

+18
-5
lines changed

4 files changed

+18
-5
lines changed

data/configuration/model_catalog.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,7 @@
12801280
{
12811281
"gpu_type": "L4",
12821282
"aliases": ["NVIDIA-L4", "L4"],
1283+
"node_selector_label": "NVIDIA-L4",
12831284
"memory_gb": 24,
12841285
"compute_capability": "8.9",
12851286
"typical_use_cases": ["inference"],
@@ -1293,6 +1294,7 @@
12931294
{
12941295
"gpu_type": "A10G",
12951296
"aliases": ["NVIDIA-A10G", "A10G"],
1297+
"node_selector_label": "NVIDIA-A10G",
12961298
"memory_gb": 24,
12971299
"compute_capability": "8.6",
12981300
"typical_use_cases": ["inference"],
@@ -1306,6 +1308,7 @@
13061308
{
13071309
"gpu_type": "A100-40",
13081310
"aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
1311+
"node_selector_label": "NVIDIA-A100-SXM4-40GB",
13091312
"memory_gb": 40,
13101313
"compute_capability": "8.0",
13111314
"typical_use_cases": ["inference", "training"],
@@ -1319,6 +1322,7 @@
13191322
{
13201323
"gpu_type": "A100-80",
13211324
"aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
1325+
"node_selector_label": "NVIDIA-A100-SXM4-80GB",
13221326
"memory_gb": 80,
13231327
"compute_capability": "8.0",
13241328
"typical_use_cases": ["inference", "training"],
@@ -1332,6 +1336,7 @@
13321336
{
13331337
"gpu_type": "H100",
13341338
"aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
1339+
"node_selector_label": "NVIDIA-H100-80GB-HBM3",
13351340
"memory_gb": 80,
13361341
"compute_capability": "9.0",
13371342
"typical_use_cases": ["inference", "training"],
@@ -1345,6 +1350,7 @@
13451350
{
13461351
"gpu_type": "H200",
13471352
"aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
1353+
"node_selector_label": "NVIDIA-H200-141GB-HBM3",
13481354
"memory_gb": 141,
13491355
"compute_capability": "9.0",
13501356
"typical_use_cases": ["inference", "training"],
@@ -1358,6 +1364,7 @@
13581364
{
13591365
"gpu_type": "B200",
13601366
"aliases": ["NVIDIA-B200", "B200"],
1367+
"node_selector_label": "NVIDIA-B200",
13611368
"memory_gb": 192,
13621369
"compute_capability": "10.0",
13631370
"typical_use_cases": ["inference", "training"],
@@ -1371,6 +1378,7 @@
13711378
{
13721379
"gpu_type": "MI300X",
13731380
"aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
1381+
"node_selector_label": "AMD-Instinct-MI300X",
13741382
"memory_gb": 192,
13751383
"compute_capability": "N/A",
13761384
"typical_use_cases": ["inference", "training"],

src/neuralnav/configuration/generator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class DeploymentGenerator:
2121
"""Generate deployment configurations from recommendations."""
2222

2323
# vLLM version to use
24-
VLLM_VERSION = "v0.6.2"
24+
VLLM_VERSION = "latest"
2525

2626
def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
2727
"""
@@ -120,9 +120,10 @@ def _prepare_template_context(
120120
traffic = recommendation.traffic_profile
121121
slo = recommendation.slo_targets
122122

123-
# Calculate GPU hourly rate from ModelCatalog
123+
# Look up GPU info from ModelCatalog
124124
gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
125125
gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
126+
gpu_node_selector_label = gpu_info.node_selector_label if gpu_info else gpu_config.gpu_type
126127

127128
# Determine resource requests based on GPU type
128129
gpu_type = gpu_config.gpu_type
@@ -184,6 +185,7 @@ def _prepare_template_context(
184185
"simulator_mode": self.simulator_mode,
185186
# GPU configuration
186187
"gpu_type": gpu_config.gpu_type,
188+
"gpu_node_selector_label": gpu_node_selector_label,
187189
"gpu_count": gpu_config.gpu_count,
188190
"tensor_parallel": gpu_config.tensor_parallel,
189191
"gpus_per_replica": gpu_config.tensor_parallel, # GPUs per pod

src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,24 +78,25 @@ spec:
7878
{% if simulator_mode %}
7979
initialDelaySeconds: 10
8080
{% else %}
81-
initialDelaySeconds: 120
81+
initialDelaySeconds: 600
8282
{% endif %}
8383
periodSeconds: 30
8484
timeoutSeconds: 10
85+
failureThreshold: 5
8586
readinessProbe:
8687
httpGet:
8788
path: /health
8889
port: 8080
8990
{% if simulator_mode %}
9091
initialDelaySeconds: 5
9192
{% else %}
92-
initialDelaySeconds: 60
93+
initialDelaySeconds: 120
9394
{% endif %}
9495
periodSeconds: 10
9596
timeoutSeconds: 5
9697
{% if not simulator_mode %}
9798
nodeSelector:
98-
nvidia.com/gpu.product: {{ gpu_type }}
99+
nvidia.com/gpu.product: {{ gpu_node_selector_label }}
99100
tolerations:
100101
- key: nvidia.com/gpu
101102
operator: Exists

src/neuralnav/knowledge_base/model_catalog.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class GPUType:
5454
def __init__(self, data: dict):
5555
self.gpu_type = data["gpu_type"]
5656
self.aliases = data.get("aliases", [data["gpu_type"]]) # Default to primary name
57+
self.node_selector_label = data.get("node_selector_label", data["gpu_type"])
5758
self.memory_gb = data["memory_gb"]
5859
self.compute_capability = data["compute_capability"]
5960
self.typical_use_cases = data["typical_use_cases"]
@@ -88,6 +89,7 @@ def to_dict(self) -> dict:
8889
return {
8990
"gpu_type": self.gpu_type,
9091
"aliases": self.aliases,
92+
"node_selector_label": self.node_selector_label,
9193
"memory_gb": self.memory_gb,
9294
"compute_capability": self.compute_capability,
9395
"typical_use_cases": self.typical_use_cases,

0 commit comments

Comments
 (0)