Skip to content

Commit 80e0b5d

Browse files
committed
fix: use correct GPU node selector labels and improve prod probe timeouts
The generated YAML used short GPU names (e.g., "L4") for the nvidia.com/gpu.product node selector, but K8s nodes use labels like "NVIDIA-L4". Add node_selector_label field to GPU catalog and use it in YAML generation. Also increase production liveness probe delay from 120s to 600s to allow time for model loading, and update vLLM image to latest. Signed-off-by: Andre Fredette <afredette@redhat.com>
1 parent 62d0219 commit 80e0b5d

File tree

4 files changed

+24
-6
lines changed

4 files changed

+24
-6
lines changed

data/configuration/model_catalog.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,7 @@
12801280
{
12811281
"gpu_type": "L4",
12821282
"aliases": ["NVIDIA-L4", "L4"],
1283+
"node_selector_label": "NVIDIA-L4",
12831284
"memory_gb": 24,
12841285
"compute_capability": "8.9",
12851286
"typical_use_cases": ["inference"],
@@ -1293,6 +1294,7 @@
12931294
{
12941295
"gpu_type": "A10G",
12951296
"aliases": ["NVIDIA-A10G", "A10G"],
1297+
"node_selector_label": "NVIDIA-A10G",
12961298
"memory_gb": 24,
12971299
"compute_capability": "8.6",
12981300
"typical_use_cases": ["inference"],
@@ -1306,6 +1308,7 @@
13061308
{
13071309
"gpu_type": "A100-40",
13081310
"aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
1311+
"node_selector_label": "NVIDIA-A100-SXM4-40GB",
13091312
"memory_gb": 40,
13101313
"compute_capability": "8.0",
13111314
"typical_use_cases": ["inference", "training"],
@@ -1319,6 +1322,7 @@
13191322
{
13201323
"gpu_type": "A100-80",
13211324
"aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
1325+
"node_selector_label": "NVIDIA-A100-SXM4-80GB",
13221326
"memory_gb": 80,
13231327
"compute_capability": "8.0",
13241328
"typical_use_cases": ["inference", "training"],
@@ -1332,6 +1336,7 @@
13321336
{
13331337
"gpu_type": "H100",
13341338
"aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
1339+
"node_selector_label": "NVIDIA-H100-80GB-HBM3",
13351340
"memory_gb": 80,
13361341
"compute_capability": "9.0",
13371342
"typical_use_cases": ["inference", "training"],
@@ -1345,6 +1350,7 @@
13451350
{
13461351
"gpu_type": "H200",
13471352
"aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
1353+
"node_selector_label": "NVIDIA-H200-141GB-HBM3",
13481354
"memory_gb": 141,
13491355
"compute_capability": "9.0",
13501356
"typical_use_cases": ["inference", "training"],
@@ -1358,6 +1364,7 @@
13581364
{
13591365
"gpu_type": "B200",
13601366
"aliases": ["NVIDIA-B200", "B200"],
1367+
"node_selector_label": "NVIDIA-B200",
13611368
"memory_gb": 192,
13621369
"compute_capability": "10.0",
13631370
"typical_use_cases": ["inference", "training"],
@@ -1371,6 +1378,7 @@
13711378
{
13721379
"gpu_type": "MI300X",
13731380
"aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
1381+
"node_selector_label": "AMD-Instinct-MI300X",
13741382
"memory_gb": 192,
13751383
"compute_capability": "N/A",
13761384
"typical_use_cases": ["inference", "training"],

src/neuralnav/configuration/generator.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class DeploymentGenerator:
2121
"""Generate deployment configurations from recommendations."""
2222

2323
# vLLM version to use
24-
VLLM_VERSION = "v0.6.2"
24+
VLLM_VERSION = "latest"
2525

2626
def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
2727
"""
@@ -122,9 +122,15 @@ def _prepare_template_context(
122122

123123
assert gpu_config is not None, "gpu_config is required for template context"
124124

125-
# Calculate GPU hourly rate from ModelCatalog
125+
# Look up GPU info from ModelCatalog
126126
gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
127-
gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
127+
if gpu_info is None:
128+
raise ValueError(
129+
f"Unknown GPU type '{gpu_config.gpu_type}'. "
130+
f"Add it to the GPU catalog in data/configuration/model_catalog.json."
131+
)
132+
gpu_hourly_rate = gpu_info.cost_per_hour_usd
133+
gpu_node_selector_label = gpu_info.node_selector_label
128134

129135
# Determine resource requests based on GPU type
130136
gpu_type = gpu_config.gpu_type
@@ -187,6 +193,7 @@ def _prepare_template_context(
187193
"simulator_mode": self.simulator_mode,
188194
# GPU configuration
189195
"gpu_type": gpu_config.gpu_type,
196+
"gpu_node_selector_label": gpu_node_selector_label,
190197
"gpu_count": gpu_config.gpu_count,
191198
"tensor_parallel": gpu_config.tensor_parallel,
192199
"gpus_per_replica": gpu_config.tensor_parallel, # GPUs per pod

src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,24 +78,25 @@ spec:
7878
{% if simulator_mode %}
7979
initialDelaySeconds: 10
8080
{% else %}
81-
initialDelaySeconds: 120
81+
initialDelaySeconds: 600
8282
{% endif %}
8383
periodSeconds: 30
8484
timeoutSeconds: 10
85+
failureThreshold: 5
8586
readinessProbe:
8687
httpGet:
8788
path: /health
8889
port: 8080
8990
{% if simulator_mode %}
9091
initialDelaySeconds: 5
9192
{% else %}
92-
initialDelaySeconds: 60
93+
initialDelaySeconds: 120
9394
{% endif %}
9495
periodSeconds: 10
9596
timeoutSeconds: 5
9697
{% if not simulator_mode %}
9798
nodeSelector:
98-
nvidia.com/gpu.product: {{ gpu_type }}
99+
nvidia.com/gpu.product: {{ gpu_node_selector_label }}
99100
tolerations:
100101
- key: nvidia.com/gpu
101102
operator: Exists

src/neuralnav/knowledge_base/model_catalog.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class GPUType:
5454
def __init__(self, data: dict):
5555
self.gpu_type = data["gpu_type"]
5656
self.aliases = data.get("aliases", [data["gpu_type"]]) # Default to primary name
57+
self.node_selector_label = data.get("node_selector_label", self.aliases[0])
5758
self.memory_gb = data["memory_gb"]
5859
self.compute_capability = data["compute_capability"]
5960
self.typical_use_cases = data["typical_use_cases"]
@@ -88,6 +89,7 @@ def to_dict(self) -> dict:
8889
return {
8990
"gpu_type": self.gpu_type,
9091
"aliases": self.aliases,
92+
"node_selector_label": self.node_selector_label,
9193
"memory_gb": self.memory_gb,
9294
"compute_capability": self.compute_capability,
9395
"typical_use_cases": self.typical_use_cases,

0 commit comments

Comments
 (0)