Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions data/configuration/model_catalog.json
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,7 @@
{
"gpu_type": "L4",
"aliases": ["NVIDIA-L4", "L4"],
"node_selector_label": "NVIDIA-L4",
"memory_gb": 24,
"compute_capability": "8.9",
"typical_use_cases": ["inference"],
Expand All @@ -1293,6 +1294,7 @@
{
"gpu_type": "A10G",
"aliases": ["NVIDIA-A10G", "A10G"],
"node_selector_label": "NVIDIA-A10G",
"memory_gb": 24,
"compute_capability": "8.6",
"typical_use_cases": ["inference"],
Expand All @@ -1306,6 +1308,7 @@
{
"gpu_type": "A100-40",
"aliases": ["NVIDIA-A100-40GB", "A100-40", "A100-40GB"],
"node_selector_label": "NVIDIA-A100-SXM4-40GB",
"memory_gb": 40,
"compute_capability": "8.0",
"typical_use_cases": ["inference", "training"],
Expand All @@ -1319,6 +1322,7 @@
{
"gpu_type": "A100-80",
"aliases": ["NVIDIA-A100-80GB", "A100-80", "A100-80GB"],
"node_selector_label": "NVIDIA-A100-SXM4-80GB",
"memory_gb": 80,
"compute_capability": "8.0",
"typical_use_cases": ["inference", "training"],
Expand All @@ -1332,6 +1336,7 @@
{
"gpu_type": "H100",
"aliases": ["NVIDIA-H100", "H100", "H100-80GB"],
"node_selector_label": "NVIDIA-H100-80GB-HBM3",
"memory_gb": 80,
"compute_capability": "9.0",
"typical_use_cases": ["inference", "training"],
Expand All @@ -1345,6 +1350,7 @@
{
"gpu_type": "H200",
"aliases": ["NVIDIA-H200", "H200", "H200-141GB"],
"node_selector_label": "NVIDIA-H200-141GB-HBM3",
"memory_gb": 141,
"compute_capability": "9.0",
"typical_use_cases": ["inference", "training"],
Expand All @@ -1358,6 +1364,7 @@
{
"gpu_type": "B200",
"aliases": ["NVIDIA-B200", "B200"],
"node_selector_label": "NVIDIA-B200",
"memory_gb": 192,
"compute_capability": "10.0",
"typical_use_cases": ["inference", "training"],
Expand All @@ -1371,6 +1378,7 @@
{
"gpu_type": "MI300X",
"aliases": ["AMD-MI300X", "MI300X", "AMD-Instinct-MI300X"],
"node_selector_label": "AMD-Instinct-MI300X",
"memory_gb": 192,
"compute_capability": "N/A",
"typical_use_cases": ["inference", "training"],
Expand Down
13 changes: 10 additions & 3 deletions src/neuralnav/configuration/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DeploymentGenerator:
"""Generate deployment configurations from recommendations."""

# vLLM version to use
VLLM_VERSION = "v0.6.2"
VLLM_VERSION = "latest"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

What is the current recommended way to pin vLLM container images for production deployments: a specific release tag, an immutable digest, or latest? Please check the official vLLM image/release documentation.

💡 Result:

For production, pin the vLLM image to a specific released version tag (e.g., vllm/vllm-openai:vX.Y.Z) and, if you need strict immutability/reproducibility, pin that tag’s digest (e.g., vllm/vllm-openai:vX.Y.Z@sha256:...). Avoid relying on latest.

  • The official Docker docs use :latest in run examples, but when showing how to extend the image they use a versioned tag and explicitly warn to match the vLLM version to the base image (implying version pinning for correctness/reproducibility). [1]
  • vLLM maintainers note that Docker Hub images/tags can be delayed/missing, and that the Docker Hub latest tag is not always promptly updated to the newest GitHub release—creating reproducibility issues. [2]

Sources: official “Using Docker” docs [1]; vLLM forum maintainer response on Docker Hub tagging/latest lag [2].

[1] https://docs.vllm.ai/en/stable/deployment/docker/
[2] https://discuss.vllm.ai/t/vllm-openai-dockerhub-missing-0-16-tags/2399


Pin the vLLM image to a specific release version tag instead of latest.

Using latest makes generated manifests non-reproducible and creates supply-chain risk. The Docker Hub latest tag is not always promptly updated to the newest vLLM release, and pods can pull different images on restart or scale-out without code changes. For production YAML, pin to a tested release tag (e.g., vX.Y.Z) or, for strict immutability, include the image digest.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/neuralnav/configuration/generator.py` at line 24, The VLLM_VERSION
constant is currently set to "latest" which makes manifests non-reproducible;
change the VLLM_VERSION value in generator.py to a specific release tag or image
digest (e.g., "vX.Y.Z" or "sha256:<digest>") instead of "latest", and update any
code that composes the image string to use this pinned value so generated YAML
references the fixed tag/digest; ensure the chosen tag/digest corresponds to a
tested vLLM release before committing.


def __init__(self, output_dir: str | None = None, simulator_mode: bool = False):
"""
Expand Down Expand Up @@ -122,9 +122,15 @@ def _prepare_template_context(

assert gpu_config is not None, "gpu_config is required for template context"

# Calculate GPU hourly rate from ModelCatalog
# Look up GPU info from ModelCatalog
gpu_info = self._catalog.get_gpu_type(gpu_config.gpu_type)
gpu_hourly_rate = gpu_info.cost_per_hour_usd if gpu_info else 1.0
if gpu_info is None:
raise ValueError(
f"Unknown GPU type '{gpu_config.gpu_type}'. "
f"Add it to the GPU catalog in data/configuration/model_catalog.json."
)
Comment on lines +127 to +131
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use a typed generation error that the API can map to 4xx.

Fail-fast is the right behavior, but raising a bare ValueError here now bubbles into the configuration routes’ generic Exception handlers and returns HTTP 500 for what is effectively invalid input/catalog drift. Please raise a domain-specific exception here, or ensure callers translate this path to 400/422 instead of 500.

🧰 Tools
🪛 Ruff (0.15.5)

[warning] 126-129: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/neuralnav/configuration/generator.py` around lines 125 - 129, Replace the
bare ValueError raised when gpu_info is None with a domain-specific typed
exception so the API can map it to a 4xx error; for example, define or reuse an
exception like InvalidGenerationInputError or InvalidGPUTypeError (subclassing a
common GenerationError or BadRequestError) and raise that instead of ValueError
in the block that checks gpu_info (referencing gpu_config.gpu_type and the
gpu_info lookup in generator.py); ensure the new exception is imported/defined
in the module so callers or global handlers can translate it to HTTP 400/422.

gpu_hourly_rate = gpu_info.cost_per_hour_usd
gpu_node_selector_label = gpu_info.node_selector_label

# Determine resource requests based on GPU type
gpu_type = gpu_config.gpu_type
Expand Down Expand Up @@ -187,6 +193,7 @@ def _prepare_template_context(
"simulator_mode": self.simulator_mode,
# GPU configuration
"gpu_type": gpu_config.gpu_type,
"gpu_node_selector_label": gpu_node_selector_label,
"gpu_count": gpu_config.gpu_count,
"tensor_parallel": gpu_config.tensor_parallel,
"gpus_per_replica": gpu_config.tensor_parallel, # GPUs per pod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,24 +78,25 @@ spec:
{% if simulator_mode %}
initialDelaySeconds: 10
{% else %}
initialDelaySeconds: 120
initialDelaySeconds: 600
{% endif %}
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 5
readinessProbe:
httpGet:
path: /health
port: 8080
{% if simulator_mode %}
initialDelaySeconds: 5
{% else %}
initialDelaySeconds: 60
initialDelaySeconds: 120
{% endif %}
periodSeconds: 10
timeoutSeconds: 5
{% if not simulator_mode %}
nodeSelector:
nvidia.com/gpu.product: {{ gpu_type }}
nvidia.com/gpu.product: {{ gpu_node_selector_label }}
Comment on lines 98 to +99
Copy link

@coderabbitai coderabbitai bot Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== GPU catalog entries with selector labels ==\n'
rg -n '"gpu_type":|"node_selector_label":' data/configuration/model_catalog.json

printf '\n== Hardcoded accelerator selector/resource keys ==\n'
rg -n 'nvidia\.com/gpu(\.product)?' src/neuralnav/configuration src/neuralnav/knowledge_base

Repository: redhat-et/neuralnav

Length of output: 1580


🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== Full kserve-inferenceservice.yaml.j2 template ==\n'
cat -n src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2 | head -120

printf '\n== Context around lines 98-99 (node selector) ==\n'
sed -n '90,110p' src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2 | cat -n

printf '\n== Check template variables passed to this template ==\n'
rg -B5 -A5 'kserve-inferenceservice.yaml.j2' src/neuralnav/configuration

Repository: redhat-et/neuralnav

Length of output: 6173


🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== Template context preparation function ==\n'
sed -n '/_prepare_template_context/,/^[[:space:]]*def /p' src/neuralnav/configuration/generator.py | head -80

printf '\n== Check for any gpu_selector_key usage ==\n'
rg -n 'gpu_selector_key' src/neuralnav/configuration

Repository: redhat-et/neuralnav

Length of output: 3349


🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== Full context dict in _prepare_template_context ==\n'
sed -n '/_prepare_template_context/,/^[[:space:]]*def /p' src/neuralnav/configuration/generator.py | tail -50

printf '\n== Check ModelCatalog GPU info structure ==\n'
rg -B10 -A10 'class.*GPU.*Info' src/neuralnav/configuration

Repository: redhat-et/neuralnav

Length of output: 2075


🏁 Script executed:

#!/bin/bash
set -eulo pipefail

printf '\n== Find ModelCatalog and GPUInfo definitions ==\n'
fd -e py | xargs grep -l 'class.*ModelCatalog\|class.*GPUInfo' | head -5

printf '\n== Check model_catalog.json structure (GPU entries) ==\n'
sed -n '1275,1385p' data/configuration/model_catalog.json | head -40

printf '\n== Full context dict (end of function) ==\n'
sed -n '/_prepare_template_context/,/return context/p' src/neuralnav/configuration/generator.py | tail -30

Repository: redhat-et/neuralnav

Length of output: 2768


🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== Complete _prepare_template_context function ==\n'
python3 << 'PYEOF'
with open('src/neuralnav/configuration/generator.py', 'r') as f:
    content = f.read()
    start = content.find('def _prepare_template_context(')
    if start == -1:
        print("Function not found")
    else:
        # Find the return statement
        remaining = content[start:]
        # Look for "return context" at the beginning of a line
        import re
        match = re.search(r'\n\s*return context\n', remaining)
        if match:
            end = start + match.end()
            func = content[start:end]
            print(func)
        else:
            print("Return statement not found")
PYEOF

printf '\n== Check if gpu_selector_key is used anywhere ==\n'
rg -n 'gpu_selector_key' src/

Repository: redhat-et/neuralnav

Length of output: 5645


🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf '\n== Check model_catalog.py for GPU info structure ==\n'
sed -n '/class.*GPU/,/^class /p' src/neuralnav/knowledge_base/model_catalog.py | head -40

printf '\n== Check if AMD entries need different resource keys ==\n'
rg -i 'amd\.com|AMD.*gpu' src/neuralnav data/ || echo "No AMD resource keys found"

Repository: redhat-et/neuralnav

Length of output: 1903


Vendor-specific selector and resource keys prevent AMD GPU deployments.

The node selector fix addresses the key name on line 99, but the template has three vendor-specific hardcodes that block AMD catalog entries like MI300X:

  • Line 99: nvidia.com/gpu.product (node selector key)
  • Lines 64, 68: nvidia.com/gpu (resource requests/limits)
  • Line 101: nvidia.com/gpu (toleration key)

AMD GPUs require different key names. Extract the vendor prefix from the catalog GPU type and use it consistently across the template. This requires adding a vendor identifier to the template context (not just the label value).

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/neuralnav/configuration/templates/kserve-inferenceservice.yaml.j2` around
lines 98 - 99, The template hardcodes NVIDIA keys (nvidia.com/gpu and
nvidia.com/gpu.product) which prevents AMD GPUs; add a vendor identifier to the
template context (e.g., vendor_prefix) and replace the three hardcoded
occurrences with dynamic keys: use nodeSelector key "{{ vendor_prefix
}}.com/gpu.product" instead of nvidia.com/gpu.product, replace resource
request/limit keys "nvidia.com/gpu" with "{{ vendor_prefix }}.com/gpu", and
replace the toleration key "nvidia.com/gpu" with "{{ vendor_prefix }}.com/gpu";
update the code that renders this Jinja template to pass vendor_prefix (derived
from the catalog GPU type) alongside gpu_node_selector_label so all GPU
vendor-specific keys are consistent.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skipping this for now. The NVIDIA and AMD GPU operators use fundamentally different resource keys and conventions — it's not as simple as swapping a vendor prefix string. For example, AMD uses amd.com/gpu with different toleration and node selector patterns. Adding a vendor_prefix variable would give a false sense of AMD support without actually working end-to-end. We'll address this properly when AMD deployments are needed.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anfredette, understood — that's a fair point. A superficial vendor_prefix substitution would be misleading since AMD's device plugin uses different toleration patterns, node selector keys, and resource naming conventions than the NVIDIA GPU operator. Deferring until AMD deployments are concretely needed is the right call.


📝 Learnings were identified and not saved (knowledge base disabled). Enable

tolerations:
- key: nvidia.com/gpu
operator: Exists
Expand Down
2 changes: 2 additions & 0 deletions src/neuralnav/knowledge_base/model_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class GPUType:
def __init__(self, data: dict):
self.gpu_type = data["gpu_type"]
self.aliases = data.get("aliases", [data["gpu_type"]]) # Default to primary name
self.node_selector_label = data.get("node_selector_label", self.aliases[0])
self.memory_gb = data["memory_gb"]
Comment on lines 54 to 58
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Require node_selector_label instead of silently defaulting it.

This fallback can reintroduce the exact broken scheduling path this PR is fixing: if a future GPU entry omits node_selector_label, YAML generation will quietly fall back to an alias and may produce an unschedulable nvidia.com/gpu.product selector. Please fail fast when the field is missing.

Suggested change
     def __init__(self, data: dict):
         self.gpu_type = data["gpu_type"]
         self.aliases = data.get("aliases", [data["gpu_type"]])  # Default to primary name
-        self.node_selector_label = data.get("node_selector_label", self.aliases[0])
+        try:
+            self.node_selector_label = data["node_selector_label"]
+        except KeyError as e:
+            raise ValueError(
+                f"GPU type '{self.gpu_type}' is missing required node_selector_label"
+            ) from e
         self.memory_gb = data["memory_gb"]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def __init__(self, data: dict):
self.gpu_type = data["gpu_type"]
self.aliases = data.get("aliases", [data["gpu_type"]]) # Default to primary name
self.node_selector_label = data.get("node_selector_label", self.aliases[0])
self.memory_gb = data["memory_gb"]
def __init__(self, data: dict):
self.gpu_type = data["gpu_type"]
self.aliases = data.get("aliases", [data["gpu_type"]]) # Default to primary name
try:
self.node_selector_label = data["node_selector_label"]
except KeyError as e:
raise ValueError(
f"GPU type '{self.gpu_type}' is missing required node_selector_label"
) from e
self.memory_gb = data["memory_gb"]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/neuralnav/knowledge_base/model_catalog.py` around lines 54 - 58, The
constructor currently silently defaults node_selector_label from aliases which
can reintroduce broken scheduling; update the __init__ in the class that parses
the GPU entry (the __init__ that sets gpu_type, aliases, node_selector_label,
memory_gb) to require data["node_selector_label"] and raise a clear exception
(e.g., ValueError) if it's missing instead of using data.get(...,
self.aliases[0]); do not fallback to aliases and ensure the error message names
the missing node_selector_label and the gpu_type (or data) to aid debugging.

self.compute_capability = data["compute_capability"]
self.typical_use_cases = data["typical_use_cases"]
Expand Down Expand Up @@ -88,6 +89,7 @@ def to_dict(self) -> dict:
return {
"gpu_type": self.gpu_type,
"aliases": self.aliases,
"node_selector_label": self.node_selector_label,
"memory_gb": self.memory_gb,
"compute_capability": self.compute_capability,
"typical_use_cases": self.typical_use_cases,
Expand Down
9 changes: 8 additions & 1 deletion ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,14 @@ def main():

# Tab-based navigation (6 tabs)
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(
["Define Use Case", "Technical Specification", "Recommendations", "Deployment", "Deployment Management", "Configuration"]
[
"Define Use Case",
"Technical Specification",
"Recommendations",
"Deployment",
"Deployment Management",
"Configuration",
]
)

with tab1:
Expand Down
12 changes: 9 additions & 3 deletions ui/components/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,18 @@ def _render_deploy_to_cluster_button(selected_config: dict):
use_container_width=True,
type="primary",
disabled=already_deployed,
help="Already deployed to cluster" if already_deployed else "Deploy to Kubernetes cluster (YAML auto-generated)",
help="Already deployed to cluster"
if already_deployed
else "Deploy to Kubernetes cluster (YAML auto-generated)",
key="deploy_to_cluster_btn",
):
# Check cluster accessibility when the user clicks
with st.spinner("Checking cluster connectivity..."):
status = check_cluster_status()
if not status.get("accessible", False):
st.error("Kubernetes cluster is not accessible. Please ensure the cluster is running and try again.")
st.error(
"Kubernetes cluster is not accessible. Please ensure the cluster is running and try again."
)
return

with st.spinner("Deploying to Kubernetes cluster..."):
Expand All @@ -208,7 +212,9 @@ def _render_deploy_to_cluster_button(selected_config: dict):
st.session_state.deployment_yaml_files = files
st.session_state.deployment_yaml_generated = True

st.success(f"Successfully deployed to cluster! Deployment ID: `{result.get('deployment_id')}`")
st.success(
f"Successfully deployed to cluster! Deployment ID: `{result.get('deployment_id')}`"
)

deployment_result = result.get("deployment_result", {})
for applied_file in deployment_result.get("applied_files", []):
Expand Down
37 changes: 23 additions & 14 deletions ui/components/deployment_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import pandas as pd
import streamlit as st
from api_client import delete_deployment, get_k8s_status, load_all_deployments
from api_client import delete_deployment, load_all_deployments


def render_deployment_management_tab():
Expand Down Expand Up @@ -46,12 +46,14 @@ def render_deployment_management_tab():
status = dep.get("status", {})
pods = dep.get("pods", [])
ready = status.get("ready", False)
table_data.append({
"Status": "Ready" if ready else "Pending",
"Name": dep["deployment_id"],
"Pods": len(pods),
"Ready": "Yes" if ready else "No",
})
table_data.append(
{
"Status": "Ready" if ready else "Pending",
"Name": dep["deployment_id"],
"Pods": len(pods),
"Ready": "Yes" if ready else "No",
}
)

df = pd.DataFrame(table_data)
st.dataframe(df, use_container_width=True, hide_index=True)
Expand Down Expand Up @@ -257,14 +259,21 @@ def _run_inference_test(deployment_id: str, prompt: str, max_tokens: int, temper
start_time = time.time()

curl_cmd = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use requests?

"curl", "-s", "-X", "POST",
"curl",
"-s",
"-X",
"POST",
"http://localhost:8080/v1/completions",
"-H", "Content-Type: application/json",
"-d", json.dumps({
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature,
}),
"-H",
"Content-Type: application/json",
"-d",
json.dumps(
{
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature,
}
),
]

with st.expander("Debug Info"):
Expand Down
6 changes: 4 additions & 2 deletions ui/components/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
upload_benchmarks,
)


_TAB_INDEX = 5 # Configuration is the 6th tab (0-indexed)


Expand All @@ -33,7 +32,10 @@ def _on_mode_change():
result = update_deployment_mode(new_mode)
if result:
st.session_state.deployment_mode_selection = st.session_state.deployment_mode_radio
st.session_state["_mode_msg"] = ("success", f"Deployment mode set to **{st.session_state.deployment_mode_radio}**.")
st.session_state["_mode_msg"] = (
"success",
f"Deployment mode set to **{st.session_state.deployment_mode_radio}**.",
)
else:
st.session_state["_mode_msg"] = ("error", "Failed to update deployment mode.")
st.session_state["_pending_tab"] = _TAB_INDEX
Expand Down
Loading