From f0324805764226777478cd6bdb7b52fffc251299 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Mon, 15 Sep 2025 22:21:27 +0530
Subject: [PATCH 01/12] Add LLMD model server tests and utilities

---
 .../model_server/llmd/__init__.py             |   8 +
 .../model_server/llmd/conftest.py             | 173 +++++
 .../model_server/llmd/test_llmd_oci_cpu.py    |  34 +
 .../model_server/llmd/test_llmd_s3.py         |  37 +
 .../model_serving/model_server/llmd/utils.py  |  62 ++
 utilities/llmd_constants.py                   |  32 +
 utilities/llmd_utils.py                       | 669 ++++++++++++++++++
 utilities/manifests/opt125m_cpu.py            |  27 +
 8 files changed, 1042 insertions(+)
 create mode 100644 tests/model_serving/model_server/llmd/__init__.py
 create mode 100644 tests/model_serving/model_server/llmd/conftest.py
 create mode 100644 tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
 create mode 100644 tests/model_serving/model_server/llmd/test_llmd_s3.py
 create mode 100644 tests/model_serving/model_server/llmd/utils.py
 create mode 100644 utilities/llmd_constants.py
 create mode 100644 utilities/llmd_utils.py
 create mode 100644 utilities/manifests/opt125m_cpu.py

diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py
new file mode 100644
index 000000000..3d0e7e903
--- /dev/null
+++ b/tests/model_serving/model_server/llmd/__init__.py
@@ -0,0 +1,8 @@
+"""
+LLM Deployment (LLMD) tests module.
+
+This module contains tests for LLM deployment functionality including:
+- Gateway resource creation and management
+- LLMInferenceService deployment and configuration
+- Integration testing between gateway and inference services
+"""
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
new file mode 100644
index 000000000..af8c5594c
--- /dev/null
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -0,0 +1,173 @@
+from typing import Generator
+
+import pytest
+from _pytest.fixtures import FixtureRequest
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.gateway import Gateway
+from ocp_resources.llm_inference_service import LLMInferenceService
+from ocp_resources.namespace import Namespace
+from ocp_resources.secret import Secret
+from ocp_resources.service_account import ServiceAccount
+
+from utilities.constants import Timeout
+from utilities.infra import s3_endpoint_secret
+from utilities.llmd_utils import create_gateway, create_llmisvc
+from utilities.llmd_constants import (
+    DEFAULT_GATEWAY_NAMESPACE,
+    VLLM_STORAGE_OCI,
+    VLLM_CPU_IMAGE,
+    DEFAULT_S3_STORAGE_PATH,
+)
+
+
+@pytest.fixture(scope="class")
+def gateway_namespace(admin_client: DynamicClient) -> str:
+    return DEFAULT_GATEWAY_NAMESPACE
+
+
+@pytest.fixture(scope="class")
+def llmd_s3_secret(
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    aws_access_key_id: str,
+    aws_secret_access_key: str,
+    models_s3_bucket_name: str,
+    models_s3_bucket_region: str,
+    models_s3_bucket_endpoint: str,
+) -> Generator[Secret, None, None]:
+    with s3_endpoint_secret(
+        client=admin_client,
+        name="llmd-s3-secret",
+        namespace=unprivileged_model_namespace.name,
+        aws_access_key=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+        aws_s3_region=models_s3_bucket_region,
+        aws_s3_bucket=models_s3_bucket_name,
+        aws_s3_endpoint=models_s3_bucket_endpoint,
+    ) as secret:
+        yield secret
+
+
+@pytest.fixture(scope="class")
+def llmd_s3_service_account(
+    admin_client: DynamicClient, 
+    llmd_s3_secret: Secret
+) -> Generator[ServiceAccount, None, None]:
+    with ServiceAccount(
+        client=admin_client,
+        namespace=llmd_s3_secret.namespace,
+        name="llmd-s3-service-account",
+        secrets=[{"name": llmd_s3_secret.name}],
+    ) as sa:
+        yield sa
+
+
+@pytest.fixture(scope="class")
+def llmd_gateway(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    gateway_namespace: str,
+) -> Generator[Gateway, None, None]:
+    if isinstance(request.param, str):
+        gateway_class_name = request.param
+        kwargs = {}
+    else:
+        gateway_class_name = request.param.get("gateway_class_name", "openshift-default")
+        kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"}
+    with create_gateway(
+        client=admin_client,
+        namespace=gateway_namespace,
+        gateway_class_name=gateway_class_name,
+        wait_for_condition=True,
+        timeout=Timeout.TIMEOUT_5MIN,
+        **kwargs
+    ) as gateway:
+        yield gateway
+
+
+@pytest.fixture(scope="class")
+def llmd_inference_service(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+) -> Generator[LLMInferenceService, None, None]:
+    if isinstance(request.param, str):
+        name_suffix = request.param
+        kwargs = {}
+    else:
+        name_suffix = request.param.get("name_suffix", "basic")
+        kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"}
+    
+    service_name = kwargs.get("name", f"llm-{name_suffix}")
+
+    if "llmd_gateway" in request.fixturenames:
+        request.getfixturevalue("llmd_gateway")
+    
+    container_resources = kwargs.get(
+        "container_resources",
+        {
+            "limits": {"cpu": "1", "memory": "10Gi"},
+            "requests": {"cpu": "100m", "memory": "8Gi"},
+        },
+    )
+
+    with create_llmisvc(
+        client=admin_client,
+        name=service_name,
+        namespace=unprivileged_model_namespace.name,
+        storage_uri=kwargs.get("storage_uri", VLLM_STORAGE_OCI),
+        container_image=kwargs.get("container_image", VLLM_CPU_IMAGE),
+        container_resources=container_resources,
+        wait=True,
+        timeout=Timeout.TIMEOUT_15MIN,
+        **{k: v for k, v in kwargs.items() if k != "name"}
+    ) as llm_service:
+        yield llm_service
+
+
+@pytest.fixture(scope="class")
+def llmd_inference_service_s3(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    llmd_s3_secret: Secret,
+    llmd_s3_service_account: ServiceAccount,
+) -> Generator[LLMInferenceService, None, None]:
+    if isinstance(request.param, str):
+        name_suffix = request.param
+        kwargs = {"storage_path": DEFAULT_S3_STORAGE_PATH}
+    else:
+        name_suffix = request.param.get("name_suffix", "s3")
+        kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"}
+    
+    service_name = kwargs.get("name", f"llm-{name_suffix}")
+    
+    if "storage_key" not in kwargs:
+        kwargs["storage_key"] = llmd_s3_secret.name
+    
+    if "storage_path" not in kwargs:
+        kwargs["storage_path"] = DEFAULT_S3_STORAGE_PATH
+    
+    container_resources = kwargs.get(
+        "container_resources",
+        {
+            "limits": {"cpu": "1", "memory": "10Gi"},
+            "requests": {"cpu": "100m", "memory": "8Gi"},
+        },
+    )
+
+    with create_llmisvc(
+        client=admin_client,
+        name=service_name,
+        namespace=unprivileged_model_namespace.name,
+        storage_key=kwargs.get("storage_key"),
+        storage_path=kwargs.get("storage_path"),
+        container_image=kwargs.get("container_image", VLLM_CPU_IMAGE),
+        container_resources=container_resources,
+        service_account=llmd_s3_service_account.name,
+        wait=True,
+        timeout=Timeout.TIMEOUT_15MIN,
+        **{k: v for k, v in kwargs.items() if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"]}
+    ) as llm_service:
+        yield llm_service
+
diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
new file mode 100644
index 000000000..e4d851e1c
--- /dev/null
+++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
@@ -0,0 +1,34 @@
+import pytest
+
+from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status
+from utilities.constants import Protocols
+from utilities.llmd_utils import verify_inference_response_llmd
+
+from utilities.llmd_constants import BASIC_LLMD_PARAMS
+from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG
+
+pytestmark = [
+    pytest.mark.llmd_cpu,
+]
+
+
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace, llmd_gateway, llmd_inference_service",
+    BASIC_LLMD_PARAMS,
+    indirect=True,
+)
+class TestLLMDOCICPUInference:
+    """LLMD inference testing with OCI storage and CPU runtime using vLLM."""
+
+    def test_llmd_oci(self, llmd_gateway, llmd_inference_service):
+        assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
+        assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready"
+        
+        verify_inference_response_llmd(
+            llm_service=llmd_inference_service,
+            inference_config=OPT125M_CPU_INFERENCE_CONFIG,
+            inference_type="chat_completions",
+            protocol=Protocols.HTTP,
+            use_default_query=True,
+            insecure=True,
+        )
diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py
new file mode 100644
index 000000000..0774b0b46
--- /dev/null
+++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py
@@ -0,0 +1,37 @@
+import pytest
+
+from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status
+from utilities.constants import Protocols
+from utilities.llmd_utils import verify_inference_response_llmd
+
+from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG
+
+pytestmark = [
+    pytest.mark.llmd_cpu,
+]
+
+
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace, llmd_gateway, llmd_inference_service_s3",
+    [
+        ({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"})
+    ],
+    indirect=True,
+)
+@pytest.mark.usefixtures("valid_aws_config")
+class TestLLMDS3Inference:
+    """LLMD inference testing with S3 storage."""
+
+    def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3):
+        assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
+        assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready"
+        
+        verify_inference_response_llmd(
+            llm_service=llmd_inference_service_s3,
+            inference_config=OPT125M_CPU_INFERENCE_CONFIG,
+            inference_type="chat_completions",
+            protocol=Protocols.HTTP,
+            use_default_query=True,
+            insecure=True,
+        )
+
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
new file mode 100644
index 000000000..0a1c591ff
--- /dev/null
+++ b/tests/model_serving/model_server/llmd/utils.py
@@ -0,0 +1,62 @@
+"""
+Utility functions for LLM Deployment (LLMD) tests.
+
+This module provides helper functions for LLMD test operations using ocp_resources.
+Follows the established model server utils pattern for consistency.
+"""
+
+from ocp_resources.gateway import Gateway
+from ocp_resources.llm_inference_service import LLMInferenceService
+from simple_logger.logger import get_logger
+
+
+LOGGER = get_logger(name=__name__)
+
+
+def verify_gateway_status(gateway: Gateway) -> bool:
+    """
+    Verify that a Gateway is properly configured and programmed.
+    
+    Args:
+        gateway (Gateway): The Gateway resource to verify
+        
+    Returns:
+        bool: True if gateway is properly configured, False otherwise
+    """
+    if not gateway.exists:
+        LOGGER.warning(f"Gateway {gateway.name} does not exist")
+        return False
+    
+    conditions = gateway.instance.status.get("conditions", [])
+    for condition in conditions:
+        if condition["type"] == "Programmed" and condition["status"] == "True":
+            LOGGER.info(f"Gateway {gateway.name} is programmed and ready")
+            return True
+    
+    LOGGER.warning(f"Gateway {gateway.name} is not in Programmed state")
+    return False
+
+
+def verify_llm_service_status(llm_service: LLMInferenceService) -> bool:
+    """
+    Verify that an LLMInferenceService is properly configured and ready.
+    
+    Args:
+        llm_service (LLMInferenceService): The LLMInferenceService resource to verify
+        
+    Returns:
+        bool: True if service is properly configured, False otherwise
+    """
+    if not llm_service.exists:
+        LOGGER.warning(f"LLMInferenceService {llm_service.name} does not exist")
+        return False
+    
+    conditions = llm_service.instance.status.get("conditions", [])
+    for condition in conditions:
+        if condition["type"] == "Ready" and condition["status"] == "True":
+            LOGGER.info(f"LLMInferenceService {llm_service.name} is ready")
+            return True
+    
+    LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state")
+    return False
+
diff --git a/utilities/llmd_constants.py b/utilities/llmd_constants.py
new file mode 100644
index 000000000..74546814b
--- /dev/null
+++ b/utilities/llmd_constants.py
@@ -0,0 +1,32 @@
+"""Centralized constants for LLMD (LLM Deployment) utilities and tests."""
+
+from utilities.constants import Timeout
+
+DEFAULT_GATEWAY_NAME = "openshift-ai-inference"
+DEFAULT_GATEWAY_NAMESPACE = "openshift-ingress"
+OPENSHIFT_DEFAULT_GATEWAY_CLASS = "openshift-default"
+
+KSERVE_GATEWAY_LABEL = "serving.kserve.io/gateway"
+KSERVE_INGRESS_GATEWAY = "kserve-ingress-gateway"
+
+DEFAULT_LLM_ENDPOINT = "/v1/chat/completions"
+DEFAULT_MAX_TOKENS = 50
+DEFAULT_TEMPERATURE = 0.0
+DEFAULT_TIMEOUT = Timeout.TIMEOUT_30SEC
+
+VLLM_STORAGE_OCI = "oci://quay.io/mwaykole/test:opt-125m"
+VLLM_CPU_IMAGE = "quay.io/pierdipi/vllm-cpu:latest"
+DEFAULT_LLMD_REPLICAS = 1
+DEFAULT_S3_STORAGE_PATH = "opt-125m"
+
+DEFAULT_STORAGE_URI = VLLM_STORAGE_OCI
+DEFAULT_CONTAINER_IMAGE = VLLM_CPU_IMAGE
+
+DEFAULT_CPU_LIMIT = "1"
+DEFAULT_MEMORY_LIMIT = "10Gi"
+DEFAULT_CPU_REQUEST = "100m"
+DEFAULT_MEMORY_REQUEST = "8Gi"
+
+BASIC_LLMD_PARAMS = [
+    ({"name": "llmd-comprehensive-test"}, "openshift-default", "basic")
+]
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
new file mode 100644
index 000000000..424166ead
--- /dev/null
+++ b/utilities/llmd_utils.py
@@ -0,0 +1,669 @@
+"""Utilities for LLM Deployment (LLMD) resources."""
+import json
+import re
+import shlex
+from contextlib import contextmanager
+from string import Template
+from typing import Any, Dict, Generator, Optional
+
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.gateway import Gateway
+from ocp_resources.llm_inference_service import LLMInferenceService
+from pyhelper_utils.shell import run_command
+from simple_logger.logger import get_logger
+from timeout_sampler import retry, TimeoutWatch
+
+from utilities.certificates_utils import get_ca_bundle
+from utilities.constants import HTTPRequest, Timeout
+from utilities.exceptions import InferenceResponseError
+from utilities.infra import get_services_by_isvc_label
+from utilities.llmd_constants import (
+    DEFAULT_GATEWAY_NAME,
+    DEFAULT_GATEWAY_NAMESPACE,
+    OPENSHIFT_DEFAULT_GATEWAY_CLASS,
+    DEFAULT_LLM_ENDPOINT,
+    DEFAULT_TIMEOUT,
+)
+
+LOGGER = get_logger(name=__name__)
+
+
+@contextmanager
+def create_gateway(
+    client: DynamicClient,
+    name: str = DEFAULT_GATEWAY_NAME,
+    namespace: str = DEFAULT_GATEWAY_NAMESPACE,
+    gateway_class_name: str = OPENSHIFT_DEFAULT_GATEWAY_CLASS,
+    listeners: Optional[list[Dict[str, Any]]] = None,
+    infrastructure: Optional[Dict[str, Any]] = None,
+    wait_for_condition: bool = True,
+    timeout: int = 300,
+    teardown: bool = True,
+) -> Generator[Gateway, None, None]:
+    """
+    Context manager to create and manage a Gateway resource using ocp_resources.
+    
+    Args:
+        client: Kubernetes dynamic client
+        name: Gateway name
+        namespace: Gateway namespace  
+        gateway_class_name: The name of the GatewayClass resource
+        listeners: List of listener configurations
+        infrastructure: Infrastructure configuration
+        wait_for_condition: Whether to wait for the gateway to be programmed
+        timeout: Timeout in seconds for waiting
+        teardown: Whether to clean up the resource
+        
+    Yields:
+        Gateway: The created Gateway resource
+    """
+    if listeners is None:
+        listeners = [
+            {
+                "name": "http",
+                "port": 80,
+                "protocol": "HTTP",
+                "allowedRoutes": {"namespaces": {"from": "All"}},
+            }
+        ]
+
+    if infrastructure is None:
+        infrastructure = {"labels": {"serving.kserve.io/gateway": "kserve-ingress-gateway"}}
+    try:
+        existing_gateway = Gateway(
+            client=client,
+            name=name,
+            namespace=namespace,
+            api_group="gateway.networking.k8s.io",
+        )
+        if existing_gateway.exists:
+            LOGGER.info(f"Cleaning up existing Gateway {name} in namespace {namespace}")
+            existing_gateway.delete(wait=True, timeout=Timeout.TIMEOUT_2MIN)
+    except Exception as e:
+        LOGGER.debug(f"No existing Gateway to clean up: {e}")
+    gateway_body = {
+        "apiVersion": "gateway.networking.k8s.io/v1",
+        "kind": "Gateway",
+        "metadata": {
+            "name": name,
+            "namespace": namespace,
+        },
+        "spec": {
+            "gatewayClassName": gateway_class_name,
+            "listeners": listeners,
+            "infrastructure": infrastructure,
+        }
+    }
+    
+    with Gateway(
+        client=client,
+        teardown=teardown,
+        kind_dict=gateway_body,
+        api_group="gateway.networking.k8s.io",
+    ) as gateway:
+        if wait_for_condition:
+            LOGGER.info(f"Waiting for Gateway {name} to be programmed...")
+            gateway.wait_for_condition(
+                condition="Programmed",
+                status="True",
+                timeout=timeout,
+            )
+            LOGGER.info(f"Gateway {name} is programmed and ready")
+        
+        yield gateway
+
+
+@contextmanager
+def create_llmisvc(
+    client: DynamicClient,
+    name: str,
+    namespace: str,
+    storage_uri: Optional[str] = None,
+    storage_key: Optional[str] = None,
+    storage_path: Optional[str] = None,
+    replicas: int = 1,
+    wait: bool = True,
+    enable_auth: bool = False,
+    router_config: Optional[Dict[str, Any]] = None,
+    container_image: Optional[str] = None,
+    container_resources: Optional[Dict[str, Any]] = None,
+    container_env: Optional[list[Dict[str, str]]] = None,
+    liveness_probe: Optional[Dict[str, Any]] = None,
+    readiness_probe: Optional[Dict[str, Any]] = None,
+    image_pull_secrets: Optional[list[str]] = None,
+    service_account: Optional[str] = None,
+    volumes: Optional[list[Dict[str, Any]]] = None,
+    volume_mounts: Optional[list[Dict[str, Any]]] = None,
+    annotations: Optional[Dict[str, str]] = None,
+    labels: Optional[Dict[str, str]] = None,
+    timeout: int = Timeout.TIMEOUT_15MIN,
+    teardown: bool = True,
+) -> Generator[LLMInferenceService, Any, None]:
+    """
+    Create LLMInferenceService object following the pattern of create_isvc.
+
+    Args:
+        client: DynamicClient object
+        name: LLMInferenceService name
+        namespace: Namespace name
+        storage_uri: Storage URI (e.g., 'oci://quay.io/user/model:tag') - used if storage_key/storage_path not provided
+        storage_key: S3 secret name for authentication (alternative to storage_uri)
+        storage_path: S3 path to model (alternative to storage_uri)
+        replicas: Number of replicas
+        wait: Wait for LLMInferenceService to be ready
+        enable_auth: Enable authentication
+        router_config: Router configuration (scheduler, route, gateway)
+        container_image: Container image
+        container_resources: Container resource requirements
+        container_env: Container environment variables
+        liveness_probe: Liveness probe configuration
+        readiness_probe: Readiness probe configuration
+        image_pull_secrets: Image pull secrets
+        service_account: Service account name
+        volumes: Volume configurations
+        volume_mounts: Volume mount configurations
+        annotations: Additional annotations
+        labels: Additional labels
+        timeout: Timeout for waiting
+        teardown: Whether to clean up on exit
+
+    Yields:
+        LLMInferenceService: LLMInferenceService object
+    """
+    if labels is None:
+        labels = {}
+    
+    if annotations is None:
+        annotations = {}
+
+    if storage_key and storage_path:
+        model_config = {
+            "uri": f"s3://ods-ci-wisdom/{storage_path}",
+        }
+    elif storage_uri:
+        model_config = {
+            "uri": storage_uri,
+        }
+    else:
+        raise ValueError("Provide either storage_uri or (storage_key and storage_path) for the model")
+
+    if router_config is None:
+        router_config = {
+            "scheduler": {},
+            "route": {},
+            "gateway": {}
+        }
+    
+    if container_resources is None:
+        raise ValueError("container_resources must be provided for LLMInferenceService")
+    
+    if container_env is None:
+        container_env = [
+            {
+                "name": "VLLM_LOGGING_LEVEL",
+                "value": "DEBUG"
+            }
+        ]
+    template_config: Dict[str, Any] = {
+        "containers": []
+    }
+    
+    main_container: Dict[str, Any] = {
+        "name": "main"
+    }
+    
+    if container_image:
+        main_container["image"] = container_image
+    else:
+        raise ValueError("container_image must be provided for LLMInferenceService")
+    
+    if container_resources:
+        main_container["resources"] = container_resources
+    
+    if container_env:
+        main_container["env"] = container_env
+    
+    if liveness_probe:
+        main_container["livenessProbe"] = liveness_probe
+    
+    if readiness_probe:
+        main_container["readinessProbe"] = readiness_probe
+    
+    if volume_mounts:
+        main_container["volumeMounts"] = volume_mounts
+    
+    template_config["containers"].append(main_container)
+    
+    if volumes:
+        template_config["volumes"] = volumes
+    
+    if service_account:
+        template_config["serviceAccountName"] = service_account
+    
+    if image_pull_secrets:
+        template_config["imagePullSecrets"] = [{"name": secret} for secret in image_pull_secrets]
+
+    if enable_auth:
+        annotations["serving.kserve.io/auth"] = "true"
+
+    LOGGER.info(f"Creating LLMInferenceService {name} in namespace {namespace}")
+    
+    with LLMInferenceService(
+        client=client,
+        name=name,
+        namespace=namespace,
+        model=model_config,
+        replicas=replicas,
+        router=router_config,
+        template=template_config,
+        annotations=annotations,
+        label=labels,
+        teardown=teardown,
+    ) as llm_service:
+        timeout_watch = TimeoutWatch(timeout=timeout)
+
+        if wait:
+            LOGGER.info(f"Waiting for LLMInferenceService {name} to be ready...")
+            llm_service.wait_for_condition(
+                condition="Ready",
+                status="True",
+                timeout=timeout_watch.remaining_time(),
+            )
+            LOGGER.info(f"LLMInferenceService {name} is ready")
+
+        yield llm_service
+
+
+def get_llm_inference_url(llm_service: LLMInferenceService) -> str:
+    """
+    Get the inference URL for an LLMInferenceService.
+    
+    This function attempts to resolve the URL in the following order:
+    1. External URL from service status
+    2. Service discovery via labels
+    3. Fallback to service name pattern
+    
+    Args:
+        llm_service: The LLMInferenceService resource
+        
+    Returns:
+        str: The inference URL (full URL including protocol and path)
+        
+    Raises:
+        ValueError: If the inference URL cannot be determined
+    """
+    if llm_service.instance.status and llm_service.instance.status.get("url"):
+        url = llm_service.instance.status["url"]
+        LOGGER.debug(f"Using external URL for {llm_service.name}: {url}")
+        return url
+    
+    try:
+        services = get_services_by_isvc_label(
+            client=llm_service.client,
+            isvc=llm_service,
+            runtime_name=None,
+        )
+        if services:
+            internal_url = f"http://{services[0].name}.{llm_service.namespace}.svc.cluster.local"
+            LOGGER.debug(f"Using service discovery URL for {llm_service.name}: {internal_url}")
+            return internal_url
+    except Exception as e:
+        LOGGER.warning(f"Could not get service for LLMInferenceService {llm_service.name}: {e}")
+    fallback_url = f"http://{llm_service.name}.{llm_service.namespace}.svc.cluster.local"
+    LOGGER.debug(f"Using fallback URL for {llm_service.name}: {fallback_url}")
+    return fallback_url
+
+
+def verify_inference_response_llmd(
+    llm_service: LLMInferenceService,
+    inference_config: Dict[str, Any],
+    inference_type: str,
+    protocol: str,
+    model_name: Optional[str] = None,
+    inference_input: Optional[Any] = None,
+    use_default_query: bool = False,
+    expected_response_text: Optional[str] = None,
+    insecure: bool = False,
+    token: Optional[str] = None,
+    authorized_user: Optional[bool] = None,
+) -> None:
+    """
+    Verify the LLM inference response following the pattern of verify_inference_response.
+
+    Args:
+        llm_service: LLMInferenceService resource to test
+        inference_config: Inference configuration dictionary
+        inference_type: Type of inference ('infer', 'streaming', etc.)
+        protocol: Protocol to use ('http', 'grpc')
+        model_name: Name of the model (defaults to service name)
+        inference_input: Input for inference (optional)
+        use_default_query: Whether to use default query from config
+        expected_response_text: Expected response text for validation
+        insecure: Whether to use insecure connections
+        token: Authentication token (optional)
+        authorized_user: Whether user should be authorized (optional)
+
+    Raises:
+        InferenceResponseError: If inference response is invalid
+        ValueError: If inference response validation fails
+    """
+
+    model_name = model_name or llm_service.name
+    inference = LLMUserInference(
+        llm_service=llm_service,
+        inference_config=inference_config,
+        inference_type=inference_type,
+        protocol=protocol,
+    )
+
+    res = inference.run_inference_flow(
+        model_name=model_name,
+        inference_input=inference_input,
+        use_default_query=use_default_query,
+        token=token,
+        insecure=insecure,
+    )
+
+    if authorized_user is False:
+        _validate_unauthorized_response(res, token, inference)
+    else:
+        _validate_authorized_response(
+            res=res,
+            inference=inference,
+            inference_config=inference_config,
+            inference_type=inference_type,
+            expected_response_text=expected_response_text,
+            use_default_query=use_default_query,
+            model_name=model_name,
+        )
+
+
+class LLMUserInference:
+    """
+    LLM-specific inference handler following the pattern of UserInference.
+    """
+    
+    STREAMING = "streaming"
+    INFER = "infer"
+    
+    def __init__(
+        self,
+        llm_service: LLMInferenceService,
+        inference_config: Dict[str, Any],
+        inference_type: str,
+        protocol: str,
+    ) -> None:
+        self.llm_service = llm_service
+        self.inference_config = inference_config
+        self.inference_type = inference_type
+        self.protocol = protocol
+        self.runtime_config = self.get_runtime_config()
+        
+    def get_runtime_config(self) -> Dict[str, Any]:
+        """Get runtime config from inference config based on inference type and protocol."""
+        if inference_type_config := self.inference_config.get(self.inference_type):
+            protocol = "http" if self.protocol.lower() in ["http", "https"] else self.protocol
+            if data := inference_type_config.get(protocol):
+                return data
+            else:
+                raise ValueError(f"Protocol {protocol} not supported for inference type {self.inference_type}")
+        else:
+            raise ValueError(f"Inference type {self.inference_type} not supported in config")
+    
+    @property
+    def inference_response_text_key_name(self) -> Optional[str]:
+        """Get inference response text key name from runtime config."""
+        return self.runtime_config.get("response_fields_map", {}).get("response_output")
+    
+    @property
+    def inference_response_key_name(self) -> str:
+        """Get inference response key name from runtime config."""
+        return self.runtime_config.get("response_fields_map", {}).get("response", "output")
+    
+    def get_inference_body(
+        self,
+        model_name: str,
+        inference_input: Optional[Any] = None,
+        use_default_query: bool = False,
+    ) -> str:
+        """Get inference body for LLM request."""
+        if not use_default_query and inference_input is None:
+            raise ValueError("Either pass `inference_input` or set `use_default_query` to True")
+        
+        if use_default_query:
+            default_query_config = self.inference_config.get("default_query_model")
+            if not default_query_config:
+                raise ValueError(f"Missing default query config for {model_name}")
+            
+            if self.inference_config.get("support_multi_default_queries"):
+                query_config = default_query_config.get(self.inference_type)
+                if not query_config:
+                    raise ValueError(f"Missing default query for inference type {self.inference_type}")
+                template_str = query_config.get("query_input", "")
+            else:
+                template_str = default_query_config.get("query_input", "")
+            
+            # Use template substitution for model name
+            template = Template(template_str)
+            body = template.safe_substitute(model_name=model_name)
+        else:
+            # For custom input, create OpenAI-compatible format
+            if isinstance(inference_input, str):
+                body = json.dumps({
+                    "model": model_name,
+                    "messages": [{"role": "user", "content": inference_input}],
+                    "max_tokens": 100,
+                    "temperature": 0.0
+                })
+            else:
+                body = json.dumps(inference_input)
+        
+        return body
+    
+    def generate_command(
+        self,
+        model_name: str,
+        inference_input: Optional[str] = None,
+        use_default_query: bool = False,
+        insecure: bool = False,
+        token: Optional[str] = None,
+    ) -> str:
+        """Generate curl command string for LLM inference."""
+        base_url = get_llm_inference_url(self.llm_service)
+        endpoint_url = f"{base_url}{DEFAULT_LLM_ENDPOINT}"
+
+        body = self.get_inference_body(
+            model_name=model_name,
+            inference_input=inference_input,
+            use_default_query=use_default_query,
+        )
+
+        header = HTTPRequest.CONTENT_JSON.replace("-H ", "").replace("'", "")
+        cmd_exec = "curl -i -s"
+        cmd = f"{cmd_exec} -X POST -d '{body}' -H {header} -H 'Accept: application/json'"
+
+        if token:
+            cmd += f" {HTTPRequest.AUTH_HEADER.format(token=token)}"
+
+        if insecure:
+            cmd += " --insecure"
+        else:
+            try:
+                from ocp_resources.resource import get_client  # type: ignore
+                client = get_client()
+                ca_bundle = get_ca_bundle(client=client, deployment_mode="raw")
+                if ca_bundle:
+                    cmd += f" --cacert {ca_bundle}"
+                else:
+                    cmd += " --insecure"
+            except Exception:
+                cmd += " --insecure"
+
+        cmd += f" --max-time {DEFAULT_TIMEOUT} {endpoint_url}"
+        return cmd
+
+    @retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=5)
+    def run_inference(
+        self,
+        model_name: str,
+        inference_input: Optional[str] = None,
+        use_default_query: bool = False,
+        insecure: bool = False,
+        token: Optional[str] = None,
+    ) -> str:
+        """Run inference command and return raw output."""
+        cmd = self.generate_command(
+            model_name=model_name,
+            inference_input=inference_input,
+            use_default_query=use_default_query,
+            insecure=insecure,
+            token=token,
+        )
+
+        res, out, err = run_command(command=shlex.split(cmd), verify_stderr=False, check=False)
+        if res:
+            return out
+        raise ValueError(f"Inference failed with error: {err}\nOutput: {out}\nCommand: {cmd}")
+
+    def run_inference_flow(
+        self,
+        model_name: str,
+        inference_input: Optional[str] = None,
+        use_default_query: bool = False,
+        insecure: bool = False,
+        token: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Run LLM inference using the same high-level flow as inference_utils."""
+        out = self.run_inference(
+            model_name=model_name,
+            inference_input=inference_input,
+            use_default_query=use_default_query,
+            insecure=insecure,
+            token=token,
+        )
+        return {"output": out}
+
+
+def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], inference: LLMUserInference) -> None:
+    """Validate response for unauthorized users."""
+    auth_header = "x-ext-auth-reason"
+    
+    if auth_reason := re.search(rf"{auth_header}: (.*)", res["output"], re.MULTILINE):
+        reason = auth_reason.group(1).lower()
+        
+        if token:
+            assert re.search(r"not (?:authenticated|authorized)", reason)
+        else:
+            assert "credential not found" in reason
+    else:
+        forbidden_patterns = ["Forbidden", "401", "403", "Unauthorized"]
+        output = res["output"]
+        
+        if any(pattern in output for pattern in forbidden_patterns):
+            return
+        
+        raise ValueError(f"Auth header {auth_header} not found in response. Response: {output}")
+
+
+def _validate_authorized_response(
+    res: Dict[str, Any],
+    inference: LLMUserInference,
+    inference_config: Dict[str, Any],
+    inference_type: str,
+    expected_response_text: Optional[str],
+    use_default_query: bool,
+    model_name: str,
+) -> None:
+    """Validate response for authorized users."""
+    
+    use_regex = False
+    
+    if use_default_query:
+        expected_response_text_config = inference_config.get("default_query_model", {})
+        use_regex = expected_response_text_config.get("use_regex", False)
+        
+        if not expected_response_text_config:
+            raise ValueError(f"Missing default_query_model config for inference {inference_config}")
+        
+        if inference_config.get("support_multi_default_queries"):
+            query_config = expected_response_text_config.get(inference_type)
+            if not query_config:
+                raise ValueError(f"Missing default_query_model config for inference type {inference_type}")
+            expected_response_text = query_config.get("query_output", "")
+            use_regex = query_config.get("use_regex", False)
+        else:
+            expected_response_text = expected_response_text_config.get("query_output")
+        
+        if not expected_response_text:
+            raise ValueError(f"Missing response text key for inference {inference_config}")
+        
+        if isinstance(expected_response_text, str):
+            expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name)
+        elif isinstance(expected_response_text, dict):
+            expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute(
+                model_name=model_name
+            )
+    
+    if inference.inference_response_text_key_name:
+        if inference_type == inference.STREAMING:
+            if output := re.findall(
+                rf"{inference.inference_response_text_key_name}\": \"(.*)\"",
+                res[inference.inference_response_key_name],
+                re.MULTILINE,
+            ):
+                assert "".join(output) == expected_response_text, (
+                    f"Expected: {expected_response_text} does not match response: {output}"
+                )
+        elif inference_type == inference.INFER or use_regex:
+            formatted_res = json.dumps(res[inference.inference_response_text_key_name]).replace(" ", "")
+            if use_regex:
+                assert re.search(expected_response_text, formatted_res), (
+                    f"Expected: {expected_response_text} not found in: {formatted_res}"
+                )
+            else:
+                formatted_res = json.dumps(res[inference.inference_response_key_name]).replace(" ", "")
+                assert formatted_res == expected_response_text, (
+                    f"Expected: {expected_response_text} does not match output: {formatted_res}"
+                )
+        else:
+            response = res[inference.inference_response_key_name]
+            if isinstance(response, list):
+                response = response[0]
+            
+            if isinstance(response, dict):
+                response_text = response[inference.inference_response_text_key_name]
+                assert response_text == expected_response_text, (
+                    f"Expected: {expected_response_text} does not match response: {response_text}"
+                )
+            else:
+                raise InferenceResponseError(
+                    "Inference response output does not match expected output format."
+                    f"Expected: {expected_response_text}.\nResponse: {res}"
+                )
+    else:
+        raise InferenceResponseError(f"Inference response output not found in response. Response: {res}")
+
+
+def validate_llm_response_format(response_data: Dict[str, Any]) -> bool:
+    """Validate that a response follows the expected LLM inference format."""
+    if "choices" not in response_data:
+        raise InferenceResponseError("Response missing required field: 'choices'")
+    
+    if not isinstance(response_data["choices"], list):
+        raise InferenceResponseError("'choices' field must be a list")
+    
+    if not response_data["choices"]:
+        raise InferenceResponseError("'choices' list cannot be empty")
+    
+    choice = response_data["choices"][0]
+    if "message" not in choice:
+        raise InferenceResponseError("Choice missing 'message' field")
+    
+    message = choice["message"]
+    if "content" not in message:
+        raise InferenceResponseError("Message missing 'content' field")
+    
+    if not isinstance(message["content"], str):
+        raise InferenceResponseError("Message content must be a string")
+    
+    return True
diff --git a/utilities/manifests/opt125m_cpu.py b/utilities/manifests/opt125m_cpu.py
new file mode 100644
index 000000000..2326e991d
--- /dev/null
+++ b/utilities/manifests/opt125m_cpu.py
@@ -0,0 +1,27 @@
+OPT125M_CPU_INFERENCE_CONFIG = {
+    "default_query_model": {
+        "query_input": "What is the boiling point of water?",
+        "query_output": r'.*',  # Accept any valid response
+        "use_regex": True,
+    },
+    "chat_completions": {
+        "http": {
+            "endpoint": "v1/chat/completions",
+            "header": "Content-Type:application/json",
+            "body": '{"model": "$model_name", "messages": [{"role": "user", "content": "$query_input"}], "max_tokens": 50, "temperature": 0.0, "stream": false}',
+            "response_fields_map": {
+                "response_output": "output",
+            },
+        },
+    },
+    "completions": {
+        "http": {
+            "endpoint": "v1/completions",
+            "header": "Content-Type:application/json",
+            "body": '{"model": "$model_name", "prompt": "$query_input", "max_tokens": 50, "temperature": 0.0}',
+            "response_fields_map": {
+                "response_output": "output",
+            },
+        },
+    },
+}

From cdd0a82559b09c3e57de24b7bf0ba05615c169b1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Sep 2025 16:56:33 +0000
Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../model_server/llmd/conftest.py             |  26 ++--
 .../model_server/llmd/test_llmd_oci_cpu.py    |   2 +-
 .../model_server/llmd/test_llmd_s3.py         |   7 +-
 .../model_serving/model_server/llmd/utils.py  |  17 ++-
 utilities/llmd_constants.py                   |   4 +-
 utilities/llmd_utils.py                       | 135 ++++++++----------
 6 files changed, 88 insertions(+), 103 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index af8c5594c..cd2c86773 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -50,8 +50,7 @@ def llmd_s3_secret(
 
 @pytest.fixture(scope="class")
 def llmd_s3_service_account(
-    admin_client: DynamicClient, 
-    llmd_s3_secret: Secret
+    admin_client: DynamicClient, llmd_s3_secret: Secret
 ) -> Generator[ServiceAccount, None, None]:
     with ServiceAccount(
         client=admin_client,
@@ -80,7 +79,7 @@ def llmd_gateway(
         gateway_class_name=gateway_class_name,
         wait_for_condition=True,
         timeout=Timeout.TIMEOUT_5MIN,
-        **kwargs
+        **kwargs,
     ) as gateway:
         yield gateway
 
@@ -97,12 +96,12 @@ def llmd_inference_service(
     else:
         name_suffix = request.param.get("name_suffix", "basic")
         kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"}
-    
+
     service_name = kwargs.get("name", f"llm-{name_suffix}")
 
     if "llmd_gateway" in request.fixturenames:
         request.getfixturevalue("llmd_gateway")
-    
+
     container_resources = kwargs.get(
         "container_resources",
         {
@@ -120,7 +119,7 @@ def llmd_inference_service(
         container_resources=container_resources,
         wait=True,
         timeout=Timeout.TIMEOUT_15MIN,
-        **{k: v for k, v in kwargs.items() if k != "name"}
+        **{k: v for k, v in kwargs.items() if k != "name"},
     ) as llm_service:
         yield llm_service
 
@@ -139,15 +138,15 @@ def llmd_inference_service_s3(
     else:
         name_suffix = request.param.get("name_suffix", "s3")
         kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"}
-    
+
     service_name = kwargs.get("name", f"llm-{name_suffix}")
-    
+
     if "storage_key" not in kwargs:
         kwargs["storage_key"] = llmd_s3_secret.name
-    
+
     if "storage_path" not in kwargs:
         kwargs["storage_path"] = DEFAULT_S3_STORAGE_PATH
-    
+
     container_resources = kwargs.get(
         "container_resources",
         {
@@ -167,7 +166,10 @@ def llmd_inference_service_s3(
         service_account=llmd_s3_service_account.name,
         wait=True,
         timeout=Timeout.TIMEOUT_15MIN,
-        **{k: v for k, v in kwargs.items() if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"]}
+        **{
+            k: v
+            for k, v in kwargs.items()
+            if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"]
+        },
     ) as llm_service:
         yield llm_service
-
diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
index e4d851e1c..031ce6e48 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
@@ -23,7 +23,7 @@ class TestLLMDOCICPUInference:
     def test_llmd_oci(self, llmd_gateway, llmd_inference_service):
         assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
         assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready"
-        
+
         verify_inference_response_llmd(
             llm_service=llmd_inference_service,
             inference_config=OPT125M_CPU_INFERENCE_CONFIG,
diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py
index 0774b0b46..03c899431 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_s3.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py
@@ -13,9 +13,7 @@
 
 @pytest.mark.parametrize(
     "unprivileged_model_namespace, llmd_gateway, llmd_inference_service_s3",
-    [
-        ({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"})
-    ],
+    [({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"})],
     indirect=True,
 )
 @pytest.mark.usefixtures("valid_aws_config")
@@ -25,7 +23,7 @@ class TestLLMDS3Inference:
     def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3):
         assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
         assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready"
-        
+
         verify_inference_response_llmd(
             llm_service=llmd_inference_service_s3,
             inference_config=OPT125M_CPU_INFERENCE_CONFIG,
@@ -34,4 +32,3 @@ def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3):
             use_default_query=True,
             insecure=True,
         )
-
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
index 0a1c591ff..b80fcff7c 100644
--- a/tests/model_serving/model_server/llmd/utils.py
+++ b/tests/model_serving/model_server/llmd/utils.py
@@ -16,23 +16,23 @@
 def verify_gateway_status(gateway: Gateway) -> bool:
     """
     Verify that a Gateway is properly configured and programmed.
-    
+
     Args:
         gateway (Gateway): The Gateway resource to verify
-        
+
     Returns:
         bool: True if gateway is properly configured, False otherwise
     """
     if not gateway.exists:
         LOGGER.warning(f"Gateway {gateway.name} does not exist")
         return False
-    
+
     conditions = gateway.instance.status.get("conditions", [])
     for condition in conditions:
         if condition["type"] == "Programmed" and condition["status"] == "True":
             LOGGER.info(f"Gateway {gateway.name} is programmed and ready")
             return True
-    
+
     LOGGER.warning(f"Gateway {gateway.name} is not in Programmed state")
     return False
 
@@ -40,23 +40,22 @@ def verify_gateway_status(gateway: Gateway) -> bool:
 def verify_llm_service_status(llm_service: LLMInferenceService) -> bool:
     """
     Verify that an LLMInferenceService is properly configured and ready.
-    
+
     Args:
         llm_service (LLMInferenceService): The LLMInferenceService resource to verify
-        
+
     Returns:
         bool: True if service is properly configured, False otherwise
     """
     if not llm_service.exists:
         LOGGER.warning(f"LLMInferenceService {llm_service.name} does not exist")
         return False
-    
+
     conditions = llm_service.instance.status.get("conditions", [])
     for condition in conditions:
         if condition["type"] == "Ready" and condition["status"] == "True":
             LOGGER.info(f"LLMInferenceService {llm_service.name} is ready")
             return True
-    
+
     LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state")
     return False
-
diff --git a/utilities/llmd_constants.py b/utilities/llmd_constants.py
index 74546814b..90751f30b 100644
--- a/utilities/llmd_constants.py
+++ b/utilities/llmd_constants.py
@@ -27,6 +27,4 @@
 DEFAULT_CPU_REQUEST = "100m"
 DEFAULT_MEMORY_REQUEST = "8Gi"
 
-BASIC_LLMD_PARAMS = [
-    ({"name": "llmd-comprehensive-test"}, "openshift-default", "basic")
-]
+BASIC_LLMD_PARAMS = [({"name": "llmd-comprehensive-test"}, "openshift-default", "basic")]
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 424166ead..87640dc18 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -1,4 +1,5 @@
 """Utilities for LLM Deployment (LLMD) resources."""
+
 import json
 import re
 import shlex
@@ -42,18 +43,18 @@ def create_gateway(
 ) -> Generator[Gateway, None, None]:
     """
     Context manager to create and manage a Gateway resource using ocp_resources.
-    
+
     Args:
         client: Kubernetes dynamic client
         name: Gateway name
-        namespace: Gateway namespace  
+        namespace: Gateway namespace
         gateway_class_name: The name of the GatewayClass resource
         listeners: List of listener configurations
         infrastructure: Infrastructure configuration
         wait_for_condition: Whether to wait for the gateway to be programmed
         timeout: Timeout in seconds for waiting
         teardown: Whether to clean up the resource
-        
+
     Yields:
         Gateway: The created Gateway resource
     """
@@ -92,9 +93,9 @@ def create_gateway(
             "gatewayClassName": gateway_class_name,
             "listeners": listeners,
             "infrastructure": infrastructure,
-        }
+        },
     }
-    
+
     with Gateway(
         client=client,
         teardown=teardown,
@@ -109,7 +110,7 @@ def create_gateway(
                 timeout=timeout,
             )
             LOGGER.info(f"Gateway {name} is programmed and ready")
-        
+
         yield gateway
 
 
@@ -172,7 +173,7 @@ def create_llmisvc(
     """
     if labels is None:
         labels = {}
-    
+
     if annotations is None:
         annotations = {}
 
@@ -188,58 +189,45 @@ def create_llmisvc(
         raise ValueError("Provide either storage_uri or (storage_key and storage_path) for the model")
 
     if router_config is None:
-        router_config = {
-            "scheduler": {},
-            "route": {},
-            "gateway": {}
-        }
-    
+        router_config = {"scheduler": {}, "route": {}, "gateway": {}}
+
     if container_resources is None:
         raise ValueError("container_resources must be provided for LLMInferenceService")
-    
+
     if container_env is None:
-        container_env = [
-            {
-                "name": "VLLM_LOGGING_LEVEL",
-                "value": "DEBUG"
-            }
-        ]
-    template_config: Dict[str, Any] = {
-        "containers": []
-    }
-    
-    main_container: Dict[str, Any] = {
-        "name": "main"
-    }
-    
+        container_env = [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}]
+    template_config: Dict[str, Any] = {"containers": []}
+
+    main_container: Dict[str, Any] = {"name": "main"}
+
     if container_image:
         main_container["image"] = container_image
     else:
         raise ValueError("container_image must be provided for LLMInferenceService")
-    
+
     if container_resources:
         main_container["resources"] = container_resources
-    
+
     if container_env:
         main_container["env"] = container_env
-    
+
     if liveness_probe:
         main_container["livenessProbe"] = liveness_probe
-    
+
     if readiness_probe:
         main_container["readinessProbe"] = readiness_probe
-    
+
     if volume_mounts:
         main_container["volumeMounts"] = volume_mounts
-    
+
     template_config["containers"].append(main_container)
-    
+
     if volumes:
         template_config["volumes"] = volumes
-    
+
     if service_account:
         template_config["serviceAccountName"] = service_account
-    
+
     if image_pull_secrets:
         template_config["imagePullSecrets"] = [{"name": secret} for secret in image_pull_secrets]
 
@@ -247,7 +235,7 @@ def create_llmisvc(
         annotations["serving.kserve.io/auth"] = "true"
 
     LOGGER.info(f"Creating LLMInferenceService {name} in namespace {namespace}")
-    
+
     with LLMInferenceService(
         client=client,
         name=name,
@@ -277,18 +265,18 @@ def create_llmisvc(
 def get_llm_inference_url(llm_service: LLMInferenceService) -> str:
     """
     Get the inference URL for an LLMInferenceService.
-    
+
     This function attempts to resolve the URL in the following order:
     1. External URL from service status
     2. Service discovery via labels
     3. Fallback to service name pattern
-    
+
     Args:
         llm_service: The LLMInferenceService resource
-        
+
     Returns:
         str: The inference URL (full URL including protocol and path)
-        
+
     Raises:
         ValueError: If the inference URL cannot be determined
     """
@@ -296,7 +284,7 @@ def get_llm_inference_url(llm_service: LLMInferenceService) -> str:
         url = llm_service.instance.status["url"]
         LOGGER.debug(f"Using external URL for {llm_service.name}: {url}")
         return url
-    
+
     try:
         services = get_services_by_isvc_label(
             client=llm_service.client,
@@ -382,10 +370,10 @@ class LLMUserInference:
     """
     LLM-specific inference handler following the pattern of UserInference.
     """
-    
+
     STREAMING = "streaming"
     INFER = "infer"
-    
+
     def __init__(
         self,
         llm_service: LLMInferenceService,
@@ -398,7 +386,7 @@ def __init__(
         self.inference_type = inference_type
         self.protocol = protocol
         self.runtime_config = self.get_runtime_config()
-        
+
     def get_runtime_config(self) -> Dict[str, Any]:
         """Get runtime config from inference config based on inference type and protocol."""
         if inference_type_config := self.inference_config.get(self.inference_type):
@@ -409,17 +397,17 @@ def get_runtime_config(self) -> Dict[str, Any]:
                 raise ValueError(f"Protocol {protocol} not supported for inference type {self.inference_type}")
         else:
             raise ValueError(f"Inference type {self.inference_type} not supported in config")
-    
+
     @property
     def inference_response_text_key_name(self) -> Optional[str]:
         """Get inference response text key name from runtime config."""
         return self.runtime_config.get("response_fields_map", {}).get("response_output")
-    
+
     @property
     def inference_response_key_name(self) -> str:
         """Get inference response key name from runtime config."""
         return self.runtime_config.get("response_fields_map", {}).get("response", "output")
-    
+
     def get_inference_body(
         self,
         model_name: str,
@@ -429,12 +417,12 @@ def get_inference_body(
         """Get inference body for LLM request."""
         if not use_default_query and inference_input is None:
             raise ValueError("Either pass `inference_input` or set `use_default_query` to True")
-        
+
         if use_default_query:
             default_query_config = self.inference_config.get("default_query_model")
             if not default_query_config:
                 raise ValueError(f"Missing default query config for {model_name}")
-            
+
             if self.inference_config.get("support_multi_default_queries"):
                 query_config = default_query_config.get(self.inference_type)
                 if not query_config:
@@ -442,7 +430,7 @@ def get_inference_body(
                 template_str = query_config.get("query_input", "")
             else:
                 template_str = default_query_config.get("query_input", "")
-            
+
             # Use template substitution for model name
             template = Template(template_str)
             body = template.safe_substitute(model_name=model_name)
@@ -453,13 +441,13 @@ def get_inference_body(
                     "model": model_name,
                     "messages": [{"role": "user", "content": inference_input}],
                     "max_tokens": 100,
-                    "temperature": 0.0
+                    "temperature": 0.0,
                 })
             else:
                 body = json.dumps(inference_input)
-        
+
         return body
-    
+
     def generate_command(
         self,
         model_name: str,
@@ -490,6 +478,7 @@ def generate_command(
         else:
             try:
                 from ocp_resources.resource import get_client  # type: ignore
+
                 client = get_client()
                 ca_bundle = get_ca_bundle(client=client, deployment_mode="raw")
                 if ca_bundle:
@@ -547,10 +536,10 @@ def run_inference_flow(
 def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], inference: LLMUserInference) -> None:
     """Validate response for unauthorized users."""
     auth_header = "x-ext-auth-reason"
-    
+
     if auth_reason := re.search(rf"{auth_header}: (.*)", res["output"], re.MULTILINE):
         reason = auth_reason.group(1).lower()
-        
+
         if token:
             assert re.search(r"not (?:authenticated|authorized)", reason)
         else:
@@ -558,10 +547,10 @@ def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], i
     else:
         forbidden_patterns = ["Forbidden", "401", "403", "Unauthorized"]
         output = res["output"]
-        
+
         if any(pattern in output for pattern in forbidden_patterns):
             return
-        
+
         raise ValueError(f"Auth header {auth_header} not found in response. Response: {output}")
 
 
@@ -575,16 +564,16 @@ def _validate_authorized_response(
     model_name: str,
 ) -> None:
     """Validate response for authorized users."""
-    
+
     use_regex = False
-    
+
     if use_default_query:
         expected_response_text_config = inference_config.get("default_query_model", {})
         use_regex = expected_response_text_config.get("use_regex", False)
-        
+
         if not expected_response_text_config:
             raise ValueError(f"Missing default_query_model config for inference {inference_config}")
-        
+
         if inference_config.get("support_multi_default_queries"):
             query_config = expected_response_text_config.get(inference_type)
             if not query_config:
@@ -593,17 +582,17 @@ def _validate_authorized_response(
             use_regex = query_config.get("use_regex", False)
         else:
             expected_response_text = expected_response_text_config.get("query_output")
-        
+
         if not expected_response_text:
             raise ValueError(f"Missing response text key for inference {inference_config}")
-        
+
         if isinstance(expected_response_text, str):
             expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name)
         elif isinstance(expected_response_text, dict):
             expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute(
                 model_name=model_name
             )
-    
+
     if inference.inference_response_text_key_name:
         if inference_type == inference.STREAMING:
             if output := re.findall(
@@ -629,7 +618,7 @@ def _validate_authorized_response(
             response = res[inference.inference_response_key_name]
             if isinstance(response, list):
                 response = response[0]
-            
+
             if isinstance(response, dict):
                 response_text = response[inference.inference_response_text_key_name]
                 assert response_text == expected_response_text, (
@@ -648,22 +637,22 @@ def validate_llm_response_format(response_data: Dict[str, Any]) -> bool:
     """Validate that a response follows the expected LLM inference format."""
     if "choices" not in response_data:
         raise InferenceResponseError("Response missing required field: 'choices'")
-    
+
     if not isinstance(response_data["choices"], list):
         raise InferenceResponseError("'choices' field must be a list")
-    
+
     if not response_data["choices"]:
         raise InferenceResponseError("'choices' list cannot be empty")
-    
+
     choice = response_data["choices"][0]
     if "message" not in choice:
         raise InferenceResponseError("Choice missing 'message' field")
-    
+
     message = choice["message"]
     if "content" not in message:
         raise InferenceResponseError("Message missing 'content' field")
-    
+
     if not isinstance(message["content"], str):
         raise InferenceResponseError("Message content must be a string")
-    
+
     return True

From 16a563bb12572b0d72319f0917667fde604f0408 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 16 Sep 2025 12:55:40 +0530
Subject: [PATCH 03/12] fix precommit

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 pytest.ini                                    |  1 +
 .../model_server/llmd/conftest.py             |  1 -
 utilities/llmd_utils.py                       | 22 +++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 80c149a5e..85ed14ab2 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -33,6 +33,7 @@ markers =
     gpu: Mark tests which require GPU resources
     multinode: Mark tests which require multiple nodes
     keda: Mark tests which are testing KEDA scaling
+    llmd_cpu: Mark tests which are testing LLMD (LLM Deployment) with CPU resources
 
     # Model Registry:
     custom_namespace: mark tests that are to be run with custom namespace
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index cd2c86773..9130c6b6b 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -101,7 +101,6 @@ def llmd_inference_service(
 
     if "llmd_gateway" in request.fixturenames:
         request.getfixturevalue("llmd_gateway")
-
     container_resources = kwargs.get(
         "container_resources",
         {
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 87640dc18..4395b792e 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -353,7 +353,7 @@ def verify_inference_response_llmd(
     )
 
     if authorized_user is False:
-        _validate_unauthorized_response(res, token, inference)
+        _validate_unauthorized_response(res=res, token=token, inference=inference)
     else:
         _validate_authorized_response(
             res=res,
@@ -432,7 +432,7 @@ def get_inference_body(
                 template_str = default_query_config.get("query_input", "")
 
             # Use template substitution for model name
-            template = Template(template_str)
+            template = Template(template=template_str)
             body = template.safe_substitute(model_name=model_name)
         else:
             # For custom input, create OpenAI-compatible format
@@ -457,7 +457,7 @@ def generate_command(
         token: Optional[str] = None,
     ) -> str:
         """Generate curl command string for LLM inference."""
-        base_url = get_llm_inference_url(self.llm_service)
+        base_url = get_llm_inference_url(llm_service=self.llm_service)
         endpoint_url = f"{base_url}{DEFAULT_LLM_ENDPOINT}"
 
         body = self.get_inference_body(
@@ -477,8 +477,7 @@ def generate_command(
             cmd += " --insecure"
         else:
             try:
-                from ocp_resources.resource import get_client  # type: ignore
-
+                from ocp_resources.resource import get_client
                 client = get_client()
                 ca_bundle = get_ca_bundle(client=client, deployment_mode="raw")
                 if ca_bundle:
@@ -587,12 +586,13 @@ def _validate_authorized_response(
             raise ValueError(f"Missing response text key for inference {inference_config}")
 
         if isinstance(expected_response_text, str):
-            expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name)
+            expected_response_text = Template(template=expected_response_text).safe_substitute(model_name=model_name)
         elif isinstance(expected_response_text, dict):
-            expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute(
-                model_name=model_name
-            )
-
+            response_output = expected_response_text.get("response_output")
+            if response_output is not None:
+                expected_response_text = Template(template=response_output).safe_substitute(
+                    model_name=model_name
+                )
     if inference.inference_response_text_key_name:
         if inference_type == inference.STREAMING:
             if output := re.findall(
@@ -605,7 +605,7 @@ def _validate_authorized_response(
                 )
         elif inference_type == inference.INFER or use_regex:
             formatted_res = json.dumps(res[inference.inference_response_text_key_name]).replace(" ", "")
-            if use_regex:
+            if use_regex and expected_response_text is not None:
                 assert re.search(expected_response_text, formatted_res), (
                     f"Expected: {expected_response_text} not found in: {formatted_res}"
                 )

From 8453cda0f80fa2b2bee2a1664850c84e58b24954 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Sep 2025 07:31:09 +0000
Subject: [PATCH 04/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 utilities/llmd_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 4395b792e..9d5c94c01 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -478,6 +478,7 @@ def generate_command(
         else:
             try:
                 from ocp_resources.resource import get_client
+
                 client = get_client()
                 ca_bundle = get_ca_bundle(client=client, deployment_mode="raw")
                 if ca_bundle:
@@ -590,9 +591,7 @@ def _validate_authorized_response(
         elif isinstance(expected_response_text, dict):
             response_output = expected_response_text.get("response_output")
             if response_output is not None:
-                expected_response_text = Template(template=response_output).safe_substitute(
-                    model_name=model_name
-                )
+                expected_response_text = Template(template=response_output).safe_substitute(model_name=model_name)
     if inference.inference_response_text_key_name:
         if inference_type == inference.STREAMING:
             if output := re.findall(

From 41c26e8b6eb995f0881b8c6bb413322df2256af0 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 16 Sep 2025 13:01:32 +0530
Subject: [PATCH 05/12] fix precommit

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 tests/model_serving/model_server/llmd/__init__.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py
index 3d0e7e903..e69de29bb 100644
--- a/tests/model_serving/model_server/llmd/__init__.py
+++ b/tests/model_serving/model_server/llmd/__init__.py
@@ -1,8 +0,0 @@
-"""
-LLM Deployment (LLMD) tests module.
-
-This module contains tests for LLM deployment functionality including:
-- Gateway resource creation and management
-- LLMInferenceService deployment and configuration
-- Integration testing between gateway and inference services
-"""

From 3a1373595e604d46fffde9b3d2a1abcde3aac171 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 16 Sep 2025 20:07:33 +0530
Subject: [PATCH 06/12] fix precommit

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 tests/model_serving/model_server/llmd/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index 9130c6b6b..6464f1f6d 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -100,7 +100,7 @@ def llmd_inference_service(
     service_name = kwargs.get("name", f"llm-{name_suffix}")
 
     if "llmd_gateway" in request.fixturenames:
-        request.getfixturevalue("llmd_gateway")
+        request.getfixturevalue(argname="llmd_gateway")
     container_resources = kwargs.get(
         "container_resources",
         {

From 75978bf602db2d78706c85300fdb7ca39e676578 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 16 Sep 2025 20:51:15 +0530
Subject: [PATCH 07/12] fix precommit

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 utilities/llmd_utils.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 9d5c94c01..dc04bf09d 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -630,28 +630,3 @@ def _validate_authorized_response(
                 )
     else:
         raise InferenceResponseError(f"Inference response output not found in response. Response: {res}")
-
-
-def validate_llm_response_format(response_data: Dict[str, Any]) -> bool:
-    """Validate that a response follows the expected LLM inference format."""
-    if "choices" not in response_data:
-        raise InferenceResponseError("Response missing required field: 'choices'")
-
-    if not isinstance(response_data["choices"], list):
-        raise InferenceResponseError("'choices' field must be a list")
-
-    if not response_data["choices"]:
-        raise InferenceResponseError("'choices' list cannot be empty")
-
-    choice = response_data["choices"][0]
-    if "message" not in choice:
-        raise InferenceResponseError("Choice missing 'message' field")
-
-    message = choice["message"]
-    if "content" not in message:
-        raise InferenceResponseError("Message missing 'content' field")
-
-    if not isinstance(message["content"], str):
-        raise InferenceResponseError("Message content must be a string")
-
-    return True

From 2d17c0ccf579c8869f9c25e02032506df4f0b2e2 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 23 Sep 2025 18:01:59 +0530
Subject: [PATCH 08/12] fix gateway thing

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 .../model_server/llmd/__init__.py             |  1 +
 .../model_server/llmd/conftest.py             | 10 ++++
 .../model_server/llmd/test_llmd_oci_cpu.py    | 12 +++--
 .../model_server/llmd/test_llmd_s3.py         | 12 +++--
 .../model_serving/model_server/llmd/utils.py  | 46 +++++++++++++++++++
 utilities/llmd_utils.py                       | 34 +++++++++++---
 6 files changed, 103 insertions(+), 12 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py
index e69de29bb..98274df1a 100644
--- a/tests/model_serving/model_server/llmd/__init__.py
+++ b/tests/model_serving/model_server/llmd/__init__.py
@@ -0,0 +1 @@
+"""LLMD (LLM Deployment) test module for OpenDataHub and OpenShift AI."""
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index 6464f1f6d..2807754e7 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -67,18 +67,28 @@ def llmd_gateway(
     admin_client: DynamicClient,
     gateway_namespace: str,
 ) -> Generator[Gateway, None, None]:
+    """
+    Pytest fixture for LLMD Gateway management.
+    
+    Implements persistent gateway strategy:
+    - Reuses existing gateways if available
+    - Creates new gateway only if needed
+    - Does not delete gateway in teardown
+    """
     if isinstance(request.param, str):
         gateway_class_name = request.param
         kwargs = {}
     else:
         gateway_class_name = request.param.get("gateway_class_name", "openshift-default")
         kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"}
+        
     with create_gateway(
         client=admin_client,
         namespace=gateway_namespace,
         gateway_class_name=gateway_class_name,
         wait_for_condition=True,
         timeout=Timeout.TIMEOUT_5MIN,
+        teardown=False,  # Don't delete gateway in teardown
         **kwargs,
     ) as gateway:
         yield gateway
diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
index 031ce6e48..adc5e64c2 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
@@ -1,9 +1,12 @@
 import pytest
 
-from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status
+from tests.model_serving.model_server.llmd.utils import (
+    verify_llm_service_status,
+    verify_gateway_status,
+    verify_llmd_pods_not_restarted,
+)
 from utilities.constants import Protocols
 from utilities.llmd_utils import verify_inference_response_llmd
-
 from utilities.llmd_constants import BASIC_LLMD_PARAMS
 from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG
 
@@ -20,7 +23,8 @@
 class TestLLMDOCICPUInference:
     """LLMD inference testing with OCI storage and CPU runtime using vLLM."""
 
-    def test_llmd_oci(self, llmd_gateway, llmd_inference_service):
+    def test_llmd_oci(self, admin_client, llmd_gateway, llmd_inference_service):
+        """Test LLMD inference with OCI storage and CPU runtime."""
         assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
         assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready"
 
@@ -32,3 +36,5 @@ def test_llmd_oci(self, llmd_gateway, llmd_inference_service):
             use_default_query=True,
             insecure=True,
         )
+        
+        verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service)
diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py
index 03c899431..0e2d2ecb2 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_s3.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py
@@ -1,9 +1,12 @@
 import pytest
 
-from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status
+from tests.model_serving.model_server.llmd.utils import (
+    verify_llm_service_status,
+    verify_gateway_status,
+    verify_llmd_pods_not_restarted,
+)
 from utilities.constants import Protocols
 from utilities.llmd_utils import verify_inference_response_llmd
-
 from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG
 
 pytestmark = [
@@ -20,7 +23,8 @@
 class TestLLMDS3Inference:
     """LLMD inference testing with S3 storage."""
 
-    def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3):
+    def test_llmd_s3(self, admin_client, llmd_gateway, llmd_inference_service_s3):
+        """Test LLMD inference with S3 storage."""
         assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
         assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready"
 
@@ -32,3 +36,5 @@ def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3):
             use_default_query=True,
             insecure=True,
         )
+        
+        verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service_s3)
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
index b80fcff7c..8e9929c18 100644
--- a/tests/model_serving/model_server/llmd/utils.py
+++ b/tests/model_serving/model_server/llmd/utils.py
@@ -5,10 +5,14 @@
 Follows the established model server utils pattern for consistency.
 """
 
+from kubernetes.dynamic import DynamicClient
 from ocp_resources.gateway import Gateway
 from ocp_resources.llm_inference_service import LLMInferenceService
+from ocp_resources.pod import Pod
 from simple_logger.logger import get_logger
 
+from utilities.exceptions import PodContainersRestartError
+
 
 LOGGER = get_logger(name=__name__)
 
@@ -59,3 +63,45 @@ def verify_llm_service_status(llm_service: LLMInferenceService) -> bool:
 
     LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state")
     return False
+
+
+def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInferenceService) -> None:
+    """
+    Verify that LLMD inference pods containers have not restarted.
+    
+    This function checks for container restarts in pods related to the specific LLMInferenceService.
+    
+    Args:
+        client (DynamicClient): DynamicClient instance
+        llm_service (LLMInferenceService): The LLMInferenceService to check pods for
+        
+    Raises:
+        PodContainersRestartError: If any containers in LLMD pods have restarted
+    """
+    LOGGER.info(f"Verifying that pods for LLMInferenceService {llm_service.name} have not restarted")
+    
+    restarted_containers = {}
+    
+    for pod in Pod.get(
+        dyn_client=client,
+        namespace=llm_service.namespace,
+        label_selector=f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice,{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}",
+    ):
+        labels = pod.instance.metadata.get("labels", {})
+        if labels.get("kserve.io/component") == "workload":
+            LOGGER.debug(f"Checking pod {pod.name} for container restarts")
+            
+            if pod.instance.status.containerStatuses:
+                if _restarted_containers := [
+                    container.name for container in pod.instance.status.containerStatuses 
+                    if container.restartCount > 0
+                ]:
+                    restarted_containers[pod.name] = _restarted_containers
+                    LOGGER.warning(f"Pod {pod.name} has restarted containers: {_restarted_containers}")
+    
+    if restarted_containers:
+        error_msg = f"LLMD inference containers restarted for {llm_service.name}: {restarted_containers}"
+        LOGGER.error(error_msg)
+        raise PodContainersRestartError(error_msg)
+    
+    LOGGER.info(f"All pods for LLMInferenceService {llm_service.name} have no container restarts")
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index dc04bf09d..55a044a1f 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -39,10 +39,15 @@ def create_gateway(
     infrastructure: Optional[Dict[str, Any]] = None,
     wait_for_condition: bool = True,
     timeout: int = 300,
-    teardown: bool = True,
+    teardown: bool = False,
 ) -> Generator[Gateway, None, None]:
     """
     Context manager to create and manage a Gateway resource using ocp_resources.
+    
+    This function implements smart gateway management:
+    - Only creates gateway if it doesn't already exist
+    - Reuses existing gateways to avoid conflicts
+    - Does not delete gateway in teardown (persistent gateway strategy)
 
     Args:
         client: Kubernetes dynamic client
@@ -53,10 +58,10 @@ def create_gateway(
         infrastructure: Infrastructure configuration
         wait_for_condition: Whether to wait for the gateway to be programmed
         timeout: Timeout in seconds for waiting
-        teardown: Whether to clean up the resource
+        teardown: Whether to clean up the resource (default: False for persistent strategy)
 
     Yields:
-        Gateway: The created Gateway resource
+        Gateway: The Gateway resource (existing or newly created)
     """
     if listeners is None:
         listeners = [
@@ -70,6 +75,8 @@ def create_gateway(
 
     if infrastructure is None:
         infrastructure = {"labels": {"serving.kserve.io/gateway": "kserve-ingress-gateway"}}
+
+    # Check if gateway already exists
     try:
         existing_gateway = Gateway(
             client=client,
@@ -78,10 +85,24 @@ def create_gateway(
             api_group="gateway.networking.k8s.io",
         )
         if existing_gateway.exists:
-            LOGGER.info(f"Cleaning up existing Gateway {name} in namespace {namespace}")
-            existing_gateway.delete(wait=True, timeout=Timeout.TIMEOUT_2MIN)
+            LOGGER.info(f"Using existing Gateway {name} in namespace {namespace}")
+            
+            if wait_for_condition:
+                LOGGER.info(f"Waiting for existing Gateway {name} to be programmed...")
+                existing_gateway.wait_for_condition(
+                    condition="Programmed",
+                    status="True",
+                    timeout=timeout,
+                )
+                LOGGER.info(f"Existing Gateway {name} is programmed and ready")
+            
+            # Yield the existing gateway without teardown
+            yield existing_gateway
+            return
     except Exception as e:
-        LOGGER.debug(f"No existing Gateway to clean up: {e}")
+        LOGGER.debug(f"No existing Gateway found, will create new one: {e}")
+
+    # Create new gateway only if it doesn't exist
     gateway_body = {
         "apiVersion": "gateway.networking.k8s.io/v1",
         "kind": "Gateway",
@@ -96,6 +117,7 @@ def create_gateway(
         },
     }
 
+    LOGGER.info(f"Creating new Gateway {name} in namespace {namespace}")
     with Gateway(
         client=client,
         teardown=teardown,

From 9c72e80d441e19562a88422f05574fe200e1405e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Sep 2025 12:33:50 +0000
Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../model_server/llmd/conftest.py             |  4 ++--
 .../model_server/llmd/test_llmd_oci_cpu.py    |  2 +-
 .../model_server/llmd/test_llmd_s3.py         |  2 +-
 .../model_serving/model_server/llmd/utils.py  | 19 +++++++++----------
 utilities/llmd_utils.py                       |  6 +++---
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index 2807754e7..e8cf4637a 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -69,7 +69,7 @@ def llmd_gateway(
 ) -> Generator[Gateway, None, None]:
     """
     Pytest fixture for LLMD Gateway management.
-    
+
     Implements persistent gateway strategy:
     - Reuses existing gateways if available
     - Creates new gateway only if needed
@@ -81,7 +81,7 @@ def llmd_gateway(
     else:
         gateway_class_name = request.param.get("gateway_class_name", "openshift-default")
         kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"}
-        
+
     with create_gateway(
         client=admin_client,
         namespace=gateway_namespace,
diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
index adc5e64c2..57481b29f 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py
@@ -36,5 +36,5 @@ def test_llmd_oci(self, admin_client, llmd_gateway, llmd_inference_service):
             use_default_query=True,
             insecure=True,
         )
-        
+
         verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service)
diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py
index 0e2d2ecb2..ec09e3257 100644
--- a/tests/model_serving/model_server/llmd/test_llmd_s3.py
+++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py
@@ -36,5 +36,5 @@ def test_llmd_s3(self, admin_client, llmd_gateway, llmd_inference_service_s3):
             use_default_query=True,
             insecure=True,
         )
-        
+
         verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service_s3)
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
index 8e9929c18..dbf5d49f6 100644
--- a/tests/model_serving/model_server/llmd/utils.py
+++ b/tests/model_serving/model_server/llmd/utils.py
@@ -68,20 +68,20 @@ def verify_llm_service_status(llm_service: LLMInferenceService) -> bool:
 def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInferenceService) -> None:
     """
     Verify that LLMD inference pods containers have not restarted.
-    
+
     This function checks for container restarts in pods related to the specific LLMInferenceService.
-    
+
     Args:
         client (DynamicClient): DynamicClient instance
         llm_service (LLMInferenceService): The LLMInferenceService to check pods for
-        
+
     Raises:
         PodContainersRestartError: If any containers in LLMD pods have restarted
     """
     LOGGER.info(f"Verifying that pods for LLMInferenceService {llm_service.name} have not restarted")
-    
+
     restarted_containers = {}
-    
+
     for pod in Pod.get(
         dyn_client=client,
         namespace=llm_service.namespace,
@@ -90,18 +90,17 @@ def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInfere
         labels = pod.instance.metadata.get("labels", {})
         if labels.get("kserve.io/component") == "workload":
             LOGGER.debug(f"Checking pod {pod.name} for container restarts")
-            
+
             if pod.instance.status.containerStatuses:
                 if _restarted_containers := [
-                    container.name for container in pod.instance.status.containerStatuses 
-                    if container.restartCount > 0
+                    container.name for container in pod.instance.status.containerStatuses if container.restartCount > 0
                 ]:
                     restarted_containers[pod.name] = _restarted_containers
                     LOGGER.warning(f"Pod {pod.name} has restarted containers: {_restarted_containers}")
-    
+
     if restarted_containers:
         error_msg = f"LLMD inference containers restarted for {llm_service.name}: {restarted_containers}"
         LOGGER.error(error_msg)
         raise PodContainersRestartError(error_msg)
-    
+
     LOGGER.info(f"All pods for LLMInferenceService {llm_service.name} have no container restarts")
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 55a044a1f..19912a09a 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -43,7 +43,7 @@ def create_gateway(
 ) -> Generator[Gateway, None, None]:
     """
     Context manager to create and manage a Gateway resource using ocp_resources.
-    
+
     This function implements smart gateway management:
     - Only creates gateway if it doesn't already exist
     - Reuses existing gateways to avoid conflicts
@@ -86,7 +86,7 @@ def create_gateway(
         )
         if existing_gateway.exists:
             LOGGER.info(f"Using existing Gateway {name} in namespace {namespace}")
-            
+
             if wait_for_condition:
                 LOGGER.info(f"Waiting for existing Gateway {name} to be programmed...")
                 existing_gateway.wait_for_condition(
@@ -95,7 +95,7 @@ def create_gateway(
                     timeout=timeout,
                 )
                 LOGGER.info(f"Existing Gateway {name} is programmed and ready")
-            
+
             # Yield the existing gateway without teardown
             yield existing_gateway
             return

From 4f3035642e672a6e8694928ccbbe65e0e2d5d1ad Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 23 Sep 2025 18:10:56 +0530
Subject: [PATCH 10/12] fix gateway thing

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 tests/model_serving/model_server/llmd/conftest.py | 15 ++++++++-------
 utilities/llmd_utils.py                           | 13 +++++++------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index e8cf4637a..ca465b1e0 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -11,7 +11,7 @@
 
 from utilities.constants import Timeout
 from utilities.infra import s3_endpoint_secret
-from utilities.llmd_utils import create_gateway, create_llmisvc
+from utilities.llmd_utils import create_llmd_gateway, create_llmisvc
 from utilities.llmd_constants import (
     DEFAULT_GATEWAY_NAMESPACE,
     VLLM_STORAGE_OCI,
@@ -68,12 +68,13 @@ def llmd_gateway(
     gateway_namespace: str,
 ) -> Generator[Gateway, None, None]:
     """
-    Pytest fixture for LLMD Gateway management.
-
-    Implements persistent gateway strategy:
-    - Reuses existing gateways if available
+    Pytest fixture for LLMD Gateway management using create_llmd_gateway.
+    
+    Implements persistent LLMD gateway strategy:
+    - Reuses existing gateways if available  
     - Creates new gateway only if needed
     - Does not delete gateway in teardown
+    - Uses LLMD-specific gateway configuration
     """
     if isinstance(request.param, str):
         gateway_class_name = request.param
@@ -81,8 +82,8 @@ def llmd_gateway(
     else:
         gateway_class_name = request.param.get("gateway_class_name", "openshift-default")
         kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"}
-
-    with create_gateway(
+        
+    with create_llmd_gateway(
         client=admin_client,
         namespace=gateway_namespace,
         gateway_class_name=gateway_class_name,
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 19912a09a..565205d31 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -30,7 +30,7 @@
 
 
 @contextmanager
-def create_gateway(
+def create_llmd_gateway(
     client: DynamicClient,
     name: str = DEFAULT_GATEWAY_NAME,
     namespace: str = DEFAULT_GATEWAY_NAMESPACE,
@@ -42,17 +42,18 @@ def create_gateway(
     teardown: bool = False,
 ) -> Generator[Gateway, None, None]:
     """
-    Context manager to create and manage a Gateway resource using ocp_resources.
-
-    This function implements smart gateway management:
+    Context manager to create and manage LLMD Gateway resources using ocp_resources.
+    
+    This function implements smart LLMD gateway management:
     - Only creates gateway if it doesn't already exist
     - Reuses existing gateways to avoid conflicts
     - Does not delete gateway in teardown (persistent gateway strategy)
+    - Specifically designed for LLMD (LLM Deployment) infrastructure
 
     Args:
         client: Kubernetes dynamic client
-        name: Gateway name
-        namespace: Gateway namespace
+        name: Gateway name (defaults to openshift-ai-inference)
+        namespace: Gateway namespace (defaults to openshift-ingress)
         gateway_class_name: The name of the GatewayClass resource
         listeners: List of listener configurations
         infrastructure: Infrastructure configuration

From 76624a8aef2de6108772e5afd6f68f05ea8dd345 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 23 Sep 2025 18:19:26 +0530
Subject: [PATCH 11/12] fix: break long line in utils.py to meet flake8 line
 length requirement

---
 tests/model_serving/model_server/llmd/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
index dbf5d49f6..337872645 100644
--- a/tests/model_serving/model_server/llmd/utils.py
+++ b/tests/model_serving/model_server/llmd/utils.py
@@ -85,7 +85,10 @@ def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInfere
     for pod in Pod.get(
         dyn_client=client,
         namespace=llm_service.namespace,
-        label_selector=f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice,{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}",
+        label_selector=(
+            f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice,"
+            f"{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}"
+        ),
     ):
         labels = pod.instance.metadata.get("labels", {})
         if labels.get("kserve.io/component") == "workload":

From e8bdd8dd512affcfbde6d15effa73b79939b4428 Mon Sep 17 00:00:00 2001
From: Milind Waykole <mwaykole@redhat.com>
Date: Tue, 23 Sep 2025 18:20:38 +0530
Subject: [PATCH 12/12] fix gateway thing

Signed-off-by: Milind Waykole <mwaykole@redhat.com>
---
 tests/model_serving/model_server/llmd/conftest.py | 6 +++---
 utilities/llmd_utils.py                           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
index ca465b1e0..05ac74284 100644
--- a/tests/model_serving/model_server/llmd/conftest.py
+++ b/tests/model_serving/model_server/llmd/conftest.py
@@ -69,9 +69,9 @@ def llmd_gateway(
 ) -> Generator[Gateway, None, None]:
     """
     Pytest fixture for LLMD Gateway management using create_llmd_gateway.
-    
+
     Implements persistent LLMD gateway strategy:
-    - Reuses existing gateways if available  
+    - Reuses existing gateways if available
     - Creates new gateway only if needed
     - Does not delete gateway in teardown
     - Uses LLMD-specific gateway configuration
@@ -82,7 +82,7 @@ def llmd_gateway(
     else:
         gateway_class_name = request.param.get("gateway_class_name", "openshift-default")
         kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"}
-        
+
     with create_llmd_gateway(
         client=admin_client,
         namespace=gateway_namespace,
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
index 565205d31..62db07bd7 100644
--- a/utilities/llmd_utils.py
+++ b/utilities/llmd_utils.py
@@ -43,7 +43,7 @@ def create_llmd_gateway(
 ) -> Generator[Gateway, None, None]:
     """
     Context manager to create and manage LLMD Gateway resources using ocp_resources.
-    
+
     This function implements smart LLMD gateway management:
     - Only creates gateway if it doesn't already exist
     - Reuses existing gateways to avoid conflicts