From f0324805764226777478cd6bdb7b52fffc251299 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Mon, 15 Sep 2025 22:21:27 +0530 Subject: [PATCH 01/12] Add LLMD model server tests and utilities --- .../model_server/llmd/__init__.py | 8 + .../model_server/llmd/conftest.py | 173 +++++ .../model_server/llmd/test_llmd_oci_cpu.py | 34 + .../model_server/llmd/test_llmd_s3.py | 37 + .../model_serving/model_server/llmd/utils.py | 62 ++ utilities/llmd_constants.py | 32 + utilities/llmd_utils.py | 669 ++++++++++++++++++ utilities/manifests/opt125m_cpu.py | 27 + 8 files changed, 1042 insertions(+) create mode 100644 tests/model_serving/model_server/llmd/__init__.py create mode 100644 tests/model_serving/model_server/llmd/conftest.py create mode 100644 tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py create mode 100644 tests/model_serving/model_server/llmd/test_llmd_s3.py create mode 100644 tests/model_serving/model_server/llmd/utils.py create mode 100644 utilities/llmd_constants.py create mode 100644 utilities/llmd_utils.py create mode 100644 utilities/manifests/opt125m_cpu.py diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py new file mode 100644 index 000000000..3d0e7e903 --- /dev/null +++ b/tests/model_serving/model_server/llmd/__init__.py @@ -0,0 +1,8 @@ +""" +LLM Deployment (LLMD) tests module. + +This module contains tests for LLM deployment functionality including: +- Gateway resource creation and management +- LLMInferenceService deployment and configuration +- Integration testing between gateway and inference services +""" diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py new file mode 100644 index 000000000..af8c5594c --- /dev/null +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -0,0 +1,173 @@ +from typing import Generator + +import pytest +from _pytest.fixtures import FixtureRequest +from kubernetes.dynamic import DynamicClient +from ocp_resources.gateway import Gateway +from ocp_resources.llm_inference_service import LLMInferenceService +from ocp_resources.namespace import Namespace +from ocp_resources.secret import Secret +from ocp_resources.service_account import ServiceAccount + +from utilities.constants import Timeout +from utilities.infra import s3_endpoint_secret +from utilities.llmd_utils import create_gateway, create_llmisvc +from utilities.llmd_constants import ( + DEFAULT_GATEWAY_NAMESPACE, + VLLM_STORAGE_OCI, + VLLM_CPU_IMAGE, + DEFAULT_S3_STORAGE_PATH, +) + + +@pytest.fixture(scope="class") +def gateway_namespace(admin_client: DynamicClient) -> str: + return DEFAULT_GATEWAY_NAMESPACE + + +@pytest.fixture(scope="class") +def llmd_s3_secret( + admin_client: DynamicClient, + unprivileged_model_namespace: Namespace, + aws_access_key_id: str, + aws_secret_access_key: str, + models_s3_bucket_name: str, + models_s3_bucket_region: str, + models_s3_bucket_endpoint: str, +) -> Generator[Secret, None, None]: + with s3_endpoint_secret( + client=admin_client, + name="llmd-s3-secret", + namespace=unprivileged_model_namespace.name, + aws_access_key=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_s3_region=models_s3_bucket_region, + aws_s3_bucket=models_s3_bucket_name, + aws_s3_endpoint=models_s3_bucket_endpoint, + ) as secret: + yield secret + + +@pytest.fixture(scope="class") +def llmd_s3_service_account( + admin_client: DynamicClient, + llmd_s3_secret: Secret +) -> Generator[ServiceAccount, None, None]: + with ServiceAccount( + client=admin_client, + namespace=llmd_s3_secret.namespace, + name="llmd-s3-service-account", + secrets=[{"name": llmd_s3_secret.name}], + ) as sa: + yield sa + + +@pytest.fixture(scope="class") +def llmd_gateway( + request: FixtureRequest, + admin_client: DynamicClient, + gateway_namespace: str, +) -> Generator[Gateway, None, None]: + if isinstance(request.param, str): + gateway_class_name = request.param + kwargs = {} + else: + gateway_class_name = request.param.get("gateway_class_name", "openshift-default") + kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"} + with create_gateway( + client=admin_client, + namespace=gateway_namespace, + gateway_class_name=gateway_class_name, + wait_for_condition=True, + timeout=Timeout.TIMEOUT_5MIN, + **kwargs + ) as gateway: + yield gateway + + +@pytest.fixture(scope="class") +def llmd_inference_service( + request: FixtureRequest, + admin_client: DynamicClient, + unprivileged_model_namespace: Namespace, +) -> Generator[LLMInferenceService, None, None]: + if isinstance(request.param, str): + name_suffix = request.param + kwargs = {} + else: + name_suffix = request.param.get("name_suffix", "basic") + kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"} + + service_name = kwargs.get("name", f"llm-{name_suffix}") + + if "llmd_gateway" in request.fixturenames: + request.getfixturevalue("llmd_gateway") + + container_resources = kwargs.get( + "container_resources", + { + "limits": {"cpu": "1", "memory": "10Gi"}, + "requests": {"cpu": "100m", "memory": "8Gi"}, + }, + ) + + with create_llmisvc( + client=admin_client, + name=service_name, + namespace=unprivileged_model_namespace.name, + storage_uri=kwargs.get("storage_uri", VLLM_STORAGE_OCI), + container_image=kwargs.get("container_image", VLLM_CPU_IMAGE), + container_resources=container_resources, + wait=True, + timeout=Timeout.TIMEOUT_15MIN, + **{k: v for k, v in kwargs.items() if k != "name"} + ) as llm_service: + yield llm_service + + +@pytest.fixture(scope="class") +def llmd_inference_service_s3( + request: FixtureRequest, + admin_client: DynamicClient, + unprivileged_model_namespace: Namespace, + llmd_s3_secret: Secret, + llmd_s3_service_account: ServiceAccount, +) -> Generator[LLMInferenceService, None, None]: + if isinstance(request.param, str): + name_suffix = request.param + kwargs = {"storage_path": DEFAULT_S3_STORAGE_PATH} + else: + name_suffix = request.param.get("name_suffix", "s3") + kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"} + + service_name = kwargs.get("name", f"llm-{name_suffix}") + + if "storage_key" not in kwargs: + kwargs["storage_key"] = llmd_s3_secret.name + + if "storage_path" not in kwargs: + kwargs["storage_path"] = DEFAULT_S3_STORAGE_PATH + + container_resources = kwargs.get( + "container_resources", + { + "limits": {"cpu": "1", "memory": "10Gi"}, + "requests": {"cpu": "100m", "memory": "8Gi"}, + }, + ) + + with create_llmisvc( + client=admin_client, + name=service_name, + namespace=unprivileged_model_namespace.name, + storage_key=kwargs.get("storage_key"), + storage_path=kwargs.get("storage_path"), + container_image=kwargs.get("container_image", VLLM_CPU_IMAGE), + container_resources=container_resources, + service_account=llmd_s3_service_account.name, + wait=True, + timeout=Timeout.TIMEOUT_15MIN, + **{k: v for k, v in kwargs.items() if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"]} + ) as llm_service: + yield llm_service + diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py new file mode 100644 index 000000000..e4d851e1c --- /dev/null +++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py @@ -0,0 +1,34 @@ +import pytest + +from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status +from utilities.constants import Protocols +from utilities.llmd_utils import verify_inference_response_llmd + +from utilities.llmd_constants import BASIC_LLMD_PARAMS +from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG + +pytestmark = [ + pytest.mark.llmd_cpu, +] + + +@pytest.mark.parametrize( + "unprivileged_model_namespace, llmd_gateway, llmd_inference_service", + BASIC_LLMD_PARAMS, + indirect=True, +) +class TestLLMDOCICPUInference: + """LLMD inference testing with OCI storage and CPU runtime using vLLM.""" + + def test_llmd_oci(self, llmd_gateway, llmd_inference_service): + assert verify_gateway_status(llmd_gateway), "Gateway should be ready" + assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready" + + verify_inference_response_llmd( + llm_service=llmd_inference_service, + inference_config=OPT125M_CPU_INFERENCE_CONFIG, + inference_type="chat_completions", + protocol=Protocols.HTTP, + use_default_query=True, + insecure=True, + ) diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py new file mode 100644 index 000000000..0774b0b46 --- /dev/null +++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py @@ -0,0 +1,37 @@ +import pytest + +from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status +from utilities.constants import Protocols +from utilities.llmd_utils import verify_inference_response_llmd + +from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG + +pytestmark = [ + pytest.mark.llmd_cpu, +] + + +@pytest.mark.parametrize( + "unprivileged_model_namespace, llmd_gateway, llmd_inference_service_s3", + [ + ({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"}) + ], + indirect=True, +) +@pytest.mark.usefixtures("valid_aws_config") +class TestLLMDS3Inference: + """LLMD inference testing with S3 storage.""" + + def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3): + assert verify_gateway_status(llmd_gateway), "Gateway should be ready" + assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready" + + verify_inference_response_llmd( + llm_service=llmd_inference_service_s3, + inference_config=OPT125M_CPU_INFERENCE_CONFIG, + inference_type="chat_completions", + protocol=Protocols.HTTP, + use_default_query=True, + insecure=True, + ) + diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py new file mode 100644 index 000000000..0a1c591ff --- /dev/null +++ b/tests/model_serving/model_server/llmd/utils.py @@ -0,0 +1,62 @@ +""" +Utility functions for LLM Deployment (LLMD) tests. + +This module provides helper functions for LLMD test operations using ocp_resources. +Follows the established model server utils pattern for consistency. +""" + +from ocp_resources.gateway import Gateway +from ocp_resources.llm_inference_service import LLMInferenceService +from simple_logger.logger import get_logger + + +LOGGER = get_logger(name=__name__) + + +def verify_gateway_status(gateway: Gateway) -> bool: + """ + Verify that a Gateway is properly configured and programmed. + + Args: + gateway (Gateway): The Gateway resource to verify + + Returns: + bool: True if gateway is properly configured, False otherwise + """ + if not gateway.exists: + LOGGER.warning(f"Gateway {gateway.name} does not exist") + return False + + conditions = gateway.instance.status.get("conditions", []) + for condition in conditions: + if condition["type"] == "Programmed" and condition["status"] == "True": + LOGGER.info(f"Gateway {gateway.name} is programmed and ready") + return True + + LOGGER.warning(f"Gateway {gateway.name} is not in Programmed state") + return False + + +def verify_llm_service_status(llm_service: LLMInferenceService) -> bool: + """ + Verify that an LLMInferenceService is properly configured and ready. + + Args: + llm_service (LLMInferenceService): The LLMInferenceService resource to verify + + Returns: + bool: True if service is properly configured, False otherwise + """ + if not llm_service.exists: + LOGGER.warning(f"LLMInferenceService {llm_service.name} does not exist") + return False + + conditions = llm_service.instance.status.get("conditions", []) + for condition in conditions: + if condition["type"] == "Ready" and condition["status"] == "True": + LOGGER.info(f"LLMInferenceService {llm_service.name} is ready") + return True + + LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state") + return False + diff --git a/utilities/llmd_constants.py b/utilities/llmd_constants.py new file mode 100644 index 000000000..74546814b --- /dev/null +++ b/utilities/llmd_constants.py @@ -0,0 +1,32 @@ +"""Centralized constants for LLMD (LLM Deployment) utilities and tests.""" + +from utilities.constants import Timeout + +DEFAULT_GATEWAY_NAME = "openshift-ai-inference" +DEFAULT_GATEWAY_NAMESPACE = "openshift-ingress" +OPENSHIFT_DEFAULT_GATEWAY_CLASS = "openshift-default" + +KSERVE_GATEWAY_LABEL = "serving.kserve.io/gateway" +KSERVE_INGRESS_GATEWAY = "kserve-ingress-gateway" + +DEFAULT_LLM_ENDPOINT = "/v1/chat/completions" +DEFAULT_MAX_TOKENS = 50 +DEFAULT_TEMPERATURE = 0.0 +DEFAULT_TIMEOUT = Timeout.TIMEOUT_30SEC + +VLLM_STORAGE_OCI = "oci://quay.io/mwaykole/test:opt-125m" +VLLM_CPU_IMAGE = "quay.io/pierdipi/vllm-cpu:latest" +DEFAULT_LLMD_REPLICAS = 1 +DEFAULT_S3_STORAGE_PATH = "opt-125m" + +DEFAULT_STORAGE_URI = VLLM_STORAGE_OCI +DEFAULT_CONTAINER_IMAGE = VLLM_CPU_IMAGE + +DEFAULT_CPU_LIMIT = "1" +DEFAULT_MEMORY_LIMIT = "10Gi" +DEFAULT_CPU_REQUEST = "100m" +DEFAULT_MEMORY_REQUEST = "8Gi" + +BASIC_LLMD_PARAMS = [ + ({"name": "llmd-comprehensive-test"}, "openshift-default", "basic") +] diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py new file mode 100644 index 000000000..424166ead --- /dev/null +++ b/utilities/llmd_utils.py @@ -0,0 +1,669 @@ +"""Utilities for LLM Deployment (LLMD) resources.""" +import json +import re +import shlex +from contextlib import contextmanager +from string import Template +from typing import Any, Dict, Generator, Optional + +from kubernetes.dynamic import DynamicClient +from ocp_resources.gateway import Gateway +from ocp_resources.llm_inference_service import LLMInferenceService +from pyhelper_utils.shell import run_command +from simple_logger.logger import get_logger +from timeout_sampler import retry, TimeoutWatch + +from utilities.certificates_utils import get_ca_bundle +from utilities.constants import HTTPRequest, Timeout +from utilities.exceptions import InferenceResponseError +from utilities.infra import get_services_by_isvc_label +from utilities.llmd_constants import ( + DEFAULT_GATEWAY_NAME, + DEFAULT_GATEWAY_NAMESPACE, + OPENSHIFT_DEFAULT_GATEWAY_CLASS, + DEFAULT_LLM_ENDPOINT, + DEFAULT_TIMEOUT, +) + +LOGGER = get_logger(name=__name__) + + +@contextmanager +def create_gateway( + client: DynamicClient, + name: str = DEFAULT_GATEWAY_NAME, + namespace: str = DEFAULT_GATEWAY_NAMESPACE, + gateway_class_name: str = OPENSHIFT_DEFAULT_GATEWAY_CLASS, + listeners: Optional[list[Dict[str, Any]]] = None, + infrastructure: Optional[Dict[str, Any]] = None, + wait_for_condition: bool = True, + timeout: int = 300, + teardown: bool = True, +) -> Generator[Gateway, None, None]: + """ + Context manager to create and manage a Gateway resource using ocp_resources. + + Args: + client: Kubernetes dynamic client + name: Gateway name + namespace: Gateway namespace + gateway_class_name: The name of the GatewayClass resource + listeners: List of listener configurations + infrastructure: Infrastructure configuration + wait_for_condition: Whether to wait for the gateway to be programmed + timeout: Timeout in seconds for waiting + teardown: Whether to clean up the resource + + Yields: + Gateway: The created Gateway resource + """ + if listeners is None: + listeners = [ + { + "name": "http", + "port": 80, + "protocol": "HTTP", + "allowedRoutes": {"namespaces": {"from": "All"}}, + } + ] + + if infrastructure is None: + infrastructure = {"labels": {"serving.kserve.io/gateway": "kserve-ingress-gateway"}} + try: + existing_gateway = Gateway( + client=client, + name=name, + namespace=namespace, + api_group="gateway.networking.k8s.io", + ) + if existing_gateway.exists: + LOGGER.info(f"Cleaning up existing Gateway {name} in namespace {namespace}") + existing_gateway.delete(wait=True, timeout=Timeout.TIMEOUT_2MIN) + except Exception as e: + LOGGER.debug(f"No existing Gateway to clean up: {e}") + gateway_body = { + "apiVersion": "gateway.networking.k8s.io/v1", + "kind": "Gateway", + "metadata": { + "name": name, + "namespace": namespace, + }, + "spec": { + "gatewayClassName": gateway_class_name, + "listeners": listeners, + "infrastructure": infrastructure, + } + } + + with Gateway( + client=client, + teardown=teardown, + kind_dict=gateway_body, + api_group="gateway.networking.k8s.io", + ) as gateway: + if wait_for_condition: + LOGGER.info(f"Waiting for Gateway {name} to be programmed...") + gateway.wait_for_condition( + condition="Programmed", + status="True", + timeout=timeout, + ) + LOGGER.info(f"Gateway {name} is programmed and ready") + + yield gateway + + +@contextmanager +def create_llmisvc( + client: DynamicClient, + name: str, + namespace: str, + storage_uri: Optional[str] = None, + storage_key: Optional[str] = None, + storage_path: Optional[str] = None, + replicas: int = 1, + wait: bool = True, + enable_auth: bool = False, + router_config: Optional[Dict[str, Any]] = None, + container_image: Optional[str] = None, + container_resources: Optional[Dict[str, Any]] = None, + container_env: Optional[list[Dict[str, str]]] = None, + liveness_probe: Optional[Dict[str, Any]] = None, + readiness_probe: Optional[Dict[str, Any]] = None, + image_pull_secrets: Optional[list[str]] = None, + service_account: Optional[str] = None, + volumes: Optional[list[Dict[str, Any]]] = None, + volume_mounts: Optional[list[Dict[str, Any]]] = None, + annotations: Optional[Dict[str, str]] = None, + labels: Optional[Dict[str, str]] = None, + timeout: int = Timeout.TIMEOUT_15MIN, + teardown: bool = True, +) -> Generator[LLMInferenceService, Any, None]: + """ + Create LLMInferenceService object following the pattern of create_isvc. + + Args: + client: DynamicClient object + name: LLMInferenceService name + namespace: Namespace name + storage_uri: Storage URI (e.g., 'oci://quay.io/user/model:tag') - used if storage_key/storage_path not provided + storage_key: S3 secret name for authentication (alternative to storage_uri) + storage_path: S3 path to model (alternative to storage_uri) + replicas: Number of replicas + wait: Wait for LLMInferenceService to be ready + enable_auth: Enable authentication + router_config: Router configuration (scheduler, route, gateway) + container_image: Container image + container_resources: Container resource requirements + container_env: Container environment variables + liveness_probe: Liveness probe configuration + readiness_probe: Readiness probe configuration + image_pull_secrets: Image pull secrets + service_account: Service account name + volumes: Volume configurations + volume_mounts: Volume mount configurations + annotations: Additional annotations + labels: Additional labels + timeout: Timeout for waiting + teardown: Whether to clean up on exit + + Yields: + LLMInferenceService: LLMInferenceService object + """ + if labels is None: + labels = {} + + if annotations is None: + annotations = {} + + if storage_key and storage_path: + model_config = { + "uri": f"s3://ods-ci-wisdom/{storage_path}", + } + elif storage_uri: + model_config = { + "uri": storage_uri, + } + else: + raise ValueError("Provide either storage_uri or (storage_key and storage_path) for the model") + + if router_config is None: + router_config = { + "scheduler": {}, + "route": {}, + "gateway": {} + } + + if container_resources is None: + raise ValueError("container_resources must be provided for LLMInferenceService") + + if container_env is None: + container_env = [ + { + "name": "VLLM_LOGGING_LEVEL", + "value": "DEBUG" + } + ] + template_config: Dict[str, Any] = { + "containers": [] + } + + main_container: Dict[str, Any] = { + "name": "main" + } + + if container_image: + main_container["image"] = container_image + else: + raise ValueError("container_image must be provided for LLMInferenceService") + + if container_resources: + main_container["resources"] = container_resources + + if container_env: + main_container["env"] = container_env + + if liveness_probe: + main_container["livenessProbe"] = liveness_probe + + if readiness_probe: + main_container["readinessProbe"] = readiness_probe + + if volume_mounts: + main_container["volumeMounts"] = volume_mounts + + template_config["containers"].append(main_container) + + if volumes: + template_config["volumes"] = volumes + + if service_account: + template_config["serviceAccountName"] = service_account + + if image_pull_secrets: + template_config["imagePullSecrets"] = [{"name": secret} for secret in image_pull_secrets] + + if enable_auth: + annotations["serving.kserve.io/auth"] = "true" + + LOGGER.info(f"Creating LLMInferenceService {name} in namespace {namespace}") + + with LLMInferenceService( + client=client, + name=name, + namespace=namespace, + model=model_config, + replicas=replicas, + router=router_config, + template=template_config, + annotations=annotations, + label=labels, + teardown=teardown, + ) as llm_service: + timeout_watch = TimeoutWatch(timeout=timeout) + + if wait: + LOGGER.info(f"Waiting for LLMInferenceService {name} to be ready...") + llm_service.wait_for_condition( + condition="Ready", + status="True", + timeout=timeout_watch.remaining_time(), + ) + LOGGER.info(f"LLMInferenceService {name} is ready") + + yield llm_service + + +def get_llm_inference_url(llm_service: LLMInferenceService) -> str: + """ + Get the inference URL for an LLMInferenceService. + + This function attempts to resolve the URL in the following order: + 1. External URL from service status + 2. Service discovery via labels + 3. Fallback to service name pattern + + Args: + llm_service: The LLMInferenceService resource + + Returns: + str: The inference URL (full URL including protocol and path) + + Raises: + ValueError: If the inference URL cannot be determined + """ + if llm_service.instance.status and llm_service.instance.status.get("url"): + url = llm_service.instance.status["url"] + LOGGER.debug(f"Using external URL for {llm_service.name}: {url}") + return url + + try: + services = get_services_by_isvc_label( + client=llm_service.client, + isvc=llm_service, + runtime_name=None, + ) + if services: + internal_url = f"http://{services[0].name}.{llm_service.namespace}.svc.cluster.local" + LOGGER.debug(f"Using service discovery URL for {llm_service.name}: {internal_url}") + return internal_url + except Exception as e: + LOGGER.warning(f"Could not get service for LLMInferenceService {llm_service.name}: {e}") + fallback_url = f"http://{llm_service.name}.{llm_service.namespace}.svc.cluster.local" + LOGGER.debug(f"Using fallback URL for {llm_service.name}: {fallback_url}") + return fallback_url + + +def verify_inference_response_llmd( + llm_service: LLMInferenceService, + inference_config: Dict[str, Any], + inference_type: str, + protocol: str, + model_name: Optional[str] = None, + inference_input: Optional[Any] = None, + use_default_query: bool = False, + expected_response_text: Optional[str] = None, + insecure: bool = False, + token: Optional[str] = None, + authorized_user: Optional[bool] = None, +) -> None: + """ + Verify the LLM inference response following the pattern of verify_inference_response. + + Args: + llm_service: LLMInferenceService resource to test + inference_config: Inference configuration dictionary + inference_type: Type of inference ('infer', 'streaming', etc.) + protocol: Protocol to use ('http', 'grpc') + model_name: Name of the model (defaults to service name) + inference_input: Input for inference (optional) + use_default_query: Whether to use default query from config + expected_response_text: Expected response text for validation + insecure: Whether to use insecure connections + token: Authentication token (optional) + authorized_user: Whether user should be authorized (optional) + + Raises: + InferenceResponseError: If inference response is invalid + ValueError: If inference response validation fails + """ + + model_name = model_name or llm_service.name + inference = LLMUserInference( + llm_service=llm_service, + inference_config=inference_config, + inference_type=inference_type, + protocol=protocol, + ) + + res = inference.run_inference_flow( + model_name=model_name, + inference_input=inference_input, + use_default_query=use_default_query, + token=token, + insecure=insecure, + ) + + if authorized_user is False: + _validate_unauthorized_response(res, token, inference) + else: + _validate_authorized_response( + res=res, + inference=inference, + inference_config=inference_config, + inference_type=inference_type, + expected_response_text=expected_response_text, + use_default_query=use_default_query, + model_name=model_name, + ) + + +class LLMUserInference: + """ + LLM-specific inference handler following the pattern of UserInference. + """ + + STREAMING = "streaming" + INFER = "infer" + + def __init__( + self, + llm_service: LLMInferenceService, + inference_config: Dict[str, Any], + inference_type: str, + protocol: str, + ) -> None: + self.llm_service = llm_service + self.inference_config = inference_config + self.inference_type = inference_type + self.protocol = protocol + self.runtime_config = self.get_runtime_config() + + def get_runtime_config(self) -> Dict[str, Any]: + """Get runtime config from inference config based on inference type and protocol.""" + if inference_type_config := self.inference_config.get(self.inference_type): + protocol = "http" if self.protocol.lower() in ["http", "https"] else self.protocol + if data := inference_type_config.get(protocol): + return data + else: + raise ValueError(f"Protocol {protocol} not supported for inference type {self.inference_type}") + else: + raise ValueError(f"Inference type {self.inference_type} not supported in config") + + @property + def inference_response_text_key_name(self) -> Optional[str]: + """Get inference response text key name from runtime config.""" + return self.runtime_config.get("response_fields_map", {}).get("response_output") + + @property + def inference_response_key_name(self) -> str: + """Get inference response key name from runtime config.""" + return self.runtime_config.get("response_fields_map", {}).get("response", "output") + + def get_inference_body( + self, + model_name: str, + inference_input: Optional[Any] = None, + use_default_query: bool = False, + ) -> str: + """Get inference body for LLM request.""" + if not use_default_query and inference_input is None: + raise ValueError("Either pass `inference_input` or set `use_default_query` to True") + + if use_default_query: + default_query_config = self.inference_config.get("default_query_model") + if not default_query_config: + raise ValueError(f"Missing default query config for {model_name}") + + if self.inference_config.get("support_multi_default_queries"): + query_config = default_query_config.get(self.inference_type) + if not query_config: + raise ValueError(f"Missing default query for inference type {self.inference_type}") + template_str = query_config.get("query_input", "") + else: + template_str = default_query_config.get("query_input", "") + + # Use template substitution for model name + template = Template(template_str) + body = template.safe_substitute(model_name=model_name) + else: + # For custom input, create OpenAI-compatible format + if isinstance(inference_input, str): + body = json.dumps({ + "model": model_name, + "messages": [{"role": "user", "content": inference_input}], + "max_tokens": 100, + "temperature": 0.0 + }) + else: + body = json.dumps(inference_input) + + return body + + def generate_command( + self, + model_name: str, + inference_input: Optional[str] = None, + use_default_query: bool = False, + insecure: bool = False, + token: Optional[str] = None, + ) -> str: + """Generate curl command string for LLM inference.""" + base_url = get_llm_inference_url(self.llm_service) + endpoint_url = f"{base_url}{DEFAULT_LLM_ENDPOINT}" + + body = self.get_inference_body( + model_name=model_name, + inference_input=inference_input, + use_default_query=use_default_query, + ) + + header = HTTPRequest.CONTENT_JSON.replace("-H ", "").replace("'", "") + cmd_exec = "curl -i -s" + cmd = f"{cmd_exec} -X POST -d '{body}' -H {header} -H 'Accept: application/json'" + + if token: + cmd += f" {HTTPRequest.AUTH_HEADER.format(token=token)}" + + if insecure: + cmd += " --insecure" + else: + try: + from ocp_resources.resource import get_client # type: ignore + client = get_client() + ca_bundle = get_ca_bundle(client=client, deployment_mode="raw") + if ca_bundle: + cmd += f" --cacert {ca_bundle}" + else: + cmd += " --insecure" + except Exception: + cmd += " --insecure" + + cmd += f" --max-time {DEFAULT_TIMEOUT} {endpoint_url}" + return cmd + + @retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=5) + def run_inference( + self, + model_name: str, + inference_input: Optional[str] = None, + use_default_query: bool = False, + insecure: bool = False, + token: Optional[str] = None, + ) -> str: + """Run inference command and return raw output.""" + cmd = self.generate_command( + model_name=model_name, + inference_input=inference_input, + use_default_query=use_default_query, + insecure=insecure, + token=token, + ) + + res, out, err = run_command(command=shlex.split(cmd), verify_stderr=False, check=False) + if res: + return out + raise ValueError(f"Inference failed with error: {err}\nOutput: {out}\nCommand: {cmd}") + + def run_inference_flow( + self, + model_name: str, + inference_input: Optional[str] = None, + use_default_query: bool = False, + insecure: bool = False, + token: Optional[str] = None, + ) -> Dict[str, Any]: + """Run LLM inference using the same high-level flow as inference_utils.""" + out = self.run_inference( + model_name=model_name, + inference_input=inference_input, + use_default_query=use_default_query, + insecure=insecure, + token=token, + ) + return {"output": out} + + +def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], inference: LLMUserInference) -> None: + """Validate response for unauthorized users.""" + auth_header = "x-ext-auth-reason" + + if auth_reason := re.search(rf"{auth_header}: (.*)", res["output"], re.MULTILINE): + reason = auth_reason.group(1).lower() + + if token: + assert re.search(r"not (?:authenticated|authorized)", reason) + else: + assert "credential not found" in reason + else: + forbidden_patterns = ["Forbidden", "401", "403", "Unauthorized"] + output = res["output"] + + if any(pattern in output for pattern in forbidden_patterns): + return + + raise ValueError(f"Auth header {auth_header} not found in response. Response: {output}") + + +def _validate_authorized_response( + res: Dict[str, Any], + inference: LLMUserInference, + inference_config: Dict[str, Any], + inference_type: str, + expected_response_text: Optional[str], + use_default_query: bool, + model_name: str, +) -> None: + """Validate response for authorized users.""" + + use_regex = False + + if use_default_query: + expected_response_text_config = inference_config.get("default_query_model", {}) + use_regex = expected_response_text_config.get("use_regex", False) + + if not expected_response_text_config: + raise ValueError(f"Missing default_query_model config for inference {inference_config}") + + if inference_config.get("support_multi_default_queries"): + query_config = expected_response_text_config.get(inference_type) + if not query_config: + raise ValueError(f"Missing default_query_model config for inference type {inference_type}") + expected_response_text = query_config.get("query_output", "") + use_regex = query_config.get("use_regex", False) + else: + expected_response_text = expected_response_text_config.get("query_output") + + if not expected_response_text: + raise ValueError(f"Missing response text key for inference {inference_config}") + + if isinstance(expected_response_text, str): + expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name) + elif isinstance(expected_response_text, dict): + expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute( + model_name=model_name + ) + + if inference.inference_response_text_key_name: + if inference_type == inference.STREAMING: + if output := re.findall( + rf"{inference.inference_response_text_key_name}\": \"(.*)\"", + res[inference.inference_response_key_name], + re.MULTILINE, + ): + assert "".join(output) == expected_response_text, ( + f"Expected: {expected_response_text} does not match response: {output}" + ) + elif inference_type == inference.INFER or use_regex: + formatted_res = json.dumps(res[inference.inference_response_text_key_name]).replace(" ", "") + if use_regex: + assert re.search(expected_response_text, formatted_res), ( + f"Expected: {expected_response_text} not found in: {formatted_res}" + ) + else: + formatted_res = json.dumps(res[inference.inference_response_key_name]).replace(" ", "") + assert formatted_res == expected_response_text, ( + f"Expected: {expected_response_text} does not match output: {formatted_res}" + ) + else: + response = res[inference.inference_response_key_name] + if isinstance(response, list): + response = response[0] + + if isinstance(response, dict): + response_text = response[inference.inference_response_text_key_name] + assert response_text == expected_response_text, ( + f"Expected: {expected_response_text} does not match response: {response_text}" + ) + else: + raise InferenceResponseError( + "Inference response output does not match expected output format." + f"Expected: {expected_response_text}.\nResponse: {res}" + ) + else: + raise InferenceResponseError(f"Inference response output not found in response. Response: {res}") + + +def validate_llm_response_format(response_data: Dict[str, Any]) -> bool: + """Validate that a response follows the expected LLM inference format.""" + if "choices" not in response_data: + raise InferenceResponseError("Response missing required field: 'choices'") + + if not isinstance(response_data["choices"], list): + raise InferenceResponseError("'choices' field must be a list") + + if not response_data["choices"]: + raise InferenceResponseError("'choices' list cannot be empty") + + choice = response_data["choices"][0] + if "message" not in choice: + raise InferenceResponseError("Choice missing 'message' field") + + message = choice["message"] + if "content" not in message: + raise InferenceResponseError("Message missing 'content' field") + + if not isinstance(message["content"], str): + raise InferenceResponseError("Message content must be a string") + + return True diff --git a/utilities/manifests/opt125m_cpu.py b/utilities/manifests/opt125m_cpu.py new file mode 100644 index 000000000..2326e991d --- /dev/null +++ b/utilities/manifests/opt125m_cpu.py @@ -0,0 +1,27 @@ +OPT125M_CPU_INFERENCE_CONFIG = { + "default_query_model": { + "query_input": "What is the boiling point of water?", + "query_output": r'.*', # Accept any valid response + "use_regex": True, + }, + "chat_completions": { + "http": { + "endpoint": "v1/chat/completions", + "header": "Content-Type:application/json", + "body": '{"model": "$model_name", "messages": [{"role": "user", "content": "$query_input"}], "max_tokens": 50, "temperature": 0.0, "stream": false}', + "response_fields_map": { + "response_output": "output", + }, + }, + }, + "completions": { + "http": { + "endpoint": "v1/completions", + "header": "Content-Type:application/json", + "body": '{"model": "$model_name", "prompt": "$query_input", "max_tokens": 50, "temperature": 0.0}', + "response_fields_map": { + "response_output": "output", + }, + }, + }, +} From cdd0a82559b09c3e57de24b7bf0ba05615c169b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 16:56:33 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../model_server/llmd/conftest.py | 26 ++-- .../model_server/llmd/test_llmd_oci_cpu.py | 2 +- .../model_server/llmd/test_llmd_s3.py | 7 +- .../model_serving/model_server/llmd/utils.py | 17 ++- utilities/llmd_constants.py | 4 +- utilities/llmd_utils.py | 135 ++++++++---------- 6 files changed, 88 insertions(+), 103 deletions(-) diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index af8c5594c..cd2c86773 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -50,8 +50,7 @@ def llmd_s3_secret( @pytest.fixture(scope="class") def llmd_s3_service_account( - admin_client: DynamicClient, - llmd_s3_secret: Secret + admin_client: DynamicClient, llmd_s3_secret: Secret ) -> Generator[ServiceAccount, None, None]: with ServiceAccount( client=admin_client, @@ -80,7 +79,7 @@ def llmd_gateway( gateway_class_name=gateway_class_name, wait_for_condition=True, timeout=Timeout.TIMEOUT_5MIN, - **kwargs + **kwargs, ) as gateway: yield gateway @@ -97,12 +96,12 @@ def llmd_inference_service( else: name_suffix = request.param.get("name_suffix", "basic") kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"} - + service_name = kwargs.get("name", f"llm-{name_suffix}") if "llmd_gateway" in request.fixturenames: request.getfixturevalue("llmd_gateway") - + container_resources = kwargs.get( "container_resources", { @@ -120,7 +119,7 @@ def llmd_inference_service( container_resources=container_resources, wait=True, timeout=Timeout.TIMEOUT_15MIN, - **{k: v for k, v in kwargs.items() if k != "name"} + **{k: v for k, v in kwargs.items() if k != "name"}, ) as llm_service: yield llm_service @@ -139,15 +138,15 @@ def llmd_inference_service_s3( else: name_suffix = request.param.get("name_suffix", "s3") kwargs = {k: v for k, v in request.param.items() if k != "name_suffix"} - + service_name = kwargs.get("name", f"llm-{name_suffix}") - + if "storage_key" not in kwargs: kwargs["storage_key"] = llmd_s3_secret.name - + if "storage_path" not in kwargs: kwargs["storage_path"] = DEFAULT_S3_STORAGE_PATH - + container_resources = kwargs.get( "container_resources", { @@ -167,7 +166,10 @@ def llmd_inference_service_s3( service_account=llmd_s3_service_account.name, wait=True, timeout=Timeout.TIMEOUT_15MIN, - **{k: v for k, v in kwargs.items() if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"]} + **{ + k: v + for k, v in kwargs.items() + if k not in ["name", "storage_key", "storage_path", "container_image", "container_resources"] + }, ) as llm_service: yield llm_service - diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py index e4d851e1c..031ce6e48 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py +++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py @@ -23,7 +23,7 @@ class TestLLMDOCICPUInference: def test_llmd_oci(self, llmd_gateway, llmd_inference_service): assert verify_gateway_status(llmd_gateway), "Gateway should be ready" assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready" - + verify_inference_response_llmd( llm_service=llmd_inference_service, inference_config=OPT125M_CPU_INFERENCE_CONFIG, diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py index 0774b0b46..03c899431 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_s3.py +++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py @@ -13,9 +13,7 @@ @pytest.mark.parametrize( "unprivileged_model_namespace, llmd_gateway, llmd_inference_service_s3", - [ - ({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"}) - ], + [({"name": "llmd-s3-test"}, "openshift-default", {"storage_path": "opt-125m/"})], indirect=True, ) @pytest.mark.usefixtures("valid_aws_config") @@ -25,7 +23,7 @@ class TestLLMDS3Inference: def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3): assert verify_gateway_status(llmd_gateway), "Gateway should be ready" assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready" - + verify_inference_response_llmd( llm_service=llmd_inference_service_s3, inference_config=OPT125M_CPU_INFERENCE_CONFIG, @@ -34,4 +32,3 @@ def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3): use_default_query=True, insecure=True, ) - diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py index 0a1c591ff..b80fcff7c 100644 --- a/tests/model_serving/model_server/llmd/utils.py +++ b/tests/model_serving/model_server/llmd/utils.py @@ -16,23 +16,23 @@ def verify_gateway_status(gateway: Gateway) -> bool: """ Verify that a Gateway is properly configured and programmed. - + Args: gateway (Gateway): The Gateway resource to verify - + Returns: bool: True if gateway is properly configured, False otherwise """ if not gateway.exists: LOGGER.warning(f"Gateway {gateway.name} does not exist") return False - + conditions = gateway.instance.status.get("conditions", []) for condition in conditions: if condition["type"] == "Programmed" and condition["status"] == "True": LOGGER.info(f"Gateway {gateway.name} is programmed and ready") return True - + LOGGER.warning(f"Gateway {gateway.name} is not in Programmed state") return False @@ -40,23 +40,22 @@ def verify_gateway_status(gateway: Gateway) -> bool: def verify_llm_service_status(llm_service: LLMInferenceService) -> bool: """ Verify that an LLMInferenceService is properly configured and ready. - + Args: llm_service (LLMInferenceService): The LLMInferenceService resource to verify - + Returns: bool: True if service is properly configured, False otherwise """ if not llm_service.exists: LOGGER.warning(f"LLMInferenceService {llm_service.name} does not exist") return False - + conditions = llm_service.instance.status.get("conditions", []) for condition in conditions: if condition["type"] == "Ready" and condition["status"] == "True": LOGGER.info(f"LLMInferenceService {llm_service.name} is ready") return True - + LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state") return False - diff --git a/utilities/llmd_constants.py b/utilities/llmd_constants.py index 74546814b..90751f30b 100644 --- a/utilities/llmd_constants.py +++ b/utilities/llmd_constants.py @@ -27,6 +27,4 @@ DEFAULT_CPU_REQUEST = "100m" DEFAULT_MEMORY_REQUEST = "8Gi" -BASIC_LLMD_PARAMS = [ - ({"name": "llmd-comprehensive-test"}, "openshift-default", "basic") -] +BASIC_LLMD_PARAMS = [({"name": "llmd-comprehensive-test"}, "openshift-default", "basic")] diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 424166ead..87640dc18 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -1,4 +1,5 @@ """Utilities for LLM Deployment (LLMD) resources.""" + import json import re import shlex @@ -42,18 +43,18 @@ def create_gateway( ) -> Generator[Gateway, None, None]: """ Context manager to create and manage a Gateway resource using ocp_resources. - + Args: client: Kubernetes dynamic client name: Gateway name - namespace: Gateway namespace + namespace: Gateway namespace gateway_class_name: The name of the GatewayClass resource listeners: List of listener configurations infrastructure: Infrastructure configuration wait_for_condition: Whether to wait for the gateway to be programmed timeout: Timeout in seconds for waiting teardown: Whether to clean up the resource - + Yields: Gateway: The created Gateway resource """ @@ -92,9 +93,9 @@ def create_gateway( "gatewayClassName": gateway_class_name, "listeners": listeners, "infrastructure": infrastructure, - } + }, } - + with Gateway( client=client, teardown=teardown, @@ -109,7 +110,7 @@ def create_gateway( timeout=timeout, ) LOGGER.info(f"Gateway {name} is programmed and ready") - + yield gateway @@ -172,7 +173,7 @@ def create_llmisvc( """ if labels is None: labels = {} - + if annotations is None: annotations = {} @@ -188,58 +189,45 @@ def create_llmisvc( raise ValueError("Provide either storage_uri or (storage_key and storage_path) for the model") if router_config is None: - router_config = { - "scheduler": {}, - "route": {}, - "gateway": {} - } - + router_config = {"scheduler": {}, "route": {}, "gateway": {}} + if container_resources is None: raise ValueError("container_resources must be provided for LLMInferenceService") - + if container_env is None: - container_env = [ - { - "name": "VLLM_LOGGING_LEVEL", - "value": "DEBUG" - } - ] - template_config: Dict[str, Any] = { - "containers": [] - } - - main_container: Dict[str, Any] = { - "name": "main" - } - + container_env = [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}] + template_config: Dict[str, Any] = {"containers": []} + + main_container: Dict[str, Any] = {"name": "main"} + if container_image: main_container["image"] = container_image else: raise ValueError("container_image must be provided for LLMInferenceService") - + if container_resources: main_container["resources"] = container_resources - + if container_env: main_container["env"] = container_env - + if liveness_probe: main_container["livenessProbe"] = liveness_probe - + if readiness_probe: main_container["readinessProbe"] = readiness_probe - + if volume_mounts: main_container["volumeMounts"] = volume_mounts - + template_config["containers"].append(main_container) - + if volumes: template_config["volumes"] = volumes - + if service_account: template_config["serviceAccountName"] = service_account - + if image_pull_secrets: template_config["imagePullSecrets"] = [{"name": secret} for secret in image_pull_secrets] @@ -247,7 +235,7 @@ def create_llmisvc( annotations["serving.kserve.io/auth"] = "true" LOGGER.info(f"Creating LLMInferenceService {name} in namespace {namespace}") - + with LLMInferenceService( client=client, name=name, @@ -277,18 +265,18 @@ def create_llmisvc( def get_llm_inference_url(llm_service: LLMInferenceService) -> str: """ Get the inference URL for an LLMInferenceService. - + This function attempts to resolve the URL in the following order: 1. External URL from service status 2. Service discovery via labels 3. Fallback to service name pattern - + Args: llm_service: The LLMInferenceService resource - + Returns: str: The inference URL (full URL including protocol and path) - + Raises: ValueError: If the inference URL cannot be determined """ @@ -296,7 +284,7 @@ def get_llm_inference_url(llm_service: LLMInferenceService) -> str: url = llm_service.instance.status["url"] LOGGER.debug(f"Using external URL for {llm_service.name}: {url}") return url - + try: services = get_services_by_isvc_label( client=llm_service.client, @@ -382,10 +370,10 @@ class LLMUserInference: """ LLM-specific inference handler following the pattern of UserInference. """ - + STREAMING = "streaming" INFER = "infer" - + def __init__( self, llm_service: LLMInferenceService, @@ -398,7 +386,7 @@ def __init__( self.inference_type = inference_type self.protocol = protocol self.runtime_config = self.get_runtime_config() - + def get_runtime_config(self) -> Dict[str, Any]: """Get runtime config from inference config based on inference type and protocol.""" if inference_type_config := self.inference_config.get(self.inference_type): @@ -409,17 +397,17 @@ def get_runtime_config(self) -> Dict[str, Any]: raise ValueError(f"Protocol {protocol} not supported for inference type {self.inference_type}") else: raise ValueError(f"Inference type {self.inference_type} not supported in config") - + @property def inference_response_text_key_name(self) -> Optional[str]: """Get inference response text key name from runtime config.""" return self.runtime_config.get("response_fields_map", {}).get("response_output") - + @property def inference_response_key_name(self) -> str: """Get inference response key name from runtime config.""" return self.runtime_config.get("response_fields_map", {}).get("response", "output") - + def get_inference_body( self, model_name: str, @@ -429,12 +417,12 @@ def get_inference_body( """Get inference body for LLM request.""" if not use_default_query and inference_input is None: raise ValueError("Either pass `inference_input` or set `use_default_query` to True") - + if use_default_query: default_query_config = self.inference_config.get("default_query_model") if not default_query_config: raise ValueError(f"Missing default query config for {model_name}") - + if self.inference_config.get("support_multi_default_queries"): query_config = default_query_config.get(self.inference_type) if not query_config: @@ -442,7 +430,7 @@ def get_inference_body( template_str = query_config.get("query_input", "") else: template_str = default_query_config.get("query_input", "") - + # Use template substitution for model name template = Template(template_str) body = template.safe_substitute(model_name=model_name) @@ -453,13 +441,13 @@ def get_inference_body( "model": model_name, "messages": [{"role": "user", "content": inference_input}], "max_tokens": 100, - "temperature": 0.0 + "temperature": 0.0, }) else: body = json.dumps(inference_input) - + return body - + def generate_command( self, model_name: str, @@ -490,6 +478,7 @@ def generate_command( else: try: from ocp_resources.resource import get_client # type: ignore + client = get_client() ca_bundle = get_ca_bundle(client=client, deployment_mode="raw") if ca_bundle: @@ -547,10 +536,10 @@ def run_inference_flow( def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], inference: LLMUserInference) -> None: """Validate response for unauthorized users.""" auth_header = "x-ext-auth-reason" - + if auth_reason := re.search(rf"{auth_header}: (.*)", res["output"], re.MULTILINE): reason = auth_reason.group(1).lower() - + if token: assert re.search(r"not (?:authenticated|authorized)", reason) else: @@ -558,10 +547,10 @@ def _validate_unauthorized_response(res: Dict[str, Any], token: Optional[str], i else: forbidden_patterns = ["Forbidden", "401", "403", "Unauthorized"] output = res["output"] - + if any(pattern in output for pattern in forbidden_patterns): return - + raise ValueError(f"Auth header {auth_header} not found in response. Response: {output}") @@ -575,16 +564,16 @@ def _validate_authorized_response( model_name: str, ) -> None: """Validate response for authorized users.""" - + use_regex = False - + if use_default_query: expected_response_text_config = inference_config.get("default_query_model", {}) use_regex = expected_response_text_config.get("use_regex", False) - + if not expected_response_text_config: raise ValueError(f"Missing default_query_model config for inference {inference_config}") - + if inference_config.get("support_multi_default_queries"): query_config = expected_response_text_config.get(inference_type) if not query_config: @@ -593,17 +582,17 @@ def _validate_authorized_response( use_regex = query_config.get("use_regex", False) else: expected_response_text = expected_response_text_config.get("query_output") - + if not expected_response_text: raise ValueError(f"Missing response text key for inference {inference_config}") - + if isinstance(expected_response_text, str): expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name) elif isinstance(expected_response_text, dict): expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute( model_name=model_name ) - + if inference.inference_response_text_key_name: if inference_type == inference.STREAMING: if output := re.findall( @@ -629,7 +618,7 @@ def _validate_authorized_response( response = res[inference.inference_response_key_name] if isinstance(response, list): response = response[0] - + if isinstance(response, dict): response_text = response[inference.inference_response_text_key_name] assert response_text == expected_response_text, ( @@ -648,22 +637,22 @@ def validate_llm_response_format(response_data: Dict[str, Any]) -> bool: """Validate that a response follows the expected LLM inference format.""" if "choices" not in response_data: raise InferenceResponseError("Response missing required field: 'choices'") - + if not isinstance(response_data["choices"], list): raise InferenceResponseError("'choices' field must be a list") - + if not response_data["choices"]: raise InferenceResponseError("'choices' list cannot be empty") - + choice = response_data["choices"][0] if "message" not in choice: raise InferenceResponseError("Choice missing 'message' field") - + message = choice["message"] if "content" not in message: raise InferenceResponseError("Message missing 'content' field") - + if not isinstance(message["content"], str): raise InferenceResponseError("Message content must be a string") - + return True From 16a563bb12572b0d72319f0917667fde604f0408 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 16 Sep 2025 12:55:40 +0530 Subject: [PATCH 03/12] fix precommit Signed-off-by: Milind Waykole --- pytest.ini | 1 + .../model_server/llmd/conftest.py | 1 - utilities/llmd_utils.py | 22 +++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pytest.ini b/pytest.ini index 80c149a5e..85ed14ab2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -33,6 +33,7 @@ markers = gpu: Mark tests which require GPU resources multinode: Mark tests which require multiple nodes keda: Mark tests which are testing KEDA scaling + llmd_cpu: Mark tests which are testing LLMD (LLM Deployment) with CPU resources # Model Registry: custom_namespace: mark tests that are to be run with custom namespace diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index cd2c86773..9130c6b6b 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -101,7 +101,6 @@ def llmd_inference_service( if "llmd_gateway" in request.fixturenames: request.getfixturevalue("llmd_gateway") - container_resources = kwargs.get( "container_resources", { diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 87640dc18..4395b792e 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -353,7 +353,7 @@ def verify_inference_response_llmd( ) if authorized_user is False: - _validate_unauthorized_response(res, token, inference) + _validate_unauthorized_response(res=res, token=token, inference=inference) else: _validate_authorized_response( res=res, @@ -432,7 +432,7 @@ def get_inference_body( template_str = default_query_config.get("query_input", "") # Use template substitution for model name - template = Template(template_str) + template = Template(template=template_str) body = template.safe_substitute(model_name=model_name) else: # For custom input, create OpenAI-compatible format @@ -457,7 +457,7 @@ def generate_command( token: Optional[str] = None, ) -> str: """Generate curl command string for LLM inference.""" - base_url = get_llm_inference_url(self.llm_service) + base_url = get_llm_inference_url(llm_service=self.llm_service) endpoint_url = f"{base_url}{DEFAULT_LLM_ENDPOINT}" body = self.get_inference_body( @@ -477,8 +477,7 @@ def generate_command( cmd += " --insecure" else: try: - from ocp_resources.resource import get_client # type: ignore - + from ocp_resources.resource import get_client client = get_client() ca_bundle = get_ca_bundle(client=client, deployment_mode="raw") if ca_bundle: @@ -587,12 +586,13 @@ def _validate_authorized_response( raise ValueError(f"Missing response text key for inference {inference_config}") if isinstance(expected_response_text, str): - expected_response_text = Template(expected_response_text).safe_substitute(model_name=model_name) + expected_response_text = Template(template=expected_response_text).safe_substitute(model_name=model_name) elif isinstance(expected_response_text, dict): - expected_response_text = Template(expected_response_text.get("response_output")).safe_substitute( - model_name=model_name - ) - + response_output = expected_response_text.get("response_output") + if response_output is not None: + expected_response_text = Template(template=response_output).safe_substitute( + model_name=model_name + ) if inference.inference_response_text_key_name: if inference_type == inference.STREAMING: if output := re.findall( @@ -605,7 +605,7 @@ def _validate_authorized_response( ) elif inference_type == inference.INFER or use_regex: formatted_res = json.dumps(res[inference.inference_response_text_key_name]).replace(" ", "") - if use_regex: + if use_regex and expected_response_text is not None: assert re.search(expected_response_text, formatted_res), ( f"Expected: {expected_response_text} not found in: {formatted_res}" ) From 8453cda0f80fa2b2bee2a1664850c84e58b24954 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Sep 2025 07:31:09 +0000 Subject: [PATCH 04/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilities/llmd_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 4395b792e..9d5c94c01 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -478,6 +478,7 @@ def generate_command( else: try: from ocp_resources.resource import get_client + client = get_client() ca_bundle = get_ca_bundle(client=client, deployment_mode="raw") if ca_bundle: @@ -590,9 +591,7 @@ def _validate_authorized_response( elif isinstance(expected_response_text, dict): response_output = expected_response_text.get("response_output") if response_output is not None: - expected_response_text = Template(template=response_output).safe_substitute( - model_name=model_name - ) + expected_response_text = Template(template=response_output).safe_substitute(model_name=model_name) if inference.inference_response_text_key_name: if inference_type == inference.STREAMING: if output := re.findall( From 41c26e8b6eb995f0881b8c6bb413322df2256af0 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 16 Sep 2025 13:01:32 +0530 Subject: [PATCH 05/12] fix precommit Signed-off-by: Milind Waykole --- tests/model_serving/model_server/llmd/__init__.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py index 3d0e7e903..e69de29bb 100644 --- a/tests/model_serving/model_server/llmd/__init__.py +++ b/tests/model_serving/model_server/llmd/__init__.py @@ -1,8 +0,0 @@ -""" -LLM Deployment (LLMD) tests module. - -This module contains tests for LLM deployment functionality including: -- Gateway resource creation and management -- LLMInferenceService deployment and configuration -- Integration testing between gateway and inference services -""" From 3a1373595e604d46fffde9b3d2a1abcde3aac171 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 16 Sep 2025 20:07:33 +0530 Subject: [PATCH 06/12] fix precommit Signed-off-by: Milind Waykole --- tests/model_serving/model_server/llmd/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index 9130c6b6b..6464f1f6d 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -100,7 +100,7 @@ def llmd_inference_service( service_name = kwargs.get("name", f"llm-{name_suffix}") if "llmd_gateway" in request.fixturenames: - request.getfixturevalue("llmd_gateway") + request.getfixturevalue(argname="llmd_gateway") container_resources = kwargs.get( "container_resources", { From 75978bf602db2d78706c85300fdb7ca39e676578 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 16 Sep 2025 20:51:15 +0530 Subject: [PATCH 07/12] fix precommit Signed-off-by: Milind Waykole --- utilities/llmd_utils.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 9d5c94c01..dc04bf09d 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -630,28 +630,3 @@ def _validate_authorized_response( ) else: raise InferenceResponseError(f"Inference response output not found in response. Response: {res}") - - -def validate_llm_response_format(response_data: Dict[str, Any]) -> bool: - """Validate that a response follows the expected LLM inference format.""" - if "choices" not in response_data: - raise InferenceResponseError("Response missing required field: 'choices'") - - if not isinstance(response_data["choices"], list): - raise InferenceResponseError("'choices' field must be a list") - - if not response_data["choices"]: - raise InferenceResponseError("'choices' list cannot be empty") - - choice = response_data["choices"][0] - if "message" not in choice: - raise InferenceResponseError("Choice missing 'message' field") - - message = choice["message"] - if "content" not in message: - raise InferenceResponseError("Message missing 'content' field") - - if not isinstance(message["content"], str): - raise InferenceResponseError("Message content must be a string") - - return True From 2d17c0ccf579c8869f9c25e02032506df4f0b2e2 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 23 Sep 2025 18:01:59 +0530 Subject: [PATCH 08/12] fix gateway thing Signed-off-by: Milind Waykole --- .../model_server/llmd/__init__.py | 1 + .../model_server/llmd/conftest.py | 10 ++++ .../model_server/llmd/test_llmd_oci_cpu.py | 12 +++-- .../model_server/llmd/test_llmd_s3.py | 12 +++-- .../model_serving/model_server/llmd/utils.py | 46 +++++++++++++++++++ utilities/llmd_utils.py | 34 +++++++++++--- 6 files changed, 103 insertions(+), 12 deletions(-) diff --git a/tests/model_serving/model_server/llmd/__init__.py b/tests/model_serving/model_server/llmd/__init__.py index e69de29bb..98274df1a 100644 --- a/tests/model_serving/model_server/llmd/__init__.py +++ b/tests/model_serving/model_server/llmd/__init__.py @@ -0,0 +1 @@ +"""LLMD (LLM Deployment) test module for OpenDataHub and OpenShift AI.""" diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index 6464f1f6d..2807754e7 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -67,18 +67,28 @@ def llmd_gateway( admin_client: DynamicClient, gateway_namespace: str, ) -> Generator[Gateway, None, None]: + """ + Pytest fixture for LLMD Gateway management. + + Implements persistent gateway strategy: + - Reuses existing gateways if available + - Creates new gateway only if needed + - Does not delete gateway in teardown + """ if isinstance(request.param, str): gateway_class_name = request.param kwargs = {} else: gateway_class_name = request.param.get("gateway_class_name", "openshift-default") kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"} + with create_gateway( client=admin_client, namespace=gateway_namespace, gateway_class_name=gateway_class_name, wait_for_condition=True, timeout=Timeout.TIMEOUT_5MIN, + teardown=False, # Don't delete gateway in teardown **kwargs, ) as gateway: yield gateway diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py index 031ce6e48..adc5e64c2 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py +++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py @@ -1,9 +1,12 @@ import pytest -from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status +from tests.model_serving.model_server.llmd.utils import ( + verify_llm_service_status, + verify_gateway_status, + verify_llmd_pods_not_restarted, +) from utilities.constants import Protocols from utilities.llmd_utils import verify_inference_response_llmd - from utilities.llmd_constants import BASIC_LLMD_PARAMS from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG @@ -20,7 +23,8 @@ class TestLLMDOCICPUInference: """LLMD inference testing with OCI storage and CPU runtime using vLLM.""" - def test_llmd_oci(self, llmd_gateway, llmd_inference_service): + def test_llmd_oci(self, admin_client, llmd_gateway, llmd_inference_service): + """Test LLMD inference with OCI storage and CPU runtime.""" assert verify_gateway_status(llmd_gateway), "Gateway should be ready" assert verify_llm_service_status(llmd_inference_service), "LLMInferenceService should be ready" @@ -32,3 +36,5 @@ def test_llmd_oci(self, llmd_gateway, llmd_inference_service): use_default_query=True, insecure=True, ) + + verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service) diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py index 03c899431..0e2d2ecb2 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_s3.py +++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py @@ -1,9 +1,12 @@ import pytest -from tests.model_serving.model_server.llmd.utils import verify_llm_service_status, verify_gateway_status +from tests.model_serving.model_server.llmd.utils import ( + verify_llm_service_status, + verify_gateway_status, + verify_llmd_pods_not_restarted, +) from utilities.constants import Protocols from utilities.llmd_utils import verify_inference_response_llmd - from utilities.manifests.opt125m_cpu import OPT125M_CPU_INFERENCE_CONFIG pytestmark = [ @@ -20,7 +23,8 @@ class TestLLMDS3Inference: """LLMD inference testing with S3 storage.""" - def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3): + def test_llmd_s3(self, admin_client, llmd_gateway, llmd_inference_service_s3): + """Test LLMD inference with S3 storage.""" assert verify_gateway_status(llmd_gateway), "Gateway should be ready" assert verify_llm_service_status(llmd_inference_service_s3), "LLMInferenceService should be ready" @@ -32,3 +36,5 @@ def test_llmd_s3(self, llmd_gateway, llmd_inference_service_s3): use_default_query=True, insecure=True, ) + + verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service_s3) diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py index b80fcff7c..8e9929c18 100644 --- a/tests/model_serving/model_server/llmd/utils.py +++ b/tests/model_serving/model_server/llmd/utils.py @@ -5,10 +5,14 @@ Follows the established model server utils pattern for consistency. """ +from kubernetes.dynamic import DynamicClient from ocp_resources.gateway import Gateway from ocp_resources.llm_inference_service import LLMInferenceService +from ocp_resources.pod import Pod from simple_logger.logger import get_logger +from utilities.exceptions import PodContainersRestartError + LOGGER = get_logger(name=__name__) @@ -59,3 +63,45 @@ def verify_llm_service_status(llm_service: LLMInferenceService) -> bool: LOGGER.warning(f"LLMInferenceService {llm_service.name} is not in Ready state") return False + + +def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInferenceService) -> None: + """ + Verify that LLMD inference pods containers have not restarted. + + This function checks for container restarts in pods related to the specific LLMInferenceService. + + Args: + client (DynamicClient): DynamicClient instance + llm_service (LLMInferenceService): The LLMInferenceService to check pods for + + Raises: + PodContainersRestartError: If any containers in LLMD pods have restarted + """ + LOGGER.info(f"Verifying that pods for LLMInferenceService {llm_service.name} have not restarted") + + restarted_containers = {} + + for pod in Pod.get( + dyn_client=client, + namespace=llm_service.namespace, + label_selector=f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice,{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}", + ): + labels = pod.instance.metadata.get("labels", {}) + if labels.get("kserve.io/component") == "workload": + LOGGER.debug(f"Checking pod {pod.name} for container restarts") + + if pod.instance.status.containerStatuses: + if _restarted_containers := [ + container.name for container in pod.instance.status.containerStatuses + if container.restartCount > 0 + ]: + restarted_containers[pod.name] = _restarted_containers + LOGGER.warning(f"Pod {pod.name} has restarted containers: {_restarted_containers}") + + if restarted_containers: + error_msg = f"LLMD inference containers restarted for {llm_service.name}: {restarted_containers}" + LOGGER.error(error_msg) + raise PodContainersRestartError(error_msg) + + LOGGER.info(f"All pods for LLMInferenceService {llm_service.name} have no container restarts") diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index dc04bf09d..55a044a1f 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -39,10 +39,15 @@ def create_gateway( infrastructure: Optional[Dict[str, Any]] = None, wait_for_condition: bool = True, timeout: int = 300, - teardown: bool = True, + teardown: bool = False, ) -> Generator[Gateway, None, None]: """ Context manager to create and manage a Gateway resource using ocp_resources. + + This function implements smart gateway management: + - Only creates gateway if it doesn't already exist + - Reuses existing gateways to avoid conflicts + - Does not delete gateway in teardown (persistent gateway strategy) Args: client: Kubernetes dynamic client @@ -53,10 +58,10 @@ def create_gateway( infrastructure: Infrastructure configuration wait_for_condition: Whether to wait for the gateway to be programmed timeout: Timeout in seconds for waiting - teardown: Whether to clean up the resource + teardown: Whether to clean up the resource (default: False for persistent strategy) Yields: - Gateway: The created Gateway resource + Gateway: The Gateway resource (existing or newly created) """ if listeners is None: listeners = [ @@ -70,6 +75,8 @@ def create_gateway( if infrastructure is None: infrastructure = {"labels": {"serving.kserve.io/gateway": "kserve-ingress-gateway"}} + + # Check if gateway already exists try: existing_gateway = Gateway( client=client, @@ -78,10 +85,24 @@ def create_gateway( api_group="gateway.networking.k8s.io", ) if existing_gateway.exists: - LOGGER.info(f"Cleaning up existing Gateway {name} in namespace {namespace}") - existing_gateway.delete(wait=True, timeout=Timeout.TIMEOUT_2MIN) + LOGGER.info(f"Using existing Gateway {name} in namespace {namespace}") + + if wait_for_condition: + LOGGER.info(f"Waiting for existing Gateway {name} to be programmed...") + existing_gateway.wait_for_condition( + condition="Programmed", + status="True", + timeout=timeout, + ) + LOGGER.info(f"Existing Gateway {name} is programmed and ready") + + # Yield the existing gateway without teardown + yield existing_gateway + return except Exception as e: - LOGGER.debug(f"No existing Gateway to clean up: {e}") + LOGGER.debug(f"No existing Gateway found, will create new one: {e}") + + # Create new gateway only if it doesn't exist gateway_body = { "apiVersion": "gateway.networking.k8s.io/v1", "kind": "Gateway", @@ -96,6 +117,7 @@ def create_gateway( }, } + LOGGER.info(f"Creating new Gateway {name} in namespace {namespace}") with Gateway( client=client, teardown=teardown, From 9c72e80d441e19562a88422f05574fe200e1405e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Sep 2025 12:33:50 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../model_server/llmd/conftest.py | 4 ++-- .../model_server/llmd/test_llmd_oci_cpu.py | 2 +- .../model_server/llmd/test_llmd_s3.py | 2 +- .../model_serving/model_server/llmd/utils.py | 19 +++++++++---------- utilities/llmd_utils.py | 6 +++--- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index 2807754e7..e8cf4637a 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -69,7 +69,7 @@ def llmd_gateway( ) -> Generator[Gateway, None, None]: """ Pytest fixture for LLMD Gateway management. - + Implements persistent gateway strategy: - Reuses existing gateways if available - Creates new gateway only if needed @@ -81,7 +81,7 @@ def llmd_gateway( else: gateway_class_name = request.param.get("gateway_class_name", "openshift-default") kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"} - + with create_gateway( client=admin_client, namespace=gateway_namespace, diff --git a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py index adc5e64c2..57481b29f 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py +++ b/tests/model_serving/model_server/llmd/test_llmd_oci_cpu.py @@ -36,5 +36,5 @@ def test_llmd_oci(self, admin_client, llmd_gateway, llmd_inference_service): use_default_query=True, insecure=True, ) - + verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service) diff --git a/tests/model_serving/model_server/llmd/test_llmd_s3.py b/tests/model_serving/model_server/llmd/test_llmd_s3.py index 0e2d2ecb2..ec09e3257 100644 --- a/tests/model_serving/model_server/llmd/test_llmd_s3.py +++ b/tests/model_serving/model_server/llmd/test_llmd_s3.py @@ -36,5 +36,5 @@ def test_llmd_s3(self, admin_client, llmd_gateway, llmd_inference_service_s3): use_default_query=True, insecure=True, ) - + verify_llmd_pods_not_restarted(client=admin_client, llm_service=llmd_inference_service_s3) diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py index 8e9929c18..dbf5d49f6 100644 --- a/tests/model_serving/model_server/llmd/utils.py +++ b/tests/model_serving/model_server/llmd/utils.py @@ -68,20 +68,20 @@ def verify_llm_service_status(llm_service: LLMInferenceService) -> bool: def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInferenceService) -> None: """ Verify that LLMD inference pods containers have not restarted. - + This function checks for container restarts in pods related to the specific LLMInferenceService. - + Args: client (DynamicClient): DynamicClient instance llm_service (LLMInferenceService): The LLMInferenceService to check pods for - + Raises: PodContainersRestartError: If any containers in LLMD pods have restarted """ LOGGER.info(f"Verifying that pods for LLMInferenceService {llm_service.name} have not restarted") - + restarted_containers = {} - + for pod in Pod.get( dyn_client=client, namespace=llm_service.namespace, @@ -90,18 +90,17 @@ def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInfere labels = pod.instance.metadata.get("labels", {}) if labels.get("kserve.io/component") == "workload": LOGGER.debug(f"Checking pod {pod.name} for container restarts") - + if pod.instance.status.containerStatuses: if _restarted_containers := [ - container.name for container in pod.instance.status.containerStatuses - if container.restartCount > 0 + container.name for container in pod.instance.status.containerStatuses if container.restartCount > 0 ]: restarted_containers[pod.name] = _restarted_containers LOGGER.warning(f"Pod {pod.name} has restarted containers: {_restarted_containers}") - + if restarted_containers: error_msg = f"LLMD inference containers restarted for {llm_service.name}: {restarted_containers}" LOGGER.error(error_msg) raise PodContainersRestartError(error_msg) - + LOGGER.info(f"All pods for LLMInferenceService {llm_service.name} have no container restarts") diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 55a044a1f..19912a09a 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -43,7 +43,7 @@ def create_gateway( ) -> Generator[Gateway, None, None]: """ Context manager to create and manage a Gateway resource using ocp_resources. - + This function implements smart gateway management: - Only creates gateway if it doesn't already exist - Reuses existing gateways to avoid conflicts @@ -86,7 +86,7 @@ def create_gateway( ) if existing_gateway.exists: LOGGER.info(f"Using existing Gateway {name} in namespace {namespace}") - + if wait_for_condition: LOGGER.info(f"Waiting for existing Gateway {name} to be programmed...") existing_gateway.wait_for_condition( @@ -95,7 +95,7 @@ def create_gateway( timeout=timeout, ) LOGGER.info(f"Existing Gateway {name} is programmed and ready") - + # Yield the existing gateway without teardown yield existing_gateway return From 4f3035642e672a6e8694928ccbbe65e0e2d5d1ad Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 23 Sep 2025 18:10:56 +0530 Subject: [PATCH 10/12] fix gateway thing Signed-off-by: Milind Waykole --- tests/model_serving/model_server/llmd/conftest.py | 15 ++++++++------- utilities/llmd_utils.py | 13 +++++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index e8cf4637a..ca465b1e0 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -11,7 +11,7 @@ from utilities.constants import Timeout from utilities.infra import s3_endpoint_secret -from utilities.llmd_utils import create_gateway, create_llmisvc +from utilities.llmd_utils import create_llmd_gateway, create_llmisvc from utilities.llmd_constants import ( DEFAULT_GATEWAY_NAMESPACE, VLLM_STORAGE_OCI, @@ -68,12 +68,13 @@ def llmd_gateway( gateway_namespace: str, ) -> Generator[Gateway, None, None]: """ - Pytest fixture for LLMD Gateway management. - - Implements persistent gateway strategy: - - Reuses existing gateways if available + Pytest fixture for LLMD Gateway management using create_llmd_gateway. + + Implements persistent LLMD gateway strategy: + - Reuses existing gateways if available - Creates new gateway only if needed - Does not delete gateway in teardown + - Uses LLMD-specific gateway configuration """ if isinstance(request.param, str): gateway_class_name = request.param @@ -81,8 +82,8 @@ def llmd_gateway( else: gateway_class_name = request.param.get("gateway_class_name", "openshift-default") kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"} - - with create_gateway( + + with create_llmd_gateway( client=admin_client, namespace=gateway_namespace, gateway_class_name=gateway_class_name, diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 19912a09a..565205d31 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -30,7 +30,7 @@ @contextmanager -def create_gateway( +def create_llmd_gateway( client: DynamicClient, name: str = DEFAULT_GATEWAY_NAME, namespace: str = DEFAULT_GATEWAY_NAMESPACE, @@ -42,17 +42,18 @@ def create_gateway( teardown: bool = False, ) -> Generator[Gateway, None, None]: """ - Context manager to create and manage a Gateway resource using ocp_resources. - - This function implements smart gateway management: + Context manager to create and manage LLMD Gateway resources using ocp_resources. + + This function implements smart LLMD gateway management: - Only creates gateway if it doesn't already exist - Reuses existing gateways to avoid conflicts - Does not delete gateway in teardown (persistent gateway strategy) + - Specifically designed for LLMD (LLM Deployment) infrastructure Args: client: Kubernetes dynamic client - name: Gateway name - namespace: Gateway namespace + name: Gateway name (defaults to openshift-ai-inference) + namespace: Gateway namespace (defaults to openshift-ingress) gateway_class_name: The name of the GatewayClass resource listeners: List of listener configurations infrastructure: Infrastructure configuration From 76624a8aef2de6108772e5afd6f68f05ea8dd345 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 23 Sep 2025 18:19:26 +0530 Subject: [PATCH 11/12] fix: break long line in utils.py to meet flake8 line length requirement --- tests/model_serving/model_server/llmd/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py index dbf5d49f6..337872645 100644 --- a/tests/model_serving/model_server/llmd/utils.py +++ b/tests/model_serving/model_server/llmd/utils.py @@ -85,7 +85,10 @@ def verify_llmd_pods_not_restarted(client: DynamicClient, llm_service: LLMInfere for pod in Pod.get( dyn_client=client, namespace=llm_service.namespace, - label_selector=f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice,{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}", + label_selector=( + f"{Pod.ApiGroup.APP_KUBERNETES_IO}/part-of=llminferenceservice," + f"{Pod.ApiGroup.APP_KUBERNETES_IO}/name={llm_service.name}" + ), ): labels = pod.instance.metadata.get("labels", {}) if labels.get("kserve.io/component") == "workload": From e8bdd8dd512affcfbde6d15effa73b79939b4428 Mon Sep 17 00:00:00 2001 From: Milind Waykole Date: Tue, 23 Sep 2025 18:20:38 +0530 Subject: [PATCH 12/12] fix gateway thing Signed-off-by: Milind Waykole --- tests/model_serving/model_server/llmd/conftest.py | 6 +++--- utilities/llmd_utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py index ca465b1e0..05ac74284 100644 --- a/tests/model_serving/model_server/llmd/conftest.py +++ b/tests/model_serving/model_server/llmd/conftest.py @@ -69,9 +69,9 @@ def llmd_gateway( ) -> Generator[Gateway, None, None]: """ Pytest fixture for LLMD Gateway management using create_llmd_gateway. - + Implements persistent LLMD gateway strategy: - - Reuses existing gateways if available + - Reuses existing gateways if available - Creates new gateway only if needed - Does not delete gateway in teardown - Uses LLMD-specific gateway configuration @@ -82,7 +82,7 @@ def llmd_gateway( else: gateway_class_name = request.param.get("gateway_class_name", "openshift-default") kwargs = {k: v for k, v in request.param.items() if k != "gateway_class_name"} - + with create_llmd_gateway( client=admin_client, namespace=gateway_namespace, diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py index 565205d31..62db07bd7 100644 --- a/utilities/llmd_utils.py +++ b/utilities/llmd_utils.py @@ -43,7 +43,7 @@ def create_llmd_gateway( ) -> Generator[Gateway, None, None]: """ Context manager to create and manage LLMD Gateway resources using ocp_resources. - + This function implements smart LLMD gateway management: - Only creates gateway if it doesn't already exist - Reuses existing gateways to avoid conflicts