Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,22 @@ def pytest_addoption(parser: Parser) -> None:
action="store_true",
)

# KServe health check options
kserve_health_group = parser.getgroup(name="KServe Health")
kserve_health_group.addoption(
"--skip-kserve-health-check",
help="Skip KServe component health check",
action="store_true",
)

# LLMD health check options
llmd_health_group = parser.getgroup(name="LLMD Health")
llmd_health_group.addoption(
"--skip-llmd-health-check",
help="Skip LLMD infrastructure dependency health check",
action="store_true",
)

# HuggingFace options
hf_group.addoption("--hf-access-token", default=os.environ.get("HF_ACCESS_TOKEN"), help="HF access token")
# Model Registry options
Expand Down
84 changes: 84 additions & 0 deletions tests/model_serving/model_server/kserve/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pytest
from kubernetes.dynamic import DynamicClient
from ocp_resources.data_science_cluster import DataScienceCluster
from ocp_resources.deployment import Deployment
from pytest_testconfig import config as py_config
from simple_logger.logger import get_logger

from utilities.constants import DscComponents

LOGGER = get_logger(name=__name__)

KSERVE_CONTROLLER_DEPLOYMENTS: list[str] = [
"kserve-controller-manager",
"odh-model-controller",
]


def verify_kserve_health(admin_client: DynamicClient, dsc_resource: DataScienceCluster) -> None:
"""Verify that KServe components are healthy and ready to serve models.

Checks management state, DSC ready condition, and controller deployment availability.
Raises pytest.skip on any failure so downstream kserve tests are skipped.
"""
applications_namespace = py_config["applications_namespace"]

kserve_management_state = dsc_resource.instance.spec.components[DscComponents.KSERVE].managementState
if kserve_management_state != DscComponents.ManagementState.MANAGED:
pytest.skip(f"KServe managementState is {kserve_management_state}, expected Managed")

kserve_ready = False
for condition in dsc_resource.instance.status.conditions:
if condition.type == DscComponents.COMPONENT_MAPPING[DscComponents.KSERVE]:
if condition.status != "True":
pytest.skip(f"KServe DSC condition is not ready: {condition.status}, reason: {condition.get('reason')}")
kserve_ready = True
break

if not kserve_ready:
pytest.skip("KserveReady condition not found in DSC status")

for name in KSERVE_CONTROLLER_DEPLOYMENTS:
deployment = Deployment(
client=admin_client,
name=name,
namespace=applications_namespace,
)
if not deployment.exists:
pytest.skip(f"KServe deployment {name} not found in {applications_namespace}")

available = False
for condition in deployment.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.skip(f"KServe deployment {name} is not Available: {condition.get('reason')}")
available = True
break

if not available:
pytest.skip(f"KServe deployment {name} has no Available condition")

LOGGER.info("KServe component health check passed")


@pytest.fixture(scope="session", autouse=True)
def kserve_health_check(
request: pytest.FixtureRequest,
admin_client: DynamicClient,
dsc_resource: DataScienceCluster,
) -> None:
"""Session-scoped health gate for all kserve tests.

Skips all tests under tests/model_serving/model_server/kserve/ when
KServe components are not healthy.
"""
if request.session.config.getoption("--skip-kserve-health-check"):
LOGGER.warning("Skipping KServe health check, got --skip-kserve-health-check")
return

selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
if "component_health" in selected_markers:
LOGGER.info("Skipping KServe health gate because selected tests include component_health marker")
return

verify_kserve_health(admin_client=admin_client, dsc_resource=dsc_resource)
142 changes: 142 additions & 0 deletions tests/model_serving/model_server/llmd/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,166 @@
import yaml
from _pytest.fixtures import FixtureRequest
from kubernetes.dynamic import DynamicClient
from ocp_resources.cluster_service_version import ClusterServiceVersion
from ocp_resources.config_map import ConfigMap
from ocp_resources.deployment import Deployment
from ocp_resources.gateway import Gateway
from ocp_resources.llm_inference_service import LLMInferenceService
from ocp_resources.namespace import Namespace
from ocp_resources.resource import Resource
from ocp_resources.role import Role
from ocp_resources.role_binding import RoleBinding
from ocp_resources.service_account import ServiceAccount
from pytest_testconfig import config as py_config

from tests.model_serving.model_server.llmd.llmd_configs import TinyLlamaOciConfig
from tests.model_serving.model_server.llmd.utils import wait_for_llmisvc, wait_for_llmisvc_pods_ready
from utilities.constants import Timeout
from utilities.infra import create_inference_token, s3_endpoint_secret, update_configmap_data
from utilities.llmd_utils import create_llmd_gateway
from utilities.logger import RedactedString
from utilities.resources.kuadrant import Kuadrant
from utilities.resources.leader_worker_set_operator import LeaderWorkerSetOperator

LOGGER = structlog.get_logger(name=__name__)
logging.getLogger("timeout_sampler").setLevel(logging.WARNING)

LLMD_DSC_CONDITION: str = "KserveLLMInferenceServiceDependencies"

LLMD_REQUIRED_OPERATORS: dict[str, str] = {
"cert-manager-operator": "cert-manager-operator",
"authorino-operator": "openshift-operators",
"rhcl-operator": "openshift-operators",
}

LLMD_REQUIRED_DEPLOYMENTS: dict[str, str] = {
"cert-manager-operator-controller-manager": "cert-manager-operator",
"cert-manager": "cert-manager",
"cert-manager-webhook": "cert-manager",
"authorino-operator": "openshift-operators",
"kuadrant-operator-controller-manager": "openshift-operators",
}

# Same KServe stack as tests/model_serving/model_server/kserve/conftest.py plus LLM-ISVC controller.
LLMD_KSERVE_CONTROLLER_DEPLOYMENTS: list[str] = [
"kserve-controller-manager",
"odh-model-controller",
"llmisvc-controller-manager",
]


def _verify_operator_csv(admin_client: DynamicClient, csv_prefix: str, namespace: str) -> None:
for csv in ClusterServiceVersion.get(client=admin_client, namespace=namespace):
if csv.name.startswith(csv_prefix) and csv.status == csv.Status.SUCCEEDED:
return
pytest.xfail(f"Operator CSV {csv_prefix} not found or not Succeeded in {namespace}")


def verify_llmd_health(admin_client: DynamicClient, dsc_resource: Resource) -> None:
"""Verify LLMD infrastructure dependencies are healthy.

Checks DSC condition, required operator CSVs, dependency and KServe controller
deployments, optional LeaderWorkerSetOperator CR (LWS is optional),
and Kuadrant CR.
"""
# 1. DSC condition for LLMD dependencies
for condition in dsc_resource.instance.status.conditions:
if condition.type == LLMD_DSC_CONDITION:
if condition.status != "True":
pytest.xfail(
f"{LLMD_DSC_CONDITION} is not ready: {condition.status}, reason: {condition.get('reason')}"
)
break
else:
pytest.xfail(f"{LLMD_DSC_CONDITION} condition not found in DSC status")

# 2. Operator CSVs
for csv_prefix, namespace in LLMD_REQUIRED_OPERATORS.items():
_verify_operator_csv(admin_client=admin_client, csv_prefix=csv_prefix, namespace=namespace)

# 3. Controller deployments
for name, namespace in LLMD_REQUIRED_DEPLOYMENTS.items():
deployment = Deployment(client=admin_client, name=name, namespace=namespace)
if not deployment.exists:
pytest.xfail(f"LLMD dependency deployment {name} not found in {namespace}")

dep_available = False
for condition in deployment.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.xfail(f"Deployment {name} in {namespace} is not Available: {condition.get('reason')}")
dep_available = True
break

if not dep_available:
pytest.xfail(f"Deployment {name} in {namespace} has no Available condition")

applications_namespace = py_config["applications_namespace"]
for name in LLMD_KSERVE_CONTROLLER_DEPLOYMENTS:
deployment = Deployment(client=admin_client, name=name, namespace=applications_namespace)
if not deployment.exists:
pytest.xfail(f"KServe/LLMD controller deployment {name} not found in {applications_namespace}")

kserve_dep_available = False
for condition in deployment.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.xfail(
f"Deployment {name} in {applications_namespace} is not Available: {condition.get('reason')}"
)
kserve_dep_available = True
break

if not kserve_dep_available:
pytest.xfail(f"Deployment {name} in {applications_namespace} has no Available condition")

# 4. LeaderWorkerSetOperator CR (optional)
lws_operator = LeaderWorkerSetOperator(client=admin_client, name="cluster")
if lws_operator.exists:
lws_available = False
for condition in lws_operator.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.xfail(f"LeaderWorkerSetOperator is not Available: {condition.get('reason')}")
lws_available = True
break

if not lws_available:
pytest.xfail("LeaderWorkerSetOperator has no Available condition")
else:
LOGGER.warning("LeaderWorkerSetOperator cluster CR not found; LWS is optional for LLMD (RHOAIENG-52057)")

# 5. Kuadrant CR
kuadrant = Kuadrant(client=admin_client, name="kuadrant", namespace="kuadrant-system")
if not kuadrant.exists:
pytest.xfail("Kuadrant 'kuadrant' CR not found")

LOGGER.info("LLMD component health check passed")


@pytest.fixture(scope="session", autouse=True)
def llmd_health_check(
request: pytest.FixtureRequest,
admin_client: DynamicClient,
dsc_resource: Resource,
) -> None:
"""Session-scoped health gate for all LLMD tests.

Marks LLMD tests as xfail when required infrastructure dependencies are unhealthy
(see verify_llmd_health). Use --skip-llmd-health-check to disable.
"""
if request.session.config.getoption("--skip-llmd-health-check"):
LOGGER.warning("Skipping LLMD health check, got --skip-llmd-health-check")
return

selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
if "component_health" in selected_markers:
LOGGER.info("Skipping LLMD health gate because selected tests include component_health marker")
return

verify_llmd_health(admin_client=admin_client, dsc_resource=dsc_resource)


AuthEntry = namedtuple(typename="AuthEntry", field_names=["service", "token"])


Expand Down
22 changes: 22 additions & 0 deletions utilities/resources/kuadrant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Kuadrant custom resource for Kuadrant API management."""

from typing import Any

from ocp_resources.resource import NamespacedResource

from utilities.constants import ApiGroups


class Kuadrant(NamespacedResource):
"""Kuadrant is the Schema for the kuadrants API."""

api_group: str = ApiGroups.KUADRANT_IO

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

def to_dict(self) -> None:
super().to_dict()

if not self.kind_dict and not self.yaml_file:
self.res["spec"] = {}
20 changes: 20 additions & 0 deletions utilities/resources/leader_worker_set_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""LeaderWorkerSetOperator custom resource for OpenShift LWS operator."""

from typing import Any

from ocp_resources.resource import Resource


class LeaderWorkerSetOperator(Resource):
"""LeaderWorkerSetOperator is the Schema for the leaderworkersetoperators API."""

api_group: str = "operator.openshift.io"

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

def to_dict(self) -> None:
super().to_dict()

if not self.kind_dict and not self.yaml_file:
self.res["spec"] = {}
Loading