Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,22 @@ def pytest_addoption(parser: Parser) -> None:
action="store_true",
)

# KServe health check options
kserve_health_group = parser.getgroup(name="KServe Health")
kserve_health_group.addoption(
"--skip-kserve-health-check",
help="Skip KServe component health check",
action="store_true",
)

# LLMD health check options
llmd_health_group = parser.getgroup(name="LLMD Health")
llmd_health_group.addoption(
"--skip-llmd-health-check",
help="Skip LLMD infrastructure dependency health check",
action="store_true",
)

# HuggingFace options
hf_group.addoption("--hf-access-token", default=os.environ.get("HF_ACCESS_TOKEN"), help="HF access token")
# Model Registry options
Expand Down
84 changes: 84 additions & 0 deletions tests/model_serving/model_server/kserve/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pytest
from kubernetes.dynamic import DynamicClient
from ocp_resources.data_science_cluster import DataScienceCluster
from ocp_resources.deployment import Deployment
from pytest_testconfig import config as py_config
from simple_logger.logger import get_logger

from utilities.constants import DscComponents

LOGGER = get_logger(name=__name__)

KSERVE_CONTROLLER_DEPLOYMENTS: list[str] = [
"kserve-controller-manager",
"odh-model-controller",
]


def verify_kserve_health(admin_client: DynamicClient, dsc_resource: DataScienceCluster) -> None:
"""Verify that KServe components are healthy and ready to serve models.

Checks management state, DSC ready condition, and controller deployment availability.
Raises pytest.skip on any failure so downstream kserve tests are skipped.
"""
applications_namespace = py_config["applications_namespace"]

kserve_management_state = dsc_resource.instance.spec.components[DscComponents.KSERVE].managementState
if kserve_management_state != DscComponents.ManagementState.MANAGED:
pytest.skip(f"KServe managementState is {kserve_management_state}, expected Managed")

kserve_ready = False
for condition in dsc_resource.instance.status.conditions:
if condition.type == DscComponents.COMPONENT_MAPPING[DscComponents.KSERVE]:
if condition.status != "True":
pytest.skip(f"KServe DSC condition is not ready: {condition.status}, reason: {condition.get('reason')}")
kserve_ready = True
break

if not kserve_ready:
pytest.skip("KserveReady condition not found in DSC status")

for name in KSERVE_CONTROLLER_DEPLOYMENTS:
deployment = Deployment(
client=admin_client,
name=name,
namespace=applications_namespace,
)
if not deployment.exists:
pytest.skip(f"KServe deployment {name} not found in {applications_namespace}")

available = False
for condition in deployment.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.skip(f"KServe deployment {name} is not Available: {condition.get('reason')}")
available = True
break

if not available:
pytest.skip(f"KServe deployment {name} has no Available condition")

LOGGER.info("KServe component health check passed")


@pytest.fixture(scope="session", autouse=True)
def kserve_health_check(
request: pytest.FixtureRequest,
admin_client: DynamicClient,
dsc_resource: DataScienceCluster,
) -> None:
"""Session-scoped health gate for all kserve tests.

Skips all tests under tests/model_serving/model_server/kserve/ when
KServe components are not healthy.
"""
if request.session.config.getoption("--skip-kserve-health-check"):
LOGGER.warning("Skipping KServe health check, got --skip-kserve-health-check")
return

selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
if "component_health" in selected_markers:
LOGGER.info("Skipping KServe health gate because selected tests include component_health marker")
return

verify_kserve_health(admin_client=admin_client, dsc_resource=dsc_resource)
115 changes: 115 additions & 0 deletions tests/model_serving/model_server/llmd/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
import yaml
from _pytest.fixtures import FixtureRequest
from kubernetes.dynamic import DynamicClient
from ocp_resources.cluster_service_version import ClusterServiceVersion
from ocp_resources.config_map import ConfigMap
from ocp_resources.deployment import Deployment
from ocp_resources.gateway import Gateway
from ocp_resources.llm_inference_service import LLMInferenceService
from ocp_resources.namespace import Namespace
from ocp_resources.resource import Resource
from ocp_resources.role import Role
from ocp_resources.role_binding import RoleBinding
from ocp_resources.service_account import ServiceAccount
Expand All @@ -24,10 +27,122 @@
from utilities.llmd_constants import LLMDGateway
from utilities.llmd_utils import create_llmd_gateway
from utilities.logger import RedactedString
from utilities.resources.kuadrant import Kuadrant
from utilities.resources.leader_worker_set_operator import LeaderWorkerSetOperator

LOGGER = structlog.get_logger(name=__name__)
logging.getLogger("timeout_sampler").setLevel(logging.WARNING)

LLMD_DSC_CONDITION: str = "KserveLLMInferenceServiceDependencies"

LLMD_REQUIRED_OPERATORS: dict[str, str] = {
"cert-manager-operator": "cert-manager-operator",
"authorino-operator": "openshift-operators",
"rhcl-operator": "openshift-operators",
}

LLMD_REQUIRED_DEPLOYMENTS: dict[str, str] = {
"cert-manager-operator-controller-manager": "cert-manager-operator",
"cert-manager": "cert-manager",
"cert-manager-webhook": "cert-manager",
"authorino-operator": "openshift-operators",
"kuadrant-operator-controller-manager": "openshift-operators",
"lws-controller-manager": "openshift-lws-operator",
}


def _verify_operator_csv(admin_client: DynamicClient, csv_prefix: str, namespace: str) -> None:
for csv in ClusterServiceVersion.get(client=admin_client, namespace=namespace):
if csv.name.startswith(csv_prefix) and csv.status == csv.Status.SUCCEEDED:
return
pytest.xfail(f"Operator CSV {csv_prefix} not found or not Succeeded in {namespace}")


def verify_llmd_health(admin_client: DynamicClient, dsc_resource: Resource) -> None:
"""Verify LLMD infrastructure dependencies are healthy.

Checks DSC condition, required operator CSVs, controller deployments,
LeaderWorkerSetOperator CR, GatewayClass, and Kuadrant CR.
"""
# 1. DSC condition for LLMD dependencies
for condition in dsc_resource.instance.status.conditions:
if condition.type == LLMD_DSC_CONDITION:
if condition.status != "True":
pytest.xfail(
f"{LLMD_DSC_CONDITION} is not ready: {condition.status}, reason: {condition.get('reason')}"
)
break
else:
pytest.xfail(f"{LLMD_DSC_CONDITION} condition not found in DSC status")

# 2. Operator CSVs
for csv_prefix, namespace in LLMD_REQUIRED_OPERATORS.items():
_verify_operator_csv(admin_client=admin_client, csv_prefix=csv_prefix, namespace=namespace)

# 3. Controller deployments
for name, namespace in LLMD_REQUIRED_DEPLOYMENTS.items():
deployment = Deployment(client=admin_client, name=name, namespace=namespace)
if not deployment.exists:
pytest.xfail(f"LLMD dependency deployment {name} not found in {namespace}")

dep_available = False
for condition in deployment.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.xfail(f"Deployment {name} in {namespace} is not Available: {condition.get('reason')}")
dep_available = True
break

if not dep_available:
pytest.xfail(f"Deployment {name} in {namespace} has no Available condition")

# 4. LeaderWorkerSetOperator CR
lws_operator = LeaderWorkerSetOperator(client=admin_client, name="cluster")
if not lws_operator.exists:
pytest.xfail("LeaderWorkerSetOperator 'cluster' CR not found")

lws_available = False
for condition in lws_operator.instance.status.get("conditions", []):
if condition.type == "Available":
if condition.status != "True":
pytest.xfail(f"LeaderWorkerSetOperator is not Available: {condition.get('reason')}")
lws_available = True
break

if not lws_available:
pytest.xfail("LeaderWorkerSetOperator has no Available condition")

# 5. Kuadrant CR
kuadrant = Kuadrant(client=admin_client, name="kuadrant", namespace="kuadrant-system")
if not kuadrant.exists:
pytest.xfail("Kuadrant 'kuadrant' CR not found")

LOGGER.info("LLMD component health check passed")


@pytest.fixture(scope="session", autouse=True)
def llmd_health_check(
request: pytest.FixtureRequest,
admin_client: DynamicClient,
dsc_resource: Resource,
) -> None:
"""Session-scoped health gate for all LLMD tests.

Skips all tests under tests/model_serving/model_server/llmd/ when
LLMD infrastructure dependencies are not healthy.
"""
if request.session.config.getoption("--skip-llmd-health-check"):
LOGGER.warning("Skipping LLMD health check, got --skip-llmd-health-check")
return

selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
if "component_health" in selected_markers:
LOGGER.info("Skipping LLMD health gate because selected tests include component_health marker")
return

verify_llmd_health(admin_client=admin_client, dsc_resource=dsc_resource)


AuthEntry = namedtuple(typename="AuthEntry", field_names=["service", "token"])


Expand Down
22 changes: 22 additions & 0 deletions utilities/resources/kuadrant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Kuadrant custom resource for Kuadrant API management."""

from typing import Any

from ocp_resources.resource import NamespacedResource

from utilities.constants import ApiGroups


class Kuadrant(NamespacedResource):
"""Kuadrant is the Schema for the kuadrants API."""

api_group: str = ApiGroups.KUADRANT_IO

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

def to_dict(self) -> None:
super().to_dict()

if not self.kind_dict and not self.yaml_file:
self.res["spec"] = {}
20 changes: 20 additions & 0 deletions utilities/resources/leader_worker_set_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""LeaderWorkerSetOperator custom resource for OpenShift LWS operator."""

from typing import Any

from ocp_resources.resource import Resource


class LeaderWorkerSetOperator(Resource):
"""LeaderWorkerSetOperator is the Schema for the leaderworkersetoperators API."""

api_group: str = "operator.openshift.io"

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)

def to_dict(self) -> None:
super().to_dict()

if not self.kind_dict and not self.yaml_file:
self.res["spec"] = {}