opendatahub-io · mwaykole · Apr 7, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 24, 2026
@@ -205,6 +205,22 @@ def pytest_addoption(parser: Parser) -> None:
         action="store_true",
     )
 
+    # KServe health check options
+    kserve_health_group = parser.getgroup(name="KServe Health")
+    kserve_health_group.addoption(
+        "--skip-kserve-health-check",
+        help="Skip KServe component health check",
+        action="store_true",
+    )
+
+    # LLMD health check options
+    llmd_health_group = parser.getgroup(name="LLMD Health")
+    llmd_health_group.addoption(
+        "--skip-llmd-health-check",
+        help="Skip LLMD infrastructure dependency health check",
+        action="store_true",
+    )
+
     # HuggingFace options
     hf_group.addoption("--hf-access-token", default=os.environ.get("HF_ACCESS_TOKEN"), help="HF access token")
     # Model Registry options

@@ -0,0 +1,84 @@
+import pytest
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.data_science_cluster import DataScienceCluster
+from ocp_resources.deployment import Deployment
+from pytest_testconfig import config as py_config
+from simple_logger.logger import get_logger
+
+from utilities.constants import DscComponents
+
+LOGGER = get_logger(name=__name__)
+
+KSERVE_CONTROLLER_DEPLOYMENTS: list[str] = [
+    "kserve-controller-manager",
+    "odh-model-controller",
+]
+
+
+def verify_kserve_health(admin_client: DynamicClient, dsc_resource: DataScienceCluster) -> None:
+    """Verify that KServe components are healthy and ready to serve models.
+
+    Checks management state, DSC ready condition, and controller deployment availability.
+    Raises pytest.skip on any failure so downstream kserve tests are skipped.
+    """
+    applications_namespace = py_config["applications_namespace"]
+
+    kserve_management_state = dsc_resource.instance.spec.components[DscComponents.KSERVE].managementState
+    if kserve_management_state != DscComponents.ManagementState.MANAGED:
+        pytest.skip(f"KServe managementState is {kserve_management_state}, expected Managed")
+
+    kserve_ready = False
+    for condition in dsc_resource.instance.status.conditions:
+        if condition.type == DscComponents.COMPONENT_MAPPING[DscComponents.KSERVE]:
+            if condition.status != "True":
+                pytest.skip(f"KServe DSC condition is not ready: {condition.status}, reason: {condition.get('reason')}")
+            kserve_ready = True
+            break
+
+    if not kserve_ready:
+        pytest.skip("KserveReady condition not found in DSC status")
+
+    for name in KSERVE_CONTROLLER_DEPLOYMENTS:
+        deployment = Deployment(
+            client=admin_client,
+            name=name,
+            namespace=applications_namespace,
+        )
+        if not deployment.exists:
+            pytest.skip(f"KServe deployment {name} not found in {applications_namespace}")
+
+        available = False
+        for condition in deployment.instance.status.get("conditions", []):
+            if condition.type == "Available":
+                if condition.status != "True":
+                    pytest.skip(f"KServe deployment {name} is not Available: {condition.get('reason')}")
+                available = True
+                break
+
+        if not available:
+            pytest.skip(f"KServe deployment {name} has no Available condition")
+
+    LOGGER.info("KServe component health check passed")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def kserve_health_check(
+    request: pytest.FixtureRequest,
+    admin_client: DynamicClient,
+    dsc_resource: DataScienceCluster,
+) -> None:
+    """Session-scoped health gate for all kserve tests.
+
+    Skips all tests under tests/model_serving/model_server/kserve/ when
+    KServe components are not healthy.
+    """
+    if request.session.config.getoption("--skip-kserve-health-check"):
+        LOGGER.warning("Skipping KServe health check, got --skip-kserve-health-check")
+        return
+
+    selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
+    if "component_health" in selected_markers:
+        LOGGER.info("Skipping KServe health gate because selected tests include component_health marker")
+        return
+
+    verify_kserve_health(admin_client=admin_client, dsc_resource=dsc_resource)
@@ -9,24 +9,166 @@
 import yaml
 from _pytest.fixtures import FixtureRequest
 from kubernetes.dynamic import DynamicClient
+from ocp_resources.cluster_service_version import ClusterServiceVersion
 from ocp_resources.config_map import ConfigMap
+from ocp_resources.deployment import Deployment
 from ocp_resources.gateway import Gateway
 from ocp_resources.llm_inference_service import LLMInferenceService
 from ocp_resources.namespace import Namespace
+from ocp_resources.resource import Resource
 from ocp_resources.role import Role
 from ocp_resources.role_binding import RoleBinding
 from ocp_resources.service_account import ServiceAccount
+from pytest_testconfig import config as py_config
 
 from tests.model_serving.model_server.llmd.llmd_configs import TinyLlamaOciConfig
 from tests.model_serving.model_server.llmd.utils import wait_for_llmisvc, wait_for_llmisvc_pods_ready
 from utilities.constants import Timeout
 from utilities.infra import create_inference_token, s3_endpoint_secret, update_configmap_data
 from utilities.llmd_utils import create_llmd_gateway
 from utilities.logger import RedactedString
+from utilities.resources.kuadrant import Kuadrant
+from utilities.resources.leader_worker_set_operator import LeaderWorkerSetOperator
 
 LOGGER = structlog.get_logger(name=__name__)
 logging.getLogger("timeout_sampler").setLevel(logging.WARNING)
 
+LLMD_DSC_CONDITION: str = "KserveLLMInferenceServiceDependencies"
+
+LLMD_REQUIRED_OPERATORS: dict[str, str] = {
+    "cert-manager-operator": "cert-manager-operator",
+    "authorino-operator": "openshift-operators",
+    "rhcl-operator": "openshift-operators",
+}
+
+LLMD_REQUIRED_DEPLOYMENTS: dict[str, str] = {
+    "cert-manager-operator-controller-manager": "cert-manager-operator",
+    "cert-manager": "cert-manager",
+    "cert-manager-webhook": "cert-manager",
+    "authorino-operator": "openshift-operators",
+    "kuadrant-operator-controller-manager": "openshift-operators",
+}
+
+# Same KServe stack as tests/model_serving/model_server/kserve/conftest.py plus LLM-ISVC controller.
+LLMD_KSERVE_CONTROLLER_DEPLOYMENTS: list[str] = [
+    "kserve-controller-manager",
+    "odh-model-controller",
+    "llmisvc-controller-manager",
+]
+
+
+def _verify_operator_csv(admin_client: DynamicClient, csv_prefix: str, namespace: str) -> None:
+    for csv in ClusterServiceVersion.get(client=admin_client, namespace=namespace):
+        if csv.name.startswith(csv_prefix) and csv.status == csv.Status.SUCCEEDED:
+            return
+    pytest.xfail(f"Operator CSV {csv_prefix} not found or not Succeeded in {namespace}")
+
+
+def verify_llmd_health(admin_client: DynamicClient, dsc_resource: Resource) -> None:
+    """Verify LLMD infrastructure dependencies are healthy.
+
+    Checks DSC condition, required operator CSVs, dependency and KServe controller
+    deployments, optional LeaderWorkerSetOperator CR (LWS is optional),
+    and Kuadrant CR.
+    """
+    # 1. DSC condition for LLMD dependencies
+    for condition in dsc_resource.instance.status.conditions:
+        if condition.type == LLMD_DSC_CONDITION:
+            if condition.status != "True":
+                pytest.xfail(
+                    f"{LLMD_DSC_CONDITION} is not ready: {condition.status}, reason: {condition.get('reason')}"
+                )
+            break
+    else:
+        pytest.xfail(f"{LLMD_DSC_CONDITION} condition not found in DSC status")
+
+    # 2. Operator CSVs
+    for csv_prefix, namespace in LLMD_REQUIRED_OPERATORS.items():
+        _verify_operator_csv(admin_client=admin_client, csv_prefix=csv_prefix, namespace=namespace)
+
+    # 3. Controller deployments
+    for name, namespace in LLMD_REQUIRED_DEPLOYMENTS.items():
+        deployment = Deployment(client=admin_client, name=name, namespace=namespace)
+        if not deployment.exists:
+            pytest.xfail(f"LLMD dependency deployment {name} not found in {namespace}")
+
+        dep_available = False
+        for condition in deployment.instance.status.get("conditions", []):
+            if condition.type == "Available":
+                if condition.status != "True":
+                    pytest.xfail(f"Deployment {name} in {namespace} is not Available: {condition.get('reason')}")
+                dep_available = True
+                break
+
+        if not dep_available:
+            pytest.xfail(f"Deployment {name} in {namespace} has no Available condition")
+
+    applications_namespace = py_config["applications_namespace"]
+    for name in LLMD_KSERVE_CONTROLLER_DEPLOYMENTS:
+        deployment = Deployment(client=admin_client, name=name, namespace=applications_namespace)
+        if not deployment.exists:
+            pytest.xfail(f"KServe/LLMD controller deployment {name} not found in {applications_namespace}")
+
+        kserve_dep_available = False
+        for condition in deployment.instance.status.get("conditions", []):
+            if condition.type == "Available":
+                if condition.status != "True":
+                    pytest.xfail(
+                        f"Deployment {name} in {applications_namespace} is not Available: {condition.get('reason')}"
+                    )
+                kserve_dep_available = True
+                break
+
+        if not kserve_dep_available:
+            pytest.xfail(f"Deployment {name} in {applications_namespace} has no Available condition")
+
+    # 4. LeaderWorkerSetOperator CR (optional)
+    lws_operator = LeaderWorkerSetOperator(client=admin_client, name="cluster")
+    if lws_operator.exists:
+        lws_available = False
+        for condition in lws_operator.instance.status.get("conditions", []):
+            if condition.type == "Available":
+                if condition.status != "True":
+                    pytest.xfail(f"LeaderWorkerSetOperator is not Available: {condition.get('reason')}")
+                lws_available = True
+                break
+
+        if not lws_available:
+            pytest.xfail("LeaderWorkerSetOperator has no Available condition")
+    else:
+        LOGGER.warning("LeaderWorkerSetOperator cluster CR not found; LWS is optional for LLMD (RHOAIENG-52057)")
+
+    # 5. Kuadrant CR
+    kuadrant = Kuadrant(client=admin_client, name="kuadrant", namespace="kuadrant-system")
+    if not kuadrant.exists:
+        pytest.xfail("Kuadrant 'kuadrant' CR not found")
+
+    LOGGER.info("LLMD component health check passed")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def llmd_health_check(
+    request: pytest.FixtureRequest,
+    admin_client: DynamicClient,
+    dsc_resource: Resource,
+) -> None:
+    """Session-scoped health gate for all LLMD tests.
+
+    Marks LLMD tests as xfail when required infrastructure dependencies are unhealthy
+    (see verify_llmd_health). Use --skip-llmd-health-check to disable.
+    """
+    if request.session.config.getoption("--skip-llmd-health-check"):
+        LOGGER.warning("Skipping LLMD health check, got --skip-llmd-health-check")
+        return
+
+    selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
+    if "component_health" in selected_markers:
+        LOGGER.info("Skipping LLMD health gate because selected tests include component_health marker")
+        return
+
+    verify_llmd_health(admin_client=admin_client, dsc_resource=dsc_resource)
+
+
 AuthEntry = namedtuple(typename="AuthEntry", field_names=["service", "token"])
 
 

@@ -0,0 +1,22 @@
+"""Kuadrant custom resource for Kuadrant API management."""
+
+from typing import Any
+
+from ocp_resources.resource import NamespacedResource
+
+from utilities.constants import ApiGroups
+
+
+class Kuadrant(NamespacedResource):
+    """Kuadrant is the Schema for the kuadrants API."""
+
+    api_group: str = ApiGroups.KUADRANT_IO
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+    def to_dict(self) -> None:
+        super().to_dict()
+
+        if not self.kind_dict and not self.yaml_file:
+            self.res["spec"] = {}
@@ -0,0 +1,20 @@
+"""LeaderWorkerSetOperator custom resource for OpenShift LWS operator."""
+
+from typing import Any
+
+from ocp_resources.resource import Resource
+
+
+class LeaderWorkerSetOperator(Resource):
+    """LeaderWorkerSetOperator is the Schema for the leaderworkersetoperators API."""
+
+    api_group: str = "operator.openshift.io"
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+    def to_dict(self) -> None:
+        super().to_dict()
+
+        if not self.kind_dict and not self.yaml_file:
+            self.res["spec"] = {}