|
8 | 8 | import yaml |
9 | 9 | from _pytest.fixtures import FixtureRequest |
10 | 10 | from kubernetes.dynamic import DynamicClient |
| 11 | +from ocp_resources.cluster_service_version import ClusterServiceVersion |
11 | 12 | from ocp_resources.config_map import ConfigMap |
| 13 | +from ocp_resources.deployment import Deployment |
12 | 14 | from ocp_resources.gateway import Gateway |
| 15 | +from ocp_resources.gateway_class import GatewayClass |
13 | 16 | from ocp_resources.llm_inference_service import LLMInferenceService |
14 | 17 | from ocp_resources.namespace import Namespace |
| 18 | +from ocp_resources.resource import Resource |
15 | 19 | from ocp_resources.role import Role |
16 | 20 | from ocp_resources.role_binding import RoleBinding |
17 | 21 | from ocp_resources.service_account import ServiceAccount |
|
24 | 28 | from utilities.llmd_constants import LLMDGateway |
25 | 29 | from utilities.llmd_utils import create_llmd_gateway |
26 | 30 | from utilities.logger import RedactedString |
| 31 | +from utilities.resources.kuadrant import Kuadrant |
| 32 | +from utilities.resources.leader_worker_set_operator import LeaderWorkerSetOperator |
27 | 33 |
|
28 | 34 | LOGGER = get_logger(name=__name__) |
29 | 35 | logging.getLogger("timeout_sampler").setLevel(logging.WARNING) |
30 | 36 |
|
| 37 | +LLMD_DSC_CONDITION: str = "KserveLLMInferenceServiceDependencies" |
| 38 | + |
| 39 | +LLMD_REQUIRED_OPERATORS: dict[str, str] = { |
| 40 | + "cert-manager-operator": "cert-manager-operator", |
| 41 | + "authorino-operator": "openshift-operators", |
| 42 | + "rhcl-operator": "openshift-operators", |
| 43 | +} |
| 44 | + |
| 45 | +LLMD_REQUIRED_DEPLOYMENTS: dict[str, str] = { |
| 46 | + "cert-manager-operator-controller-manager": "cert-manager-operator", |
| 47 | + "cert-manager": "cert-manager", |
| 48 | + "cert-manager-webhook": "cert-manager", |
| 49 | + "authorino-operator": "openshift-operators", |
| 50 | + "kuadrant-operator-controller-manager": "openshift-operators", |
| 51 | + "lws-controller-manager": "openshift-lws-operator", |
| 52 | +} |
| 53 | + |
| 54 | + |
| 55 | +def _verify_operator_csv(admin_client: DynamicClient, csv_prefix: str, namespace: str) -> None: |
| 56 | + for csv in ClusterServiceVersion.get(client=admin_client, namespace=namespace): |
| 57 | + if csv.name.startswith(csv_prefix) and csv.status == csv.Status.SUCCEEDED: |
| 58 | + return |
| 59 | + pytest.skip(f"Operator CSV {csv_prefix} not found or not Succeeded in {namespace}") |
| 60 | + |
| 61 | + |
| 62 | +def verify_llmd_health(admin_client: DynamicClient, dsc_resource: Resource) -> None: |
| 63 | + """Verify LLMD infrastructure dependencies are healthy. |
| 64 | +
|
| 65 | + Checks DSC condition, required operator CSVs, controller deployments, |
| 66 | + LeaderWorkerSetOperator CR, GatewayClass, and Kuadrant CR. |
| 67 | + """ |
| 68 | + # 1. DSC condition for LLMD dependencies |
| 69 | + for condition in dsc_resource.instance.status.conditions: |
| 70 | + if condition.type == LLMD_DSC_CONDITION: |
| 71 | + if condition.status != "True": |
| 72 | + pytest.skip(f"{LLMD_DSC_CONDITION} is not ready: {condition.status}, reason: {condition.get('reason')}") |
| 73 | + break |
| 74 | + else: |
| 75 | + pytest.skip(f"{LLMD_DSC_CONDITION} condition not found in DSC status") |
| 76 | + |
| 77 | + # 2. Operator CSVs |
| 78 | + for csv_prefix, namespace in LLMD_REQUIRED_OPERATORS.items(): |
| 79 | + _verify_operator_csv(admin_client=admin_client, csv_prefix=csv_prefix, namespace=namespace) |
| 80 | + |
| 81 | + # 3. Controller deployments |
| 82 | + for name, namespace in LLMD_REQUIRED_DEPLOYMENTS.items(): |
| 83 | + deployment = Deployment(client=admin_client, name=name, namespace=namespace) |
| 84 | + if not deployment.exists: |
| 85 | + pytest.skip(f"LLMD dependency deployment {name} not found in {namespace}") |
| 86 | + |
| 87 | + dep_available = False |
| 88 | + for condition in deployment.instance.status.get("conditions", []): |
| 89 | + if condition.type == "Available": |
| 90 | + if condition.status != "True": |
| 91 | + pytest.skip(f"Deployment {name} in {namespace} is not Available: {condition.get('reason')}") |
| 92 | + dep_available = True |
| 93 | + break |
| 94 | + |
| 95 | + if not dep_available: |
| 96 | + pytest.skip(f"Deployment {name} in {namespace} has no Available condition") |
| 97 | + |
| 98 | + # 4. LeaderWorkerSetOperator CR |
| 99 | + lws_operator = LeaderWorkerSetOperator(client=admin_client, name="cluster") |
| 100 | + if not lws_operator.exists: |
| 101 | + pytest.skip("LeaderWorkerSetOperator 'cluster' CR not found") |
| 102 | + |
| 103 | + lws_available = False |
| 104 | + for condition in lws_operator.instance.status.get("conditions", []): |
| 105 | + if condition.type == "Available": |
| 106 | + if condition.status != "True": |
| 107 | + pytest.skip(f"LeaderWorkerSetOperator is not Available: {condition.get('reason')}") |
| 108 | + lws_available = True |
| 109 | + break |
| 110 | + |
| 111 | + if not lws_available: |
| 112 | + pytest.skip("LeaderWorkerSetOperator has no Available condition") |
| 113 | + |
| 114 | + # 5. GatewayClass |
| 115 | + gateway_class = GatewayClass(client=admin_client, name="openshift-default") |
| 116 | + if not gateway_class.exists: |
| 117 | + pytest.skip("GatewayClass 'openshift-default' not found") |
| 118 | + |
| 119 | + gc_accepted = False |
| 120 | + for condition in gateway_class.instance.status.get("conditions", []): |
| 121 | + if condition.type == "Accepted": |
| 122 | + if condition.status != "True": |
| 123 | + pytest.skip(f"GatewayClass 'openshift-default' is not Accepted: {condition.get('reason')}") |
| 124 | + gc_accepted = True |
| 125 | + break |
| 126 | + |
| 127 | + if not gc_accepted: |
| 128 | + pytest.skip("GatewayClass 'openshift-default' has no Accepted condition") |
| 129 | + |
| 130 | + # 6. Kuadrant CR |
| 131 | + kuadrant = Kuadrant(client=admin_client, name="kuadrant", namespace="kuadrant-system") |
| 132 | + if not kuadrant.exists: |
| 133 | + pytest.skip("Kuadrant 'kuadrant' CR not found") |
| 134 | + |
| 135 | + LOGGER.info("LLMD component health check passed") |
| 136 | + |
| 137 | + |
| 138 | +@pytest.fixture(scope="session", autouse=True) |
| 139 | +def llmd_health_check( |
| 140 | + request: pytest.FixtureRequest, |
| 141 | + admin_client: DynamicClient, |
| 142 | + dsc_resource: Resource, |
| 143 | +) -> None: |
| 144 | + """Session-scoped health gate for all LLMD tests. |
| 145 | +
|
| 146 | + Skips all tests under tests/model_serving/model_server/llmd/ when |
| 147 | + LLMD infrastructure dependencies are not healthy. |
| 148 | + """ |
| 149 | + if request.session.config.getoption("--skip-llmd-health-check"): |
| 150 | + LOGGER.warning("Skipping LLMD health check, got --skip-llmd-health-check") |
| 151 | + return |
| 152 | + |
| 153 | + selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()} |
| 154 | + if "component_health" in selected_markers: |
| 155 | + LOGGER.info("Skipping LLMD health gate because selected tests include component_health marker") |
| 156 | + return |
| 157 | + |
| 158 | + verify_llmd_health(admin_client=admin_client, dsc_resource=dsc_resource) |
| 159 | + |
| 160 | + |
31 | 161 | AuthEntry = namedtuple(typename="AuthEntry", field_names=["service", "token"]) |
32 | 162 |
|
33 | 163 |
|
|
0 commit comments