Skip to content

Commit 7a7c780

Browse files
committed
Add LLMD infrastructure health check as autouse session fixture
Adds an automatic pre-test health gate for all LLMD tests that verifies cert-manager, authorino, RHCL operators, required deployments, LeaderWorkerSetOperator, GatewayClass, and Kuadrant CRs are healthy. Tests are skipped with a descriptive reason if checks fail. Includes --skip-llmd-health-check CLI option and wrapper resource classes for LeaderWorkerSetOperator and Kuadrant. Made-with: Cursor
1 parent 0cef219 commit 7a7c780

File tree

4 files changed

+180
-0
lines changed

4 files changed

+180
-0
lines changed

conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,14 @@ def pytest_addoption(parser: Parser) -> None:
204204
action="store_true",
205205
)
206206

207+
# LLMD health check options
208+
llmd_health_group = parser.getgroup(name="LLMD Health")
209+
llmd_health_group.addoption(
210+
"--skip-llmd-health-check",
211+
help="Skip LLMD infrastructure dependency health check",
212+
action="store_true",
213+
)
214+
207215
# HuggingFace options
208216
hf_group.addoption("--hf-access-token", default=os.environ.get("HF_ACCESS_TOKEN"), help="HF access token")
209217
# Model Registry options

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@
88
import yaml
99
from _pytest.fixtures import FixtureRequest
1010
from kubernetes.dynamic import DynamicClient
11+
from ocp_resources.cluster_service_version import ClusterServiceVersion
1112
from ocp_resources.config_map import ConfigMap
13+
from ocp_resources.deployment import Deployment
1214
from ocp_resources.gateway import Gateway
15+
from ocp_resources.gateway_class import GatewayClass
1316
from ocp_resources.llm_inference_service import LLMInferenceService
1417
from ocp_resources.namespace import Namespace
18+
from ocp_resources.resource import Resource
1519
from ocp_resources.role import Role
1620
from ocp_resources.role_binding import RoleBinding
1721
from ocp_resources.service_account import ServiceAccount
@@ -24,10 +28,136 @@
2428
from utilities.llmd_constants import LLMDGateway
2529
from utilities.llmd_utils import create_llmd_gateway
2630
from utilities.logger import RedactedString
31+
from utilities.resources.kuadrant import Kuadrant
32+
from utilities.resources.leader_worker_set_operator import LeaderWorkerSetOperator
2733

2834
LOGGER = get_logger(name=__name__)
2935
logging.getLogger("timeout_sampler").setLevel(logging.WARNING)
3036

37+
LLMD_DSC_CONDITION: str = "KserveLLMInferenceServiceDependencies"
38+
39+
LLMD_REQUIRED_OPERATORS: dict[str, str] = {
40+
"cert-manager-operator": "cert-manager-operator",
41+
"authorino-operator": "openshift-operators",
42+
"rhcl-operator": "openshift-operators",
43+
}
44+
45+
LLMD_REQUIRED_DEPLOYMENTS: dict[str, str] = {
46+
"cert-manager-operator-controller-manager": "cert-manager-operator",
47+
"cert-manager": "cert-manager",
48+
"cert-manager-webhook": "cert-manager",
49+
"authorino-operator": "openshift-operators",
50+
"kuadrant-operator-controller-manager": "openshift-operators",
51+
"lws-controller-manager": "openshift-lws-operator",
52+
}
53+
54+
55+
def _verify_operator_csv(admin_client: DynamicClient, csv_prefix: str, namespace: str) -> None:
56+
for csv in ClusterServiceVersion.get(client=admin_client, namespace=namespace):
57+
if csv.name.startswith(csv_prefix) and csv.status == csv.Status.SUCCEEDED:
58+
return
59+
pytest.skip(f"Operator CSV {csv_prefix} not found or not Succeeded in {namespace}")
60+
61+
62+
def verify_llmd_health(admin_client: DynamicClient, dsc_resource: Resource) -> None:
63+
"""Verify LLMD infrastructure dependencies are healthy.
64+
65+
Checks DSC condition, required operator CSVs, controller deployments,
66+
LeaderWorkerSetOperator CR, GatewayClass, and Kuadrant CR.
67+
"""
68+
# 1. DSC condition for LLMD dependencies
69+
for condition in dsc_resource.instance.status.conditions:
70+
if condition.type == LLMD_DSC_CONDITION:
71+
if condition.status != "True":
72+
pytest.skip(f"{LLMD_DSC_CONDITION} is not ready: {condition.status}, reason: {condition.get('reason')}")
73+
break
74+
else:
75+
pytest.skip(f"{LLMD_DSC_CONDITION} condition not found in DSC status")
76+
77+
# 2. Operator CSVs
78+
for csv_prefix, namespace in LLMD_REQUIRED_OPERATORS.items():
79+
_verify_operator_csv(admin_client=admin_client, csv_prefix=csv_prefix, namespace=namespace)
80+
81+
# 3. Controller deployments
82+
for name, namespace in LLMD_REQUIRED_DEPLOYMENTS.items():
83+
deployment = Deployment(client=admin_client, name=name, namespace=namespace)
84+
if not deployment.exists:
85+
pytest.skip(f"LLMD dependency deployment {name} not found in {namespace}")
86+
87+
dep_available = False
88+
for condition in deployment.instance.status.get("conditions", []):
89+
if condition.type == "Available":
90+
if condition.status != "True":
91+
pytest.skip(f"Deployment {name} in {namespace} is not Available: {condition.get('reason')}")
92+
dep_available = True
93+
break
94+
95+
if not dep_available:
96+
pytest.skip(f"Deployment {name} in {namespace} has no Available condition")
97+
98+
# 4. LeaderWorkerSetOperator CR
99+
lws_operator = LeaderWorkerSetOperator(client=admin_client, name="cluster")
100+
if not lws_operator.exists:
101+
pytest.skip("LeaderWorkerSetOperator 'cluster' CR not found")
102+
103+
lws_available = False
104+
for condition in lws_operator.instance.status.get("conditions", []):
105+
if condition.type == "Available":
106+
if condition.status != "True":
107+
pytest.skip(f"LeaderWorkerSetOperator is not Available: {condition.get('reason')}")
108+
lws_available = True
109+
break
110+
111+
if not lws_available:
112+
pytest.skip("LeaderWorkerSetOperator has no Available condition")
113+
114+
# 5. GatewayClass
115+
gateway_class = GatewayClass(client=admin_client, name="openshift-default")
116+
if not gateway_class.exists:
117+
pytest.skip("GatewayClass 'openshift-default' not found")
118+
119+
gc_accepted = False
120+
for condition in gateway_class.instance.status.get("conditions", []):
121+
if condition.type == "Accepted":
122+
if condition.status != "True":
123+
pytest.skip(f"GatewayClass 'openshift-default' is not Accepted: {condition.get('reason')}")
124+
gc_accepted = True
125+
break
126+
127+
if not gc_accepted:
128+
pytest.skip("GatewayClass 'openshift-default' has no Accepted condition")
129+
130+
# 6. Kuadrant CR
131+
kuadrant = Kuadrant(client=admin_client, name="kuadrant", namespace="kuadrant-system")
132+
if not kuadrant.exists:
133+
pytest.skip("Kuadrant 'kuadrant' CR not found")
134+
135+
LOGGER.info("LLMD component health check passed")
136+
137+
138+
@pytest.fixture(scope="session", autouse=True)
139+
def llmd_health_check(
140+
request: pytest.FixtureRequest,
141+
admin_client: DynamicClient,
142+
dsc_resource: Resource,
143+
) -> None:
144+
"""Session-scoped health gate for all LLMD tests.
145+
146+
Skips all tests under tests/model_serving/model_server/llmd/ when
147+
LLMD infrastructure dependencies are not healthy.
148+
"""
149+
if request.session.config.getoption("--skip-llmd-health-check"):
150+
LOGGER.warning("Skipping LLMD health check, got --skip-llmd-health-check")
151+
return
152+
153+
selected_markers = {mark.name for item in request.session.items for mark in item.iter_markers()}
154+
if "component_health" in selected_markers:
155+
LOGGER.info("Skipping LLMD health gate because selected tests include component_health marker")
156+
return
157+
158+
verify_llmd_health(admin_client=admin_client, dsc_resource=dsc_resource)
159+
160+
31161
AuthEntry = namedtuple(typename="AuthEntry", field_names=["service", "token"])
32162

33163

utilities/resources/kuadrant.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Kuadrant custom resource for Kuadrant API management."""
2+
3+
from typing import Any
4+
5+
from ocp_resources.resource import NamespacedResource
6+
7+
from utilities.constants import ApiGroups
8+
9+
10+
class Kuadrant(NamespacedResource):
11+
"""Kuadrant is the Schema for the kuadrants API."""
12+
13+
api_group: str = ApiGroups.KUADRANT_IO
14+
15+
def __init__(self, **kwargs: Any) -> None:
16+
super().__init__(**kwargs)
17+
18+
def to_dict(self) -> None:
19+
super().to_dict()
20+
21+
if not self.kind_dict and not self.yaml_file:
22+
self.res["spec"] = {}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""LeaderWorkerSetOperator custom resource for OpenShift LWS operator."""
2+
3+
from typing import Any
4+
5+
from ocp_resources.resource import Resource
6+
7+
8+
class LeaderWorkerSetOperator(Resource):
9+
"""LeaderWorkerSetOperator is the Schema for the leaderworkersetoperators API."""
10+
11+
api_group: str = "operator.openshift.io"
12+
13+
def __init__(self, **kwargs: Any) -> None:
14+
super().__init__(**kwargs)
15+
16+
def to_dict(self) -> None:
17+
super().to_dict()
18+
19+
if not self.kind_dict and not self.yaml_file:
20+
self.res["spec"] = {}

0 commit comments

Comments
 (0)