Skip to content

Commit d996140

Browse files
rnetserdbasunag
authored andcommitted
Add Cluster sanity checks before test execution (opendatahub-io#235)
* Create size-labeler.yml * Delete .github/workflows/size-labeler.yml * model mesh - add auth tests * xx * feat: cluster sanity * feat: cluster sanity * feat: cluster sanity * feat: cluster sanity add readme * fix: tix str typo * fix: address comments * fix: address review comments * fix: address comment * fix: use dsci from global config * fix: remove duplicate fixture
1 parent ac485b2 commit d996140

File tree

8 files changed

+91
-17
lines changed

8 files changed

+91
-17
lines changed

conftest.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,18 @@ def pytest_addoption(parser: Parser) -> None:
136136
action="store_false",
137137
)
138138

139+
# Cluster sanity options
140+
cluster_sanity_group.addoption(
141+
"--cluster-sanity-skip-check",
142+
help="Skip cluster_sanity check",
143+
action="store_true",
144+
)
145+
cluster_sanity_group.addoption(
146+
"--cluster-sanity-skip-rhoai-check",
147+
help="Skip RHOAI/ODH-related resources (DSCI and DSC) checks",
148+
action="store_true",
149+
)
150+
139151

140152
def pytest_cmdline_main(config: Any) -> None:
141153
config.option.basetemp = py_config["tmp_base_dir"] = f"{config.option.basetemp}-{shortuuid.uuid()}"

docs/GETTING_STARTED.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ uv run pytest -k test_name
6262
Bt default, RHOAI distribution is set.
6363
To run on ODH, pass `--tc=distribution:upstream` to pytest.
6464

65+
### Skip cluster sanity checks
66+
By default, cluster sanity checks are run to make cluster ready for tests.
67+
To skip cluster sanity checks, pass `--cluster-sanity-skip-check` to skip all tests.
68+
To skip RHOAI/ODH-related tests (for example when running in upstream), pass `--cluster-sanity-skip-rhoai-check`.
69+
6570
### jira integration
6671
To skip running tests which have open bugs, [pytest_jira](https://github.com/rhevm-qe-automation/pytest_jira) plugin is used.
6772
To run tests with jira integration, you need to set `PYTEST_JIRA_URL` and `PYTEST_JIRA_TOKEN` environment variables.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ dependencies = [
6666
"openshift-python-wrapper>=11.0.38",
6767
"semver>=3.0.4",
6868
"sqlalchemy>=2.0.40",
69+
"pytest-order>=1.3.0",
6970
]
7071

7172
[project.urls]

tests/conftest.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
import base64
22
import os
33
import shutil
4-
from typing import Any, Generator
4+
from typing import Any, Callable, Generator
55

66
import pytest
77
import shortuuid
88
import yaml
99
from _pytest.tmpdir import TempPathFactory
1010
from ocp_resources.config_map import ConfigMap
11+
from ocp_resources.dsc_initialization import DSCInitialization
12+
from ocp_resources.node import Node
1113
from ocp_resources.pod import Pod
1214
from ocp_resources.secret import Secret
1315
from ocp_resources.service import Service
1416
from pyhelper_utils.shell import run_command
1517
from pytest import FixtureRequest, Config
1618
from kubernetes.dynamic import DynamicClient
17-
from kubernetes.dynamic.exceptions import ResourceNotFoundError
1819
from ocp_resources.data_science_cluster import DataScienceCluster
1920
from ocp_resources.namespace import Namespace
2021
from ocp_resources.resource import get_client
@@ -25,6 +26,7 @@
2526
from utilities.exceptions import ClusterLoginError
2627
from utilities.general import get_s3_secret_dict
2728
from utilities.infra import (
29+
verify_cluster_sanity,
2830
create_ns,
2931
get_dsci_applications_namespace,
3032
get_operator_distribution,
@@ -277,12 +279,14 @@ def unprivileged_client(
277279
raise ClusterLoginError(user=non_admin_user_name)
278280

279281

282+
@pytest.fixture(scope="session")
283+
def dsci_resource(admin_client: DynamicClient) -> DSCInitialization:
284+
return DSCInitialization(client=admin_client, name=py_config["dsci_name"], ensure_exists=True)
285+
286+
280287
@pytest.fixture(scope="session")
281288
def dsc_resource(admin_client: DynamicClient) -> DataScienceCluster:
282-
name = py_config["dsc_name"]
283-
for dsc in DataScienceCluster.get(dyn_client=admin_client, name=name):
284-
return dsc
285-
raise ResourceNotFoundError(f"DSC resource {name} not found")
289+
return DataScienceCluster(client=admin_client, name=py_config["dsc_name"], ensure_exists=True)
286290

287291

288292
@pytest.fixture(scope="module")
@@ -444,3 +448,33 @@ def minio_data_connection(
444448
},
445449
) as minio_secret:
446450
yield minio_secret
451+
452+
453+
@pytest.fixture(scope="session")
454+
def nodes(admin_client: DynamicClient) -> Generator[list[Node], Any, Any]:
455+
yield list(Node.get(dyn_client=admin_client))
456+
457+
458+
@pytest.fixture(scope="session")
459+
def junitxml_plugin(
460+
request: FixtureRequest, record_testsuite_property: Callable[[str, object], None]
461+
) -> Callable[[str, object], None] | None:
462+
return record_testsuite_property if request.config.pluginmanager.has_plugin("junitxml") else None
463+
464+
465+
@pytest.fixture(scope="session", autouse=True)
466+
@pytest.mark.early(order=0)
467+
def cluster_sanity_scope_session(
468+
request: FixtureRequest,
469+
nodes: list[Node],
470+
dsci_resource: DSCInitialization,
471+
dsc_resource: DataScienceCluster,
472+
junitxml_plugin: Callable[[str, object], None],
473+
) -> None:
474+
verify_cluster_sanity(
475+
request=request,
476+
nodes=nodes,
477+
dsc_resource=dsc_resource,
478+
dsci_resource=dsci_resource,
479+
junitxml_property=junitxml_plugin,
480+
)

tests/model_serving/model_server/multi_node/conftest.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,6 @@
2828
from utilities.serving_runtime import ServingRuntimeFromTemplate
2929

3030

31-
@pytest.fixture(scope="session")
32-
def nodes(admin_client: DynamicClient) -> list[Node]:
33-
return list(Node.get(dyn_client=admin_client))
34-
35-
3631
@pytest.fixture(scope="session")
3732
def nvidia_gpu_nodes(nodes: list[Node]) -> list[Node]:
3833
return [node for node in nodes if "nvidia.com/gpu.present" in node.labels.keys()]

utilities/exceptions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from typing import Optional
24

35
from ocp_resources.service import Service
@@ -112,3 +114,6 @@ class TooManyPodsError(Exception):
112114

113115
class UnexpectedFailureError(Exception):
114116
pass
117+
118+
class ResourceNotReadyError(Exception):
119+
pass

utilities/infra.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import shlex
44
from contextlib import contextmanager
55
from functools import cache
6-
from typing import Any, Generator, Optional, Set, Callable
6+
7+
from typing import Any, Callable, Generator, Optional, Set
8+
79
import kubernetes
10+
import pytest
811
from _pytest.fixtures import FixtureRequest
912
from kubernetes.dynamic import DynamicClient
1013
from kubernetes.dynamic.exceptions import ResourceNotFoundError, ResourceNotUniqueError
@@ -29,6 +32,7 @@
2932
from ocp_resources.service import Service
3033
from ocp_resources.service_account import ServiceAccount
3134
from ocp_resources.serving_runtime import ServingRuntime
35+
3236
from pyhelper_utils.shell import run_command
3337
from pytest_testconfig import config as py_config
3438
from packaging.version import parse, Version
@@ -722,13 +726,12 @@ def get_product_version(admin_client: DynamicClient) -> Version:
722726
return Version.parse(operator_version)
723727

724728

725-
def get_dsci_applications_namespace(client: DynamicClient, dsci_name: str = "default-dsci") -> str:
729+
def get_dsci_applications_namespace(client: DynamicClient) -> str:
726730
"""
727731
Get the namespace where DSCI applications are deployed.
728732
729733
Args:
730734
client (DynamicClient): DynamicClient object
731-
dsci_name (str): DSCI name
732735
733736
Returns:
734737
str: Namespace where DSCI applications are deployed.
@@ -738,6 +741,7 @@ def get_dsci_applications_namespace(client: DynamicClient, dsci_name: str = "def
738741
MissingResourceError: If DSCI not found
739742
740743
"""
744+
dsci_name = py_config["dsci_name"]
741745
dsci = DSCInitialization(client=client, name=dsci_name)
742746

743747
if dsci.exists:
@@ -801,7 +805,11 @@ def wait_for_serverless_pods_deletion(resource: Project | Namespace, admin_clien
801805
pod.wait_deleted(timeout=Timeout.TIMEOUT_1MIN)
802806

803807

804-
@retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=1, exceptions_dict={ResourceNotFoundError: []})
808+
@retry(
809+
wait_timeout=Timeout.TIMEOUT_30SEC,
810+
sleep=1,
811+
exceptions_dict={ResourceNotFoundError: []},
812+
)
805813
def wait_for_isvc_pods(client: DynamicClient, isvc: InferenceService, runtime_name: str | None = None) -> list[Pod]:
806814
"""
807815
Wait for ISVC pods.
@@ -903,6 +911,7 @@ def verify_cluster_sanity(
903911
wait_for_dsci_status_ready(dsci_resource=dsci_resource)
904912
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
905913

914+
906915
except (ResourceNotReadyError, NodeUnschedulableError, NodeNotReadyError) as ex:
907916
error_msg = f"Cluster sanity check failed: {str(ex)}"
908917
# return_code set to 99 to not collide with https://docs.pytest.org/en/stable/reference/exit-codes.html

uv.lock

Lines changed: 15 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)