Skip to content

Commit 9c9953f

Browse files
rnetserdbasunag
authored andcommitted
Add Cluster sanity checks before test execution (opendatahub-io#235)
* Create size-labeler.yml * Delete .github/workflows/size-labeler.yml * model mesh - add auth tests * xx * feat: cluster sanity * feat: cluster sanity * feat: cluster sanity * feat: cluster sanity add readme * fix: tix str typo * fix: address comments * fix: address review comments * fix: address comment * fix: use dsci from global config * fix: remove duplicate fixture
1 parent 13803ce commit 9c9953f

File tree

8 files changed

+89
-17
lines changed

8 files changed

+89
-17
lines changed

conftest.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,18 @@ def pytest_addoption(parser: Parser) -> None:
137137
default=False,
138138
)
139139

140+
# Cluster sanity options
141+
cluster_sanity_group.addoption(
142+
"--cluster-sanity-skip-check",
143+
help="Skip cluster_sanity check",
144+
action="store_true",
145+
)
146+
cluster_sanity_group.addoption(
147+
"--cluster-sanity-skip-rhoai-check",
148+
help="Skip RHOAI/ODH-related resources (DSCI and DSC) checks",
149+
action="store_true",
150+
)
151+
140152

141153
def pytest_cmdline_main(config: Any) -> None:
142154
config.option.basetemp = py_config["tmp_base_dir"] = f"{config.option.basetemp}-{shortuuid.uuid()}"

docs/GETTING_STARTED.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ uv run pytest -k test_name
6262
Bt default, RHOAI distribution is set.
6363
To run on ODH, pass `--tc=distribution:upstream` to pytest.
6464

65+
### Skip cluster sanity checks
66+
By default, cluster sanity checks are run to make cluster ready for tests.
67+
To skip cluster sanity checks, pass `--cluster-sanity-skip-check` to skip all tests.
68+
To skip RHOAI/ODH-related tests (for example when running in upstream), pass `--cluster-sanity-skip-rhoai-check`.
69+
6570
### jira integration
6671
To skip running tests which have open bugs, [pytest_jira](https://github.com/rhevm-qe-automation/pytest_jira) plugin is used.
6772
To run tests with jira integration, you need to set `PYTEST_JIRA_URL` and `PYTEST_JIRA_TOKEN` environment variables.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ dependencies = [
6666
"openshift-python-wrapper>=11.0.38",
6767
"semver>=3.0.4",
6868
"sqlalchemy>=2.0.40",
69+
"pytest-order>=1.3.0",
6970
]
7071

7172
[project.urls]

tests/conftest.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
import base64
22
import os
33
import shutil
4-
from typing import Any, Generator
4+
from typing import Any, Callable, Generator
55

66
import pytest
77
import shortuuid
88
import yaml
99
from _pytest.tmpdir import TempPathFactory
1010
from ocp_resources.config_map import ConfigMap
11+
from ocp_resources.dsc_initialization import DSCInitialization
12+
from ocp_resources.node import Node
1113
from ocp_resources.pod import Pod
1214
from ocp_resources.secret import Secret
1315
from ocp_resources.service import Service
1416
from pyhelper_utils.shell import run_command
1517
from pytest import FixtureRequest, Config
1618
from kubernetes.dynamic import DynamicClient
17-
from kubernetes.dynamic.exceptions import ResourceNotFoundError
1819
from ocp_resources.data_science_cluster import DataScienceCluster
1920
from ocp_resources.namespace import Namespace
2021
from ocp_resources.resource import get_client
@@ -25,6 +26,7 @@
2526
from utilities.exceptions import ClusterLoginError
2627
from utilities.general import get_s3_secret_dict
2728
from utilities.infra import (
29+
verify_cluster_sanity,
2830
create_ns,
2931
get_dsci_applications_namespace,
3032
get_operator_distribution,
@@ -277,12 +279,14 @@ def unprivileged_client(
277279
raise ClusterLoginError(user=non_admin_user_name)
278280

279281

282+
@pytest.fixture(scope="session")
283+
def dsci_resource(admin_client: DynamicClient) -> DSCInitialization:
284+
return DSCInitialization(client=admin_client, name=py_config["dsci_name"], ensure_exists=True)
285+
286+
280287
@pytest.fixture(scope="session")
281288
def dsc_resource(admin_client: DynamicClient) -> DataScienceCluster:
282-
name = py_config["dsc_name"]
283-
for dsc in DataScienceCluster.get(dyn_client=admin_client, name=name):
284-
return dsc
285-
raise ResourceNotFoundError(f"DSC resource {name} not found")
289+
return DataScienceCluster(client=admin_client, name=py_config["dsc_name"], ensure_exists=True)
286290

287291

288292
@pytest.fixture(scope="module")
@@ -444,3 +448,33 @@ def minio_data_connection(
444448
},
445449
) as minio_secret:
446450
yield minio_secret
451+
452+
453+
@pytest.fixture(scope="session")
454+
def nodes(admin_client: DynamicClient) -> Generator[list[Node], Any, Any]:
455+
yield list(Node.get(dyn_client=admin_client))
456+
457+
458+
@pytest.fixture(scope="session")
459+
def junitxml_plugin(
460+
request: FixtureRequest, record_testsuite_property: Callable[[str, object], None]
461+
) -> Callable[[str, object], None] | None:
462+
return record_testsuite_property if request.config.pluginmanager.has_plugin("junitxml") else None
463+
464+
465+
@pytest.fixture(scope="session", autouse=True)
466+
@pytest.mark.early(order=0)
467+
def cluster_sanity_scope_session(
468+
request: FixtureRequest,
469+
nodes: list[Node],
470+
dsci_resource: DSCInitialization,
471+
dsc_resource: DataScienceCluster,
472+
junitxml_plugin: Callable[[str, object], None],
473+
) -> None:
474+
verify_cluster_sanity(
475+
request=request,
476+
nodes=nodes,
477+
dsc_resource=dsc_resource,
478+
dsci_resource=dsci_resource,
479+
junitxml_property=junitxml_plugin,
480+
)

tests/model_serving/model_server/multi_node/conftest.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,6 @@
2828
from utilities.serving_runtime import ServingRuntimeFromTemplate
2929

3030

31-
@pytest.fixture(scope="session")
32-
def nodes(admin_client: DynamicClient) -> list[Node]:
33-
return list(Node.get(dyn_client=admin_client))
34-
35-
3631
@pytest.fixture(scope="session")
3732
def nvidia_gpu_nodes(nodes: list[Node]) -> list[Node]:
3833
return [node for node in nodes if "nvidia.com/gpu.present" in node.labels.keys()]

utilities/exceptions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from typing import Optional
24

35
from ocp_resources.service import Service
@@ -112,3 +114,6 @@ class TooManyPodsError(Exception):
112114

113115
class UnexpectedFailureError(Exception):
114116
pass
117+
118+
class ResourceNotReadyError(Exception):
119+
pass

utilities/infra.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
import tempfile
77
from contextlib import contextmanager
88
from functools import cache
9-
from typing import Any, Generator, Optional, Set, Callable
9+
from typing import Any, Generator, Optional, Set, Callable
1010
from json import JSONDecodeError
1111

1212
import kubernetes
13+
import pytest
1314
from _pytest.fixtures import FixtureRequest
1415
from kubernetes.dynamic import DynamicClient
1516
from kubernetes.dynamic.exceptions import ResourceNotFoundError, ResourceNotUniqueError
@@ -34,6 +35,7 @@
3435
from ocp_resources.service import Service
3536
from ocp_resources.service_account import ServiceAccount
3637
from ocp_resources.serving_runtime import ServingRuntime
38+
3739
from pyhelper_utils.shell import run_command
3840
from pytest_testconfig import config as py_config
3941
from packaging.version import parse, Version
@@ -727,13 +729,12 @@ def get_product_version(admin_client: DynamicClient) -> Version:
727729
return Version.parse(operator_version)
728730

729731

730-
def get_dsci_applications_namespace(client: DynamicClient, dsci_name: str = "default-dsci") -> str:
732+
def get_dsci_applications_namespace(client: DynamicClient) -> str:
731733
"""
732734
Get the namespace where DSCI applications are deployed.
733735
734736
Args:
735737
client (DynamicClient): DynamicClient object
736-
dsci_name (str): DSCI name
737738
738739
Returns:
739740
str: Namespace where DSCI applications are deployed.
@@ -743,6 +744,7 @@ def get_dsci_applications_namespace(client: DynamicClient, dsci_name: str = "def
743744
MissingResourceError: If DSCI not found
744745
745746
"""
747+
dsci_name = py_config["dsci_name"]
746748
dsci = DSCInitialization(client=client, name=dsci_name)
747749

748750
if dsci.exists:
@@ -806,7 +808,11 @@ def wait_for_serverless_pods_deletion(resource: Project | Namespace, admin_clien
806808
pod.wait_deleted(timeout=Timeout.TIMEOUT_1MIN)
807809

808810

809-
@retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=1, exceptions_dict={ResourceNotFoundError: []})
811+
@retry(
812+
wait_timeout=Timeout.TIMEOUT_30SEC,
813+
sleep=1,
814+
exceptions_dict={ResourceNotFoundError: []},
815+
)
810816
def wait_for_isvc_pods(client: DynamicClient, isvc: InferenceService, runtime_name: str | None = None) -> list[Pod]:
811817
"""
812818
Wait for ISVC pods.
@@ -908,6 +914,7 @@ def verify_cluster_sanity(
908914
wait_for_dsci_status_ready(dsci_resource=dsci_resource)
909915
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
910916

917+
911918
except (ResourceNotReadyError, NodeUnschedulableError, NodeNotReadyError) as ex:
912919
error_msg = f"Cluster sanity check failed: {str(ex)}"
913920
# return_code set to 99 to not collide with https://docs.pytest.org/en/stable/reference/exit-codes.html

uv.lock

Lines changed: 15 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)