diff --git a/conftest.py b/conftest.py index fa87c8152..7d78be57d 100644 --- a/conftest.py +++ b/conftest.py @@ -307,6 +307,8 @@ def updated_global_config(admin_client: DynamicClient, config: Config) -> None: distribution = get_operator_distribution(client=admin_client) if distribution == "Open Data Hub": py_config["distribution"] = "upstream" + # override the operator namespace + py_config["operator_namespace"] = "opendatahub-operators" elif distribution.startswith("OpenShift AI"): py_config["distribution"] = "downstream" diff --git a/tests/cluster_health/test_cluster_health.py b/tests/cluster_health/test_cluster_health.py index 9b3e76151..148711b06 100644 --- a/tests/cluster_health/test_cluster_health.py +++ b/tests/cluster_health/test_cluster_health.py @@ -1,10 +1,15 @@ import pytest - +from kubernetes.dynamic import DynamicClient from ocp_resources.data_science_cluster import DataScienceCluster from ocp_resources.dsc_initialization import DSCInitialization from ocp_resources.node import Node from ocp_utilities.infra import assert_nodes_in_healthy_condition, assert_nodes_schedulable +from utilities.general import wait_for_pods_running from utilities.infra import wait_for_dsci_status_ready, wait_for_dsc_status_ready +from pytest_testconfig import config as py_config +from simple_logger.logger import get_logger + +LOGGER = get_logger(name=__name__) @pytest.mark.cluster_health @@ -21,3 +26,22 @@ def test_data_science_cluster_initialization_healthy(dsci_resource: DSCInitializ @pytest.mark.cluster_health def test_data_science_cluster_healthy(dsc_resource: DataScienceCluster): wait_for_dsc_status_ready(dsc_resource=dsc_resource) + + +@pytest.mark.parametrize( + "namespace_name", + [ + pytest.param( + py_config["operator_namespace"], + id="test_operator_namespace_pod_healthy", + ), + pytest.param( + py_config["applications_namespace"], + id="test_application_namespace_pod_healthy", + ), + ], +) +@pytest.mark.cluster_health +def test_pods_cluster_healthy(admin_client: DynamicClient, namespace_name: str): + LOGGER.info(f"Testing Pods in namespace {namespace_name} for cluster health") + wait_for_pods_running(admin_client=admin_client, namespace_name=namespace_name) diff --git a/tests/global_config.py b/tests/global_config.py index 0a5fef450..c4819b202 100644 --- a/tests/global_config.py +++ b/tests/global_config.py @@ -1,3 +1,5 @@ +from utilities.constants import RHOAI_OPERATOR_NAMESPACE + global config # type:ignore[unused-ignore] dsc_name: str = "default-dsc" @@ -9,6 +11,7 @@ distribution: str = "downstream" applications_namespace: str = "redhat-ods-applications" model_registry_namespace: str = "rhoai-model-registries" +operator_namespace: str = RHOAI_OPERATOR_NAMESPACE for _dir in dir(): val = locals()[_dir] diff --git a/tests/model_registry/conftest.py b/tests/model_registry/conftest.py index 7920b429d..d11504988 100644 --- a/tests/model_registry/conftest.py +++ b/tests/model_registry/conftest.py @@ -34,8 +34,7 @@ from tests.model_registry.rbac.utils import wait_for_oauth_openshift_deployment from tests.model_registry.utils import generate_namespace_name, get_rest_headers -from utilities.general import generate_random_name - +from utilities.general import generate_random_name, wait_for_pods_running from tests.model_registry.constants import ( MR_OPERATOR_NAME, @@ -49,7 +48,6 @@ from tests.model_registry.utils import ( get_endpoint_from_mr_service, get_mr_service_by_label, - wait_for_pods_running, get_model_registry_objects, get_model_registry_metadata_resources, ) diff --git a/tests/model_registry/model_catalog/utils.py b/tests/model_registry/model_catalog/utils.py index 976732834..16704d22c 100644 --- a/tests/model_registry/model_catalog/utils.py +++ b/tests/model_registry/model_catalog/utils.py @@ -14,7 +14,8 @@ CATALOG_TYPE, DEFAULT_CATALOG_FILE, ) -from tests.model_registry.utils import get_model_catalog_pod, wait_for_pods_running, get_rest_headers +from tests.model_registry.utils import get_model_catalog_pod, get_rest_headers +from utilities.general import wait_for_pods_running LOGGER = get_logger(name=__name__) diff --git a/tests/model_registry/rest_api/conftest.py b/tests/model_registry/rest_api/conftest.py index 482b04eae..7bf32612f 100644 --- a/tests/model_registry/rest_api/conftest.py +++ b/tests/model_registry/rest_api/conftest.py @@ -10,13 +10,12 @@ execute_model_registry_patch_command, get_mr_deployment, ) -from utilities.general import generate_random_name +from utilities.general import generate_random_name, wait_for_pods_running from ocp_resources.deployment import Deployment from tests.model_registry.utils import ( get_model_registry_deployment_template_dict, apply_mysql_args_and_volume_mounts, add_mysql_certs_volumes_to_deployment, - wait_for_pods_running, get_mr_standard_labels, get_mysql_config, ) diff --git a/tests/model_registry/utils.py b/tests/model_registry/utils.py index db7fc0c2e..7ac0c4528 100644 --- a/tests/model_registry/utils.py +++ b/tests/model_registry/utils.py @@ -13,8 +13,7 @@ from ocp_resources.model_registry_modelregistry_opendatahub_io import ModelRegistry from kubernetes.dynamic.exceptions import ResourceNotFoundError from simple_logger.logger import get_logger -from timeout_sampler import TimeoutExpiredError, TimeoutSampler, retry -from kubernetes.dynamic.exceptions import NotFoundError +from timeout_sampler import retry from tests.model_registry.constants import ( MR_DB_IMAGE_DIGEST, MODEL_REGISTRY_DB_SECRET_STR_DATA, @@ -213,74 +212,6 @@ def get_model_registry_db_label_dict(db_resource_name: str) -> dict[str, str]: } -def get_pod_container_error_status(pod: Pod) -> str | None: - """ - Check container error status for a given pod and if any containers is in waiting state, return that information - """ - pod_instance_status = pod.instance.status - for container_status in pod_instance_status.get("containerStatuses", []): - if waiting_container := container_status.get("state", {}).get("waiting"): - return waiting_container["reason"] if waiting_container.get("reason") else waiting_container - return "" - - -def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]: - # Gets all the non-running pods from a given namespace. - # Note: We need to keep track of pods marked for deletion as not running. This would ensure any - # pod that was spun up in place of pod marked for deletion, are not ignored - pods_not_running = [] - try: - for pod in pods: - pod_instance = pod.instance - if container_status_error := get_pod_container_error_status(pod=pod): - pods_not_running.append({pod.name: container_status_error}) - - if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in ( - pod.Status.RUNNING, - pod.Status.SUCCEEDED, - ): - pods_not_running.append({pod.name: pod.status}) - except (ResourceNotFoundError, NotFoundError) as exc: - LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc) - return pods_not_running - - -def wait_for_pods_running( - admin_client: DynamicClient, - namespace_name: str, - number_of_consecutive_checks: int = 1, -) -> bool | None: - """ - Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running - state too soon, use number_of_consecutive_checks with appropriate values. - """ - samples = TimeoutSampler( - wait_timeout=180, - sleep=5, - func=get_not_running_pods, - pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)), - exceptions_dict={NotFoundError: [], ResourceNotFoundError: []}, - ) - sample = None - try: - current_check = 0 - for sample in samples: - if not sample: - current_check += 1 - if current_check >= number_of_consecutive_checks: - return True - else: - current_check = 0 - except TimeoutExpiredError: - if sample: - LOGGER.error( - f"timeout waiting for all pods in namespace {namespace_name} to reach " - f"running state, following pods are in not running state: {sample}" - ) - raise - return None - - @retry(exceptions_dict={TimeoutError: []}, wait_timeout=Timeout.TIMEOUT_2MIN, sleep=5) def wait_for_new_running_mr_pod( admin_client: DynamicClient, diff --git a/utilities/general.py b/utilities/general.py index ba9b330b8..6e7aac376 100644 --- a/utilities/general.py +++ b/utilities/general.py @@ -1,6 +1,6 @@ import base64 import re -from typing import List, Tuple +from typing import List, Tuple, Any import uuid from kubernetes.dynamic import DynamicClient @@ -15,6 +15,7 @@ from utilities.exceptions import UnexpectedResourceCountError, ResourceValueMismatch from ocp_resources.resource import Resource from timeout_sampler import retry +from timeout_sampler import TimeoutExpiredError, TimeoutSampler # Constants for image validation SHA256_DIGEST_PATTERN = r"@sha256:[a-f0-9]{64}$" @@ -384,3 +385,71 @@ def wait_for_container_status(pod: Pod, container_name: str, expected_status: st LOGGER.info(f"Container {container_name} is in the expected status {expected_status}") return True raise ResourceValueMismatch(f"Container {container_name} is not in the expected status {container_status.state}") + + +def get_pod_container_error_status(pod: Pod) -> str | None: + """ + Check container error status for a given pod and if any containers is in waiting state, return that information + """ + pod_instance_status = pod.instance.status + for container_status in pod_instance_status.get("containerStatuses", []): + if waiting_container := container_status.get("state", {}).get("waiting"): + return waiting_container["reason"] if waiting_container.get("reason") else waiting_container + return "" + + +def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]: + # Gets all the non-running pods from a given namespace. + # Note: We need to keep track of pods marked for deletion as not running. This would ensure any + # pod that was spun up in place of pod marked for deletion, are not ignored + pods_not_running = [] + try: + for pod in pods: + pod_instance = pod.instance + if container_status_error := get_pod_container_error_status(pod=pod): + pods_not_running.append({pod.name: container_status_error}) + + if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in ( + pod.Status.RUNNING, + pod.Status.SUCCEEDED, + ): + pods_not_running.append({pod.name: pod.status}) + except (ResourceNotFoundError, NotFoundError) as exc: + LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc) + return pods_not_running + + +def wait_for_pods_running( + admin_client: DynamicClient, + namespace_name: str, + number_of_consecutive_checks: int = 1, +) -> bool | None: + """ + Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running + state too soon, use number_of_consecutive_checks with appropriate values. + """ + samples = TimeoutSampler( + wait_timeout=180, + sleep=5, + func=get_not_running_pods, + pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)), + exceptions_dict={NotFoundError: [], ResourceNotFoundError: []}, + ) + sample = None + try: + current_check = 0 + for sample in samples: + if not sample: + current_check += 1 + if current_check >= number_of_consecutive_checks: + return True + else: + current_check = 0 + except TimeoutExpiredError: + if sample: + LOGGER.error( + f"timeout waiting for all pods in namespace {namespace_name} to reach " + f"running state, following pods are in not running state: {sample}" + ) + raise + return None