Add pod checks to cluster health (#674)

dbasunag · dbasunag · commit bfffd3d5d9fa · 2025-10-03T13:11:01.000Z
diff --git a/conftest.py b/conftest.py
@@ -307,6 +307,8 @@ def updated_global_config(admin_client: DynamicClient, config: Config) -> None:
     distribution = get_operator_distribution(client=admin_client)
     if distribution == "Open Data Hub":
         py_config["distribution"] = "upstream"
+        # override the operator namespace
+        py_config["operator_namespace"] = "opendatahub-operators"
 
     elif distribution.startswith("OpenShift AI"):
         py_config["distribution"] = "downstream"
diff --git a/tests/cluster_health/test_cluster_health.py b/tests/cluster_health/test_cluster_health.py
@@ -1,10 +1,15 @@
 import pytest
-
+from kubernetes.dynamic import DynamicClient
 from ocp_resources.data_science_cluster import DataScienceCluster
 from ocp_resources.dsc_initialization import DSCInitialization
 from ocp_resources.node import Node
 from ocp_utilities.infra import assert_nodes_in_healthy_condition, assert_nodes_schedulable
+from utilities.general import wait_for_pods_running
 from utilities.infra import wait_for_dsci_status_ready, wait_for_dsc_status_ready
+from pytest_testconfig import config as py_config
+from simple_logger.logger import get_logger
+
+LOGGER = get_logger(name=__name__)
 
 
 @pytest.mark.cluster_health
@@ -21,3 +26,22 @@ def test_data_science_cluster_initialization_healthy(dsci_resource: DSCInitializ
 @pytest.mark.cluster_health
 def test_data_science_cluster_healthy(dsc_resource: DataScienceCluster):
     wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+
+
+@pytest.mark.parametrize(
+    "namespace_name",
+    [
+        pytest.param(
+            py_config["operator_namespace"],
+            id="test_operator_namespace_pod_healthy",
+        ),
+        pytest.param(
+            py_config["applications_namespace"],
+            id="test_application_namespace_pod_healthy",
+        ),
+    ],
+)
+@pytest.mark.cluster_health
+def test_pods_cluster_healthy(admin_client: DynamicClient, namespace_name: str):
+    LOGGER.info(f"Testing Pods in namespace {namespace_name} for cluster health")
+    wait_for_pods_running(admin_client=admin_client, namespace_name=namespace_name)
diff --git a/tests/global_config.py b/tests/global_config.py
@@ -1,3 +1,5 @@
+from utilities.constants import RHOAI_OPERATOR_NAMESPACE
+
 global config  # type:ignore[unused-ignore]
 
 dsc_name: str = "default-dsc"
@@ -9,6 +11,7 @@
 distribution: str = "downstream"
 applications_namespace: str = "redhat-ods-applications"
 model_registry_namespace: str = "rhoai-model-registries"
+operator_namespace: str = RHOAI_OPERATOR_NAMESPACE
 
 for _dir in dir():
     val = locals()[_dir]
diff --git a/tests/model_registry/conftest.py b/tests/model_registry/conftest.py
@@ -34,8 +34,7 @@
 
 from tests.model_registry.rbac.utils import wait_for_oauth_openshift_deployment
 from tests.model_registry.utils import generate_namespace_name, get_rest_headers
-from utilities.general import generate_random_name
-
+from utilities.general import generate_random_name, wait_for_pods_running
 
 from tests.model_registry.constants import (
     MR_OPERATOR_NAME,
@@ -49,7 +48,6 @@
 from tests.model_registry.utils import (
     get_endpoint_from_mr_service,
     get_mr_service_by_label,
-    wait_for_pods_running,
     get_model_registry_objects,
     get_model_registry_metadata_resources,
 )
diff --git a/tests/model_registry/model_catalog/utils.py b/tests/model_registry/model_catalog/utils.py
@@ -14,7 +14,8 @@
     CATALOG_TYPE,
     DEFAULT_CATALOG_FILE,
 )
-from tests.model_registry.utils import get_model_catalog_pod, wait_for_pods_running, get_rest_headers
+from tests.model_registry.utils import get_model_catalog_pod, get_rest_headers
+from utilities.general import wait_for_pods_running
 
 LOGGER = get_logger(name=__name__)
 
diff --git a/tests/model_registry/rest_api/conftest.py b/tests/model_registry/rest_api/conftest.py
@@ -10,13 +10,12 @@
     execute_model_registry_patch_command,
     get_mr_deployment,
 )
-from utilities.general import generate_random_name
+from utilities.general import generate_random_name, wait_for_pods_running
 from ocp_resources.deployment import Deployment
 from tests.model_registry.utils import (
     get_model_registry_deployment_template_dict,
     apply_mysql_args_and_volume_mounts,
     add_mysql_certs_volumes_to_deployment,
-    wait_for_pods_running,
     get_mr_standard_labels,
     get_mysql_config,
 )
diff --git a/tests/model_registry/utils.py b/tests/model_registry/utils.py
@@ -13,8 +13,7 @@
 from ocp_resources.model_registry_modelregistry_opendatahub_io import ModelRegistry
 from kubernetes.dynamic.exceptions import ResourceNotFoundError
 from simple_logger.logger import get_logger
-from timeout_sampler import TimeoutExpiredError, TimeoutSampler, retry
-from kubernetes.dynamic.exceptions import NotFoundError
+from timeout_sampler import retry
 from tests.model_registry.constants import (
     MR_DB_IMAGE_DIGEST,
     MODEL_REGISTRY_DB_SECRET_STR_DATA,
@@ -213,74 +212,6 @@ def get_model_registry_db_label_dict(db_resource_name: str) -> dict[str, str]:
     }
 
 
-def get_pod_container_error_status(pod: Pod) -> str | None:
-    """
-    Check container error status for a given pod and if any containers is in waiting state, return that information
-    """
-    pod_instance_status = pod.instance.status
-    for container_status in pod_instance_status.get("containerStatuses", []):
-        if waiting_container := container_status.get("state", {}).get("waiting"):
-            return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
-    return ""
-
-
-def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
-    # Gets all the non-running pods from a given namespace.
-    # Note: We need to keep track of pods marked for deletion as not running. This would ensure any
-    # pod that was spun up in place of pod marked for deletion, are not ignored
-    pods_not_running = []
-    try:
-        for pod in pods:
-            pod_instance = pod.instance
-            if container_status_error := get_pod_container_error_status(pod=pod):
-                pods_not_running.append({pod.name: container_status_error})
-
-            if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
-                pod.Status.RUNNING,
-                pod.Status.SUCCEEDED,
-            ):
-                pods_not_running.append({pod.name: pod.status})
-    except (ResourceNotFoundError, NotFoundError) as exc:
-        LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
-    return pods_not_running
-
-
-def wait_for_pods_running(
-    admin_client: DynamicClient,
-    namespace_name: str,
-    number_of_consecutive_checks: int = 1,
-) -> bool | None:
-    """
-    Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
-    state too soon, use number_of_consecutive_checks with appropriate values.
-    """
-    samples = TimeoutSampler(
-        wait_timeout=180,
-        sleep=5,
-        func=get_not_running_pods,
-        pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
-        exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
-    )
-    sample = None
-    try:
-        current_check = 0
-        for sample in samples:
-            if not sample:
-                current_check += 1
-                if current_check >= number_of_consecutive_checks:
-                    return True
-            else:
-                current_check = 0
-    except TimeoutExpiredError:
-        if sample:
-            LOGGER.error(
-                f"timeout waiting for all pods in namespace {namespace_name} to reach "
-                f"running state, following pods are in not running state: {sample}"
-            )
-            raise
-    return None
-
-
 @retry(exceptions_dict={TimeoutError: []}, wait_timeout=Timeout.TIMEOUT_2MIN, sleep=5)
 def wait_for_new_running_mr_pod(
     admin_client: DynamicClient,
diff --git a/utilities/general.py b/utilities/general.py
@@ -1,6 +1,6 @@
 import base64
 import re
-from typing import List, Tuple
+from typing import List, Tuple, Any
 import uuid
 
 from kubernetes.dynamic import DynamicClient
@@ -15,6 +15,7 @@
 from utilities.exceptions import UnexpectedResourceCountError, ResourceValueMismatch
 from ocp_resources.resource import Resource
 from timeout_sampler import retry
+from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 
 # Constants for image validation
 SHA256_DIGEST_PATTERN = r"@sha256:[a-f0-9]{64}$"
@@ -384,3 +385,71 @@ def wait_for_container_status(pod: Pod, container_name: str, expected_status: st
         LOGGER.info(f"Container {container_name} is in the expected status {expected_status}")
         return True
     raise ResourceValueMismatch(f"Container {container_name} is not in the expected status {container_status.state}")
+
+
+def get_pod_container_error_status(pod: Pod) -> str | None:
+    """
+    Check container error status for a given pod and if any containers is in waiting state, return that information
+    """
+    pod_instance_status = pod.instance.status
+    for container_status in pod_instance_status.get("containerStatuses", []):
+        if waiting_container := container_status.get("state", {}).get("waiting"):
+            return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
+    return ""
+
+
+def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
+    # Gets all the non-running pods from a given namespace.
+    # Note: We need to keep track of pods marked for deletion as not running. This would ensure any
+    # pod that was spun up in place of pod marked for deletion, are not ignored
+    pods_not_running = []
+    try:
+        for pod in pods:
+            pod_instance = pod.instance
+            if container_status_error := get_pod_container_error_status(pod=pod):
+                pods_not_running.append({pod.name: container_status_error})
+
+            if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
+                pod.Status.RUNNING,
+                pod.Status.SUCCEEDED,
+            ):
+                pods_not_running.append({pod.name: pod.status})
+    except (ResourceNotFoundError, NotFoundError) as exc:
+        LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
+    return pods_not_running
+
+
+def wait_for_pods_running(
+    admin_client: DynamicClient,
+    namespace_name: str,
+    number_of_consecutive_checks: int = 1,
+) -> bool | None:
+    """
+    Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
+    state too soon, use number_of_consecutive_checks with appropriate values.
+    """
+    samples = TimeoutSampler(
+        wait_timeout=180,
+        sleep=5,
+        func=get_not_running_pods,
+        pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
+        exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
+    )
+    sample = None
+    try:
+        current_check = 0
+        for sample in samples:
+            if not sample:
+                current_check += 1
+                if current_check >= number_of_consecutive_checks:
+                    return True
+            else:
+                current_check = 0
+    except TimeoutExpiredError:
+        if sample:
+            LOGGER.error(
+                f"timeout waiting for all pods in namespace {namespace_name} to reach "
+                f"running state, following pods are in not running state: {sample}"
+            )
+            raise
+    return None

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,8 @@`
`14`	`14`	`CATALOG_TYPE,`
`15`	`15`	`DEFAULT_CATALOG_FILE,`
`16`	`16`	`)`
`17`		`-from tests.model_registry.utils import get_model_catalog_pod, wait_for_pods_running, get_rest_headers`
	`17`	`+from tests.model_registry.utils import get_model_catalog_pod, get_rest_headers`
	`18`	`+from utilities.general import wait_for_pods_running`
`18`	`19`
`19`	`20`	`LOGGER = get_logger(name=__name__)`
`20`	`21`