Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ def updated_global_config(admin_client: DynamicClient, config: Config) -> None:
distribution = get_operator_distribution(client=admin_client)
if distribution == "Open Data Hub":
py_config["distribution"] = "upstream"
# override the operator namespace
py_config["operator_namespace"] = "opendatahub-operators"

elif distribution.startswith("OpenShift AI"):
py_config["distribution"] = "downstream"
Expand Down
26 changes: 25 additions & 1 deletion tests/cluster_health/test_cluster_health.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import pytest

from kubernetes.dynamic import DynamicClient
from ocp_resources.data_science_cluster import DataScienceCluster
from ocp_resources.dsc_initialization import DSCInitialization
from ocp_resources.node import Node
from ocp_utilities.infra import assert_nodes_in_healthy_condition, assert_nodes_schedulable
from utilities.general import wait_for_pods_running
from utilities.infra import wait_for_dsci_status_ready, wait_for_dsc_status_ready
from pytest_testconfig import config as py_config
from simple_logger.logger import get_logger

LOGGER = get_logger(name=__name__)


@pytest.mark.cluster_health
Expand All @@ -21,3 +26,22 @@ def test_data_science_cluster_initialization_healthy(dsci_resource: DSCInitializ
@pytest.mark.cluster_health
def test_data_science_cluster_healthy(dsc_resource: DataScienceCluster):
wait_for_dsc_status_ready(dsc_resource=dsc_resource)


@pytest.mark.parametrize(
"namespace_name",
[
pytest.param(
py_config["operator_namespace"],
id="test_operator_namespace_pod_healthy",
),
pytest.param(
py_config["applications_namespace"],
id="test_application_namespace_pod_healthy",
),
],
)
@pytest.mark.cluster_health
def test_pods_cluster_healthy(admin_client: DynamicClient, namespace_name: str):
LOGGER.info(f"Testing Pods in namespace {namespace_name} for cluster health")
wait_for_pods_running(admin_client=admin_client, namespace_name=namespace_name)
3 changes: 3 additions & 0 deletions tests/global_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from utilities.constants import RHOAI_OPERATOR_NAMESPACE

global config # type:ignore[unused-ignore]

dsc_name: str = "default-dsc"
Expand All @@ -9,6 +11,7 @@
distribution: str = "downstream"
applications_namespace: str = "redhat-ods-applications"
model_registry_namespace: str = "rhoai-model-registries"
operator_namespace: str = RHOAI_OPERATOR_NAMESPACE

for _dir in dir():
val = locals()[_dir]
Expand Down
4 changes: 1 addition & 3 deletions tests/model_registry/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@

from tests.model_registry.rbac.utils import wait_for_oauth_openshift_deployment
from tests.model_registry.utils import generate_namespace_name, get_rest_headers
from utilities.general import generate_random_name

from utilities.general import generate_random_name, wait_for_pods_running

from tests.model_registry.constants import (
MR_OPERATOR_NAME,
Expand All @@ -49,7 +48,6 @@
from tests.model_registry.utils import (
get_endpoint_from_mr_service,
get_mr_service_by_label,
wait_for_pods_running,
get_model_registry_objects,
get_model_registry_metadata_resources,
)
Expand Down
3 changes: 2 additions & 1 deletion tests/model_registry/model_catalog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
CATALOG_TYPE,
DEFAULT_CATALOG_FILE,
)
from tests.model_registry.utils import get_model_catalog_pod, wait_for_pods_running, get_rest_headers
from tests.model_registry.utils import get_model_catalog_pod, get_rest_headers
from utilities.general import wait_for_pods_running

LOGGER = get_logger(name=__name__)

Expand Down
3 changes: 1 addition & 2 deletions tests/model_registry/rest_api/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@
execute_model_registry_patch_command,
get_mr_deployment,
)
from utilities.general import generate_random_name
from utilities.general import generate_random_name, wait_for_pods_running
from ocp_resources.deployment import Deployment
from tests.model_registry.utils import (
get_model_registry_deployment_template_dict,
apply_mysql_args_and_volume_mounts,
add_mysql_certs_volumes_to_deployment,
wait_for_pods_running,
get_mr_standard_labels,
get_mysql_config,
)
Expand Down
71 changes: 1 addition & 70 deletions tests/model_registry/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
from ocp_resources.model_registry_modelregistry_opendatahub_io import ModelRegistry
from kubernetes.dynamic.exceptions import ResourceNotFoundError
from simple_logger.logger import get_logger
from timeout_sampler import TimeoutExpiredError, TimeoutSampler, retry
from kubernetes.dynamic.exceptions import NotFoundError
from timeout_sampler import retry
from tests.model_registry.constants import (
MR_DB_IMAGE_DIGEST,
MODEL_REGISTRY_DB_SECRET_STR_DATA,
Expand Down Expand Up @@ -213,74 +212,6 @@ def get_model_registry_db_label_dict(db_resource_name: str) -> dict[str, str]:
}


def get_pod_container_error_status(pod: Pod) -> str | None:
"""
Check container error status for a given pod and if any containers is in waiting state, return that information
"""
pod_instance_status = pod.instance.status
for container_status in pod_instance_status.get("containerStatuses", []):
if waiting_container := container_status.get("state", {}).get("waiting"):
return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
return ""


def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
# Gets all the non-running pods from a given namespace.
# Note: We need to keep track of pods marked for deletion as not running. This would ensure any
# pod that was spun up in place of pod marked for deletion, are not ignored
pods_not_running = []
try:
for pod in pods:
pod_instance = pod.instance
if container_status_error := get_pod_container_error_status(pod=pod):
pods_not_running.append({pod.name: container_status_error})

if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
pod.Status.RUNNING,
pod.Status.SUCCEEDED,
):
pods_not_running.append({pod.name: pod.status})
except (ResourceNotFoundError, NotFoundError) as exc:
LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
return pods_not_running


def wait_for_pods_running(
admin_client: DynamicClient,
namespace_name: str,
number_of_consecutive_checks: int = 1,
) -> bool | None:
"""
Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
state too soon, use number_of_consecutive_checks with appropriate values.
"""
samples = TimeoutSampler(
wait_timeout=180,
sleep=5,
func=get_not_running_pods,
pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
)
sample = None
try:
current_check = 0
for sample in samples:
if not sample:
current_check += 1
if current_check >= number_of_consecutive_checks:
return True
else:
current_check = 0
except TimeoutExpiredError:
if sample:
LOGGER.error(
f"timeout waiting for all pods in namespace {namespace_name} to reach "
f"running state, following pods are in not running state: {sample}"
)
raise
return None


@retry(exceptions_dict={TimeoutError: []}, wait_timeout=Timeout.TIMEOUT_2MIN, sleep=5)
def wait_for_new_running_mr_pod(
admin_client: DynamicClient,
Expand Down
71 changes: 70 additions & 1 deletion utilities/general.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import base64
import re
from typing import List, Tuple
from typing import List, Tuple, Any
import uuid

from kubernetes.dynamic import DynamicClient
Expand All @@ -15,6 +15,7 @@
from utilities.exceptions import UnexpectedResourceCountError, ResourceValueMismatch
from ocp_resources.resource import Resource
from timeout_sampler import retry
from timeout_sampler import TimeoutExpiredError, TimeoutSampler

# Constants for image validation
SHA256_DIGEST_PATTERN = r"@sha256:[a-f0-9]{64}$"
Expand Down Expand Up @@ -384,3 +385,71 @@ def wait_for_container_status(pod: Pod, container_name: str, expected_status: st
LOGGER.info(f"Container {container_name} is in the expected status {expected_status}")
return True
raise ResourceValueMismatch(f"Container {container_name} is not in the expected status {container_status.state}")


def get_pod_container_error_status(pod: Pod) -> str | None:
"""
Check container error status for a given pod and if any containers is in waiting state, return that information
"""
pod_instance_status = pod.instance.status
for container_status in pod_instance_status.get("containerStatuses", []):
if waiting_container := container_status.get("state", {}).get("waiting"):
return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
return ""


def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
# Gets all the non-running pods from a given namespace.
# Note: We need to keep track of pods marked for deletion as not running. This would ensure any
# pod that was spun up in place of pod marked for deletion, are not ignored
pods_not_running = []
try:
for pod in pods:
pod_instance = pod.instance
if container_status_error := get_pod_container_error_status(pod=pod):
pods_not_running.append({pod.name: container_status_error})

if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
pod.Status.RUNNING,
pod.Status.SUCCEEDED,
):
pods_not_running.append({pod.name: pod.status})
except (ResourceNotFoundError, NotFoundError) as exc:
LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
return pods_not_running


def wait_for_pods_running(
admin_client: DynamicClient,
namespace_name: str,
number_of_consecutive_checks: int = 1,
) -> bool | None:
"""
Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
state too soon, use number_of_consecutive_checks with appropriate values.
"""
samples = TimeoutSampler(
wait_timeout=180,
sleep=5,
func=get_not_running_pods,
pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
)
sample = None
try:
current_check = 0
for sample in samples:
if not sample:
current_check += 1
if current_check >= number_of_consecutive_checks:
return True
else:
current_check = 0
except TimeoutExpiredError:
if sample:
LOGGER.error(
f"timeout waiting for all pods in namespace {namespace_name} to reach "
f"running state, following pods are in not running state: {sample}"
)
raise
return None