Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions tests/model_registry/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
get_mr_service_by_label,
get_model_registry_deployment_template_dict,
get_model_registry_db_label_dict,
wait_for_pods_running,
)
from utilities.constants import Annotations, Protocols, DscComponents
from model_registry import ModelRegistry as ModelRegistryClient
Expand Down Expand Up @@ -221,8 +222,7 @@ def after_call(self, response: Response, case: Case) -> None:

@pytest.fixture(scope="class")
def updated_dsc_component_state_scope_class(
request: FixtureRequest,
dsc_resource: DataScienceCluster,
request: FixtureRequest, dsc_resource: DataScienceCluster, admin_client: DynamicClient
) -> Generator[DataScienceCluster, Any, Any]:
original_components = dsc_resource.instance.spec.components
with ResourceEditor(patches={dsc_resource: {"spec": {"components": request.param["component_patch"]}}}):
Expand All @@ -233,6 +233,11 @@ def updated_dsc_component_state_scope_class(
name=dsc_resource.instance.spec.components.modelregistry.registriesNamespace, ensure_exists=True
)
namespace.wait_for_status(status=Namespace.Status.ACTIVE)
wait_for_pods_running(
admin_client=admin_client,
namespace_name=py_config["applications_namespace"],
number_of_consecutive_checks=6,
)
yield dsc_resource

for component_name, value in request.param["component_patch"].items():
Expand Down
75 changes: 74 additions & 1 deletion tests/model_registry/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,21 @@

from kubernetes.dynamic import DynamicClient
from ocp_resources.namespace import Namespace
from ocp_resources.pod import Pod
from ocp_resources.service import Service
from ocp_resources.model_registry import ModelRegistry
from kubernetes.dynamic.exceptions import ResourceNotFoundError

from simple_logger.logger import get_logger
from timeout_sampler import TimeoutExpiredError, TimeoutSampler
from kubernetes.dynamic.exceptions import NotFoundError
from tests.model_registry.constants import MR_DB_IMAGE_DIGEST
from utilities.exceptions import ProtocolNotSupportedError, TooManyServicesError
from utilities.constants import Protocols, Annotations

ADDRESS_ANNOTATION_PREFIX: str = "routing.opendatahub.io/external-address-"

LOGGER = get_logger(name=__name__)


def get_mr_service_by_label(client: DynamicClient, ns: Namespace, mr_instance: ModelRegistry) -> Service:
"""
Expand Down Expand Up @@ -156,3 +161,71 @@ def get_model_registry_db_label_dict(db_resource_name: str) -> dict[str, str]:
Annotations.KubernetesIo.INSTANCE: db_resource_name,
Annotations.KubernetesIo.PART_OF: db_resource_name,
}


def get_pod_container_error_status(pod: Pod) -> str | None:
"""
Check container error status for a given pod and if any containers is in waiting state, return that information
"""
pod_instance_status = pod.instance.status
for container_status in pod_instance_status.get("containerStatuses", []):
if waiting_container := container_status.get("state", {}).get("waiting"):
return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
return ""


def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
# Gets all the non-running pods from a given namespace.
# Note: We need to keep track of pods marked for deletion as not running. This would ensure any
# pod that was spun up in place of pod marked for deletion, are not ignored
pods_not_running = []
try:
for pod in pods:
pod_instance = pod.instance
if container_status_error := get_pod_container_error_status(pod=pod):
pods_not_running.append({pod.name: container_status_error})

if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
pod.Status.RUNNING,
pod.Status.SUCCEEDED,
):
pods_not_running.append({pod.name: pod.status})
except (ResourceNotFoundError, NotFoundError) as exc:
LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
return pods_not_running


def wait_for_pods_running(
admin_client: DynamicClient,
namespace_name: str,
number_of_consecutive_checks: int = 1,
) -> bool | None:
"""
Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
state too soon, use number_of_consecutive_checks with appropriate values.
"""
samples = TimeoutSampler(
wait_timeout=180,
sleep=5,
func=get_not_running_pods,
pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
)
sample = None
try:
current_check = 0
for sample in samples:
if not sample:
current_check += 1
if current_check >= number_of_consecutive_checks:
return True
else:
current_check = 0
except TimeoutExpiredError:
if sample:
LOGGER.error(
f"timeout waiting for all pods in namespace {namespace_name} to reach "
f"running state, following pods are in not running state: {sample}"
)
raise
return None