Skip to content

Commit bfffd3d

Browse files
committed
Add pod checks to cluster health (#674)
1 parent 8d2c467 commit bfffd3d

File tree

8 files changed

+105
-78
lines changed

8 files changed

+105
-78
lines changed

conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,8 @@ def updated_global_config(admin_client: DynamicClient, config: Config) -> None:
307307
distribution = get_operator_distribution(client=admin_client)
308308
if distribution == "Open Data Hub":
309309
py_config["distribution"] = "upstream"
310+
# override the operator namespace
311+
py_config["operator_namespace"] = "opendatahub-operators"
310312

311313
elif distribution.startswith("OpenShift AI"):
312314
py_config["distribution"] = "downstream"

tests/cluster_health/test_cluster_health.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
import pytest
2-
2+
from kubernetes.dynamic import DynamicClient
33
from ocp_resources.data_science_cluster import DataScienceCluster
44
from ocp_resources.dsc_initialization import DSCInitialization
55
from ocp_resources.node import Node
66
from ocp_utilities.infra import assert_nodes_in_healthy_condition, assert_nodes_schedulable
7+
from utilities.general import wait_for_pods_running
78
from utilities.infra import wait_for_dsci_status_ready, wait_for_dsc_status_ready
9+
from pytest_testconfig import config as py_config
10+
from simple_logger.logger import get_logger
11+
12+
LOGGER = get_logger(name=__name__)
813

914

1015
@pytest.mark.cluster_health
@@ -21,3 +26,22 @@ def test_data_science_cluster_initialization_healthy(dsci_resource: DSCInitializ
2126
@pytest.mark.cluster_health
2227
def test_data_science_cluster_healthy(dsc_resource: DataScienceCluster):
2328
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
29+
30+
31+
@pytest.mark.parametrize(
32+
"namespace_name",
33+
[
34+
pytest.param(
35+
py_config["operator_namespace"],
36+
id="test_operator_namespace_pod_healthy",
37+
),
38+
pytest.param(
39+
py_config["applications_namespace"],
40+
id="test_application_namespace_pod_healthy",
41+
),
42+
],
43+
)
44+
@pytest.mark.cluster_health
45+
def test_pods_cluster_healthy(admin_client: DynamicClient, namespace_name: str):
46+
LOGGER.info(f"Testing Pods in namespace {namespace_name} for cluster health")
47+
wait_for_pods_running(admin_client=admin_client, namespace_name=namespace_name)

tests/global_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from utilities.constants import RHOAI_OPERATOR_NAMESPACE
2+
13
global config # type:ignore[unused-ignore]
24

35
dsc_name: str = "default-dsc"
@@ -9,6 +11,7 @@
911
distribution: str = "downstream"
1012
applications_namespace: str = "redhat-ods-applications"
1113
model_registry_namespace: str = "rhoai-model-registries"
14+
operator_namespace: str = RHOAI_OPERATOR_NAMESPACE
1215

1316
for _dir in dir():
1417
val = locals()[_dir]

tests/model_registry/conftest.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434

3535
from tests.model_registry.rbac.utils import wait_for_oauth_openshift_deployment
3636
from tests.model_registry.utils import generate_namespace_name, get_rest_headers
37-
from utilities.general import generate_random_name
38-
37+
from utilities.general import generate_random_name, wait_for_pods_running
3938

4039
from tests.model_registry.constants import (
4140
MR_OPERATOR_NAME,
@@ -49,7 +48,6 @@
4948
from tests.model_registry.utils import (
5049
get_endpoint_from_mr_service,
5150
get_mr_service_by_label,
52-
wait_for_pods_running,
5351
get_model_registry_objects,
5452
get_model_registry_metadata_resources,
5553
)

tests/model_registry/model_catalog/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
CATALOG_TYPE,
1515
DEFAULT_CATALOG_FILE,
1616
)
17-
from tests.model_registry.utils import get_model_catalog_pod, wait_for_pods_running, get_rest_headers
17+
from tests.model_registry.utils import get_model_catalog_pod, get_rest_headers
18+
from utilities.general import wait_for_pods_running
1819

1920
LOGGER = get_logger(name=__name__)
2021

tests/model_registry/rest_api/conftest.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,12 @@
1010
execute_model_registry_patch_command,
1111
get_mr_deployment,
1212
)
13-
from utilities.general import generate_random_name
13+
from utilities.general import generate_random_name, wait_for_pods_running
1414
from ocp_resources.deployment import Deployment
1515
from tests.model_registry.utils import (
1616
get_model_registry_deployment_template_dict,
1717
apply_mysql_args_and_volume_mounts,
1818
add_mysql_certs_volumes_to_deployment,
19-
wait_for_pods_running,
2019
get_mr_standard_labels,
2120
get_mysql_config,
2221
)

tests/model_registry/utils.py

Lines changed: 1 addition & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
from ocp_resources.model_registry_modelregistry_opendatahub_io import ModelRegistry
1414
from kubernetes.dynamic.exceptions import ResourceNotFoundError
1515
from simple_logger.logger import get_logger
16-
from timeout_sampler import TimeoutExpiredError, TimeoutSampler, retry
17-
from kubernetes.dynamic.exceptions import NotFoundError
16+
from timeout_sampler import retry
1817
from tests.model_registry.constants import (
1918
MR_DB_IMAGE_DIGEST,
2019
MODEL_REGISTRY_DB_SECRET_STR_DATA,
@@ -213,74 +212,6 @@ def get_model_registry_db_label_dict(db_resource_name: str) -> dict[str, str]:
213212
}
214213

215214

216-
def get_pod_container_error_status(pod: Pod) -> str | None:
217-
"""
218-
Check container error status for a given pod and if any containers is in waiting state, return that information
219-
"""
220-
pod_instance_status = pod.instance.status
221-
for container_status in pod_instance_status.get("containerStatuses", []):
222-
if waiting_container := container_status.get("state", {}).get("waiting"):
223-
return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
224-
return ""
225-
226-
227-
def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
228-
# Gets all the non-running pods from a given namespace.
229-
# Note: We need to keep track of pods marked for deletion as not running. This would ensure any
230-
# pod that was spun up in place of pod marked for deletion, are not ignored
231-
pods_not_running = []
232-
try:
233-
for pod in pods:
234-
pod_instance = pod.instance
235-
if container_status_error := get_pod_container_error_status(pod=pod):
236-
pods_not_running.append({pod.name: container_status_error})
237-
238-
if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
239-
pod.Status.RUNNING,
240-
pod.Status.SUCCEEDED,
241-
):
242-
pods_not_running.append({pod.name: pod.status})
243-
except (ResourceNotFoundError, NotFoundError) as exc:
244-
LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
245-
return pods_not_running
246-
247-
248-
def wait_for_pods_running(
249-
admin_client: DynamicClient,
250-
namespace_name: str,
251-
number_of_consecutive_checks: int = 1,
252-
) -> bool | None:
253-
"""
254-
Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
255-
state too soon, use number_of_consecutive_checks with appropriate values.
256-
"""
257-
samples = TimeoutSampler(
258-
wait_timeout=180,
259-
sleep=5,
260-
func=get_not_running_pods,
261-
pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
262-
exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
263-
)
264-
sample = None
265-
try:
266-
current_check = 0
267-
for sample in samples:
268-
if not sample:
269-
current_check += 1
270-
if current_check >= number_of_consecutive_checks:
271-
return True
272-
else:
273-
current_check = 0
274-
except TimeoutExpiredError:
275-
if sample:
276-
LOGGER.error(
277-
f"timeout waiting for all pods in namespace {namespace_name} to reach "
278-
f"running state, following pods are in not running state: {sample}"
279-
)
280-
raise
281-
return None
282-
283-
284215
@retry(exceptions_dict={TimeoutError: []}, wait_timeout=Timeout.TIMEOUT_2MIN, sleep=5)
285216
def wait_for_new_running_mr_pod(
286217
admin_client: DynamicClient,

utilities/general.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import base64
22
import re
3-
from typing import List, Tuple
3+
from typing import List, Tuple, Any
44
import uuid
55

66
from kubernetes.dynamic import DynamicClient
@@ -15,6 +15,7 @@
1515
from utilities.exceptions import UnexpectedResourceCountError, ResourceValueMismatch
1616
from ocp_resources.resource import Resource
1717
from timeout_sampler import retry
18+
from timeout_sampler import TimeoutExpiredError, TimeoutSampler
1819

1920
# Constants for image validation
2021
SHA256_DIGEST_PATTERN = r"@sha256:[a-f0-9]{64}$"
@@ -384,3 +385,71 @@ def wait_for_container_status(pod: Pod, container_name: str, expected_status: st
384385
LOGGER.info(f"Container {container_name} is in the expected status {expected_status}")
385386
return True
386387
raise ResourceValueMismatch(f"Container {container_name} is not in the expected status {container_status.state}")
388+
389+
390+
def get_pod_container_error_status(pod: Pod) -> str | None:
391+
"""
392+
Check container error status for a given pod and if any containers is in waiting state, return that information
393+
"""
394+
pod_instance_status = pod.instance.status
395+
for container_status in pod_instance_status.get("containerStatuses", []):
396+
if waiting_container := container_status.get("state", {}).get("waiting"):
397+
return waiting_container["reason"] if waiting_container.get("reason") else waiting_container
398+
return ""
399+
400+
401+
def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]:
402+
# Gets all the non-running pods from a given namespace.
403+
# Note: We need to keep track of pods marked for deletion as not running. This would ensure any
404+
# pod that was spun up in place of pod marked for deletion, are not ignored
405+
pods_not_running = []
406+
try:
407+
for pod in pods:
408+
pod_instance = pod.instance
409+
if container_status_error := get_pod_container_error_status(pod=pod):
410+
pods_not_running.append({pod.name: container_status_error})
411+
412+
if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in (
413+
pod.Status.RUNNING,
414+
pod.Status.SUCCEEDED,
415+
):
416+
pods_not_running.append({pod.name: pod.status})
417+
except (ResourceNotFoundError, NotFoundError) as exc:
418+
LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc)
419+
return pods_not_running
420+
421+
422+
def wait_for_pods_running(
423+
admin_client: DynamicClient,
424+
namespace_name: str,
425+
number_of_consecutive_checks: int = 1,
426+
) -> bool | None:
427+
"""
428+
Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running
429+
state too soon, use number_of_consecutive_checks with appropriate values.
430+
"""
431+
samples = TimeoutSampler(
432+
wait_timeout=180,
433+
sleep=5,
434+
func=get_not_running_pods,
435+
pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)),
436+
exceptions_dict={NotFoundError: [], ResourceNotFoundError: []},
437+
)
438+
sample = None
439+
try:
440+
current_check = 0
441+
for sample in samples:
442+
if not sample:
443+
current_check += 1
444+
if current_check >= number_of_consecutive_checks:
445+
return True
446+
else:
447+
current_check = 0
448+
except TimeoutExpiredError:
449+
if sample:
450+
LOGGER.error(
451+
f"timeout waiting for all pods in namespace {namespace_name} to reach "
452+
f"running state, following pods are in not running state: {sample}"
453+
)
454+
raise
455+
return None

0 commit comments

Comments
 (0)