|
1 | 1 | import base64 |
2 | 2 | import re |
3 | | -from typing import List, Tuple |
| 3 | +from typing import List, Tuple, Any |
4 | 4 | import uuid |
5 | 5 |
|
6 | 6 | from kubernetes.dynamic import DynamicClient |
|
15 | 15 | from utilities.exceptions import UnexpectedResourceCountError, ResourceValueMismatch |
16 | 16 | from ocp_resources.resource import Resource |
17 | 17 | from timeout_sampler import retry |
| 18 | +from timeout_sampler import TimeoutExpiredError, TimeoutSampler |
18 | 19 |
|
19 | 20 | # Constants for image validation |
20 | 21 | SHA256_DIGEST_PATTERN = r"@sha256:[a-f0-9]{64}$" |
@@ -384,3 +385,71 @@ def wait_for_container_status(pod: Pod, container_name: str, expected_status: st |
384 | 385 | LOGGER.info(f"Container {container_name} is in the expected status {expected_status}") |
385 | 386 | return True |
386 | 387 | raise ResourceValueMismatch(f"Container {container_name} is not in the expected status {container_status.state}") |
| 388 | + |
| 389 | + |
| 390 | +def get_pod_container_error_status(pod: Pod) -> str | None: |
| 391 | + """ |
| 392 | + Check container error status for a given pod and if any containers is in waiting state, return that information |
| 393 | + """ |
| 394 | + pod_instance_status = pod.instance.status |
| 395 | + for container_status in pod_instance_status.get("containerStatuses", []): |
| 396 | + if waiting_container := container_status.get("state", {}).get("waiting"): |
| 397 | + return waiting_container["reason"] if waiting_container.get("reason") else waiting_container |
| 398 | + return "" |
| 399 | + |
| 400 | + |
| 401 | +def get_not_running_pods(pods: list[Pod]) -> list[dict[str, Any]]: |
| 402 | + # Gets all the non-running pods from a given namespace. |
| 403 | + # Note: We need to keep track of pods marked for deletion as not running. This would ensure any |
| 404 | + # pod that was spun up in place of pod marked for deletion, are not ignored |
| 405 | + pods_not_running = [] |
| 406 | + try: |
| 407 | + for pod in pods: |
| 408 | + pod_instance = pod.instance |
| 409 | + if container_status_error := get_pod_container_error_status(pod=pod): |
| 410 | + pods_not_running.append({pod.name: container_status_error}) |
| 411 | + |
| 412 | + if pod_instance.metadata.get("deletionTimestamp") or pod_instance.status.phase not in ( |
| 413 | + pod.Status.RUNNING, |
| 414 | + pod.Status.SUCCEEDED, |
| 415 | + ): |
| 416 | + pods_not_running.append({pod.name: pod.status}) |
| 417 | + except (ResourceNotFoundError, NotFoundError) as exc: |
| 418 | + LOGGER.warning("Ignoring pod that disappeared during cluster sanity check: %s", exc) |
| 419 | + return pods_not_running |
| 420 | + |
| 421 | + |
| 422 | +def wait_for_pods_running( |
| 423 | + admin_client: DynamicClient, |
| 424 | + namespace_name: str, |
| 425 | + number_of_consecutive_checks: int = 1, |
| 426 | +) -> bool | None: |
| 427 | + """ |
| 428 | + Waits for all pods in a given namespace to reach Running/Completed state. To avoid catching all pods in running |
| 429 | + state too soon, use number_of_consecutive_checks with appropriate values. |
| 430 | + """ |
| 431 | + samples = TimeoutSampler( |
| 432 | + wait_timeout=180, |
| 433 | + sleep=5, |
| 434 | + func=get_not_running_pods, |
| 435 | + pods=list(Pod.get(dyn_client=admin_client, namespace=namespace_name)), |
| 436 | + exceptions_dict={NotFoundError: [], ResourceNotFoundError: []}, |
| 437 | + ) |
| 438 | + sample = None |
| 439 | + try: |
| 440 | + current_check = 0 |
| 441 | + for sample in samples: |
| 442 | + if not sample: |
| 443 | + current_check += 1 |
| 444 | + if current_check >= number_of_consecutive_checks: |
| 445 | + return True |
| 446 | + else: |
| 447 | + current_check = 0 |
| 448 | + except TimeoutExpiredError: |
| 449 | + if sample: |
| 450 | + LOGGER.error( |
| 451 | + f"timeout waiting for all pods in namespace {namespace_name} to reach " |
| 452 | + f"running state, following pods are in not running state: {sample}" |
| 453 | + ) |
| 454 | + raise |
| 455 | + return None |
0 commit comments