From 9d56ef16be50c29700eb60cdf08d0964fb89df68 Mon Sep 17 00:00:00 2001 From: rnetser Date: Mon, 28 Apr 2025 20:17:53 +0300 Subject: [PATCH 1/2] fix: mm container --- utilities/infra.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/utilities/infra.py b/utilities/infra.py index 5e97ec9a3..2c19d93d1 100644 --- a/utilities/infra.py +++ b/utilities/infra.py @@ -28,7 +28,7 @@ from ocp_resources.pod import Pod from ocp_resources.project_project_openshift_io import Project from ocp_resources.project_request import ProjectRequest -from ocp_resources.resource import ResourceEditor, get_client +from ocp_resources.resource import Resource, ResourceEditor, get_client from ocp_resources.role import Role from ocp_resources.route import Route from ocp_resources.secret import Secret @@ -627,6 +627,7 @@ def verify_no_failed_pods( """ wait_for_isvc_pods(client=client, isvc=isvc, runtime_name=runtime_name) + deployment_mode = isvc.instance.metadata.annotations.get("serving.kserve.io/deploymentMode") LOGGER.info("Verifying no failed pods") for pods in TimeoutSampler( @@ -640,6 +641,14 @@ def verify_no_failed_pods( ready_pods = 0 failed_pods: dict[str, Any] = {} + container_wait_base_errors = ["InvalidImageName"] + container_terminated_base_errors = [Resource.Status.ERROR] + + # For Model Mesh, if image pulling takes longer, pod may be in CrashLoopBackOff state but recover with retries. + if deployment_mode != KServeDeploymentType.MODEL_MESH: + container_wait_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF) + container_terminated_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF) + if pods: for pod in pods: for condition in pod.instance.status.conditions: @@ -658,17 +667,11 @@ def verify_no_failed_pods( ): is_waiting_pull_back_off = ( wait_state := container_status.state.waiting - ) and wait_state.reason in ( - pod.Status.CRASH_LOOPBACK_OFF, - "InvalidImageName", - ) + ) and wait_state.reason in container_wait_base_errors is_terminated_error = ( terminate_state := container_status.state.terminated - ) and terminate_state.reason in ( - pod.Status.ERROR, - pod.Status.CRASH_LOOPBACK_OFF, - ) + ) and terminate_state.reason in container_terminated_base_errors if is_waiting_pull_back_off or is_terminated_error: failed_pods[pod.name] = pod_status From 2a70bcfad570bbceb485fdb7b7f2712dc0737dde Mon Sep 17 00:00:00 2001 From: rnetser Date: Mon, 28 Apr 2025 20:20:51 +0300 Subject: [PATCH 2/2] fix: update condition --- utilities/infra.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utilities/infra.py b/utilities/infra.py index 2c19d93d1..b53345dd8 100644 --- a/utilities/infra.py +++ b/utilities/infra.py @@ -627,7 +627,6 @@ def verify_no_failed_pods( """ wait_for_isvc_pods(client=client, isvc=isvc, runtime_name=runtime_name) - deployment_mode = isvc.instance.metadata.annotations.get("serving.kserve.io/deploymentMode") LOGGER.info("Verifying no failed pods") for pods in TimeoutSampler( @@ -645,7 +644,9 @@ def verify_no_failed_pods( container_terminated_base_errors = [Resource.Status.ERROR] # For Model Mesh, if image pulling takes longer, pod may be in CrashLoopBackOff state but recover with retries. - if deployment_mode != KServeDeploymentType.MODEL_MESH: + if ( + deployment_mode := isvc.instance.metadata.annotations.get("serving.kserve.io/deploymentMode") + ) and deployment_mode != KServeDeploymentType.MODEL_MESH: container_wait_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF) container_terminated_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF)