Skip to content

Commit 4e1d91f

Browse files
authored
verify_no_failed_pods - exclude container failures when model mesh deployment (#278)
* fix: mm container * fix: update condition
1 parent f0c32cf commit 4e1d91f

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

utilities/infra.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ocp_resources.pod import Pod
2929
from ocp_resources.project_project_openshift_io import Project
3030
from ocp_resources.project_request import ProjectRequest
31-
from ocp_resources.resource import ResourceEditor, get_client
31+
from ocp_resources.resource import Resource, ResourceEditor, get_client
3232
from ocp_resources.role import Role
3333
from ocp_resources.route import Route
3434
from ocp_resources.secret import Secret
@@ -640,6 +640,16 @@ def verify_no_failed_pods(
640640
ready_pods = 0
641641
failed_pods: dict[str, Any] = {}
642642

643+
container_wait_base_errors = ["InvalidImageName"]
644+
container_terminated_base_errors = [Resource.Status.ERROR]
645+
646+
# For Model Mesh, if image pulling takes longer, pod may be in CrashLoopBackOff state but recover with retries.
647+
if (
648+
deployment_mode := isvc.instance.metadata.annotations.get("serving.kserve.io/deploymentMode")
649+
) and deployment_mode != KServeDeploymentType.MODEL_MESH:
650+
container_wait_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF)
651+
container_terminated_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF)
652+
643653
if pods:
644654
for pod in pods:
645655
for condition in pod.instance.status.conditions:
@@ -658,17 +668,11 @@ def verify_no_failed_pods(
658668
):
659669
is_waiting_pull_back_off = (
660670
wait_state := container_status.state.waiting
661-
) and wait_state.reason in (
662-
pod.Status.CRASH_LOOPBACK_OFF,
663-
"InvalidImageName",
664-
)
671+
) and wait_state.reason in container_wait_base_errors
665672

666673
is_terminated_error = (
667674
terminate_state := container_status.state.terminated
668-
) and terminate_state.reason in (
669-
pod.Status.ERROR,
670-
pod.Status.CRASH_LOOPBACK_OFF,
671-
)
675+
) and terminate_state.reason in container_terminated_base_errors
672676

673677
if is_waiting_pull_back_off or is_terminated_error:
674678
failed_pods[pod.name] = pod_status

0 commit comments

Comments
 (0)