Skip to content

Commit 7debf78

Browse files
rnetserdbasunag
authored andcommitted
verify_no_failed_pods - exclude container failures when model mesh deployment (opendatahub-io#278)
* fix: mm container * fix: update condition
1 parent 9ae33a3 commit 7debf78

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

utilities/infra.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from ocp_resources.pod import Pod
3030
from ocp_resources.project_project_openshift_io import Project
3131
from ocp_resources.project_request import ProjectRequest
32-
from ocp_resources.resource import ResourceEditor, get_client
32+
from ocp_resources.resource import Resource, ResourceEditor, get_client
3333
from ocp_resources.role import Role
3434
from ocp_resources.route import Route
3535
from ocp_resources.secret import Secret
@@ -636,6 +636,16 @@ def verify_no_failed_pods(
636636
ready_pods = 0
637637
failed_pods: dict[str, Any] = {}
638638

639+
container_wait_base_errors = ["InvalidImageName"]
640+
container_terminated_base_errors = [Resource.Status.ERROR]
641+
642+
# For Model Mesh, if image pulling takes longer, pod may be in CrashLoopBackOff state but recover with retries.
643+
if (
644+
deployment_mode := isvc.instance.metadata.annotations.get("serving.kserve.io/deploymentMode")
645+
) and deployment_mode != KServeDeploymentType.MODEL_MESH:
646+
container_wait_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF)
647+
container_terminated_base_errors.append(Resource.Status.CRASH_LOOPBACK_OFF)
648+
639649
if pods:
640650
for pod in pods:
641651
for condition in pod.instance.status.conditions:
@@ -654,17 +664,11 @@ def verify_no_failed_pods(
654664
):
655665
is_waiting_pull_back_off = (
656666
wait_state := container_status.state.waiting
657-
) and wait_state.reason in (
658-
pod.Status.CRASH_LOOPBACK_OFF,
659-
"InvalidImageName",
660-
)
667+
) and wait_state.reason in container_wait_base_errors
661668

662669
is_terminated_error = (
663670
terminate_state := container_status.state.terminated
664-
) and terminate_state.reason in (
665-
pod.Status.ERROR,
666-
pod.Status.CRASH_LOOPBACK_OFF,
667-
)
671+
) and terminate_state.reason in container_terminated_base_errors
668672

669673
if is_waiting_pull_back_off or is_terminated_error:
670674
failed_pods[pod.name] = pod_status

0 commit comments

Comments
 (0)