2929from ocp_resources .pod import Pod
3030from ocp_resources .project_project_openshift_io import Project
3131from ocp_resources .project_request import ProjectRequest
32- from ocp_resources .resource import ResourceEditor , get_client
32+ from ocp_resources .resource import Resource , ResourceEditor , get_client
3333from ocp_resources .role import Role
3434from ocp_resources .route import Route
3535from ocp_resources .secret import Secret
@@ -636,6 +636,16 @@ def verify_no_failed_pods(
636636 ready_pods = 0
637637 failed_pods : dict [str , Any ] = {}
638638
639+ container_wait_base_errors = ["InvalidImageName" ]
640+ container_terminated_base_errors = [Resource .Status .ERROR ]
641+
642+ # For Model Mesh, if image pulling takes longer, pod may be in CrashLoopBackOff state but recover with retries.
643+ if (
644+ deployment_mode := isvc .instance .metadata .annotations .get ("serving.kserve.io/deploymentMode" )
645+ ) and deployment_mode != KServeDeploymentType .MODEL_MESH :
646+ container_wait_base_errors .append (Resource .Status .CRASH_LOOPBACK_OFF )
647+ container_terminated_base_errors .append (Resource .Status .CRASH_LOOPBACK_OFF )
648+
639649 if pods :
640650 for pod in pods :
641651 for condition in pod .instance .status .conditions :
@@ -654,17 +664,11 @@ def verify_no_failed_pods(
654664 ):
655665 is_waiting_pull_back_off = (
656666 wait_state := container_status .state .waiting
657- ) and wait_state .reason in (
658- pod .Status .CRASH_LOOPBACK_OFF ,
659- "InvalidImageName" ,
660- )
667+ ) and wait_state .reason in container_wait_base_errors
661668
662669 is_terminated_error = (
663670 terminate_state := container_status .state .terminated
664- ) and terminate_state .reason in (
665- pod .Status .ERROR ,
666- pod .Status .CRASH_LOOPBACK_OFF ,
667- )
671+ ) and terminate_state .reason in container_terminated_base_errors
668672
669673 if is_waiting_pull_back_off or is_terminated_error :
670674 failed_pods [pod .name ] = pod_status
0 commit comments