4040from utilities .constants import KServeDeploymentType
4141from utilities .constants import Annotations
4242from utilities .exceptions import FailedPodsError
43- from timeout_sampler import TimeoutExpiredError , TimeoutSampler
43+ from timeout_sampler import TimeoutExpiredError , TimeoutSampler , retry
4444import utilities .general
4545
4646LOGGER = get_logger (name = __name__ )
@@ -576,17 +576,20 @@ def verify_no_failed_pods(
576576 timeout : int = Timeout .TIMEOUT_5MIN ,
577577) -> None :
578578 """
579- Verify no failed pods.
579+ Verify pods created and no failed pods.
580580
581581 Args:
582582 client (DynamicClient): DynamicClient object
583583 isvc (InferenceService): InferenceService object
584584 runtime_name (str): ServingRuntime name
585585 timeout (int): Time to wait for the pod.
586+
586587 Raises:
587- FailedPodsError: If any pod is in failed state
588+ FailedPodsError: If any pod is in failed state
588589
589590 """
591+ wait_for_isvc_pods (client = client , isvc = isvc , runtime_name = runtime_name )
592+
590593 LOGGER .info ("Verifying no failed pods" )
591594 for pods in TimeoutSampler (
592595 wait_timeout = timeout ,
@@ -612,12 +615,16 @@ def verify_no_failed_pods(
612615 pod_status = pod .instance .status
613616
614617 if pod_status .containerStatuses :
615- for container_status in pod_status .containerStatuses :
618+ for container_status in pod_status .get ("containerStatuses" , []) + pod_status .get (
619+ "initContainerStatuses" , []
620+ ):
616621 is_waiting_pull_back_off = (
617622 wait_state := container_status .state .waiting
618623 ) and wait_state .reason in (
619624 pod .Status .IMAGE_PULL_BACK_OFF ,
620625 pod .Status .CRASH_LOOPBACK_OFF ,
626+ pod .Status .ERR_IMAGE_PULL ,
627+ "InvalidImageName" ,
621628 )
622629
623630 is_terminated_error = (
@@ -630,11 +637,6 @@ def verify_no_failed_pods(
630637 if is_waiting_pull_back_off or is_terminated_error :
631638 failed_pods [pod .name ] = pod_status
632639
633- if init_container_status := pod_status .initContainerStatuses :
634- if container_terminated := init_container_status [0 ].lastState .terminated :
635- if container_terminated .reason == "Error" :
636- failed_pods [pod .name ] = pod_status
637-
638640 elif pod_status .phase in (
639641 pod .Status .CRASH_LOOPBACK_OFF ,
640642 pod .Status .FAILED ,
@@ -781,3 +783,23 @@ def wait_for_serverless_pods_deletion(resource: Project | Namespace, admin_clien
781783 ):
782784 LOGGER .info (f"Waiting for { KServeDeploymentType .SERVERLESS } pod { pod .name } to be deleted" )
783785 pod .wait_deleted (timeout = Timeout .TIMEOUT_1MIN )
786+
787+
788+ @retry (wait_timeout = Timeout .TIMEOUT_30SEC , sleep = 1 , exceptions_dict = {ResourceNotFoundError : []})
789+ def wait_for_isvc_pods (client : DynamicClient , isvc : InferenceService , runtime_name : str | None = None ) -> list [Pod ]:
790+ """
791+ Wait for ISVC pods.
792+
793+ Args:
794+ client (DynamicClient): DynamicClient object
795+ isvc (InferenceService): InferenceService object
796+ runtime_name (ServingRuntime): ServingRuntime name
797+
798+ Returns:
799+ list[Pod]: A list of all matching pods
800+
801+ Raises:
802+ TimeoutExpiredError: If pods do not exist
803+ """
804+ LOGGER .info ("Waiting for pods to be created" )
805+ return get_pods_by_isvc_label (client = client , isvc = isvc , runtime_name = runtime_name )
0 commit comments