Skip to content

Commit c3f586e

Browse files
authored
Merge branch 'main' into cherry-pick
2 parents 4377d51 + 494adec commit c3f586e

File tree

5 files changed

+33
-12
lines changed

5 files changed

+33
-12
lines changed

tests/model_explainability/trustyai_service/drift/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ def mlserver_runtime(
6363
protocol_versions=["v2"],
6464
annotations={
6565
f"{ApiGroups.OPENDATAHUB_IO}/accelerator-name": "",
66-
f"{ApiGroups.OPENDATAHUB_IO}/recommended-accelerators": [Labels.Nvidia.NVIDIA_COM_GPU],
6766
f"{ApiGroups.OPENDATAHUB_IO}/template-display-name": "KServe MLServer",
6867
"prometheus.kserve.io/path": "/metrics",
6968
"prometheus.io/port": str(Ports.REST_PORT),

tests/model_explainability/trustyai_service/trustyai_service_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from utilities.inference_utils import Inference, UserInference
2222

2323
LOGGER = get_logger(name=__name__)
24-
TIMEOUT_30SEC: int = 30
2524

2625
TRUSTYAI_SERVICE_NAME: str = "trustyai-service"
2726

utilities/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ class Nvidia:
187187

188188

189189
class Timeout:
190+
TIMEOUT_30SEC: int = 30
190191
TIMEOUT_1MIN: int = 60
191192
TIMEOUT_2MIN: int = 2 * TIMEOUT_1MIN
192193
TIMEOUT_4MIN: int = 4 * TIMEOUT_1MIN

utilities/inference_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def run_inference_flow(
399399
except JSONDecodeError:
400400
return {"output": out}
401401

402-
@retry(wait_timeout=30, sleep=5)
402+
@retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=5)
403403
def run_inference(self, cmd: str) -> str:
404404
"""
405405
Run inference command

utilities/infra.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
from utilities.constants import KServeDeploymentType
4141
from utilities.constants import Annotations
4242
from utilities.exceptions import FailedPodsError
43-
from timeout_sampler import TimeoutExpiredError, TimeoutSampler
43+
from timeout_sampler import TimeoutExpiredError, TimeoutSampler, retry
4444
import utilities.general
4545

4646
LOGGER = get_logger(name=__name__)
@@ -576,17 +576,20 @@ def verify_no_failed_pods(
576576
timeout: int = Timeout.TIMEOUT_5MIN,
577577
) -> None:
578578
"""
579-
Verify no failed pods.
579+
Verify pods created and no failed pods.
580580
581581
Args:
582582
client (DynamicClient): DynamicClient object
583583
isvc (InferenceService): InferenceService object
584584
runtime_name (str): ServingRuntime name
585585
timeout (int): Time to wait for the pod.
586+
586587
Raises:
587-
FailedPodsError: If any pod is in failed state
588+
FailedPodsError: If any pod is in failed state
588589
589590
"""
591+
wait_for_isvc_pods(client=client, isvc=isvc, runtime_name=runtime_name)
592+
590593
LOGGER.info("Verifying no failed pods")
591594
for pods in TimeoutSampler(
592595
wait_timeout=timeout,
@@ -612,12 +615,16 @@ def verify_no_failed_pods(
612615
pod_status = pod.instance.status
613616

614617
if pod_status.containerStatuses:
615-
for container_status in pod_status.containerStatuses:
618+
for container_status in pod_status.get("containerStatuses", []) + pod_status.get(
619+
"initContainerStatuses", []
620+
):
616621
is_waiting_pull_back_off = (
617622
wait_state := container_status.state.waiting
618623
) and wait_state.reason in (
619624
pod.Status.IMAGE_PULL_BACK_OFF,
620625
pod.Status.CRASH_LOOPBACK_OFF,
626+
pod.Status.ERR_IMAGE_PULL,
627+
"InvalidImageName",
621628
)
622629

623630
is_terminated_error = (
@@ -630,11 +637,6 @@ def verify_no_failed_pods(
630637
if is_waiting_pull_back_off or is_terminated_error:
631638
failed_pods[pod.name] = pod_status
632639

633-
if init_container_status := pod_status.initContainerStatuses:
634-
if container_terminated := init_container_status[0].lastState.terminated:
635-
if container_terminated.reason == "Error":
636-
failed_pods[pod.name] = pod_status
637-
638640
elif pod_status.phase in (
639641
pod.Status.CRASH_LOOPBACK_OFF,
640642
pod.Status.FAILED,
@@ -781,3 +783,23 @@ def wait_for_serverless_pods_deletion(resource: Project | Namespace, admin_clien
781783
):
782784
LOGGER.info(f"Waiting for {KServeDeploymentType.SERVERLESS} pod {pod.name} to be deleted")
783785
pod.wait_deleted(timeout=Timeout.TIMEOUT_1MIN)
786+
787+
788+
@retry(wait_timeout=Timeout.TIMEOUT_30SEC, sleep=1, exceptions_dict={ResourceNotFoundError: []})
789+
def wait_for_isvc_pods(client: DynamicClient, isvc: InferenceService, runtime_name: str | None = None) -> list[Pod]:
790+
"""
791+
Wait for ISVC pods.
792+
793+
Args:
794+
client (DynamicClient): DynamicClient object
795+
isvc (InferenceService): InferenceService object
796+
runtime_name (ServingRuntime): ServingRuntime name
797+
798+
Returns:
799+
list[Pod]: A list of all matching pods
800+
801+
Raises:
802+
TimeoutExpiredError: If pods do not exist
803+
"""
804+
LOGGER.info("Waiting for pods to be created")
805+
return get_pods_by_isvc_label(client=client, isvc=isvc, runtime_name=runtime_name)

0 commit comments

Comments
 (0)