Skip to content

Commit feda520

Browse files
committed
refactor(lmeval): improve vLLM model readiness check
Replace manual timeout and stabilization loop with TimeoutSampler. Add specific exceptions (ResourceNotFoundError, UnexpectedResourceCountError). Use component=predictor label selector for pod filtering. Use collect_pod_information for better logging. Signed-off-by: Shehan Saleem <ssaleem@redhat.com> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED
1 parent a2cd3a1 commit feda520

File tree

2 files changed

+37
-32
lines changed

2 files changed

+37
-32
lines changed

tests/model_explainability/lm_eval/utils.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import re
2-
import time
32

43
import pandas as pd
54
import structlog
@@ -8,10 +7,16 @@
87
from ocp_resources.lm_eval_job import LMEvalJob
98
from ocp_resources.pod import Pod
109
from pyhelper_utils.general import tts
11-
from timeout_sampler import TimeoutExpiredError
10+
from timeout_sampler import TimeoutExpiredError, TimeoutSampler
1211

1312
from utilities.constants import Timeout
14-
from utilities.exceptions import PodLogMissMatchError, UnexpectedFailureError
13+
from utilities.exceptions import (
14+
PodLogMissMatchError,
15+
ResourceNotFoundError,
16+
UnexpectedFailureError,
17+
UnexpectedResourceCountError,
18+
)
19+
from utilities.general import collect_pod_information
1520

1621
LOGGER = structlog.get_logger(name=__name__)
1722

@@ -116,7 +121,6 @@ def wait_for_vllm_model_ready(
116121
inference_service_name: str,
117122
max_wait_time: int = 600,
118123
check_interval: int = 10,
119-
stabilization_wait: int = 10,
120124
) -> Pod:
121125
"""Wait for vLLM model to download and be ready to serve requests.
122126
@@ -126,12 +130,12 @@ def wait_for_vllm_model_ready(
126130
inference_service_name: Name of the inference service
127131
max_wait_time: Maximum time to wait in seconds
128132
check_interval: Time between checks in seconds
129-
stabilization_wait: Seconds to wait after model is ready for server stabilization
130133
131134
Returns:
132135
The predictor pod once model is ready
133136
134137
Raises:
138+
ResourceNotFoundError: If no predictor pod is found
135139
UnexpectedFailureError: If model fails to load or pod encounters errors
136140
"""
137141
LOGGER.info("Waiting for vLLM model to download and load...")
@@ -140,49 +144,46 @@ def wait_for_vllm_model_ready(
140144
Pod.get(
141145
dyn_client=client,
142146
namespace=namespace,
143-
label_selector=f"serving.kserve.io/inferenceservice={inference_service_name}",
147+
label_selector=f"serving.kserve.io/inferenceservice={inference_service_name},component=predictor",
144148
)
145149
)
146150

147151
if not predictor_pods:
148-
raise UnexpectedFailureError("No predictor pod found for inference service")
152+
raise ResourceNotFoundError(f"No predictor pod found for inference service '{inference_service_name}'.")
149153

150-
predictor_pods = [pod for pod in predictor_pods if "predictor" in pod.name]
151-
152-
if not predictor_pods:
153-
raise UnexpectedFailureError("No predictor pod found for inference service")
154+
if len(predictor_pods) != 1:
155+
raise UnexpectedResourceCountError(
156+
f"Expected exactly 1 predictor pod for inference service '{inference_service_name}', "
157+
f"but found {len(predictor_pods)}: {[pod.name for pod in predictor_pods]}"
158+
)
154159

155160
predictor_pod = predictor_pods[0]
156161
LOGGER.info(f"Predictor pod: {predictor_pod.name}")
157162

158-
elapsed_time = 0
159-
model_loaded = False
160-
161-
while elapsed_time < max_wait_time:
163+
def _check_model_ready() -> bool:
162164
try:
163165
pod_logs = predictor_pod.log(container="kserve-container")
164-
165166
if "Uvicorn running on" in pod_logs or "Application startup complete" in pod_logs:
166167
LOGGER.info("vLLM server is running and ready!")
167-
model_loaded = True
168-
break
168+
return True
169169
else:
170-
LOGGER.info(f"Model still loading... (waited {elapsed_time}s)")
170+
LOGGER.info("Model still loading..")
171+
return False
171172
except (ApiException, OSError) as e:
172173
LOGGER.info(f"Could not get pod logs yet: {e}")
174+
return False
173175

174-
time.sleep(check_interval)
175-
elapsed_time += check_interval
176-
177-
if not model_loaded:
178-
try:
179-
full_logs = predictor_pod.log(container="kserve-container")
180-
LOGGER.error(f"vLLM pod failed to start within {max_wait_time}s. Full logs:\n{full_logs}")
181-
except (ApiException, OSError) as e:
182-
LOGGER.error(f"Could not retrieve pod logs: {e}")
183-
raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds")
184-
185-
LOGGER.info(f"Model loaded! Waiting {stabilization_wait} more seconds for server stabilization.")
186-
time.sleep(stabilization_wait)
176+
try:
177+
for sample in TimeoutSampler(
178+
wait_timeout=max_wait_time,
179+
sleep=check_interval,
180+
func=_check_model_ready,
181+
):
182+
if sample:
183+
break
184+
except TimeoutExpiredError as e:
185+
LOGGER.error(f"vLLM pod failed to start within {max_wait_time} seconds")
186+
collect_pod_information(pod=predictor_pod)
187+
raise UnexpectedFailureError(f"vLLM model failed to load within {max_wait_time} seconds") from e
187188

188189
return predictor_pod

utilities/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,7 @@ class ExceptionUserLogin(Exception):
131131

132132
class UnexpectedValueError(Exception):
133133
"""Unexpected value found"""
134+
135+
136+
class ResourceNotFoundError(Exception):
137+
"""Resource not found"""

0 commit comments

Comments
 (0)